Fix NFS exports of FUSE file systems for big directories

The FUSE protocol does not require that a directory entry's d_off field
outlive the lifetime of its directory's file handle.  Since the NFS
server must reopen the directory on every VOP_READDIR call, that means
it can't pass uio->uio_offset down to the FUSE server.  Instead, it must
read the directory from 0 each time.  It may need to issue multiple
FUSE_READDIR operations until it finds the d_off field that it's looking
for.  That was the intention behind SVN r348209 and r297887, but a logic
bug prevented subsequent FUSE_READDIR operations from ever being issued,
rendering large directories incompletely browseable.

Reviewed by:	rmacklem

(cherry picked from commit d088dc76e1)

fusefs: optimize NFS readdir for FUSE_NO_OPENDIR_SUPPORT

In its lowest common denominator, FUSE does not require that a directory
entry's d_off field is valid outside of the lifetime of the directory's
FUSE file handle.  But since NFS is stateless, it must reopen the
directory on every call to VOP_READDIR.  That means reading the
directory all the way from the first entry.  Not only does this create
an O(n^2) condition for large directories, but it can also result in
incorrect behavior if either:

* The file system _does_ change the d_off field for the last directory
  entry previously seen by NFS, or
* The file system deletes the last directory entry previously seen by
  NFS.

Handily, for file systems that set FUSE_NO_OPENDIR_SUPPORT d_off is
guaranteed to be valid for the lifetime of the directory entry, there is
no need to read the directory from the start.

Reviewed by:	rmacklem

(cherry picked from commit 4a6526d84a)

fusefs: require FUSE_NO_OPENDIR_SUPPORT for NFS exporting

FUSE file systems that do not set FUSE_NO_OPENDIR_SUPPORT do not
guarantee that d_off will be valid after closing and reopening a
directory.  That conflicts with NFS's statelessness, that results in
unresolvable bugs when NFS reads large directories, if:

* The file system _does_ change the d_off field for the last directory
  entry previously returned by VOP_READDIR, or
* The file system deletes the last directory entry previously seen by
  NFS.

Rather than doing a poor job of exporting such file systems, it's better
just to refuse.

Even though this is technically a breaking change, 13.0-RELEASE's
NFS-FUSE support was bad enough that an MFC should be allowed.

Reviewed by:	rmacklem
Differential Revision: https://reviews.freebsd.org/D33726

(cherry picked from commit 00134a0789)

fusefs: fix the build without INVARIANTS after 00134a0789

MFC with:	00134a0789
Reported by:	se

(cherry picked from commit 18ed2ce77a)
This commit is contained in:
Alan Somers 2022-01-02 10:18:47 -07:00
parent 1e40fc6fa9
commit 3d721de049
3 changed files with 61 additions and 74 deletions

View file

@ -554,7 +554,6 @@ fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
int
fuse_internal_readdir(struct vnode *vp,
struct uio *uio,
off_t startoff,
struct fuse_filehandle *fufh,
struct fuse_iov *cookediov,
int *ncookies,
@ -563,7 +562,6 @@ fuse_internal_readdir(struct vnode *vp,
int err = 0;
struct fuse_dispatcher fdi;
struct fuse_read_in *fri = NULL;
int fnd_start;
if (uio_resid(uio) == 0)
return 0;
@ -573,25 +571,9 @@ fuse_internal_readdir(struct vnode *vp,
* Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p
* I/O).
*/
/*
* fnd_start is set non-zero once the offset in the directory gets
* to the startoff. This is done because directories must be read
* from the beginning (offset == 0) when fuse_vnop_readdir() needs
* to do an open of the directory.
* If it is not set non-zero here, it will be set non-zero in
* fuse_internal_readdir_processdata() when uio_offset == startoff.
*/
fnd_start = 0;
if (uio->uio_offset == startoff)
fnd_start = 1;
while (uio_resid(uio) > 0) {
fdi.iosize = sizeof(*fri);
if (fri == NULL)
fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
else
fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
fri = fdi.indata;
fri->fh = fufh->fh_id;
fri->offset = uio_offset(uio);
@ -600,9 +582,8 @@ fuse_internal_readdir(struct vnode *vp,
if ((err = fdisp_wait_answ(&fdi)))
break;
if ((err = fuse_internal_readdir_processdata(uio, startoff,
&fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov,
ncookies, &cookies)))
if ((err = fuse_internal_readdir_processdata(uio, fri->size,
fdi.answ, fdi.iosize, cookediov, ncookies, &cookies)))
break;
}
@ -617,8 +598,6 @@ fuse_internal_readdir(struct vnode *vp,
*/
int
fuse_internal_readdir_processdata(struct uio *uio,
off_t startoff,
int *fnd_start,
size_t reqsize,
void *buf,
size_t bufsize,
@ -672,39 +651,32 @@ fuse_internal_readdir_processdata(struct uio *uio,
err = -1;
break;
}
/*
* Don't start to copy the directory entries out until
* the requested offset in the directory is found.
*/
if (*fnd_start != 0) {
fiov_adjust(cookediov, oreclen);
bzero(cookediov->base, oreclen);
fiov_adjust(cookediov, oreclen);
bzero(cookediov->base, oreclen);
de = (struct dirent *)cookediov->base;
de->d_fileno = fudge->ino;
de->d_off = fudge->off;
de->d_reclen = oreclen;
de->d_type = fudge->type;
de->d_namlen = fudge->namelen;
memcpy((char *)cookediov->base + sizeof(struct dirent) -
MAXNAMLEN - 1,
(char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
dirent_terminate(de);
de = (struct dirent *)cookediov->base;
de->d_fileno = fudge->ino;
de->d_off = fudge->off;
de->d_reclen = oreclen;
de->d_type = fudge->type;
de->d_namlen = fudge->namelen;
memcpy((char *)cookediov->base + sizeof(struct dirent) -
MAXNAMLEN - 1,
(char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
dirent_terminate(de);
err = uiomove(cookediov->base, cookediov->len, uio);
if (err)
err = uiomove(cookediov->base, cookediov->len, uio);
if (err)
break;
if (cookies != NULL) {
if (*ncookies == 0) {
err = -1;
break;
if (cookies != NULL) {
if (*ncookies == 0) {
err = -1;
break;
}
*cookies = fudge->off;
cookies++;
(*ncookies)--;
}
} else if (startoff == fudge->off)
*fnd_start = 1;
*cookies = fudge->off;
cookies++;
(*ncookies)--;
}
buf = (char *)buf + freclen;
bufsize -= freclen;
uio_setoffset(uio, fudge->off);

View file

@ -250,12 +250,12 @@ int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
struct pseudo_dirent {
uint32_t d_namlen;
};
int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff,
int fuse_internal_readdir(struct vnode *vp, struct uio *uio,
struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies,
u_long *cookies);
int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff,
int *fnd_start, size_t reqsize, void *buf, size_t bufsize,
struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp);
uint64_t *cookies);
int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize,
void *buf, size_t bufsize, struct fuse_iov *cookediov, int *ncookies,
u_long **cookiesp);
/* remove */

View file

@ -1847,10 +1847,10 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
struct uio *uio = ap->a_uio;
struct ucred *cred = ap->a_cred;
struct fuse_filehandle *fufh = NULL;
struct mount *mp = vnode_mount(vp);
struct fuse_iov cookediov;
int err = 0;
u_long *cookies;
off_t startoff;
ssize_t tresid;
int ncookies;
bool closefufh = false;
@ -1867,24 +1867,17 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
}
tresid = uio->uio_resid;
startoff = uio->uio_offset;
err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
if (err == EBADF && mp->mnt_flag & MNT_EXPORTED) {
KASSERT(fuse_get_mpdata(mp)->dataflags
& FSESS_NO_OPENDIR_SUPPORT,
("FUSE file systems that don't set "
"FUSE_NO_OPENDIR_SUPPORT should not be exported"));
/*
* nfsd will do VOP_READDIR without first doing VOP_OPEN. We
* must implicitly open the directory here
* must implicitly open the directory here.
*/
err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
if (err == 0) {
/*
* When a directory is opened, it must be read from
* the beginning. Hopefully, the "startoff" still
* exists as an offset cookie for the directory.
* If not, it will read the entire directory without
* returning any entries and just return eof.
*/
uio->uio_offset = 0;
}
closefufh = true;
}
if (err)
@ -1902,7 +1895,7 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
#define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
fiov_init(&cookediov, DIRCOOKEDSIZE);
err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
err = fuse_internal_readdir(vp, uio, fufh, &cookediov,
&ncookies, cookies);
fiov_teardown(&cookediov);
@ -2999,8 +2992,30 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap)
struct vattr va;
int err;
if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
/* NFS requires lookups for "." and ".." */
SDT_PROBE2(fusefs, , vnops, trace, 1,
"VOP_VPTOFH without FUSE_EXPORT_SUPPORT");
return EOPNOTSUPP;
}
if ((mp->mnt_flag & MNT_EXPORTED) &&
!(data->dataflags & FSESS_NO_OPENDIR_SUPPORT))
{
/*
* NFS is stateless, so nfsd must reopen a directory on every
* call to VOP_READDIR, passing in the d_off field from the
* final dirent of the previous invocation. But without
* FUSE_NO_OPENDIR_SUPPORT, the FUSE protocol does not
* guarantee that d_off will be valid after a directory is
* closed and reopened. So prohibit exporting FUSE file
* systems that don't set that flag.
*
* But userspace NFS servers don't have this problem.
*/
SDT_PROBE2(fusefs, , vnops, trace, 1,
"VOP_VPTOFH without FUSE_NO_OPENDIR_SUPPORT");
return EOPNOTSUPP;
}
err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
if (err)