diff --git a/Makefile.inc1 b/Makefile.inc1 index 8838d31c795..f740cc5abcc 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -3042,6 +3042,7 @@ _prebuild_libs= ${_kerberos5_lib_libasn1} \ lib/libpam/libpam lib/libthr \ ${_lib_libradius} lib/libsbuf lib/libtacplus \ lib/libgeom \ + ${_lib_librt} \ ${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \ ${_cddl_lib_libuutil} \ ${_cddl_lib_libavl} \ @@ -3097,6 +3098,7 @@ lib/libpjdlog__L: lib/libutil__L lib/libcasper__L: lib/libnv__L lib/liblzma__L: lib/libmd__L lib/libthr__L lib/libzstd__L: lib/libthr__L +lib/librt__L: lib/libthr__L _generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} .if ${MK_IPFILTER} != "no" @@ -3122,6 +3124,7 @@ cddl/lib/libnvpair__L: cddl/lib/libspl__L cddl/lib/libuutil__L: cddl/lib/libavl__L cddl/lib/libspl__L .if ${MK_ZFS} != "no" +_lib_librt= lib/librt _cddl_lib_libicp= cddl/lib/libicp _cddl_lib_libicp_rescue= cddl/lib/libicp_rescue _cddl_lib_libtpool= cddl/lib/libtpool @@ -3136,7 +3139,7 @@ cddl/lib/libzutil__L: cddl/lib/libavl__L lib/libgeom__L lib/msun__L cddl/lib/lib cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L cddl/lib/libspl__L cddl/lib/libzutil__L -cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L +cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L lib/librt__L cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L cddl/lib/libzfs__L: cddl/lib/libnvpair__L cddl/lib/libzutil__L diff --git a/ObsoleteFiles.inc b/ObsoleteFiles.inc index 655a81cd900..e8f6e2e1693 100644 --- a/ObsoleteFiles.inc +++ b/ObsoleteFiles.inc @@ -51,6 +51,9 @@ # xargs -n1 | sort | uniq -d; # done +# 20230902: libzfs now requires librt, moved to /lib +MOVED_LIBS+=usr/lib/librt.so.1 + # 20230807: GoogleTest 1.14.0 upgrade. OLD_FILES+=usr/include/private/gmock/gmock-generated-actions.h OLD_FILES+=usr/include/private/gmock/gmock-generated-function-mockers.h diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index f96ba4f24e5..6f8965e4c14 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -19,6 +19,7 @@ LIBADD= \ md \ nvpair \ pthread \ + rt \ umem \ util \ uutil \ diff --git a/cddl/lib/libzfs/Makefile.depend b/cddl/lib/libzfs/Makefile.depend index 60fad109048..1fead4b4f29 100644 --- a/cddl/lib/libzfs/Makefile.depend +++ b/cddl/lib/libzfs/Makefile.depend @@ -15,6 +15,7 @@ DIRDEPS = \ lib/libexpat \ lib/libgeom \ lib/libmd \ + lib/librt \ lib/libthr \ lib/libutil \ lib/libz \ diff --git a/lib/librt/Makefile b/lib/librt/Makefile index 9a54c3ea281..c755bb123c7 100644 --- a/lib/librt/Makefile +++ b/lib/librt/Makefile @@ -1,3 +1,4 @@ +SHLIBDIR?=/lib .include diff --git a/rescue/rescue/Makefile b/rescue/rescue/Makefile index 9eb41eecb1c..0283aed9391 100644 --- a/rescue/rescue/Makefile +++ b/rescue/rescue/Makefile @@ -143,7 +143,7 @@ CRUNCH_PROGS_usr.sbin+= zdb CRUNCH_LIBS+= -l80211 -lalias -lcam -lncursesw -ldevstat -lipsec -llzma .if ${MK_ZFS} != "no" -CRUNCH_LIBS+= -lavl -lpthread -luutil -lumem -ltpool -lspl +CRUNCH_LIBS+= -lavl -lpthread -luutil -lumem -ltpool -lspl -lrt CRUNCH_LIBS_zfs+= ${LIBBE} \ ${LIBZPOOL} \ ${LIBZFS} \ diff --git a/rescue/rescue/Makefile.depend b/rescue/rescue/Makefile.depend index d9f20a6afd5..0bfb25d3f17 100644 --- a/rescue/rescue/Makefile.depend +++ b/rescue/rescue/Makefile.depend @@ -39,6 +39,7 @@ DIRDEPS = \ lib/libmd \ lib/libmt \ lib/libnv \ + lib/librt \ lib/libsbuf \ lib/libthr \ lib/libufs \ diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk index d458b28adf3..6a79d0b864d 100644 --- a/share/mk/src.libnames.mk +++ b/share/mk/src.libnames.mk @@ -413,7 +413,7 @@ _DP_fifolog= z _DP_ipf= kvm _DP_tpool= spl _DP_uutil= avl spl -_DP_zfs= md pthread umem util uutil m avl bsdxml crypto geom nvpair \ +_DP_zfs= md pthread rt umem util uutil m avl bsdxml crypto geom nvpair \ z zfs_core zutil _DP_zfsbootenv= zfs nvpair _DP_zfs_core= nvpair spl zutil diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 5f834d5cc7c..e4e770026b5 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -2,9 +2,9 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.2.0 -Release: rc1 +Release: rc3 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.3 +Linux-Maximum: 6.4 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 9568d2bbfe3..4b9921d47b8 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = { #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) +typedef struct zdb_brt_entry { + dva_t zbre_dva; + uint64_t zbre_refcount; + avl_node_t zbre_node; +} zdb_brt_entry_t; + typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_clone_asize; + uint64_t zcb_clone_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; @@ -5368,6 +5377,8 @@ typedef struct zdb_cb { int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; + avl_tree_t zcb_brt; + boolean_t zcb_brt_is_active; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ @@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); + if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre != NULL) { + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + return; + } + + uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); + if (crefcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = crefcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } + if (dump_opt['L']) return; @@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa) iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } +static int +zdb_brt_entry_compare(const void *zcn1, const void *zcn2) +{ + const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; + const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; + int cmp; + + cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (cmp == 0) + cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); + + return (cmp); +} + static int dump_block_stats(spa_t *spa) { @@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa) zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, + sizeof (zdb_brt_entry_t), + offsetof(zdb_brt_entry_t, zbre_node)); + zcb->zcb_brt_is_active = B_TRUE; + } + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", @@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa) metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); - total_found = tzb->zb_asize - zcb->zcb_dedup_asize + + total_found = + tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { @@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa) "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, (u_longlong_t)zcb->zcb_dedup_blocks, (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu count: %6llu\n", + "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, + (u_longlong_t)zcb->zcb_clone_blocks); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c index b07a0271229..a8d084bb4bd 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -607,8 +607,6 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) */ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || strcmp(dp->dd_compare, path) != 0) { - zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)", - __func__, dp->dd_compare, path); return; } if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) { diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c index f83ae09259a..a0e377a4a0c 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c @@ -416,6 +416,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) return; + if (vdev_guid == 0) { + fmd_hdl_debug(hdl, "Got a zero GUID"); + return; + } + if (spare) { int nspares = find_and_remove_spares(zhdl, vdev_guid); fmd_hdl_debug(hdl, "%d spares removed", nspares); diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am index c65b43fb027..812558cf6d0 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am +++ b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am @@ -16,6 +16,7 @@ dist_zedexec_SCRIPTS = \ %D%/scrub_finish-notify.sh \ %D%/statechange-led.sh \ %D%/statechange-notify.sh \ + %D%/statechange-slot_off.sh \ %D%/trim_finish-notify.sh \ %D%/vdev_attach-led.sh \ %D%/vdev_clear-led.sh @@ -35,6 +36,7 @@ zedconfdefaults = \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + statechange-slot_off.sh \ vdev_attach-led.sh \ vdev_clear-led.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh new file mode 100755 index 00000000000..150012abe71 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh @@ -0,0 +1,64 @@ +#!/bin/sh +# shellcheck disable=SC3014,SC2154,SC2086,SC2034 +# +# Turn off disk's enclosure slot if it becomes FAULTED. +# +# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos +# as they flip between FAULTED and ONLINE. If +# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# FAULTED, then power down the slot via sysfs: +# +# /sys/class/enclosure///power_status +# +# We assume the user will be responsible for turning the slot back on again. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 3: vdev was not FAULTED +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +# Turn off the slot and wait for sysfs to report that the slot is off. +# It can take ~400ms on some enclosures and multiple retries may be needed. +for i in $(seq 1 20) ; do + echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + + for j in $(seq 1 5) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break 2 + fi + sleep 0.1 + done +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc index c55a70c79f7..78dc1afc7b1 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc +++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc @@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # Disabled by default, 1 to enable and 0 to disable. #ZED_SYSLOG_DISPLAY_GUIDS=1 +## +# Power off the drive's slot in the enclosure if it becomes FAULTED. This can +# help silence misbehaving drives. This assumes your drive enclosure fully +# supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1 diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 10a3b5b14fc..6d0dae8d8b0 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -3143,6 +3143,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; + int ms_status = 0; zpool_handle_t *zhp; const char *name; uint64_t version; @@ -3232,10 +3233,15 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ret = 1; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && - !(flags & ZFS_IMPORT_ONLY) && - zpool_enable_datasets(zhp, mntopts, 0) != 0) { - zpool_close(zhp); - return (1); + !(flags & ZFS_IMPORT_ONLY)) { + ms_status = zpool_enable_datasets(zhp, mntopts, 0); + if (ms_status == EZFS_SHAREFAILED) { + (void) fprintf(stderr, gettext("Import was " + "successful, but unable to share some datasets")); + } else if (ms_status == EZFS_MOUNTFAILED) { + (void) fprintf(stderr, gettext("Import was " + "successful, but unable to mount some datasets")); + } } zpool_close(zhp); @@ -6755,6 +6761,7 @@ zpool_do_split(int argc, char **argv) char *mntopts = NULL; splitflags_t flags; int c, ret = 0; + int ms_status = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; @@ -6891,13 +6898,18 @@ zpool_do_split(int argc, char **argv) ret = 1; } - if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && - zpool_enable_datasets(zhp, mntopts, 0) != 0) { - ret = 1; - (void) fprintf(stderr, gettext("Split was successful, but " - "the datasets could not all be mounted\n")); - (void) fprintf(stderr, gettext("Try doing '%s' with a " - "different altroot\n"), "zpool import"); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { + ms_status = zpool_enable_datasets(zhp, mntopts, 0); + if (ms_status == EZFS_SHAREFAILED) { + (void) fprintf(stderr, gettext("Split was successful, " + "datasets are mounted but sharing of some datasets " + "has failed\n")); + } else if (ms_status == EZFS_MOUNTFAILED) { + (void) fprintf(stderr, gettext("Split was successful" + ", but some datasets could not be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } } zpool_close(zhp); nvlist_free(config); diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c index b6b99bfff6d..398c519cfc3 100644 --- a/sys/contrib/openzfs/cmd/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest.c @@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); @@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { + ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); diff --git a/sys/contrib/openzfs/config/Shellcheck.am b/sys/contrib/openzfs/config/Shellcheck.am index 1cff81e04be..1ab13516066 100644 --- a/sys/contrib/openzfs/config/Shellcheck.am +++ b/sys/contrib/openzfs/config/Shellcheck.am @@ -4,6 +4,7 @@ # Not following: a was not specified as input (see shellcheck -x). [SC1091] # Prefer putting braces around variable references even when not strictly required. [SC2250] # Consider invoking this command separately to avoid masking its return value (or use '|| true' to ignore). [SC2312] +# Command appears to be unreachable. Check usage (or ignore if invoked indirectly). [SC2317] # In POSIX sh, 'local' is undefined. [SC2039] # older ShellCheck versions # In POSIX sh, 'local' is undefined. [SC3043] # newer ShellCheck versions @@ -18,7 +19,7 @@ PHONY += shellcheck _STGT = $(subst ^,/,$(subst shellcheck-here-,,$@)) shellcheck-here-%: if HAVE_SHELLCHECK - shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)" + shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)" else @echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed" endif diff --git a/sys/contrib/openzfs/config/kernel-blkdev.m4 b/sys/contrib/openzfs/config/kernel-blkdev.m4 index 28e5364581e..887acee670b 100644 --- a/sys/contrib/openzfs/config/kernel-blkdev.m4 +++ b/sys/contrib/openzfs/config/kernel-blkdev.m4 @@ -103,6 +103,33 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ ]) ]) +dnl # +dnl # 6.5.x API change +dnl # disk_check_media_change() was added +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ + ZFS_LINUX_TEST_SRC([disk_check_media_change], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + bool error; + + error = disk_check_media_change(bdev->bd_disk); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ + AC_MSG_CHECKING([whether disk_check_media_change() exists]) + ZFS_LINUX_TEST_RESULT([disk_check_media_change], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1, + [disk_check_media_change() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # bdev_kobj() is introduced from 5.12 dnl # @@ -443,6 +470,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [ ]) ]) +dnl # +dnl # 6.5.x API change +dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [ + ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [ + #include + ],[ + blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ + AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined]) + ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [ + AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined]) + AC_MSG_RESULT(yes) + ], [ + AC_MSG_RESULT(no) + ]) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT @@ -458,6 +508,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV + ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -476,4 +528,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV + ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT ]) diff --git a/sys/contrib/openzfs/config/kernel-vfs-extended-file_range.m4 b/sys/contrib/openzfs/config/kernel-vfs-extended-file_range.m4 new file mode 100644 index 00000000000..a2622313129 --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-vfs-extended-file_range.m4 @@ -0,0 +1,50 @@ +dnl # +dnl # EL7 have backported copy_file_range and clone_file_range and +dnl # added them to an "extended" file_operations struct. +dnl # +dnl # We're testing for both functions in one here, because they will only +dnl # ever appear together and we don't want to match a similar method in +dnl # some future vendor kernel. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND], [ + ZFS_LINUX_TEST_SRC([vfs_file_operations_extend], [ + #include + + static ssize_t test_copy_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; (void) flags; + return (0); + } + + static int test_clone_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + u64 len) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; + return (0); + } + + static const struct file_operations_extend + fops __attribute__ ((unused)) = { + .kabi_fops = {}, + .copy_file_range = test_copy_file_range, + .clone_file_range = test_clone_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND], [ + AC_MSG_CHECKING([whether file_operations_extend takes \ +.copy_file_range() and .clone_file_range()]) + ZFS_LINUX_TEST_RESULT([vfs_file_operations_extend], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_FILE_OPERATIONS_EXTEND, 1, + [file_operations_extend takes .copy_file_range() + and .clone_file_range()]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-vfs-file_range.m4 b/sys/contrib/openzfs/config/kernel-vfs-file_range.m4 new file mode 100644 index 00000000000..cc96404d8bb --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-vfs-file_range.m4 @@ -0,0 +1,164 @@ +dnl # +dnl # The *_file_range APIs have a long history: +dnl # +dnl # 2.6.29: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE ioctl introduced +dnl # 3.12: BTRFS_IOC_FILE_EXTENT_SAME ioctl introduced +dnl # +dnl # 4.5: copy_file_range() syscall introduced, added to VFS +dnl # 4.5: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE renamed to FICLONE ands +dnl # FICLONERANGE, added to VFS as clone_file_range() +dnl # 4.5: BTRFS_IOC_FILE_EXTENT_SAME renamed to FIDEDUPERANGE, added to VFS +dnl # as dedupe_file_range() +dnl # +dnl # 4.20: VFS clone_file_range() and dedupe_file_range() replaced by +dnl # remap_file_range() +dnl # +dnl # 5.3: VFS copy_file_range() expected to do its own fallback, +dnl # generic_copy_file_range() added to support it +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [ + #include + + static ssize_t test_copy_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; (void) flags; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .copy_file_range = test_copy_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->copy_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_copy_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_COPY_FILE_RANGE, 1, + [fops->copy_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([generic_copy_file_range], [ + #include + ], [ + struct file *src_file __attribute__ ((unused)) = NULL; + loff_t src_off __attribute__ ((unused)) = 0; + struct file *dst_file __attribute__ ((unused)) = NULL; + loff_t dst_off __attribute__ ((unused)) = 0; + size_t len __attribute__ ((unused)) = 0; + unsigned int flags __attribute__ ((unused)) = 0; + generic_copy_file_range(src_file, src_off, dst_file, dst_off, + len, flags); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [ + AC_MSG_CHECKING([whether generic_copy_file_range() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_copy_file_range], + [generic_copy_file_range], [fs/read_write.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_GENERIC_COPY_FILE_RANGE, 1, + [generic_copy_file_range() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [ + #include + + static int test_clone_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + u64 len) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .clone_file_range = test_clone_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->clone_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_clone_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_CLONE_FILE_RANGE, 1, + [fops->clone_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_dedupe_file_range], [ + #include + + static int test_dedupe_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + u64 len) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .dedupe_file_range = test_dedupe_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->dedupe_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_dedupe_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DEDUPE_FILE_RANGE, 1, + [fops->dedupe_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_remap_file_range], [ + #include + + static loff_t test_remap_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + loff_t len, unsigned int flags) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; (void) flags; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .remap_file_range = test_remap_file_range, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->remap_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_remap_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_REMAP_FILE_RANGE, 1, + [fops->remap_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index cb7e736c9a4..1487fa2e779 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -116,6 +116,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER + ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN @@ -249,6 +255,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_VFS_IOV_ITER + ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE + ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE + ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE + ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE + ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE + ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN diff --git a/sys/contrib/openzfs/contrib/debian/changelog b/sys/contrib/openzfs/contrib/debian/changelog index 6273d603834..ba42ea59fa8 100644 --- a/sys/contrib/openzfs/contrib/debian/changelog +++ b/sys/contrib/openzfs/contrib/debian/changelog @@ -1,3 +1,9 @@ +openzfs-linux (2.2.0-0) unstable; urgency=low + + * OpenZFS 2.2.0 is tagged. + + -- Umer Saleem Tue, 25 Jul 2023 15:00:00 +0500 + openzfs-linux (2.1.99-1) unstable; urgency=low * Integrate minimally modified Debian packaging from ZFS on Linux diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install index cafcfdc0e15..b3afef50dbd 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install @@ -1,10 +1,8 @@ -sbin/zinject sbin/ztest usr/bin/raidz_test usr/share/man/man1/raidz_test.1 usr/share/man/man1/test-runner.1 usr/share/man/man1/ztest.1 -usr/share/man/man8/zinject.8 usr/share/zfs/common.sh usr/share/zfs/runfiles/ usr/share/zfs/test-runner diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install index 49f0ec0d5d9..301d8f67b3a 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install @@ -27,6 +27,7 @@ sbin/zfs sbin/zfs_ids_to_path sbin/zgenhostid sbin/zhack +sbin/zinject sbin/zpool sbin/zstream sbin/zstreamdump @@ -59,7 +60,6 @@ usr/share/man/man8/zfs-get.8 usr/share/man/man8/zfs-groupspace.8 usr/share/man/man8/zfs-hold.8 usr/share/man/man8/zfs-inherit.8 -usr/share/man/man8/zfs-jail.8 usr/share/man/man8/zfs-list.8 usr/share/man/man8/zfs-load-key.8 usr/share/man/man8/zfs-mount-generator.8 @@ -79,7 +79,6 @@ usr/share/man/man8/zfs-set.8 usr/share/man/man8/zfs-share.8 usr/share/man/man8/zfs-snapshot.8 usr/share/man/man8/zfs-unallow.8 -usr/share/man/man8/zfs-unjail.8 usr/share/man/man8/zfs-unload-key.8 usr/share/man/man8/zfs-unmount.8 usr/share/man/man8/zfs-unzone.8 @@ -92,6 +91,7 @@ usr/share/man/man8/zfs_ids_to_path.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 +usr/share/man/man8/zinject.8 usr/share/man/man8/zpool-add.8 usr/share/man/man8/zpool-attach.8 usr/share/man/man8/zpool-checkpoint.8 diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-env-bootfs.service.in index 7ebab4c1a58..fe362b930bf 100644 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-env-bootfs.service.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -12,11 +12,12 @@ ExecStart=/bin/sh -c ' decode_root_args || exit 0; \ [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \ rootflags="$(getarg rootflags=)"; \ - case ",$rootflags," in \ - *,zfsutil,*) ;; \ - ,,) rootflags=zfsutil ;; \ - *) rootflags="zfsutil,$rootflags" ;; \ - esac; \ + [ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \ + case ",$rootflags," in \ + *,zfsutil,*) ;; \ + ,,) rootflags=zfsutil ;; \ + *) rootflags="zfsutil,$rootflags" ;; \ + esac; \ exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"' [Install] diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in index 68fdcb1f323..12d8ac703e3 100644 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in @@ -2,7 +2,7 @@ Description=Rollback bootfs just before it is mounted Requisite=zfs-import.target After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service -Before=dracut-mount.service +Before=dracut-mount.service sysroot.mount DefaultDependencies=no ConditionKernelCommandLine=bootfs.rollback ConditionEnvironment=BOOTFS diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index a7037e3e626..fa05b7921bb 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -156,6 +156,7 @@ typedef enum zfs_error { EZFS_NOT_USER_NAMESPACE, /* a file is not a user namespace */ EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ + EZFS_SHAREFAILED, /* filesystem share failed */ EZFS_UNKNOWN } zfs_error_t; diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h index 91dadfae661..0779e58e495 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h @@ -36,7 +36,11 @@ struct xucred; typedef struct flock flock64_t; typedef struct vnode vnode_t; typedef struct vattr vattr_t; -#define vtype_t __enum_uint8(vtype) +#if __FreeBSD_version < 1400093 +typedef enum vtype vtype_t; +#else +#define vtype_t __enum_uint8(vtype) +#endif #include #include diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h index c5c6385be6f..e0f20ba3200 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h @@ -181,7 +181,11 @@ bi_status_to_errno(blk_status_t status) return (ENOLINK); case BLK_STS_TARGET: return (EREMOTEIO); +#ifdef HAVE_BLK_STS_RESV_CONFLICT + case BLK_STS_RESV_CONFLICT: +#else case BLK_STS_NEXUS: +#endif return (EBADE); case BLK_STS_MEDIUM: return (ENODATA); @@ -215,7 +219,11 @@ errno_to_bi_status(int error) case EREMOTEIO: return (BLK_STS_TARGET); case EBADE: +#ifdef HAVE_BLK_STS_RESV_CONFLICT + return (BLK_STS_RESV_CONFLICT); +#else return (BLK_STS_NEXUS); +#endif case ENODATA: return (BLK_STS_MEDIUM); case EILSEQ: @@ -337,6 +345,8 @@ zfs_check_media_change(struct block_device *bdev) return (0); } #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) +#elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE) +#define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk) #else /* * This is encountered if check_disk_change() and bdev_check_media_change() diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h index 1d77f0487a3..699b8a57182 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h @@ -147,6 +147,15 @@ #error "Toolchain needs to support the XSAVE assembler instruction" #endif +#ifndef XFEATURE_MASK_XTILE +/* + * For kernels where this doesn't exist yet, we still don't want to break + * by save/restoring this broken nonsense. + * See issue #14989 or Intel errata SPR4 for why + */ +#define XFEATURE_MASK_XTILE 0x60000 +#endif + #include #include @@ -315,18 +324,18 @@ kfpu_begin(void) uint8_t *state = zfs_kfpu_fpregs[smp_processor_id()]; #if defined(HAVE_XSAVES) if (static_cpu_has(X86_FEATURE_XSAVES)) { - kfpu_do_xsave("xsaves", state, ~0); + kfpu_do_xsave("xsaves", state, ~XFEATURE_MASK_XTILE); return; } #endif #if defined(HAVE_XSAVEOPT) if (static_cpu_has(X86_FEATURE_XSAVEOPT)) { - kfpu_do_xsave("xsaveopt", state, ~0); + kfpu_do_xsave("xsaveopt", state, ~XFEATURE_MASK_XTILE); return; } #endif if (static_cpu_has(X86_FEATURE_XSAVE)) { - kfpu_do_xsave("xsave", state, ~0); + kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE); } else if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_save_fxsr(state); } else { @@ -376,12 +385,12 @@ kfpu_end(void) uint8_t *state = zfs_kfpu_fpregs[smp_processor_id()]; #if defined(HAVE_XSAVES) if (static_cpu_has(X86_FEATURE_XSAVES)) { - kfpu_do_xrstor("xrstors", state, ~0); + kfpu_do_xrstor("xrstors", state, ~XFEATURE_MASK_XTILE); goto out; } #endif if (static_cpu_has(X86_FEATURE_XSAVE)) { - kfpu_do_xrstor("xrstor", state, ~0); + kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE); } else if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_restore_fxsr(state); } else { diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h index cc9cafa84f9..20eeadc46e1 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h @@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +/* + * This is necessary to be compatible with other kernel modules + * or in-tree filesystem that may define kmem_cache_alloc, + * like bcachefs does it now. + */ +#ifdef kmem_cache_alloc +#undef kmem_cache_alloc +#endif #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/types.h b/sys/contrib/openzfs/include/os/linux/spl/sys/types.h index a7666187ec2..d89a91c36f9 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/types.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/types.h @@ -38,7 +38,7 @@ typedef unsigned long ulong_t; typedef unsigned long long u_longlong_t; typedef long long longlong_t; -typedef unsigned long intptr_t; +typedef long intptr_t; typedef unsigned long long rlim64_t; typedef struct task_struct kthread_t; diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h index 2b302e9dab0..0bd20f64897 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h @@ -52,7 +52,11 @@ extern const struct inode_operations zpl_special_inode_operations; /* zpl_file.c */ extern const struct address_space_operations zpl_address_space_operations; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND +extern const struct file_operations_extend zpl_file_operations; +#else extern const struct file_operations zpl_file_operations; +#endif extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ @@ -180,6 +184,55 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) } #endif /* HAVE_VFS_ITERATE */ + +/* zpl_file_range.c */ + +/* handlers for file_operations of the same name */ +extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags); +extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags); +extern int zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len); +extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len); + +/* compat for FICLONE/FICLONERANGE/FIDEDUPERANGE ioctls */ +typedef struct { + int64_t fcr_src_fd; + uint64_t fcr_src_offset; + uint64_t fcr_src_length; + uint64_t fcr_dest_offset; +} zfs_ioc_compat_file_clone_range_t; + +typedef struct { + int64_t fdri_dest_fd; + uint64_t fdri_dest_offset; + uint64_t fdri_bytes_deduped; + int32_t fdri_status; + uint32_t fdri_reserved; +} zfs_ioc_compat_dedupe_range_info_t; + +typedef struct { + uint64_t fdr_src_offset; + uint64_t fdr_src_length; + uint16_t fdr_dest_count; + uint16_t fdr_reserved1; + uint32_t fdr_reserved2; + zfs_ioc_compat_dedupe_range_info_t fdr_info[]; +} zfs_ioc_compat_dedupe_range_t; + +#define ZFS_IOC_COMPAT_FICLONE _IOW(0x94, 9, int) +#define ZFS_IOC_COMPAT_FICLONERANGE \ + _IOW(0x94, 13, zfs_ioc_compat_file_clone_range_t) +#define ZFS_IOC_COMPAT_FIDEDUPERANGE \ + _IOWR(0x94, 54, zfs_ioc_compat_dedupe_range_t) + +extern long zpl_ioctl_ficlone(struct file *filp, void *arg); +extern long zpl_ioctl_ficlonerange(struct file *filp, void *arg); +extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); + + #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) #define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) #elif defined(HAVE_INODE_TIMESPEC64_TIMES) diff --git a/sys/contrib/openzfs/include/sys/bpobj.h b/sys/contrib/openzfs/include/sys/bpobj.h index f3384f52645..81bc0fe2108 100644 --- a/sys/contrib/openzfs/include/sys/bpobj.h +++ b/sys/contrib/openzfs/include/sys/bpobj.h @@ -60,7 +60,7 @@ typedef struct bpobj { kmutex_t bpo_lock; objset_t *bpo_os; uint64_t bpo_object; - int bpo_epb; + uint32_t bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; uint8_t bpo_havefreed; diff --git a/sys/contrib/openzfs/include/sys/brt.h b/sys/contrib/openzfs/include/sys/brt.h index 0761159e3f5..f73df95058d 100644 --- a/sys/contrib/openzfs/include/sys/brt.h +++ b/sys/contrib/openzfs/include/sys/brt.h @@ -36,6 +36,7 @@ extern "C" { #endif extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); +extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp); extern uint64_t brt_get_dspace(spa_t *spa); extern uint64_t brt_get_used(spa_t *spa); diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index 7e57d133c2e..615ba8fe749 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, + dmu_buf_t **dbp); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. diff --git a/sys/contrib/openzfs/include/sys/dmu_impl.h b/sys/contrib/openzfs/include/sys/dmu_impl.h index ce6ae3c665a..83ae2b76ba1 100644 --- a/sys/contrib/openzfs/include/sys/dmu_impl.h +++ b/sys/contrib/openzfs/include/sys/dmu_impl.h @@ -247,8 +247,6 @@ typedef struct dmu_sendstatus { void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); -int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, - const void *, dmu_buf_t **); #ifdef __cplusplus } diff --git a/sys/contrib/openzfs/include/sys/dmu_zfetch.h b/sys/contrib/openzfs/include/sys/dmu_zfetch.h index 0fbc3bacffb..f00e13cf03a 100644 --- a/sys/contrib/openzfs/include/sys/dmu_zfetch.h +++ b/sys/contrib/openzfs/include/sys/dmu_zfetch.h @@ -36,8 +36,6 @@ extern "C" { #endif -extern uint64_t zfetch_array_rd_sz; - struct dnode; /* so we can reference dnode */ typedef struct zfetch { diff --git a/sys/contrib/openzfs/include/sys/fm/fs/zfs.h b/sys/contrib/openzfs/include/sys/fm/fs/zfs.h index b9bac7e252e..fb9e8649221 100644 --- a/sys/contrib/openzfs/include/sys/fm/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fm/fs/zfs.h @@ -102,8 +102,6 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" @@ -112,8 +110,6 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" #define FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME "snapshot_name" #define FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME "device_name" #define FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME "raw_name" diff --git a/sys/contrib/openzfs/include/sys/metaslab.h b/sys/contrib/openzfs/include/sys/metaslab.h index fec080139a2..0df6e5f81fc 100644 --- a/sys/contrib/openzfs/include/sys/metaslab.h +++ b/sys/contrib/openzfs/include/sys/metaslab.h @@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 -#define METASLAB_FASTWRITE 0x40 #define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, @@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); -void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); -void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index 5beb1b73777..d328068890c 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -313,7 +313,7 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_allocatable and + * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context @@ -366,9 +366,9 @@ struct metaslab_group { struct metaslab { /* * This is the main lock of the metaslab and its purpose is to - * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing - * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 44afa763283..588c72f6e4f 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -250,6 +250,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index 2b22b973ba4..ad9dc3aefd8 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -266,7 +266,6 @@ struct vdev { metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ - uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ @@ -420,6 +419,7 @@ struct vdev { boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ + boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h index 03a409c5257..f780ad3d61b 100644 --- a/sys/contrib/openzfs/include/sys/zil_impl.h +++ b/sys/contrib/openzfs/include/sys/zil_impl.h @@ -38,14 +38,22 @@ extern "C" { /* * Possible states for a given lwb structure. * - * An lwb will start out in the "closed" state, and then transition to - * the "opened" state via a call to zil_lwb_write_open(). When - * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" - * must be held. + * An lwb will start out in the "new" state, and transition to the "opened" + * state via a call to zil_lwb_write_open() on first itx assignment. When + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be + * held. * - * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must - * be held when making this transition. + * After the lwb is "opened", it can be assigned number of itxs and transition + * into the "closed" state via zil_lwb_write_close() when full or on timeout. + * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock" + * must be held. New lwb allocation also takes "zl_lock" to protect the list. + * + * After the lwb is "closed", it can transition into the "ready" state via + * zil_lwb_write_issue(). "zl_lock" must be held when making this transition. + * Since it is done by the same thread, "zl_issuer_lock" is not needed. + * + * When lwb in "ready" state receives its block pointer, it can transition to + * "issued". "zl_lock" must be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write * done" state via zil_lwb_write_done(); and then into the "flush done" @@ -62,17 +70,20 @@ extern "C" { * * Additionally, correctness when reading an lwb's state is often * achieved by exploiting the fact that these state transitions occur in - * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * this specific order; i.e. "new" to "opened" to "closed" to "ready" to + * "issued" to "write_done" and finally "flush_done". * - * Thus, if an lwb is in the "closed" or "opened" state, holding the + * Thus, if an lwb is in the "new" or "opened" state, holding the * "zl_issuer_lock" will prevent a concurrent thread from transitioning - * that lwb to the "issued" state. Likewise, if an lwb is already in the - * "issued" state, holding the "zl_lock" will prevent a concurrent - * thread from transitioning that lwb to the "write done" state. + * that lwb to the "closed" state. Likewise, if an lwb is already in the + * "ready" state, holding the "zl_lock" will prevent a concurrent thread + * from transitioning that lwb to the "issued" state. */ typedef enum { - LWB_STATE_CLOSED, + LWB_STATE_NEW, LWB_STATE_OPENED, + LWB_STATE_CLOSED, + LWB_STATE_READY, LWB_STATE_ISSUED, LWB_STATE_WRITE_DONE, LWB_STATE_FLUSH_DONE, @@ -91,18 +102,21 @@ typedef enum { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ + boolean_t lwb_slim; /* log block has slim format */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ - boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ + int lwb_error; /* log block allocation error */ + int lwb_nmax; /* max bytes in the buffer */ int lwb_nused; /* # used bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ + zio_t *lwb_child_zio; /* parent zio for children */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ + uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index f4da80783e5..e1f4d5c0449 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) -#define ZIO_FLAG_FASTWRITE (1ULL << 31) #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) diff --git a/sys/contrib/openzfs/include/sys/zio_checksum.h b/sys/contrib/openzfs/include/sys/zio_checksum.h index 9fb79ab4a54..37fd65b7cb3 100644 --- a/sys/contrib/openzfs/include/sys/zio_checksum.h +++ b/sys/contrib/openzfs/include/sys/zio_checksum.h @@ -94,8 +94,6 @@ typedef const struct zio_checksum_info { } zio_checksum_info_t; typedef struct zio_bad_cksum { - zio_cksum_t zbc_expected; - zio_cksum_t zbc_actual; const char *zbc_checksum_name; uint8_t zbc_byteswapped; uint8_t zbc_injected; diff --git a/sys/contrib/openzfs/lib/libzfs/Makefile.am b/sys/contrib/openzfs/lib/libzfs/Makefile.am index cffe341220c..5e74d908de3 100644 --- a/sys/contrib/openzfs/lib/libzfs/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs/Makefile.am @@ -57,7 +57,7 @@ libzfs_la_LIBADD = \ libzutil.la \ libuutil.la -libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) +libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) libzfs_la_LDFLAGS = -pthread diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c index 5d1fe651c97..b38ad88096b 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c @@ -1300,7 +1300,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, zfs_mount_one, &ms, B_TRUE); if (ms.ms_mntstatus != 0) - ret = ms.ms_mntstatus; + ret = EZFS_MOUNTFAILED; /* * Share all filesystems that need to be shared. This needs to be @@ -1311,7 +1311,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, zfs_share_one, &ms, B_FALSE); if (ms.ms_mntstatus != 0) - ret = ms.ms_mntstatus; + ret = EZFS_SHAREFAILED; else zfs_commit_shares(NULL); diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c index d4af31c50cf..85564edfd86 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c @@ -3926,6 +3926,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) switch (errno) { + case EALREADY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "removal for this vdev is already in progress.")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + break; + case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c index 87a30f54fea..e9bc78aa8d3 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c @@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, return (0); } +static volatile boolean_t send_progress_thread_signal_duetotimer; +static void +send_progress_thread_act(int sig, siginfo_t *info, void *ucontext) +{ + (void) sig, (void) ucontext; + send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER; +} + +struct timer_desirability { + timer_t timer; + boolean_t desired; +}; +static void +timer_delete_cleanup(void *timer) +{ + struct timer_desirability *td = timer; + if (td->desired) + timer_delete(td->timer); +} + +#ifdef SIGINFO +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO) +#else +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO +#endif +#define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \ + sigset_t new; \ + sigemptyset(&new); \ + sigaddset(&new, SIGUSR1); \ + SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \ + pthread_sigmask(SIG_BLOCK, &new, old); \ +} + static void * send_progress_thread(void *arg) { @@ -941,6 +974,26 @@ send_progress_thread(void *arg) struct tm tm; int err; + const struct sigaction signal_action = + {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO}; + struct sigevent timer_cfg = + {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1}; + const struct itimerspec timer_time = + {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}}; + struct timer_desirability timer = {}; + + sigaction(SIGUSR1, &signal_action, NULL); +#ifdef SIGINFO + sigaction(SIGINFO, &signal_action, NULL); +#endif + + if ((timer.desired = pa->pa_progress || pa->pa_astitle)) { + if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer)) + return ((void *)(uintptr_t)errno); + (void) timer_settime(timer.timer, 0, &timer_time, NULL); + } + pthread_cleanup_push(timer_delete_cleanup, &timer); + if (!pa->pa_parsable && pa->pa_progress) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", @@ -953,12 +1006,12 @@ send_progress_thread(void *arg) * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { - (void) sleep(1); + pause(); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) - return ((void *)0); - return ((void *)(uintptr_t)err); + err = 0; + pthread_exit(((void *)(uintptr_t)err)); } (void) time(&t); @@ -991,21 +1044,25 @@ send_progress_thread(void *arg) (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); - } else if (pa->pa_progress) { + } else if (pa->pa_progress || + !send_progress_thread_signal_duetotimer) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } + pthread_cleanup_pop(B_TRUE); } static boolean_t -send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid) +send_progress_thread_exit( + libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); + pthread_sigmask(SIG_SETMASK, oldmask, NULL); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, @@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (sdd->progress || sdd->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; @@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) zfs_close(zhp); return (err); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); - if ((sdd->progress || sdd->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, tid)) + if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask)) return (-1); } @@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; + sigset_t oldmask; - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, @@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, redactbook, fd, &size); *sizep = size; - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, ptid)) + if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask)) return (-1); if (!flags->progress && !flags->parsable) @@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; + sigset_t oldmask; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; @@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, zfs_close(zhp); return (error); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, @@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (redact_book != NULL) free(redact_book); - if ((flags->progressastitle || flags->progress) && - send_progress_thread_exit(hdl, tid)) { + if (send_progress_thread_exit(hdl, tid, &oldmask)) { zfs_close(zhp); return (-1); } @@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(hdl, ptid)) + if (send_progress_thread_exit(hdl, ptid, &oldmask)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { diff --git a/sys/contrib/openzfs/man/Makefile.am b/sys/contrib/openzfs/man/Makefile.am index 2973520324f..36c1aede106 100644 --- a/sys/contrib/openzfs/man/Makefile.am +++ b/sys/contrib/openzfs/man/Makefile.am @@ -38,7 +38,6 @@ dist_man_MANS = \ %D%/man8/zfs-groupspace.8 \ %D%/man8/zfs-hold.8 \ %D%/man8/zfs-inherit.8 \ - %D%/man8/zfs-jail.8 \ %D%/man8/zfs-list.8 \ %D%/man8/zfs-load-key.8 \ %D%/man8/zfs-mount.8 \ @@ -57,14 +56,11 @@ dist_man_MANS = \ %D%/man8/zfs-share.8 \ %D%/man8/zfs-snapshot.8 \ %D%/man8/zfs-unallow.8 \ - %D%/man8/zfs-unjail.8 \ %D%/man8/zfs-unload-key.8 \ %D%/man8/zfs-unmount.8 \ - %D%/man8/zfs-unzone.8 \ %D%/man8/zfs-upgrade.8 \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ - %D%/man8/zfs-zone.8 \ %D%/man8/zfs_ids_to_path.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ @@ -104,6 +100,18 @@ dist_man_MANS = \ %D%/man8/zstreamdump.8 \ %D%/man8/zpool_influxdb.8 +if BUILD_FREEBSD +dist_man_MANS += \ + %D%/man8/zfs-jail.8 \ + %D%/man8/zfs-unjail.8 +endif + +if BUILD_LINUX +dist_man_MANS += \ + %D%/man8/zfs-unzone.8 \ + %D%/man8/zfs-zone.8 +endif + nodist_man_MANS = \ %D%/man8/zed.8 \ %D%/man8/zfs-mount-generator.8 diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 271b02b6ee4..3843419731b 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -15,7 +15,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd January 10, 2023 +.Dd July 21, 2023 .Dt ZFS 4 .Os . @@ -239,6 +239,11 @@ relative to the pool. Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . +.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint +For blocks that could be forced to be a gang block (due to +.Sy metaslab_force_ganging ) , +force this many of them to be gang blocks. +. .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created @@ -519,9 +524,6 @@ However, this is limited by Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . -.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 -If prefetching is enabled, disable prefetching for reads larger than this size. -. .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to diff --git a/sys/contrib/openzfs/man/man7/zfsprops.7 b/sys/contrib/openzfs/man/man7/zfsprops.7 index 8f6b919cfc0..51ddd85eb79 100644 --- a/sys/contrib/openzfs/man/man7/zfsprops.7 +++ b/sys/contrib/openzfs/man/man7/zfsprops.7 @@ -38,7 +38,7 @@ .\" Copyright (c) 2019, Kjeld Schouten-Lebbing .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP. .\" -.Dd April 18, 2023 +.Dd August 8, 2023 .Dt ZFSPROPS 7 .Os . @@ -1916,13 +1916,15 @@ See for more information. Jails are a .Fx -feature and are not relevant on other platforms. -The default value is -.Sy off . -.It Sy zoned Ns = Ns Sy on Ns | Ns Sy off +feature and this property is not available on other platforms. +.It Sy zoned Ns = Ns Sy off Ns | Ns Sy on Controls whether the dataset is managed from a non-global zone or namespace. -The default value is -.Sy off . +See +.Xr zfs-zone 8 +for more information. +Zoning is a +Linux +feature and this property is not available on other platforms. .El .Pp The following three properties cannot be changed after the file system is diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8 index 8cc6ae6ad59..ba604bf7785 100644 --- a/sys/contrib/openzfs/man/man8/zfs-send.8 +++ b/sys/contrib/openzfs/man/man8/zfs-send.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd January 12, 2023 +.Dd July 27, 2023 .Dt ZFS-SEND 8 .Os . @@ -297,6 +297,12 @@ This flag can only be used in conjunction with .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. @@ -433,6 +439,12 @@ and the verbose output goes to standard error .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .El .It Xo .Nm zfs @@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. . +.Sh SIGNALS +See +.Fl v . +. .Sh EXAMPLES .\" These are, respectively, examples 12, 13 from zfs.8 .\" Make sure to update them bidirectionally diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8 index 341f902fe66..e1436f6ded5 100644 --- a/sys/contrib/openzfs/man/man8/zpool-events.8 +++ b/sys/contrib/openzfs/man/man8/zpool-events.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 27, 2021 +.Dd July 11, 2023 .Dt ZPOOL-EVENTS 8 .Os . @@ -305,10 +305,6 @@ The time when a given I/O request was submitted. The time required to service a given I/O request. .It Sy prev_state The previous state of the vdev. -.It Sy cksum_expected -The expected checksum value for the block. -.It Sy cksum_actual -The actual checksum value for an errant block. .It Sy cksum_algorithm Checksum algorithm used. See @@ -362,23 +358,6 @@ Like but contains .Pq Ar good data No & ~( Ns Ar bad data ) ; that is, the bits set in the good data which are cleared in the bad data. -.It Sy bad_set_histogram -If this field exists, it is an array of counters. -Each entry counts bits set in a particular bit of a big-endian uint64 type. -The first entry counts bits -set in the high-order bit of the first byte, the 9th byte, etc, and the last -entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc. -This information is useful for observing a stuck bit in a parallel data path, -such as IDE or parallel SCSI. -.It Sy bad_cleared_histogram -If this field exists, it is an array of counters. -Each entry counts bit clears in a particular bit of a big-endian uint64 type. -The first entry counts bits -clears of the high-order bit of the first byte, the 9th byte, etc, and the -last entry counts clears of the low-order bit of the 8th byte, the 16th byte, -etc. -This information is useful for observing a stuck bit in a parallel data -path, such as IDE or parallel SCSI. .El . .Sh I/O STAGES diff --git a/sys/contrib/openzfs/man/man8/zpool.8 b/sys/contrib/openzfs/man/man8/zpool.8 index e8eadffa6fc..4c4020bdd81 100644 --- a/sys/contrib/openzfs/man/man8/zpool.8 +++ b/sys/contrib/openzfs/man/man8/zpool.8 @@ -110,9 +110,10 @@ Removes ZFS label information from the specified .It Xo .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8 .Xc -Increases or decreases redundancy by -.Cm attach Ns ing or -.Cm detach Ns ing a device on an existing vdev (virtual device). +Converts a non-redundant disk into a mirror, or increases +the redundancy level of an existing mirror +.Cm ( attach Ns ), or performs the inverse operation ( +.Cm detach Ns ). .It Xo .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8 .Xc @@ -233,16 +234,16 @@ Invalid command line options were specified. .El . .Sh EXAMPLES -.\" Examples 1, 2, 3, 4, 11, 12 are shared with zpool-create.8. -.\" Examples 5, 13 are shared with zpool-add.8. -.\" Examples 6, 15 are shared with zpool-list.8. -.\" Examples 7 are shared with zpool-destroy.8. -.\" Examples 8 are shared with zpool-export.8. -.\" Examples 9 are shared with zpool-import.8. -.\" Examples 10 are shared with zpool-upgrade.8. -.\" Examples 14 are shared with zpool-remove.8. -.\" Examples 16 are shared with zpool-status.8. -.\" Examples 13, 16 are also shared with zpool-iostat.8. +.\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8. +.\" Examples 6, 14 are shared with zpool-add.8. +.\" Examples 7, 16 are shared with zpool-list.8. +.\" Examples 8 are shared with zpool-destroy.8. +.\" Examples 9 are shared with zpool-export.8. +.\" Examples 10 are shared with zpool-import.8. +.\" Examples 11 are shared with zpool-upgrade.8. +.\" Examples 15 are shared with zpool-remove.8. +.\" Examples 17 are shared with zpool-status.8. +.\" Examples 14, 17 are also shared with zpool-iostat.8. .\" Make sure to update them omnidirectionally .Ss Example 1 : No Creating a RAID-Z Storage Pool The following command creates a pool with a single raidz root vdev that @@ -264,14 +265,21 @@ While not recommended, a pool based on files can be useful for experimental purposes. .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b . -.Ss Example 5 : No Adding a Mirror to a ZFS Storage Pool +.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored +The following command converts an existing single device +.Ar sda +into a mirror by attaching a second device to it, +.Ar sdb . +.Dl # Nm zpool Cm attach Ar tank Pa sda sdb +. +.Ss Example 6 : No Adding a Mirror to a ZFS Storage Pool The following command adds two mirrored disks to the pool .Ar tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. .Dl # Nm zpool Cm add Ar tank Sy mirror Pa sda sdb . -.Ss Example 6 : No Listing Available ZFS Storage Pools +.Ss Example 7 : No Listing Available ZFS Storage Pools The following command lists all available pools on the system. In this case, the pool .Ar zion @@ -285,19 +293,19 @@ tank 61.5G 20.0G 41.5G - 48% 32% 1.00x ONLINE - zion - - - - - - - FAULTED - .Ed . -.Ss Example 7 : No Destroying a ZFS Storage Pool +.Ss Example 8 : No Destroying a ZFS Storage Pool The following command destroys the pool .Ar tank and any datasets contained within: .Dl # Nm zpool Cm destroy Fl f Ar tank . -.Ss Example 8 : No Exporting a ZFS Storage Pool +.Ss Example 9 : No Exporting a ZFS Storage Pool The following command exports the devices in pool .Ar tank so that they can be relocated or later imported: .Dl # Nm zpool Cm export Ar tank . -.Ss Example 9 : No Importing a ZFS Storage Pool +.Ss Example 10 : No Importing a ZFS Storage Pool The following command displays available pools, and then imports the pool .Ar tank for use on the system. @@ -318,7 +326,7 @@ config: .No # Nm zpool Cm import Ar tank .Ed . -.Ss Example 10 : No Upgrading All ZFS Storage Pools to the Current Version +.Ss Example 11 : No Upgrading All ZFS Storage Pools to the Current Version The following command upgrades all ZFS Storage pools to the current version of the software: .Bd -literal -compact -offset Ds @@ -326,7 +334,7 @@ the software: This system is currently running ZFS version 2. .Ed . -.Ss Example 11 : No Managing Hot Spares +.Ss Example 12 : No Managing Hot Spares The following command creates a new pool with an available hot spare: .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy spare Pa sdc .Pp @@ -341,12 +349,12 @@ The hot spare can be permanently removed from the pool using the following command: .Dl # Nm zpool Cm remove Ar tank Pa sdc . -.Ss Example 12 : No Creating a ZFS Pool with Mirrored Separate Intent Logs +.Ss Example 13 : No Creating a ZFS Pool with Mirrored Separate Intent Logs The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf . -.Ss Example 13 : No Adding Cache Devices to a ZFS Pool +.Ss Example 14 : No Adding Cache Devices to a ZFS Pool The following command adds two disks for use as cache devices to a ZFS storage pool: .Dl # Nm zpool Cm add Ar pool Sy cache Pa sdc sdd @@ -359,7 +367,7 @@ Capacity and reads can be monitored using the subcommand as follows: .Dl # Nm zpool Cm iostat Fl v Ar pool 5 . -.Ss Example 14 : No Removing a Mirrored top-level (Log or Data) Device +.Ss Example 15 : No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device @@ -394,7 +402,7 @@ The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 . -.Ss Example 15 : No Displaying expanded space on a device +.Ss Example 16 : No Displaying expanded space on a device The following command displays the detailed information for the pool .Ar data . This pool is comprised of a single raidz vdev where one of its devices @@ -411,7 +419,7 @@ data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - sdc - - - - - .Ed . -.Ss Example 16 : No Adding output columns +.Ss Example 17 : No Adding output columns Additional columns can be added to the .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c . .Bd -literal -compact -offset Ds diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 485331ac655..c132171592a 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -461,6 +461,7 @@ ZFS_OBJS_OS := \ zpl_ctldir.o \ zpl_export.o \ zpl_file.o \ + zpl_file_range.o \ zpl_inode.o \ zpl_super.o \ zpl_xattr.o \ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 0d6d006781d..f672deed34d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6268,7 +6268,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) goto bad_write_fallback; } } else { -#if __FreeBSD_version >= 1400086 +#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \ + __FreeBSD_version >= 1400086 vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false, LK_EXCLUSIVE); #else @@ -6294,7 +6295,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), ap->a_outoffp, &len, ap->a_outcred); - if (error == EXDEV) + if (error == EXDEV || error == EAGAIN || error == EINVAL || + error == EOPNOTSUPP) goto bad_locked_fallback; *ap->a_lenp = (size_t)len; out_locked: diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index dca48e1e401..c45a3eb5a4e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip) */ static struct inode * zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, - const struct file_operations *fops, const struct inode_operations *ops) + const struct file_operations *fops, const struct inode_operations *ops, + uint64_t creation) { - inode_timespec_t now; struct inode *ip; znode_t *zp; + inode_timespec_t now = {.tv_sec = creation}; ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); - now = current_time(ip); + if (!creation) + now = current_time(ip); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); @@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; + uint64_t creation = 0; + dsl_dataset_t *snap_ds; + dsl_pool_t *pool; while (ip == NULL) { ip = ilookup(zfsvfs->z_sb, (unsigned long)id); if (ip) break; + if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { + pool = dmu_objset_pool(zfsvfs->z_os); + dsl_pool_config_enter(pool, FTAG); + if (!dsl_dataset_hold_obj(pool, + ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { + creation = dsl_get_creation(snap_ds); + dsl_dataset_rele(snap_ds, FTAG); + } + dsl_pool_config_exit(pool, FTAG); + } + /* May fail due to concurrent zfsctl_inode_alloc() */ - ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); + ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); } return (ip); @@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) ASSERT(zfsvfs->z_ctldir == NULL); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, - &zpl_fops_root, &zpl_ops_root); + &zpl_fops_root, &zpl_ops_root, 0); if (zfsvfs->z_ctldir == NULL) return (SET_ERROR(ENOENT)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 6b6293b9e48..464c12e1108 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1662,6 +1662,7 @@ zfs_umount(struct super_block *sb) } zfsvfs_free(zfsvfs); + sb->s_fs_info = NULL; return (0); } @@ -2091,6 +2092,9 @@ zfs_init(void) zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + register_fo_extend(&zpl_file_operations); +#endif } void @@ -2101,6 +2105,9 @@ zfs_fini(void) */ taskq_wait(system_delay_taskq); taskq_wait(system_taskq); +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + unregister_fo_extend(&zpl_file_operations); +#endif unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index 02b1af3edc4..335ae3460c5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -415,7 +415,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + ip->i_fop = &zpl_file_operations.kabi_fops; +#else ip->i_fop = &zpl_file_operations; +#endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; @@ -455,7 +459,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + ip->i_fop = &zpl_file_operations.kabi_fops; +#else ip->i_fop = &zpl_file_operations; +#endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index e690525d3cd..73526db731c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -1257,6 +1257,12 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FICLONE: + return (zpl_ioctl_ficlone(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FICLONERANGE: + return (zpl_ioctl_ficlonerange(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FIDEDUPERANGE: + return (zpl_ioctl_fideduperange(filp, (void *)arg)); default: return (-ENOTTY); } @@ -1283,7 +1289,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } #endif /* CONFIG_COMPAT */ - const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, @@ -1306,7 +1311,12 @@ const struct address_space_operations zpl_address_space_operations = { #endif }; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND +const struct file_operations_extend zpl_file_operations = { + .kabi_fops = { +#else const struct file_operations zpl_file_operations = { +#endif .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, @@ -1333,6 +1343,18 @@ const struct file_operations zpl_file_operations = { .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, +#ifdef HAVE_VFS_COPY_FILE_RANGE + .copy_file_range = zpl_copy_file_range, +#endif +#ifdef HAVE_VFS_CLONE_FILE_RANGE + .clone_file_range = zpl_clone_file_range, +#endif +#ifdef HAVE_VFS_REMAP_FILE_RANGE + .remap_file_range = zpl_remap_file_range, +#endif +#ifdef HAVE_VFS_DEDUPE_FILE_RANGE + .dedupe_file_range = zpl_dedupe_file_range, +#endif #ifdef HAVE_FILE_FADVISE .fadvise = zpl_fadvise, #endif @@ -1340,6 +1362,11 @@ const struct file_operations zpl_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + }, /* kabi_fops */ + .copy_file_range = zpl_copy_file_range, + .clone_file_range = zpl_clone_file_range, +#endif }; const struct file_operations zpl_dir_file_operations = { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c new file mode 100644 index 00000000000..2abbf44df58 --- /dev/null +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c @@ -0,0 +1,272 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2023, Klara Inc. + */ + +#ifdef CONFIG_COMPAT +#include +#endif +#include +#include +#include +#include +#include + +/* + * Clone part of a file via block cloning. + * + * Note that we are not required to update file offsets; the kernel will take + * care of that depending on how it was called. + */ +static ssize_t +__zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len) +{ + struct inode *src_i = file_inode(src_file); + struct inode *dst_i = file_inode(dst_file); + uint64_t src_off_o = (uint64_t)src_off; + uint64_t dst_off_o = (uint64_t)dst_off; + uint64_t len_o = (uint64_t)len; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int err; + + if (!spa_feature_is_enabled( + dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) + return (-EOPNOTSUPP); + + if (src_i != dst_i) + spl_inode_lock_shared(src_i); + spl_inode_lock(dst_i); + + crhold(cr); + cookie = spl_fstrans_mark(); + + err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i), + &dst_off_o, &len_o, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + spl_inode_unlock(dst_i); + if (src_i != dst_i) + spl_inode_unlock_shared(src_i); + + if (err < 0) + return (err); + + return ((ssize_t)len_o); +} + +#if defined(HAVE_VFS_COPY_FILE_RANGE) || \ + defined(HAVE_VFS_FILE_OPERATIONS_EXTEND) +/* + * Entry point for copy_file_range(). Copy len bytes from src_off in src_file + * to dst_off in dst_file. We are permitted to do this however we like, so we + * try to just clone the blocks, and if we can't support it, fall back to the + * kernel's generic byte copy function. + */ +ssize_t +zpl_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) +{ + ssize_t ret; + + if (flags != 0) + return (-EINVAL); + + /* Try to do it via zfs_clone_range() */ + ret = __zpl_clone_file_range(src_file, src_off, + dst_file, dst_off, len); + +#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE + /* + * Since Linux 5.3 the filesystem driver is responsible for executing + * an appropriate fallback, and a generic fallback function is provided. + */ + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) + ret = generic_copy_file_range(src_file, src_off, dst_file, + dst_off, len, flags); +#else + /* + * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal + * to the kernel that it should fallback to a content copy. + */ + if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) + ret = -EOPNOTSUPP; +#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ + + return (ret); +} +#endif /* HAVE_VFS_COPY_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ + +#ifdef HAVE_VFS_REMAP_FILE_RANGE +/* + * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE. + * + * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except + * that they must clone - they cannot fall back to copying. FICLONE is exactly + * FICLONERANGE, for the entire file. We don't need to try to tell them apart; + * the kernel will sort that out for us. + * + * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the + * range in both files and if they're the same, arrange for them to be backed + * by the same storage. + */ +loff_t +zpl_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags) +{ + if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) + return (-EINVAL); + + /* + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given + * range if we want. Its designed for filesystems that make data past + * EOF available, and don't want it to be visible in both files. ZFS + * doesn't do that, so we just turn the flag off. + */ + flags &= ~REMAP_FILE_CAN_SHORTEN; + + if (flags & REMAP_FILE_DEDUP) + /* No support for dedup yet */ + return (-EOPNOTSUPP); + + /* Zero length means to clone everything to the end of the file */ + if (len == 0) + len = i_size_read(file_inode(src_file)) - src_off; + + return (__zpl_clone_file_range(src_file, src_off, + dst_file, dst_off, len)); +} +#endif /* HAVE_VFS_REMAP_FILE_RANGE */ + +#if defined(HAVE_VFS_CLONE_FILE_RANGE) || \ + defined(HAVE_VFS_FILE_OPERATIONS_EXTEND) +/* + * Entry point for FICLONE and FICLONERANGE, before Linux 4.20. + */ +int +zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len) +{ + /* Zero length means to clone everything to the end of the file */ + if (len == 0) + len = i_size_read(file_inode(src_file)) - src_off; + + return (__zpl_clone_file_range(src_file, src_off, + dst_file, dst_off, len)); +} +#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ + +#ifdef HAVE_VFS_DEDUPE_FILE_RANGE +/* + * Entry point for FIDEDUPERANGE, before Linux 4.20. + */ +int +zpl_dedupe_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len) +{ + /* No support for dedup yet */ + return (-EOPNOTSUPP); +} +#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */ + +/* Entry point for FICLONE, before Linux 4.5. */ +long +zpl_ioctl_ficlone(struct file *dst_file, void *arg) +{ + unsigned long sfd = (unsigned long)arg; + + struct file *src_file = fget(sfd); + if (src_file == NULL) + return (-EBADF); + + if (dst_file->f_op != src_file->f_op) + return (-EXDEV); + + size_t len = i_size_read(file_inode(src_file)); + + ssize_t ret = + __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + + fput(src_file); + + if (ret < 0) { + if (ret == -EOPNOTSUPP) + return (-ENOTTY); + return (ret); + } + + if (ret != len) + return (-EINVAL); + + return (0); +} + +/* Entry point for FICLONERANGE, before Linux 4.5. */ +long +zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) +{ + zfs_ioc_compat_file_clone_range_t fcr; + + if (copy_from_user(&fcr, arg, sizeof (fcr))) + return (-EFAULT); + + struct file *src_file = fget(fcr.fcr_src_fd); + if (src_file == NULL) + return (-EBADF); + + if (dst_file->f_op != src_file->f_op) + return (-EXDEV); + + size_t len = fcr.fcr_src_length; + if (len == 0) + len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; + + ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, + dst_file, fcr.fcr_dest_offset, len); + + fput(src_file); + + if (ret < 0) { + if (ret == -EOPNOTSUPP) + return (-ENOTTY); + return (ret); + } + + if (ret != len) + return (-EINVAL); + + return (0); +} + +/* Entry point for FIDEDUPERANGE, before Linux 4.5. */ +long +zpl_ioctl_fideduperange(struct file *filp, void *arg) +{ + (void) arg; + + /* No support for dedup yet */ + return (-ENOTTY); +} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index c5c230bee14..ad52a11aada 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -277,8 +277,6 @@ zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; - int match; - /* * If the os doesn't match the z_os in the super_block, assume it is * not a match. Matching would imply a multimount of a dataset. It is @@ -286,19 +284,7 @@ zpl_test_super(struct super_block *s, void *data) * that changes the z_os, e.g., rollback, where the match will be * missed, but in that case the user will get an EBUSY. */ - if (zfsvfs == NULL || os != zfsvfs->z_os) - return (0); - - /* - * If they do match, recheck with the lock held to prevent mounting the - * wrong dataset since z_os can be stale when the teardown lock is held. - */ - if (zpl_enter(zfsvfs, FTAG) != 0) - return (0); - match = (os == zfsvfs->z_os); - zpl_exit(zfsvfs, FTAG); - - return (match); + return (zfsvfs != NULL && os == zfsvfs->z_os); } static struct super_block * @@ -324,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + /* + * Recheck with the lock held to prevent mounting the wrong dataset + * since z_os can be stale when the teardown lock is held. + * + * We can't do this in zpl_test_super in since it's under spinlock and + * also s_umount lock is not held there so it would race with + * zfs_umount and zfsvfs can be freed. + */ + if (!IS_ERR(s) && s->s_fs_info != NULL) { + zfsvfs_t *zfsvfs = s->s_fs_info; + if (zpl_enter(zfsvfs, FTAG) == 0) { + if (os != zfsvfs->z_os) + err = -SET_ERROR(EBUSY); + zpl_exit(zfsvfs, FTAG); + } else { + err = -SET_ERROR(EBUSY); + } + } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) return (ERR_CAST(s)); + if (err) { + deactivate_locked_super(s); + return (ERR_PTR(err)); + } + if (s->s_root == NULL) { err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); if (err) { diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c index 211bab56519..e772caead29 100644 --- a/sys/contrib/openzfs/module/zfs/bpobj.c +++ b/sys/contrib/openzfs/module/zfs/bpobj.c @@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { + int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; + uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) * + sizeof (blkptr_t); + uint64_t ps = start * sizeof (blkptr_t); + uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0, + ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb, + ZIO_PRIORITY_ASYNC_READ); + } + for (; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); @@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, if (dbuf) dmu_buf_rele(dbuf, FTAG); err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, - offset, FTAG, &dbuf, 0); + offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH); if (err) break; + pe = pb; + pb = MAX((dbuf->db_offset > dmu_prefetch_max) ? + dbuf->db_offset - dmu_prefetch_max : 0, ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + pb, pe - pb, ZIO_PRIORITY_ASYNC_READ); + } } ASSERT3U(offset, >=, dbuf->db_offset); @@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int64_t i = bpi->bpi_unprocessed_subobjs - 1; uint64_t offset = i * sizeof (uint64_t); - uint64_t obj_from_sublist; + uint64_t subobj; err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - offset, sizeof (uint64_t), &obj_from_sublist, - DMU_READ_PREFETCH); + offset, sizeof (uint64_t), &subobj, + DMU_READ_NO_PREFETCH); if (err) break; - bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), + + bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t), KM_SLEEP); - - err = bpobj_open(sublist, bpo->bpo_os, - obj_from_sublist); - if (err) + err = bpobj_open(subbpo, bpo->bpo_os, subobj); + if (err) { + kmem_free(subbpo, sizeof (bpobj_t)); break; + } - list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); - mutex_enter(&sublist->bpo_lock); + if (subbpo->bpo_havesubobj && + subbpo->bpo_phys->bpo_subobjs != 0) { + dmu_prefetch(subbpo->bpo_os, + subbpo->bpo_phys->bpo_subobjs, 0, 0, 0, + ZIO_PRIORITY_ASYNC_READ); + } + + list_insert_head(&stack, bpi_alloc(subbpo, bpi, i)); + mutex_enter(&subbpo->bpo_lock); bpi->bpi_unprocessed_subobjs--; } } diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c index 99bd472d6fb..ddd8eefe600 100644 --- a/sys/contrib/openzfs/module/zfs/brt.c +++ b/sys/contrib/openzfs/module/zfs/brt.c @@ -174,7 +174,7 @@ * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be - * block-aligned or we will return the EXDEV error so the upper layer can + * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing @@ -192,9 +192,9 @@ * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP - * allocated yet, so cannot be immediately cloned. We return EXDEV. + * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same - * transaction group. We return EXDEV. + * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time @@ -680,7 +680,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); - entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); + entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); @@ -709,7 +709,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), BT_SIZEOFMAP(brtvd->bv_nblocks))); - kmem_free(brtvd->bv_entcount, + vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); } @@ -792,7 +792,7 @@ brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT(brtvd->bv_initiated); - kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); + vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); brtvd->bv_bitmap = NULL; @@ -1544,6 +1544,37 @@ out: return (B_FALSE); } +uint64_t +brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search, *bre; + uint64_t vdevid, refcnt; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre == NULL) { + error = brt_entry_lookup(brt, brtvd, &bre_search); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) + refcnt = 0; + else + refcnt = bre_search.bre_refcount; + } else + refcnt = bre->bre_refcount; + + brt_unlock(brt); + return (refcnt); +} + static void brt_prefetch(brt_t *brt, const blkptr_t *bp) { diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 1ea075217fb..f2831a0e8ab 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -2701,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) */ mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); - ASSERT(list_head(&db->db_dirty_records) == NULL); + ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; @@ -4457,6 +4457,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else if (db->db_state == DB_READ) { + /* + * This buffer has a clone we need to write, and an in-flight + * read on the BP we're about to clone. Its safe to issue the + * write here because the read has already been issued and the + * contents won't change. + */ + ASSERT(dr->dt.dl.dr_brtwrite && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index dda869287c7..ddb29020b09 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -89,7 +89,11 @@ static int zfs_dmu_offset_next_sync = 1; * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ +#ifdef _ILP32 +uint_t dmu_prefetch_max = 8 * 1024 * 1024; +#else uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +#endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, @@ -161,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { zfs_acl_byteswap, "acl" } }; -static int +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp) { @@ -181,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, *dbp = &db->db; return (0); } + int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp) @@ -552,8 +557,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - length <= zfetch_array_rd_sz) { + if ((flags & DMU_READ_NO_PREFETCH) == 0) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we @@ -1650,10 +1654,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, { dmu_sync_arg_t *dsa; dmu_tx_t *tx; + int error; + + error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, + DB_RF_CANFAIL | DB_RF_NOPREFETCH); + if (error != 0) + return (error); tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + /* + * This transaction does not produce any dirty data or log blocks, so + * it should not be throttled. All other cases wait for TXG sync, by + * which time the log block we are writing will be obsolete, so we can + * skip waiting and just return error here instead. + */ + if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 2fdd7c1ece7..05ca91717c2 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -1795,17 +1795,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa, } /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. + * The dmu does not currently support decreasing nlevels or changing + * indirect block size if there is already one, same as changing the + * number of of dnode slots on an object. For non-raw sends this + * does not matter and the new object can just use the previous one's + * parameters. For raw sends, however, the structure of the received + * dnode (including indirects and dnode slots) must match that of the + * send side. Therefore, instead of using dmu_object_reclaim(), we + * must free the object completely and call dmu_object_claim_dnsize() + * instead. */ - if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + if ((rwa->raw && ((doi->doi_indirection > 1 && + indblksz != doi->doi_metadata_block_size) || + drro->drr_nlevels < doi->doi_indirection)) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index b70459380c2..d0acaf50206 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -52,14 +52,19 @@ static unsigned int zfetch_max_streams = 8; static unsigned int zfetch_min_sec_reap = 1; /* max time before stream delete */ static unsigned int zfetch_max_sec_reap = 2; +#ifdef _ILP32 +/* min bytes to prefetch per stream (default 2MB) */ +static unsigned int zfetch_min_distance = 2 * 1024 * 1024; +/* max bytes to prefetch per stream (default 8MB) */ +unsigned int zfetch_max_distance = 8 * 1024 * 1024; +#else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ unsigned int zfetch_max_distance = 64 * 1024 * 1024; +#endif /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; -/* max number of bytes in an array_read in which we allow prefetching (1MB) */ -uint64_t zfetch_array_rd_sz = 1024 * 1024; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; @@ -580,6 +585,3 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); - -ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW, - "Number of bytes in a array_read"); diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index d15268cd7bc..7cf03264dce 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -1882,7 +1882,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs == dn->dn_indblkshift) ibs = 0; - if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + if (size == dn->dn_datablksz && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); @@ -1905,24 +1905,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs && dn->dn_nlevels != 1) goto fail; - /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) { - dbuf_new_size(db, size, tx); - } else if (err != ENOENT) { - goto fail; - } - - dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (size != dn->dn_datablksz) { + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); + if (err == 0) { + dbuf_new_size(db, size, tx); + } else if (err != ENOENT) { + goto fail; + } + + dnode_setdblksz(dn, size); + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size; + if (db) + dbuf_rele(db, FTAG); + } if (ibs) { dn->dn_indblkshift = ibs; - dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; } - /* release after we have fixed the blocksize in the dnode */ - if (db) - dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c index 181efd8ed69..47c234f76c4 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c @@ -892,9 +892,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) for (zap_cursor_init(&zc, dl->dl_os, obj); (error = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { - uint64_t mintxg = zfs_strtonum(za->za_name, NULL); - dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx); - VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + dsl_deadlist_insert_bpobj(dl, za->za_first_integer, + zfs_strtonum(za->za_name, NULL), tx); + VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx)); if (perror == 0) { dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, zfs_strtonum(pza->za_name, NULL)); diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 50428bff3ef..34012db82de 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -2015,6 +2015,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { + if (OBJSET_BUF_HAS_PROJECTUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_projectused_dnode, zb->zb_objset, + DMU_PROJECTUSED_OBJECT); + } dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); @@ -2075,10 +2080,16 @@ dsl_scan_prefetch_thread(void *arg) zio_flags |= ZIO_FLAG_RAW; } + /* We don't need data L1 buffer since we do not prefetch L0. */ + blkptr_t *bp = &spic->spic_bp; + if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) + flags |= ARC_FLAG_NO_BUF; + /* issue the prefetch asynchronously */ - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, - &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + (void) arc_read(scn->scn_zio_root, spa, bp, + dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, + zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 176247d63b7..20dc934593f 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024; */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; +/* + * Of blocks of size >= metaslab_force_ganging, actually gang them this often. + */ +uint_t metaslab_force_ganging_pct = 3; + /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each @@ -1287,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, /* * If this metaslab group is below its qmax or it's - * the only allocatable metasable group, then attempt + * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) @@ -5096,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - metaslab_group_t *mg, *fast_mg, *rotor; + metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ - if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) { + if (psize >= metaslab_force_ganging && + metaslab_force_ganging_pct > 0 && + (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); @@ -5157,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; - } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mca->mca_rotor; - - do { - if (fast_mg->mg_vd->vdev_pending_fastwrite < - mg->mg_vd->vdev_pending_fastwrite) - mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); - } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; @@ -5290,7 +5288,7 @@ top: mg->mg_bias = 0; } - if ((flags & METASLAB_FASTWRITE) || + if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; @@ -5303,11 +5301,6 @@ top: ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); - if (flags & METASLAB_FASTWRITE) { - atomic_add_64(&vd->vdev_pending_fastwrite, - psize); - } - return (0); } next: @@ -5943,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } -void -metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - atomic_add_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - -void -metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); - atomic_sub_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) @@ -6266,7 +6210,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, - "Blocks larger than this size are forced to be gang blocks"); + "Blocks larger than this size are sometimes forced to be gang blocks"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, + "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 06f64076904..3b355e0debc 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -772,6 +772,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 30551feb632..87c14559323 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -889,9 +889,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_not_present); /* - * Get the alignment requirement. + * Get the alignment requirement. Ignore pool ashift for vdev + * attach case. */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + if (alloctype != VDEV_ALLOC_ATTACH) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, + &vd->vdev_ashift); + } else { + vd->vdev_attaching = B_TRUE; + } /* * Retrieve the vdev creation time. @@ -1186,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT(tvd == tvd->vdev_top); - tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; @@ -1393,6 +1398,36 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +/* + * Choose GCD for spa_gcd_alloc. + */ +static uint64_t +vdev_gcd(uint64_t a, uint64_t b) +{ + while (b != 0) { + uint64_t t = b; + b = a % b; + a = t; + } + return (a); +} + +/* + * Set spa_min_alloc and spa_gcd_alloc. + */ +static void +vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) +{ + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + if (spa->spa_gcd_alloc == INT_MAX) { + spa->spa_gcd_alloc = min_alloc; + } else { + spa->spa_gcd_alloc = vdev_gcd(min_alloc, + spa->spa_gcd_alloc); + } +} + void vdev_metaslab_group_create(vdev_t *vd) { @@ -1445,8 +1480,7 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } } } @@ -1620,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd) } } ASSERT0(vd->vdev_ms_count); - ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { @@ -2144,9 +2177,9 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EDOM)); } - if (vd->vdev_top == vd) { + if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); - } + vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { @@ -2207,8 +2240,7 @@ vdev_open(vdev_t *vd) if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } /* @@ -5688,6 +5720,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; + uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); @@ -5698,31 +5731,28 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) if (vd == NULL) return; + /* + * Set vdev property values in the vdev props mos object. + */ + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { + objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + objid = vd->vdev_leaf_zap; + } else { + panic("unexpected vdev type"); + } + mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { - uint64_t intval, objid = 0; + uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; - /* - * Set vdev property values in the vdev props mos object. - */ - if (vd->vdev_root_zap != 0) { - objid = vd->vdev_root_zap; - } else if (vd->vdev_top_zap != 0) { - objid = vd->vdev_top_zap; - } else if (vd->vdev_leaf_zap != 0) { - objid = vd->vdev_leaf_zap; - } else { - /* - * XXX: implement vdev_props_set_check() - */ - panic("vdev not root/top/leaf"); - } - switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { @@ -5791,6 +5821,12 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ASSERT(vd != NULL); + /* Check that vdev has a zap we can use */ + if (vd->vdev_root_zap == 0 && + vd->vdev_top_zap == 0 && + vd->vdev_leaf_zap == 0) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 89667585345..acb72569667 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -1398,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio, vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - zio_bad_cksum_t zbc = {{{ 0 }}}; + zio_bad_cksum_t zbc = { 0 }; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 14b98a76b84..3445fa9d35d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -1785,7 +1785,7 @@ vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) static int raidz_checksum_verify(zio_t *zio) { - zio_bad_cksum_t zbc = {{{0}}}; + zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index c42ef048dd7..c4eb74e873d 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -754,10 +754,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, #define MAX_RANGES 16 typedef struct zfs_ecksum_info { - /* histograms of set and cleared bits by bit number in a 64-bit word */ - uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY]; - uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; - /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; @@ -781,7 +777,7 @@ typedef struct zfs_ecksum_info { } zfs_ecksum_info_t; static void -update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count) +update_bad_bits(uint64_t value_arg, uint32_t *count) { size_t i; size_t bits = 0; @@ -789,10 +785,8 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count) /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { - if (value & (1ull << i)) { - hist[63 - i]++; + if (value & (1ull << i)) ++bits; - } } /* update the count of bits changed */ *count += bits; @@ -920,14 +914,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_expected) / sizeof (uint64_t), - (uint64_t *)&info->zbc_expected, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_actual) / sizeof (uint64_t), - (uint64_t *)&info->zbc_actual, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, @@ -1010,10 +996,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, offset++; } - update_histogram(set, eip->zei_histogram_set, - &eip->zei_range_sets[range]); - update_histogram(cleared, eip->zei_histogram_cleared, - &eip->zei_range_clears[range]); + update_bad_bits(set, &eip->zei_range_sets[range]); + update_bad_bits(cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ @@ -1049,15 +1033,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); - } else { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, - DATA_TYPE_UINT8_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_set, - FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, - DATA_TYPE_UINT8_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, - NULL); } return (eip); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 7bdcc163938..f8d13075d5c 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* @@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated @@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } #endif if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread(os, object, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) * * On success, the function return the number of bytes copied in *lenp. * Note, it doesn't return how much bytes are left to be copied. + * On errors which are caused by any file system limitations or + * brt limitations `EINVAL` is returned. In the most cases a user + * requested bad parameters, it could be possible to clone the file but + * some parameters don't match the requirements. */ int zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, @@ -1078,6 +1082,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } + /* + * outos and inos belongs to the same storage pool. + * see a few lines above, only one check. + */ + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EOPNOTSUPP)); + } + ASSERT(!outzfsvfs->z_replay); error = zfs_verify_zp(inzp); @@ -1088,12 +1102,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (error); } - if (!spa_feature_is_enabled(dmu_objset_spa(outos), - SPA_FEATURE_BLOCK_CLONING)) { - zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); - return (SET_ERROR(EXDEV)); - } - /* * We don't copy source file's flags that's why we don't allow to clone * files that are in quarantine. @@ -1167,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * We cannot clone into files with different block size. */ if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1175,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * Offsets and len must be at block boundries. */ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } /* @@ -1183,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if ((len % inblksz) != 0 && (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1212,7 +1220,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, gid = KGID_TO_SGID(ZTOGID(outzp)); projid = outzp->z_projid; - bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); /* * Clone the file in reasonable size chunks. Each chunk is cloned @@ -1238,13 +1246,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, &nbps); if (error != 0) { /* - * If we are tyring to clone a block that was created - * in the current transaction group. Return an error, - * so the caller can fallback to just copying the data. + * If we are trying to clone a block that was created + * in the current transaction group, error will be + * EAGAIN here, which we can just return to the caller + * so it can fallback if it likes. */ - if (error == EAGAIN) { - error = SET_ERROR(EXDEV); - } break; } /* @@ -1330,7 +1336,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, done += size; } - kmem_free(bps, sizeof (bps[0]) * maxblocks); + vmem_free(bps, sizeof (bps[0]) * maxblocks); zfs_znode_update_vfs(outzp); unlock: diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index be5b9edf6ed..b30676b42d8 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -151,7 +151,6 @@ static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); -static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb); static itx_t *zil_itx_clone(itx_t *oitx); static int @@ -290,7 +289,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); @@ -304,7 +303,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { @@ -760,35 +759,52 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) return (TREE_CMP(v1, v2)); } +/* + * Allocate a new lwb. We may already have a block pointer for it, in which + * case we get size and version from there. Or we may not yet, in which case + * we choose them here and later make the block allocation match. + */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, - boolean_t fastwrite) +zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, + uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; - lwb->lwb_blk = *bp; - lwb->lwb_fastwrite = fastwrite; - lwb->lwb_slog = slog; - lwb->lwb_indirect = B_FALSE; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); + if (bp) { + lwb->lwb_blk = *bp; + lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); + sz = BP_GET_LSIZE(bp); } else { - lwb->lwb_nused = lwb->lwb_nfilled = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + BP_ZERO(&lwb->lwb_blk); + lwb->lwb_slim = (spa_version(zilog->zl_spa) >= + SPA_VERSION_SLIM_ZIL); } - lwb->lwb_state = LWB_STATE_CLOSED; - lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_slog = slog; + lwb->lwb_error = 0; + if (lwb->lwb_slim) { + lwb->lwb_nmax = sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } + lwb->lwb_sz = sz; + lwb->lwb_state = state; + lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - lwb->lwb_max_txg = txg; + lwb->lwb_alloc_txg = txg; + lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); + if (state != LWB_STATE_NEW) + zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); @@ -798,15 +814,17 @@ static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || - lwb->lwb_state == LWB_STATE_FLUSH_DONE); + VERIFY(list_is_empty(&lwb->lwb_itxs)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer @@ -916,7 +934,6 @@ zil_create(zilog_t *zilog) dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; - boolean_t fastwrite = FALSE; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); @@ -949,8 +966,6 @@ zil_create(zilog_t *zilog) error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); - fastwrite = TRUE; - if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -959,7 +974,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); + lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction @@ -1044,12 +1059,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, - &lwb->lwb_blk); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { @@ -1277,21 +1290,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. + * zl_issuer_lock while the lwb is open and zl_lock otherwise. + * zl_issuer_lock also protects leaving the open state. + * zcw_lwb setting is protected by zl_issuer_lock and state != + * flush_done, which transition is protected by zl_lock. */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); + IMPLY(lwb->lwb_state != LWB_STATE_OPENED, + MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - ASSERT3P(lwb, !=, NULL); - ASSERT(lwb->lwb_state == LWB_STATE_OPENED || - lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE); - list_insert_tail(&lwb->lwb_waiters, zcw); + ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; - mutex_exit(&zcw->zcw_lock); } /* @@ -1302,11 +1315,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); list_insert_tail(nolwb, zcw); - mutex_exit(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, NULL); } void @@ -1318,6 +1329,9 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) int ndvas = BP_GET_NDVAS(bp); int i; + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + if (zil_nocacheflush) return; @@ -1397,15 +1411,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog_t *zilog = lwb->lwb_zilog; zil_commit_waiter_t *zcw; itx_t *itx; - uint64_t txg; - list_t itxs, waiters; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); - list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); - list_create(&waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1414,6 +1422,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) lwb->lwb_root_zio = NULL; + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1424,22 +1435,13 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } - list_move_tail(&itxs, &lwb->lwb_itxs); - list_move_tail(&waiters, &lwb->lwb_waiters); - txg = lwb->lwb_issued_txg; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - - mutex_exit(&zilog->zl_lock); - - while ((itx = list_remove_head(&itxs)) != NULL) + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); - list_destroy(&itxs); - while ((zcw = list_remove_head(&waiters)) != NULL) { + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been @@ -1464,7 +1466,11 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } - list_destroy(&waiters); + + uint64_t txg = lwb->lwb_issued_txg; + + /* Once we drop the lock, lwb may be freed by zil_sync(). */ + mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); @@ -1492,7 +1498,7 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); - while (lwb != NULL && lwb->lwb_max_txg <= txg) { + while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); @@ -1535,14 +1541,6 @@ zil_lwb_write_done(zio_t *zio) ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); - ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); - ASSERT(!BP_IS_GANG(zio->io_bp)); - ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(BP_GET_FILL(zio->io_bp) == 0); - abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; @@ -1550,8 +1548,8 @@ zil_lwb_write_done(zio_t *zio) mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; - lwb->lwb_fastwrite = FALSE; nlwb = list_next(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); @@ -1615,124 +1613,75 @@ zil_lwb_write_done(zio_t *zio) } } +/* + * Build the zio dependency chain, which is used to preserve the ordering of + * lwb completions that is required by the semantics of the ZIL. Each new lwb + * zio becomes a parent of the previous lwb zio, such that the new lwb's zio + * cannot complete until the previous lwb's zio completes. + * + * This is required by the semantics of zil_commit(): the commit waiters + * attached to the lwbs will be woken in the lwb zio's completion callback, + * so this zio dependency graph ensures the waiters are woken in the correct + * order (the same order the lwbs were created). + */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { - lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_lock)); + lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); + if (prev_lwb == NULL || + prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) + return; + /* - * The zilog's "zl_last_lwb_opened" field is used to build the - * lwb/zio dependency chain, which is used to preserve the - * ordering of lwb completions that is required by the semantics - * of the ZIL. Each new lwb zio becomes a parent of the - * "previous" lwb zio, such that the new lwb's zio cannot - * complete until the "previous" lwb's zio completes. + * If the previous lwb's write hasn't already completed, we also want + * to order the completion of the lwb write zios (above, we only order + * the completion of the lwb root zios). This is required because of + * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * - * This is required by the semantics of zil_commit(); the commit - * waiters attached to the lwbs will be woken in the lwb zio's - * completion callback, so this zio dependency graph ensures the - * waiters are woken in the correct order (the same order the - * lwbs were created). + * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous + * lwb will rely on this lwb to flush the vdevs written to by that + * previous lwb. Thus, we need to ensure this lwb doesn't issue the + * flush until after the previous lwb's write completes. We ensure + * this ordering by setting the zio parent/child relationship here. + * + * Without this relationship on the lwb's write zio, it's possible + * for this lwb's write to complete prior to the previous lwb's write + * completing; and thus, the vdevs for the previous lwb would be + * flushed prior to that lwb's data being written to those vdevs (the + * vdevs are flushed in the lwb write zio's completion handler, + * zil_lwb_write_done()). */ - if (last_lwb_opened != NULL && - last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED || - last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); - - ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); - zio_add_child(lwb->lwb_root_zio, - last_lwb_opened->lwb_root_zio); - - /* - * If the previous lwb's write hasn't already completed, - * we also want to order the completion of the lwb write - * zios (above, we only order the completion of the lwb - * root zios). This is required because of how we can - * defer the DKIOCFLUSHWRITECACHE commands for each lwb. - * - * When the DKIOCFLUSHWRITECACHE commands are deferred, - * the previous lwb will rely on this lwb to flush the - * vdevs written to by that previous lwb. Thus, we need - * to ensure this lwb doesn't issue the flush until - * after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child - * relationship here. - * - * Without this relationship on the lwb's write zio, - * it's possible for this lwb's write to complete prior - * to the previous lwb's write completing; and thus, the - * vdevs for the previous lwb would be flushed prior to - * that lwb's data being written to those vdevs (the - * vdevs are flushed in the lwb write zio's completion - * handler, zil_lwb_write_done()). - */ - if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED); - - ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); - zio_add_child(lwb->lwb_write_zio, - last_lwb_opened->lwb_write_zio); - } + if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { + ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); + zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); + } else { + ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } + + ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); + zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to - * accept new itxs being committed to it. To do this, the lwb's zio - * structures are created, and linked to the lwb. This function is - * idempotent; if the passed in lwb has already been opened, this - * function is essentially a no-op. + * accept new itxs being committed to it. This function is idempotent; if + * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { - zbookmark_phys_t zb; - zio_priority_t prio; - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb, !=, NULL); - EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); - EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); - if (lwb->lwb_root_zio != NULL) + if (lwb->lwb_state != LWB_STATE_NEW) { + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, - lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); - - /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ - mutex_enter(&zilog->zl_lock); - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; } - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, - &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - + mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; - - zil_lwb_set_zio_dependency(zilog, lwb); zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } @@ -1769,57 +1718,21 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { - lwb_t *nlwb = NULL; - zil_chain_t *zilc; - spa_t *spa = zilog->zl_spa; - blkptr_t *bp; - dmu_tx_t *tx; - uint64_t txg; - uint64_t zil_blksz; - int i, error; - boolean_t slog; + int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + lwb->lwb_state = LWB_STATE_CLOSED; /* - * If this lwb includes indirect writes, we have to commit before - * creating the transaction, otherwise we may end up in dead lock. + * If there was an allocation failure then returned NULL will trigger + * zil_commit_writer_stall() at the caller. This is inherently racy, + * since allocation may not have happened yet. */ - if (lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } - - /* - * Allocate the next block and save its address in this block - * before writing it in order to establish the log chain. - */ - - tx = dmu_tx_create(zilog->zl_os); - - /* - * Since we are not going to create any new dirty data, and we - * can even help with clearing the existing dirty data, we - * should not be subject to the dirty data based delays. We - * use TXG_NOTHROTTLE to bypass the delay mechanism. - */ - VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); - - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - mutex_enter(&zilog->zl_lwb_io_lock); - lwb->lwb_issued_txg = txg; - zilog->zl_lwb_inflight[txg & TXG_MASK]++; - zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); - mutex_exit(&zilog->zl_lwb_io_lock); + if (lwb->lwb_error != 0) + return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next @@ -1837,7 +1750,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) * the maximum block size because we can exhaust the available * pool log space. */ - zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); + uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); @@ -1849,94 +1762,149 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) - zilc = (zil_chain_t *)lwb->lwb_buf; - else - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; - BP_ZERO(bp); - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); - bp->blk_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; - - /* - * Allocate a new log write block (lwb). - */ - nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); - } - - lwb->lwb_state = LWB_STATE_ISSUED; - - dmu_tx_commit(tx); - - /* - * We need to acquire the config lock for the lwb to issue it later. - * However, if we already have a queue of closed parent lwbs already - * holding the config lock (but not yet issued), we can't block here - * waiting on the lock or we will deadlock. In that case we must - * first issue to parent IOs before waiting on the lock. - */ - if (ilwbs && !list_is_empty(ilwbs)) { - if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - } else { - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - - if (ilwbs) - list_insert_tail(ilwbs, lwb); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); + return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. - * Does not require locking. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { + spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; - int wsz; + boolean_t slog; + zbookmark_phys_t zb; + zio_priority_t prio; + int error; - /* Actually fill the lwb with the data if not yet. */ - if (!lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); - wsz = lwb->lwb_write_zio->io_size; + /* Actually fill the lwb with the data. */ + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; - zilc = (zil_chain_t *)lwb->lwb_buf; - } else { - wsz = lwb->lwb_sz; - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - } - zilc->zc_pad = 0; - zilc->zc_nused = lwb->lwb_nused; - zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, + ZIO_FLAG_CANFAIL); /* - * clear unused data for security + * The lwb is now ready to be issued, but it can be only if it already + * got its block pointer allocated or the allocation has failed. + * Otherwise leave it as-is, relying on some other thread to issue it + * after allocating its block pointer via calling zil_lwb_write_issue() + * for the previous lwb(s) in the chain. */ - memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + mutex_enter(&zilog->zl_lock); + lwb->lwb_state = LWB_STATE_READY; + if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { + mutex_exit(&zilog->zl_lock); + return; + } + mutex_exit(&zilog->zl_lock); + +next_lwb: + if (lwb->lwb_slim) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); + int wsz = lwb->lwb_sz; + if (lwb->lwb_error == 0) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, + &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + lwb, prio, ZIO_FLAG_CANFAIL, &zb); + zil_lwb_add_block(lwb, &lwb->lwb_blk); + + if (lwb->lwb_slim) { + /* For Slim ZIL only write what is used. */ + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, + int); + ASSERT3S(wsz, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; + } + memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + } else { + /* + * We can't write the lwb if there was an allocation failure, + * so create a null zio instead just to maintain dependencies. + */ + lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, + zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); + lwb->lwb_write_zio->io_error = lwb->lwb_error; + } + if (lwb->lwb_child_zio) + zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); + + /* + * Open transaction to allocate the next block pointer. + */ + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Allocate next the block pointer unless we are already in error. + */ + lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); + blkptr_t *bp = &zilc->zc_next_blk; + BP_ZERO(bp); + error = lwb->lwb_error; + if (error == 0) { + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, + &slog); + } + if (error == 0) { + ASSERT3U(bp->blk_birth, ==, txg); + BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : + ZIO_CHECKSUM_ZILOG); + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + } + + /* + * Reduce TXG open time by incrementing inflight counter and committing + * the transaciton. zil_sync() will wait for it to return to zero. + */ + mutex_enter(&zilog->zl_lwb_io_lock); + lwb->lwb_issued_txg = txg; + zilog->zl_lwb_inflight[txg & TXG_MASK]++; + zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); + mutex_exit(&zilog->zl_lwb_io_lock); + dmu_tx_commit(tx); + + spa_config_enter(spa, SCL_STATE, lwb, RW_READER); + + /* + * We've completed all potentially blocking operations. Update the + * nlwb and allow it proceed without possible lock order reversals. + */ + mutex_enter(&zilog->zl_lock); + zil_lwb_set_zio_dependency(zilog, lwb); + lwb->lwb_state = LWB_STATE_ISSUED; + + if (nlwb) { + nlwb->lwb_blk = *bp; + nlwb->lwb_error = error; + nlwb->lwb_slog = slog; + nlwb->lwb_alloc_txg = txg; + if (nlwb->lwb_state != LWB_STATE_READY) + nlwb = NULL; + } + mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); @@ -1955,11 +1923,19 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } - ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER)); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); - zio_nowait(lwb->lwb_root_zio); + if (lwb->lwb_child_zio) + zio_nowait(lwb->lwb_child_zio); zio_nowait(lwb->lwb_write_zio); + zio_nowait(lwb->lwb_root_zio); + + /* + * If nlwb was ready when we gave it the block pointer, + * it is on us to issue it and possibly following ones. + */ + lwb = nlwb; + if (lwb) + goto next_lwb; } /* @@ -2031,10 +2007,7 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); - itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } @@ -2053,17 +2026,17 @@ cont: * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); - zil_lwb_write_open(zilog, lwb); - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; /* * There must be enough space in the new, empty log block to @@ -2101,7 +2074,7 @@ cont: clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); @@ -2113,22 +2086,9 @@ cont: goto cont; } - /* - * We have to really issue all queued LWBs before we may have to - * wait for a txg sync. Otherwise we may end up in a dead lock. - */ - if (lr->lrc_txtype == TX_WRITE) { - boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); - if (frozen || itx->itx_wr_state == WR_INDIRECT) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - } - if (itx->itx_wr_state == WR_INDIRECT) - lwb->lwb_indirect = B_TRUE; - if (frozen) - txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); - } + if (lr->lrc_txtype == TX_WRITE && + lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } @@ -2191,26 +2151,24 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); + if (lwb->lwb_child_zio == NULL) { + lwb->lwb_child_zio = zio_root( + zilog->zl_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } } /* - * We pass in the "lwb_write_zio" rather than - * "lwb_root_zio" so that the "lwb_write_zio" - * becomes the parent of any zio's created by - * the "zl_get_data" callback. The vdevs are - * flushed after the "lwb_write_zio" completes, - * so we want to make sure that completion - * callback waits for these additional zio's, - * such that the vdevs used by those zio's will - * be included in the lwb's vdev tree, and those - * vdevs will be properly flushed. If we passed - * in "lwb_root_zio" here, then these additional - * vdevs may not be flushed; e.g. if these zio's - * completed after "lwb_write_zio" completed. + * The "lwb_child_zio" we pass in will become a child of + * "lwb_write_zio", when one is created, so one will be + * a parent of any zio's created by the "zl_get_data". + * This way "lwb_write_zio" will first wait for children + * block pointers before own writing, and then for their + * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, - lwb->lwb_write_zio); + lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, @@ -2243,12 +2201,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) error); zfs_fallthrough; case EIO: - if (lwb->lwb_indirect) { - txg_wait_synced(zilog->zl_dmu_pool, - lr->lrc_txg); - } else { - lwb->lwb_write_zio->io_error = error; - } + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; @@ -2692,7 +2646,6 @@ zil_prune_commit_list(zilog_t *zilog) zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); - itx->itx_private = NULL; } mutex_exit(&zilog->zl_lock); @@ -2770,10 +2723,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - first = (lwb->lwb_state != LWB_STATE_OPENED) && + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); + first = (lwb->lwb_state == LWB_STATE_NEW) && ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || plwb->lwb_state == LWB_STATE_FLUSH_DONE); } @@ -2897,37 +2849,32 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" - * variable is in one of the following states: "closed" - * or "open". + * variable is in "new" or "opened" state. * - * If it's "closed", then no itxs have been committed to - * it, so there's no point in issuing its zio (i.e. it's - * "empty"). + * If it's "new", then no itxs have been committed to it, so + * there's no point in issuing its zio (i.e. it's "empty"). * - * If it's "open", then it contains one or more itxs that + * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * - * 1. Ideally, there will be more ZIL activity occurring - * on the system, such that this function will be - * immediately called again (not necessarily by the same - * thread) and this lwb's zio will be issued via - * zil_lwb_assign(). This way, the lwb is guaranteed to - * be "full" when it is issued to disk, and we'll make - * use of the lwb's size the best we can. + * 1. Ideally, there will be more ZIL activity occurring on + * the system, such that this function will be immediately + * called again by different thread and this lwb will be + * closed by zil_lwb_assign(). This way, the lwb will be + * "full" when it is issued to disk, and we'll make use of + * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on - * the system, such that this lwb's zio isn't issued via - * zil_lwb_assign(), zil_commit_waiter() will issue the - * lwb's zio. If this occurs, the lwb is not guaranteed + * the system, zil_commit_waiter() will close it and issue + * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. @@ -2957,8 +2904,11 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) hrtime_t sleep = zilog->zl_last_lwb_latency * zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || - lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + lwb->lwb_nmax - lwb->lwb_nused < + lwb->lwb_nmax / 8) { + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, + LWB_STATE_NEW); zilog->zl_cur_used = 0; if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) @@ -3041,7 +2991,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can @@ -3050,9 +3000,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + if (lwb->lwb_state != LWB_STATE_OPENED) return; /* @@ -3075,8 +3023,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * wind up with a use-after-free error below. */ if (zcw->zcw_done) { - lwb = NULL; - goto out; + mutex_exit(&zilog->zl_issuer_lock); + return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -3087,28 +3035,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition - * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb - * _can_ transition from ISSUED to DONE, but it's OK to race with + * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb + * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in - * the ISSUED or DONE states. + * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on - * if it's ISSUED or OPENED, and block any other threads that might - * attempt to issue this lwb. For that reason we hold the + * if it's OPENED or CLOSED, and block any other threads that might + * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_close() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) { - lwb = NULL; - goto out; + if (lwb->lwb_state != LWB_STATE_OPENED) { + mutex_exit(&zilog->zl_issuer_lock); + return; } - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + /* + * We do not need zcw_lock once we hold zl_issuer_lock and know lwb + * is still open. But we have to drop it to avoid a deadlock in case + * callback of zio issued by zil_lwb_write_issue() try to get it, + * while zil_lwb_write_issue() is blocked on attempt to issue next + * lwb it found in LWB_STATE_READY state. + */ + mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and @@ -3116,9 +3069,9 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* * Since the lwb's zio hadn't been issued by the time this thread @@ -3141,34 +3094,15 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. - * - * We must drop the commit waiter's lock prior to - * calling zil_commit_writer_stall() or else we can wind - * up with the following deadlock: - * - * - This thread is waiting for the txg to sync while - * holding the waiter's lock; txg_wait_synced() is - * used within txg_commit_writer_stall(). - * - * - The txg can't sync because it is waiting for this - * lwb's zio callback to call dmu_tx_commit(). - * - * - The lwb's zio callback can't call dmu_tx_commit() - * because it's blocked trying to acquire the waiter's - * lock, which occurs prior to calling dmu_tx_commit() */ - mutex_exit(&zcw->zcw_lock); zil_lwb_write_issue(zilog, lwb); - lwb = NULL; zil_commit_writer_stall(zilog); - mutex_enter(&zcw->zcw_lock); - } - -out: - mutex_exit(&zilog->zl_issuer_lock); - if (lwb) + mutex_exit(&zilog->zl_issuer_lock); + } else { + mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + } + mutex_enter(&zcw->zcw_lock); } /* @@ -3233,7 +3167,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ - IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); + IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); @@ -3281,6 +3215,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) */ IMPLY(lwb != NULL, + lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); @@ -3635,10 +3571,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || - lwb->lwb_max_txg > txg) + lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); - zio_free(spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* @@ -3651,18 +3588,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) BP_ZERO(&zh->zh_log); } - /* - * Remove fastwrite on any blocks that have been pre-allocated for - * the next commit. This prevents fastwrite counter pollution by - * unused, long-lived LWBs. - */ - for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { - if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) { - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 0; - } - } - mutex_exit(&zilog->zl_lock); } @@ -3854,17 +3779,18 @@ zil_close(zilog_t *zilog) } mutex_enter(&zilog->zl_lock); + txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) - txg = zilog->zl_dirty_max_txg; - else - txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); + if (lwb != NULL) { + txg = MAX(txg, lwb->lwb_alloc_txg); + txg = MAX(txg, lwb->lwb_max_txg); + } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_close(). + * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); @@ -3893,11 +3819,7 @@ zil_close(zilog_t *zilog) lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } @@ -4018,7 +3940,7 @@ zil_suspend(const char *osname, void **cookiep) /* * We need to use zil_commit_impl to ensure we wait for all - * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed + * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 10279fde89d..3b3b40fa73d 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -1596,6 +1596,19 @@ zio_shrink(zio_t *zio, uint64_t size) } } +/* + * Round provided allocation size up to a value that can be allocated + * by at least some vdev(s) in the pool with minimum or no additional + * padding and without extra space usage on others + */ +static uint64_t +zio_roundup_alloc_size(spa_t *spa, uint64_t size) +{ + if (size > spa->spa_min_alloc) + return (roundup(size, spa->spa_gcd_alloc)); + return (spa->spa_min_alloc); +} + /* * ========================================================================== * Prepare to read and write logical blocks @@ -1762,8 +1775,9 @@ zio_write_compress(zio_t *zio) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), - spa_max_replication(spa)) == BP_GET_NDVAS(bp)); + ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || + MIN(zp->zp_copies, spa_max_replication(spa)) + == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ @@ -1802,9 +1816,8 @@ zio_write_compress(zio_t *zio) * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)roundup(psize, - spa->spa_min_alloc); + size_t rounded = (size_t)zio_roundup_alloc_size(spa, + psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1847,8 +1860,8 @@ zio_write_compress(zio_t *zio) * take this codepath because it will change the on-disk block * and decryption will fail. */ - size_t rounded = MIN((size_t)roundup(psize, - spa->spa_min_alloc), lsize); + size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), + lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); @@ -3012,11 +3025,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - /* - * We didn't allocate this bp, so make sure it doesn't get unmarked. - */ - pio->io_flags &= ~ZIO_FLAG_FASTWRITE; - zio_nowait(zio); return (pio); @@ -3604,7 +3612,6 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) @@ -3764,7 +3771,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, @@ -4460,8 +4467,8 @@ zio_ready(zio_t *zio) zio_t *pio, *pio_next; zio_link_t *zl = NULL; - if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, - ZIO_WAIT_READY)) { + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | + ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } @@ -4919,12 +4926,6 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } - if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && - !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && - !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { - metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); - } - /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c index 6090959c5b8..9de515e8767 100644 --- a/sys/contrib/openzfs/module/zfs/zio_checksum.c +++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c @@ -515,8 +515,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, } if (info != NULL) { - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index cd4e6f0c755..53dcb4dee44 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); @@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change @@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); - error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run index 618eeb93401..2c8d5cb0ecb 100644 --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -34,6 +34,17 @@ tags = ['functional', 'acl', 'posix-sa'] tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] +[tests/functional/block_cloning:Linux] +tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', + 'block_cloning_copyfilerange_fallback', + 'block_cloning_ficlone', 'block_cloning_ficlonerange', + 'block_cloning_ficlonerange_partial', + 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', + 'block_cloning_disabled_ficlonerange', + 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_copyfilerange_fallback_same_txg'] +tags = ['functional', 'block_cloning'] + [tests/functional/chattr:Linux] tests = ['chattr_001_pos', 'chattr_002_neg'] tags = ['functional', 'chattr'] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index cf438e0e649..e1bbe063ab4 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -134,6 +134,12 @@ ci_reason = 'CI runner doesn\'t have all requirements' # idmap_reason = 'Idmapped mount needs kernel 5.12+' +# +# copy_file_range() is not supported by all kernels +# +cfr_reason = 'Kernel copy_file_range support required' +cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+' + # # These tests are known to fail, thus we use this list to prevent these # failures from failing the job as a whole; only unexpected failures @@ -288,6 +294,18 @@ elif sys.platform.startswith('linux'): 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason], + 'block_cloning/block_cloning_disabled_copyfilerange': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_partial': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_fallback': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_cross_dataset': + ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': + ['SKIP', cfr_cross_reason], }) diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore b/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore index f68f5807281..5f53b687191 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore @@ -1,6 +1,7 @@ /badsend /btree_test /chg_usr_exec +/clonefile /devname2devid /dir_rd_update /draid diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am index 066abb6ce3b..9bdb3c20975 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am @@ -119,6 +119,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2 scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util +scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile %C%_idmap_util_LDADD = libspl.la diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/clonefile.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/clonefile.c new file mode 100644 index 00000000000..696dc471d8c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/clonefile.c @@ -0,0 +1,333 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2023, Rob Norris + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* + * This program is to test the availability and behaviour of copy_file_range, + * FICLONE, FICLONERANGE and FIDEDUPERANGE in the Linux kernel. It should + * compile and run even if these features aren't exposed through the libc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __NR_copy_file_range +#if defined(__x86_64__) +#define __NR_copy_file_range (326) +#elif defined(__i386__) +#define __NR_copy_file_range (377) +#elif defined(__s390__) +#define __NR_copy_file_range (375) +#elif defined(__arm__) +#define __NR_copy_file_range (391) +#elif defined(__aarch64__) +#define __NR_copy_file_range (285) +#elif defined(__powerpc__) +#define __NR_copy_file_range (379) +#else +#error "no definition of __NR_copy_file_range for this platform" +#endif +#endif /* __NR_copy_file_range */ + +ssize_t +copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) + __attribute__((weak)); + +static inline ssize_t +cf_copy_file_range(int sfd, loff_t *soff, int dfd, loff_t *doff, + size_t len, unsigned int flags) +{ + if (copy_file_range) + return (copy_file_range(sfd, soff, dfd, doff, len, flags)); + return ( + syscall(__NR_copy_file_range, sfd, soff, dfd, doff, len, flags)); +} + +/* Define missing FICLONE */ +#ifdef FICLONE +#define CF_FICLONE FICLONE +#else +#define CF_FICLONE _IOW(0x94, 9, int) +#endif + +/* Define missing FICLONERANGE and support structs */ +#ifdef FICLONERANGE +#define CF_FICLONERANGE FICLONERANGE +typedef struct file_clone_range cf_file_clone_range_t; +#else +typedef struct { + int64_t src_fd; + uint64_t src_offset; + uint64_t src_length; + uint64_t dest_offset; +} cf_file_clone_range_t; +#define CF_FICLONERANGE _IOW(0x94, 13, cf_file_clone_range_t) +#endif + +/* Define missing FIDEDUPERANGE and support structs */ +#ifdef FIDEDUPERANGE +#define CF_FIDEDUPERANGE FIDEDUPERANGE +#define CF_FILE_DEDUPE_RANGE_SAME FILE_DEDUPE_RANGE_SAME +#define CF_FILE_DEDUPE_RANGE_DIFFERS FILE_DEDUPE_RANGE_DIFFERS +typedef struct file_dedupe_range_info cf_file_dedupe_range_info_t; +typedef struct file_dedupe_range cf_file_dedupe_range_t; +#else +typedef struct { + int64_t dest_fd; + uint64_t dest_offset; + uint64_t bytes_deduped; + int32_t status; + uint32_t reserved; +} cf_file_dedupe_range_info_t; +typedef struct { + uint64_t src_offset; + uint64_t src_length; + uint16_t dest_count; + uint16_t reserved1; + uint32_t reserved2; + cf_file_dedupe_range_info_t info[0]; +} cf_file_dedupe_range_t; +#define CF_FIDEDUPERANGE _IOWR(0x94, 54, cf_file_dedupe_range_t) +#define CF_FILE_DEDUPE_RANGE_SAME (0) +#define CF_FILE_DEDUPE_RANGE_DIFFERS (1) +#endif + +typedef enum { + CF_MODE_NONE, + CF_MODE_CLONE, + CF_MODE_CLONERANGE, + CF_MODE_COPYFILERANGE, + CF_MODE_DEDUPERANGE, +} cf_mode_t; + +static int +usage(void) +{ + printf( + "usage:\n" + " FICLONE:\n" + " clonefile -c \n" + " FICLONERANGE:\n" + " clonefile -r \n" + " copy_file_range:\n" + " clonefile -f \n" + " FIDEDUPERANGE:\n" + " clonefile -d \n"); + return (1); +} + +int do_clone(int sfd, int dfd); +int do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len); +int do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len); +int do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len); + +int quiet = 0; + +int +main(int argc, char **argv) +{ + cf_mode_t mode = CF_MODE_NONE; + + char c; + while ((c = getopt(argc, argv, "crfdq")) != -1) { + switch (c) { + case 'c': + mode = CF_MODE_CLONE; + break; + case 'r': + mode = CF_MODE_CLONERANGE; + break; + case 'f': + mode = CF_MODE_COPYFILERANGE; + break; + case 'd': + mode = CF_MODE_DEDUPERANGE; + break; + case 'q': + quiet = 1; + break; + } + } + + if (mode == CF_MODE_NONE || (argc-optind) < 2 || + (mode != CF_MODE_CLONE && (argc-optind) < 5)) + return (usage()); + + loff_t soff = 0, doff = 0; + size_t len = 0; + if (mode != CF_MODE_CLONE) { + soff = strtoull(argv[optind+2], NULL, 10); + if (soff == ULLONG_MAX) { + fprintf(stderr, "invalid source offset"); + return (1); + } + doff = strtoull(argv[optind+3], NULL, 10); + if (doff == ULLONG_MAX) { + fprintf(stderr, "invalid dest offset"); + return (1); + } + len = strtoull(argv[optind+4], NULL, 10); + if (len == ULLONG_MAX) { + fprintf(stderr, "invalid length"); + return (1); + } + } + + int sfd = open(argv[optind], O_RDONLY); + if (sfd < 0) { + fprintf(stderr, "open: %s: %s\n", + argv[optind], strerror(errno)); + return (1); + } + + int dfd = open(argv[optind+1], O_WRONLY|O_CREAT, + S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); + if (dfd < 0) { + fprintf(stderr, "open: %s: %s\n", + argv[optind+1], strerror(errno)); + close(sfd); + return (1); + } + + int err; + switch (mode) { + case CF_MODE_CLONE: + err = do_clone(sfd, dfd); + break; + case CF_MODE_CLONERANGE: + err = do_clonerange(sfd, dfd, soff, doff, len); + break; + case CF_MODE_COPYFILERANGE: + err = do_copyfilerange(sfd, dfd, soff, doff, len); + break; + case CF_MODE_DEDUPERANGE: + err = do_deduperange(sfd, dfd, soff, doff, len); + break; + default: + abort(); + } + + off_t spos = lseek(sfd, 0, SEEK_CUR); + off_t slen = lseek(sfd, 0, SEEK_END); + off_t dpos = lseek(dfd, 0, SEEK_CUR); + off_t dlen = lseek(dfd, 0, SEEK_END); + + fprintf(stderr, "file offsets: src=%lu/%lu; dst=%lu/%lu\n", spos, slen, + dpos, dlen); + + close(dfd); + close(sfd); + + return (err == 0 ? 0 : 1); +} + +int +do_clone(int sfd, int dfd) +{ + fprintf(stderr, "using FICLONE\n"); + int err = ioctl(dfd, CF_FICLONE, sfd); + if (err < 0) { + fprintf(stderr, "ioctl(FICLONE): %s\n", strerror(errno)); + return (err); + } + return (0); +} + +int +do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) +{ + fprintf(stderr, "using FICLONERANGE\n"); + cf_file_clone_range_t fcr = { + .src_fd = sfd, + .src_offset = soff, + .src_length = len, + .dest_offset = doff, + }; + int err = ioctl(dfd, CF_FICLONERANGE, &fcr); + if (err < 0) { + fprintf(stderr, "ioctl(FICLONERANGE): %s\n", strerror(errno)); + return (err); + } + return (0); +} + +int +do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) +{ + fprintf(stderr, "using copy_file_range\n"); + ssize_t copied = cf_copy_file_range(sfd, &soff, dfd, &doff, len, 0); + if (copied < 0) { + fprintf(stderr, "copy_file_range: %s\n", strerror(errno)); + return (1); + } + if (copied != len) { + fprintf(stderr, "copy_file_range: copied less than requested: " + "requested=%lu; copied=%lu\n", len, copied); + return (1); + } + return (0); +} + +int +do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) +{ + fprintf(stderr, "using FIDEDUPERANGE\n"); + + char buf[sizeof (cf_file_dedupe_range_t)+ + sizeof (cf_file_dedupe_range_info_t)] = {0}; + cf_file_dedupe_range_t *fdr = (cf_file_dedupe_range_t *)&buf[0]; + cf_file_dedupe_range_info_t *fdri = + (cf_file_dedupe_range_info_t *) + &buf[sizeof (cf_file_dedupe_range_t)]; + + fdr->src_offset = soff; + fdr->src_length = len; + fdr->dest_count = 1; + + fdri->dest_fd = dfd; + fdri->dest_offset = doff; + + int err = ioctl(sfd, CF_FIDEDUPERANGE, fdr); + if (err != 0) + fprintf(stderr, "ioctl(FIDEDUPERANGE): %s\n", strerror(errno)); + + if (fdri->status < 0) { + fprintf(stderr, "dedup failed: %s\n", strerror(-fdri->status)); + err = -1; + } else if (fdri->status == CF_FILE_DEDUPE_RANGE_DIFFERS) { + fprintf(stderr, "dedup failed: range differs\n"); + err = -1; + } + + return (err); +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/readmmap.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/readmmap.c index 704ffd55c8a..a5c8079d0e4 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/readmmap.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/readmmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include int diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg index b3cfe149ffa..fa545e06bbf 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg @@ -182,6 +182,7 @@ export ZFS_FILES='zdb export ZFSTEST_FILES='badsend btree_test chg_usr_exec + clonefile devname2devid dir_rd_update draid diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index ff65dc1ac2b..66aff5026f8 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -90,6 +90,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ + functional/block_cloning/block_cloning.kshlib \ functional/cache/cache.cfg \ functional/cache/cache.kshlib \ functional/cachefile/cachefile.cfg \ @@ -437,6 +438,19 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ + functional/block_cloning/cleanup.ksh \ + functional/block_cloning/setup.ksh \ + functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ + functional/block_cloning/block_cloning_copyfilerange.ksh \ + functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ + functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ + functional/block_cloning/block_cloning_disabled_ficlone.ksh \ + functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ + functional/block_cloning/block_cloning_ficlone.ksh \ + functional/block_cloning/block_cloning_ficlonerange.ksh \ + functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib new file mode 100644 index 00000000000..8e16366b4cd --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -0,0 +1,54 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +function have_same_content +{ + typeset hash1=$(cat $1 | md5sum) + typeset hash2=$(cat $2 | md5sum) + + log_must [ "$hash1" = "$hash2" ] +} + +# +# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2 +# +# Returns a space-separated list of the indexes (starting at 0) of the L0 +# blocks that are shared between both files (by first DVA and checksum). +# Assumes that the two files have the same content, use have_same_content to +# confirm that. +# +function get_same_blocks +{ + typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$ + zdb -vvvvv $1 -O $2 | \ + awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a + zdb -vvvvv $3 -O $4 | \ + awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b + echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') +} + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh new file mode 100755 index 00000000000..43ea47b0ef1 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="The copy_file_range syscall can clone whole files." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "0 1 2 3" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh new file mode 100755 index 00000000000..74e6b04903a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then + log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" +fi + +claim="The copy_file_range syscall can clone across datasets." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must zfs create $TESTPOOL/$TESTFS1 +log_must zfs create $TESTPOOL/$TESTFS2 + +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS1/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must \ + clonefile -f /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2 + +typeset blocks=$(get_same_blocks \ + $TESTPOOL/$TESTFS1 file1 $TESTPOOL/$TESTFS2 file2) +log_must [ "$blocks" = "0 1 2 3" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh new file mode 100755 index 00000000000..9a96eacd60a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning not possible." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must sync_pool $TESTPOOL + + +log_note "Copying entire file with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "0 1 2 3" ] + + +log_note "Copying within a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1 2 3" ] + + +log_note "Copying across a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh new file mode 100755 index 00000000000..a10545bc076 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning on same txg" + +log_assert $claim + +typeset timeout=$(get_tunable TXG_TIMEOUT) + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 TXG_TIMEOUT $timeout +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must set_tunable64 TXG_TIMEOUT 5000 + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 + +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "" ] + +log_pass $claim + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh new file mode 100755 index 00000000000..a5da0a0bd35 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="The copy_file_range syscall can clone parts of a file." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "" ] + +log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "1 2" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh new file mode 100755 index 00000000000..d21b6251134 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="The copy_file_range syscall copies files when block cloning is disabled." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=disabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh new file mode 100755 index 00000000000..10a2715ea25 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +claim="The FICLONE ioctl fails when block cloning is disabled." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=disabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_mustnot clonefile -c /$TESTPOOL/file1 /$TESTPOOL/file2 + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh new file mode 100755 index 00000000000..e8461e6d3c3 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +claim="The FICLONERANGE ioctl fails when block cloning is disabled." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=disabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_mustnot clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288 + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh new file mode 100755 index 00000000000..3f227fb68ee --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh @@ -0,0 +1,56 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +claim="The FICLONE ioctl can clone files." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must clonefile -c /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "0 1 2 3" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh new file mode 100755 index 00000000000..cefc4336aef --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh @@ -0,0 +1,56 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +claim="The FICLONERANGE ioctl can clone whole files." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "0 1 2 3" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh new file mode 100755 index 00000000000..067f55aaa65 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +claim="The FICLONERANGE ioctl can clone parts of a file." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128K count=4 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "" ] + +log_must clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 + +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "1 2" ] + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh new file mode 100755 index 00000000000..7ac13adb632 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +default_cleanup_noexit + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh new file mode 100755 index 00000000000..512f5a0644d --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +if ! command -v clonefile > /dev/null ; then + log_unsupported "clonefile program required to test block cloning" +fi + +verify_runnable "global" + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh index 6ccec6abd66..574cb7654d1 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh @@ -35,7 +35,7 @@ # # STRATEGY: # 1. Create various pools with different ashift values. -# 2. Verify 'attach -o ashift=' works only with allowed values. +# 2. Verify 'attach' works. # verify_runnable "global" @@ -66,26 +66,14 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do - for cmdval in ${ashifts[@]} - do - log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 - log_must verify_ashift $disk1 $ashift - - # ashift_of(attached_disk) <= ashift_of(existing_vdev) - if [[ $cmdval -le $ashift ]] - then - log_must zpool attach -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - log_must verify_ashift $disk2 $ashift - else - log_mustnot zpool attach -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - fi - # clean things for the next run - log_must zpool destroy $TESTPOOL1 - log_must zpool labelclear $disk1 - log_must zpool labelclear $disk2 - done + log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 + log_must verify_ashift $disk1 $ashift + log_must zpool attach $TESTPOOL1 $disk1 $disk2 + log_must verify_ashift $disk2 $ashift + # clean things for the next run + log_must zpool destroy $TESTPOOL1 + log_must zpool labelclear $disk1 + log_must zpool labelclear $disk2 done typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-") diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh index 37ed0062e61..9595e51241b 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh @@ -35,7 +35,7 @@ # # STRATEGY: # 1. Create various pools with different ashift values. -# 2. Verify 'replace -o ashift=' works only with allowed values. +# 2. Verify 'replace' works. # verify_runnable "global" @@ -66,26 +66,16 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do - for cmdval in ${ashifts[@]} - do - log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 - log_must verify_ashift $disk1 $ashift - # ashift_of(replacing_disk) <= ashift_of(existing_vdev) - if [[ $cmdval -le $ashift ]] - then - log_must zpool replace -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - log_must verify_ashift $disk2 $ashift - wait_replacing $TESTPOOL1 - else - log_mustnot zpool replace -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - fi - # clean things for the next run - log_must zpool destroy $TESTPOOL1 - log_must zpool labelclear $disk1 - log_must zpool labelclear $disk2 - done + log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 + log_must verify_ashift $disk1 $ashift + # ashift_of(replacing_disk) <= ashift_of(existing_vdev) + log_must zpool replace $TESTPOOL1 $disk1 $disk2 + log_must verify_ashift $disk2 $ashift + wait_replacing $TESTPOOL1 + # clean things for the next run + log_must zpool destroy $TESTPOOL1 + log_must zpool labelclear $disk1 + log_must zpool labelclear $disk2 done typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-") diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh index ffdaf91a284..b4ac18e5ea2 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh @@ -34,10 +34,8 @@ # # STRATEGY: # 1. Create a pool with default values. -# 2. Verify 'zpool replace' uses the ashift pool property value when -# replacing an existing device. -# 3. Verify the default ashift value can still be overridden by manually -# specifying '-o ashift=' from the command line. +# 2. Override the pool ashift property. +# 3. Verify 'zpool replace' works. # verify_runnable "global" @@ -72,21 +70,9 @@ do do log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 log_must zpool set ashift=$pprop $TESTPOOL1 - # ashift_of(replacing_disk) <= ashift_of(existing_vdev) - if [[ $pprop -le $ashift ]] - then - log_must zpool replace $TESTPOOL1 $disk1 $disk2 - wait_replacing $TESTPOOL1 - log_must verify_ashift $disk2 $ashift - else - # cannot replace if pool prop ashift > vdev ashift - log_mustnot zpool replace $TESTPOOL1 $disk1 $disk2 - # verify we can override the pool prop value manually - log_must zpool replace -o ashift=$ashift $TESTPOOL1 \ - $disk1 $disk2 - wait_replacing $TESTPOOL1 - log_must verify_ashift $disk2 $ashift - fi + log_must zpool replace $TESTPOOL1 $disk1 $disk2 + wait_replacing $TESTPOOL1 + log_must verify_ashift $disk2 $ashift # clean things for the next run log_must zpool destroy $TESTPOOL1 log_must zpool labelclear $disk1 diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index bc0ec800535..29425d3368e 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -213,6 +213,9 @@ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ +/* BLK_STS_RESV_CONFLICT is defined */ +/* #undef HAVE_BLK_STS_RESV_CONFLICT */ + /* Define if revalidate_disk() in block_device_operations */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */ @@ -271,6 +274,9 @@ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ +/* disk_check_media_change() exists */ +/* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */ + /* disk_*_io_acct() available */ /* #undef HAVE_DISK_IO_ACCT */ @@ -849,6 +855,15 @@ /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ +/* fops->clone_file_range() is available */ +/* #undef HAVE_VFS_CLONE_FILE_RANGE */ + +/* fops->copy_file_range() is available */ +/* #undef HAVE_VFS_COPY_FILE_RANGE */ + +/* fops->dedupe_file_range() is available */ +/* #undef HAVE_VFS_DEDUPE_FILE_RANGE */ + /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ @@ -864,6 +879,12 @@ /* filemap_dirty_folio exists */ /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */ +/* file_operations_extend takes .copy_file_range() and .clone_file_range() */ +/* #undef HAVE_VFS_FILE_OPERATIONS_EXTEND */ + +/* generic_copy_file_range() is available */ +/* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */ + /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ @@ -882,6 +903,9 @@ /* read_folio exists */ /* #undef HAVE_VFS_READ_FOLIO */ +/* fops->remap_file_range() is available */ +/* #undef HAVE_VFS_REMAP_FILE_RANGE */ + /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ @@ -1050,7 +1074,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.0-FreeBSD_g009d3288" +#define ZFS_META_ALIAS "zfs-2.2.0-FreeBSD_g32949f256" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1059,7 +1083,7 @@ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.3" +#define ZFS_META_KVER_MAX "6.4" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" @@ -1080,7 +1104,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_g009d3288" +#define ZFS_META_RELEASE "FreeBSD_g32949f256" /* Define the project version. */ #define ZFS_META_VERSION "2.2.0" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index afe3b29c865..71be2d813a0 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.0-rc1-0-g009d3288d" +#define ZFS_META_GITREV "zfs-2.2.0-rc3-31-g32949f256"