From 83f359245adc9c03ee0ededa2ff00b7dd9f82d2a Mon Sep 17 00:00:00 2001
From: Gleb Smirnoff <glebius@FreeBSD.org>
Date: Thu, 15 Aug 2024 09:08:43 -0700
Subject: [PATCH 01/59] FreeBSD: fix build without kernel option MAC

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Johnston <markj@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Gleb Smirnoff <glebius@FreeBSD.org>
Closes #16446
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 283f5696317..01b964f98f3 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6125,7 +6125,9 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
+#ifdef MAC
 out_locked:
+#endif
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);

From 963e6c9f3ffc0bc767ca8b89549be595f29f9470 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 16 Aug 2024 00:39:44 +0500
Subject: [PATCH 02/59] Fix incorrect error report on vdev attach/replace

Report the correct error message in libzfs when attaching/replacing a
vdev with a higher ashift.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16449
---
 lib/libzfs/libzfs_pool.c | 7 +++++++
 module/zfs/spa.c         | 6 ++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 8a043aa0f87..e493e8562a7 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -3733,6 +3733,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
 			(void) zpool_standard_error(hdl, errno, errbuf);
 		}
 		break;
+
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "The new device cannot have a higher alignment requirement "
+		    "than the top-level vdev."));
+		(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
+		break;
 	default:
 		(void) zpool_standard_error(hdl, errno, errbuf);
 	}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index cafc7196c35..99a8d107eca 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -7602,8 +7602,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
-	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
+		return (spa_vdev_exit(spa, newrootvd, txg,
+		    ZFS_ERR_ASHIFT_MISMATCH));
+	}
 
 	/*
 	 * RAIDZ-expansion-specific checks.

From f2f4ada240e3560faa721082c4a62dcb0cce0115 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 15 Aug 2024 14:00:18 -0700
Subject: [PATCH 03/59] Linux 6.10 compat: fix rpm-kmod and builtin

The 6.10 kernel broke our rpm-kmod builds.  The 6.10 kernel really
wants the source files in the same directory as the object files.
This workaround makes rpm-kmod work again.  It also updates
the builtin kernel codepath to work correctly with 6.10.

See kernel commits:

b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
                     directory
9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
                     rules

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16439
Closes #16450
---
 module/Kbuild.in             |  4 ++--
 rpm/generic/zfs-kmod.spec.in | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/module/Kbuild.in b/module/Kbuild.in
index 4f48cb9da0c..57682214dfd 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -16,8 +16,8 @@ src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
-icp_include = $(srctree)/$(src)/icp/include
-zstd_include = $(srctree)/$(src)/zstd/include
+icp_include = $(src)/icp/include
+zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif
 
diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in
index 4cc075585d4..30524474d1a 100644
--- a/rpm/generic/zfs-kmod.spec.in
+++ b/rpm/generic/zfs-kmod.spec.in
@@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
         %{?kernel_cc} \
         %{?kernel_ld} \
         %{?kernel_llvm}
+
+    # Pre-6.10 kernel builds didn't need to copy over the source files to the
+    # build directory.  However we do need to do it though post-6.10 due to
+    # these commits:
+    #
+    # b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
+    #                      directory
+    #
+    # 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
+    #                      rules
+    #
+    # Note that kmodtool actually copies over the source into the build
+    # directory, so what we're doing here is normal.  For efficiency reasons
+    # though we just use hardlinks instead of copying.
+    #
+    # See https://github.com/openzfs/zfs/issues/16439 for more info.
+    cp -lR ../%{module}-%{version}/module/* module/
+
     make %{?_smp_mflags}
     cd ..
 done

From fb432660c3691b2ac4a4cc462b9789e593c5ac29 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 15 Aug 2024 14:05:58 -0700
Subject: [PATCH 04/59] Linux 6.10 compat: Fix zvol NULL pointer deference

zvol_alloc_non_blk_mq()->blk_queue_set_write_cache() needs the disk
queue setup to prevent a NULL pointer deference.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16453
---
 module/os/linux/zfs/zvol_os.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 2beec6436bf..5daf00c647c 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1261,13 +1261,14 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 		return (1);
 	}
 
+	zso->zvo_disk = disk;
+	zso->zvo_disk->minors = ZVOL_MINORS;
+	zso->zvo_queue = zso->zvo_disk->queue;
+
 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
 #endif
 
-	zso->zvo_disk = disk;
-	zso->zvo_disk->minors = ZVOL_MINORS;
-	zso->zvo_queue = zso->zvo_disk->queue;
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)

From 5807de90a14127ee78de45c85c2e010841759536 Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Thu, 15 Aug 2024 14:13:18 -0700
Subject: [PATCH 05/59] Fix null ptr deref when renaming a zvol with snaps and
 snapdev=visible (#16316)

If a zvol is renamed, and it has one or more snapshots, and
snapdev=visible is true for the zvol, then the rename causes a kernel
null pointer dereference error. This has the effect (on Linux, anyway)
of killing the z_zvol taskq kthread, with locks still held; which in
turn causes a variety of zvol-related operations afterward to hang
indefinitely (such as udev workers, among other things).

The problem occurs because of an oversight in #15486
(e36ff84c338d2f7b15aef2538f6a9507115bbf4a). As documented in
dataset_kstats_create, some datasets may not actually have kstats
allocated for them; and at least at the present time, this is true for
snapshots. In practical terms, this means that for snapshots,
dk->dk_kstats will be NULL. The dataset_kstats_rename function
introduced in the patch above does not first check whether dk->dk_kstats
is NULL before proceeding, unlike e.g. the nearby
dataset_kstats_update_* functions.

In the very particular circumstance in which a zvol is renamed, AND that
zvol has one or more snapshots, AND that zvol also has snapdev=visible,
zvol_rename_minors_impl will loop over not just the zvol dataset itself,
but each of the zvol's snapshots as well, so that their device nodes
will be renamed as well. This results in dataset_kstats_create being
called for snapshots, where, as we've established, dk->dk_kstats is
NULL.

Fix this by simply adding a NULL check before doing anything in
dataset_kstats_rename.

This still allows the dataset_name kstat value for the zvol to be
updated (as was the intent of the original patch), and merely blocks
attempts by the code to act upon the zvol's non-kstat-having snapshots.
If at some future time, kstats are added for snapshots, then things
should work as intended in that case as well.

Signed-off-by: Justin Gottula <justin@jgottula.com>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Alan Somers <asomers@gmail.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 module/zfs/dataset_kstats.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
index 8faa6c2a252..914260e742f 100644
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@@ -204,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 void
 dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 {
+	if (dk->dk_kstats == NULL)
+		return;
+
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	char *ds_name;
 

From bdf4d6be1de870b16d4f7997b235d9f19dd7e30e Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 16 Aug 2024 02:29:50 +0500
Subject: [PATCH 06/59] linux/zvol_os: fix zvol queue limits initialization

zvol queue limits initialization depends on `zv_volblocksize`, but it is
initialized later, leading to several limits being initialized with
incorrect values, including `max_discard_*` limits. This also causes
`blkdiscard` command to consistently fail, as `blk_ioctl_discard` reads
`bdev_max_discard_sectors()` limits as 0, leading to failure. The fix is
straightforward: initialize `zv->zv_volblocksize` early, before setting
the queue limits. This PR should fix `zvol/zvol_misc/zvol_misc_trim`
failure on recent PRs, as the test case issues `blkdiscard` for a zvol.
Additionally, `zvol_misc_trim` was recently enabled in `6c7d41a`,
which is why the issue wasn't identified earlier.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16454
---
 module/os/linux/zfs/zvol_os.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 5daf00c647c..e04f64e232a 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1362,7 +1362,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
  * request queue and generic disk structures for the block device.
  */
 static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
+zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
@@ -1382,6 +1382,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1671,7 +1672,8 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
@@ -1681,7 +1683,6 @@ zvol_os_create_minor(const char *name)
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 

From db2b1fdb796619823b22b4882ebe0c09db5fa05f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 20 Jun 2023 12:06:13 +1000
Subject: [PATCH 07/59] ddt: add FDT feature and support for legacy and new
 on-disk formats

This is the supporting infrastructure for the upcoming dedup features.

Traditionally, dedup objects live directly in the MOS root. While their
details vary (checksum, type and class), they are all the same "kind" of
thing - a store of dedup entries.

The new features are more varied than that, and are better thought of as
a set of related stores for the overall state of a dedup table.

This adds a new feature flag, SPA_FEATURE_FAST_DEDUP. Enabling this will
cause new DDTs to be created as a ZAP in the MOS root, named
DDT-<checksum>. The is used as the root object for the normal type/class
store objects, but will also be a place for any storage required by new
features.

This commit adds two new fields to ddt_t, for version and flags. These
are intended to describe the structure and features of the overall dedup
table, and are stored as-is in the DDT root. In this commit, flags are
always zero, but the intent is that they can be used to hang optional
logic or state onto for new dedup features. Version is always 1.

For a "legacy" dedup table, where no DDT root directory exists, the
version will be 0.

ddt_configure() is expected to determine the version and flags features
currently in operation based on whether or not the fast_dedup feature is
enabled, and from what's available on disk. In this way, its possible to
support both old and new tables.

This also provides a migration path. A legacy setup can be upgraded to
FDT by creating the DDT root ZAP, moving the existing objects into it,
and setting version and flags appropriately. There's no support for that
here, but it would be straightforward to add later and allows the
possibility that newer features could be applied to existing dedup
tables.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15892
---
 include/sys/ddt.h                             |  18 +-
 include/sys/ddt_impl.h                        |   8 +
 include/sys/dmu.h                             |   1 +
 include/zfeature_common.h                     |   1 +
 lib/libzfs/libzfs.abi                         |  11 +-
 man/man7/zpool-features.7                     |  17 +-
 module/zcommon/zfeature_common.c              |   6 +
 module/zfs/ddt.c                              | 260 +++++++++++++++++-
 module/zfs/zio_compress.c                     |   4 +
 .../cli_root/zpool_get/zpool_get.cfg          |   1 +
 10 files changed, 307 insertions(+), 20 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 66d59cebacd..02d0cf5daab 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -39,6 +39,12 @@ extern "C" {
 
 struct abd;
 
+/*
+ * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
+ */
+/* No flags yet. */
+#define	DDT_FLAG_MASK	(0)
+
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
  * implementation, see ddt_ops_t. The value itself is not stored on disk.
@@ -185,11 +191,15 @@ typedef struct {
 
 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
 
-	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
+	avl_tree_t	ddt_repair_tree; /* entries being repaired */
 
-	enum zio_checksum ddt_checksum;		/* checksum algorithm in use */
-	spa_t		*ddt_spa;		/* pool this ddt is on */
-	objset_t	*ddt_os;		/* ddt objset (always MOS) */
+	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
+	spa_t		*ddt_spa;	/* pool this ddt is on */
+	objset_t	*ddt_os;	/* ddt objset (always MOS) */
+
+	uint64_t	ddt_dir_object;	/* MOS dir holding ddt objects */
+	uint64_t	ddt_version;	/* DDT version */
+	uint64_t	ddt_flags;	/* FDT option flags */
 
 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 4aaab10c873..9c0fea64f38 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -33,6 +33,14 @@
 extern "C" {
 #endif
 
+/* DDT version numbers */
+#define	DDT_VERSION_LEGACY	(0)
+#define	DDT_VERSION_FDT		(1)
+
+/* Names of interesting objects in the DDT root dir */
+#define	DDT_DIR_VERSION		"version"
+#define	DDT_DIR_FLAGS		"flags"
+
 /*
  * Ops vector to access a specific DDT object type.
  */
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 1376cbef763..5b80dc31594 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -376,6 +376,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
+#define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 2515ba32175..5733a8187a9 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -82,6 +82,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_AVZ_V2,
 	SPA_FEATURE_REDACTION_LIST_SPILL,
 	SPA_FEATURE_RAIDZ_EXPANSION,
+	SPA_FEATURE_FAST_DEDUP,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 51c8dc9647e..88baa4168c3 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -616,7 +616,7 @@
     <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6006,7 +6006,8 @@
       <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
       <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
       <enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
-      <enumerator name='SPA_FEATURES' value='41'/>
+      <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
+      <enumerator name='SPA_FEATURES' value='42'/>
     </enum-decl>
     <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
     <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@@ -9131,8 +9132,8 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
-      <subrange length='41' type-id='7359adad' id='cb834f44'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
+      <subrange length='42' type-id='7359adad' id='cb7c937f'/>
     </array-type-def>
     <enum-decl name='zfeature_flags' id='6db816a4'>
       <underlying-type type-id='9cac1fee'/>
@@ -9209,7 +9210,7 @@
     <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
     <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
     <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
     <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
     <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index ea3c68dc608..ff6e485a481 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -17,8 +17,9 @@
 .\" Copyright (c) 2019, Klara Inc.
 .\" Copyright (c) 2019, Allan Jude
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+.\" Copyright (c) 2023, Klara Inc.
 .\"
-.Dd June 23, 2022
+.Dd February 14, 2024
 .Dt ZPOOL-FEATURES 7
 .Os
 .
@@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
 .Sy enabled
 state when all datasets that use this feature are destroyed.
 .
+.feature com.klarasystems fast_dedup yes
+This feature allows more advanced deduplication features to be enabled on new
+dedup tables.
+.Pp
+This feature will be
+.Sy active
+when the first deduplicated block is written after a new dedup table is created
+(ie after a new pool creation, or new checksum used on a dataset with
+.Sy dedup
+enabled).
+It will be returned to the
+.Sy enabled
+state when all deduplicated blocks using it are freed.
+.
 .feature com.delphix extensible_dataset no
 This feature allows more flexible use of internal ZFS data structures,
 and exists for other features to depend on.
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 309d9bf14cd..8dec5f27b0a 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -754,6 +754,12 @@ zpool_feature_init(void)
 	    "Support for raidz expansion",
 	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
 
+	zfeature_register(SPA_FEATURE_FAST_DEDUP,
+	    "com.klarasystems:fast_dedup", "fast_dedup",
+	    "Support for advanced deduplication",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+	    sfeatures);
+
 	zfs_mod_list_supported_free(sfeatures);
 }
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index d70ae1a031d..7e2010c423c 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -39,6 +39,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
+#include <sys/zfeature.h>
 
 /*
  * # DDT: Deduplication tables
@@ -185,6 +186,18 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
 	"unique",
 };
 
+/*
+ * DDT feature flags automatically enabled for each on-disk version. Note that
+ * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
+ */
+static const uint64_t ddt_version_flags[] = {
+	[DDT_VERSION_LEGACY] = 0,
+	[DDT_VERSION_FDT] = 0,
+};
+
+/* Dummy version to signal that configure is still necessary */
+#define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
+
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
@@ -196,14 +209,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, ==, 0);
 	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
 	ASSERT3U(*objectp, !=, 0);
 
-	VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
-	    sizeof (uint64_t), 1, objectp, tx));
+	ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+
+	VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
+	    objectp, tx));
 
 	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
@@ -220,13 +237,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 	uint64_t count;
 	char name[DDT_NAMELEN];
 
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, !=, 0);
 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 	VERIFY0(ddt_object_count(ddt, type, class, &count));
 	VERIFY0(count);
-	VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
 	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
 	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
 	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
@@ -243,9 +262,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 	char name[DDT_NAMELEN];
 	int error;
 
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then this object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+		return (SET_ERROR(ENOENT));
+	}
+
 	ddt_object_name(ddt, type, class, name);
 
-	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+	error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 	if (error != 0)
 		return (error);
@@ -684,6 +712,8 @@ ddt_prefetch_all(spa_t *spa)
 	}
 }
 
+static int ddt_configure(ddt_t *ddt, boolean_t new);
+
 ddt_entry_t *
 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 {
@@ -697,6 +727,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
+	if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) {
+		/*
+		 * This is the first use of this DDT since the pool was
+		 * created; finish getting it ready for use.
+		 */
+		VERIFY0(ddt_configure(ddt, B_TRUE));
+		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+	}
+
 	ddt_key_fill(&search, bp);
 
 	/* Find an existing live entry */
@@ -837,6 +876,181 @@ ddt_key_compare(const void *x1, const void *x2)
 	return (TREE_ISIGN(cmp));
 }
 
+/* Create the containing dir for this DDT and bump the feature count */
+static void
+ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, ==, 0);
+	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
+
+	char name[DDT_NAMELEN];
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+	ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
+
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
+	    sizeof (uint64_t), 1, &ddt->ddt_version, tx));
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
+	    sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
+
+	spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
+}
+
+/* Destroy the containing dir and deactivate the feature */
+static void
+ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, !=, 0);
+	ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
+	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
+
+	char name[DDT_NAMELEN];
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			ASSERT(!ddt_object_exists(ddt, type, class));
+		}
+	}
+
+	uint64_t count;
+	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
+	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
+	    DDT_DIR_VERSION));
+	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
+	ASSERT3U(count, ==, 2);
+
+	VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
+
+	ddt->ddt_dir_object = 0;
+
+	spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
+}
+
+/*
+ * Determine, flags and on-disk layout from what's already stored. If there's
+ * nothing stored, then if new is false, returns ENOENT, and if true, selects
+ * based on pool config.
+ */
+static int
+ddt_configure(ddt_t *ddt, boolean_t new)
+{
+	spa_t *spa = ddt->ddt_spa;
+	char name[DDT_NAMELEN];
+	int error;
+
+	ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
+
+	boolean_t fdt_enabled =
+	    spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
+	boolean_t fdt_active =
+	    spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
+
+	/*
+	 * First, look for the global DDT stats object. If its not there, then
+	 * there's never been a DDT written before ever, and we know we're
+	 * starting from scratch.
+	 */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+	    &spa->spa_ddt_stat_object);
+	if (error != 0) {
+		if (error != ENOENT)
+			return (error);
+		goto not_found;
+	}
+
+	if (fdt_active) {
+		/*
+		 * Now look for a DDT directory. If it exists, then it has
+		 * everything we need.
+		 */
+		snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+		    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+		error = zap_lookup(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
+		    &ddt->ddt_dir_object);
+		if (error == 0) {
+			ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
+
+			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
+			    DDT_DIR_VERSION, sizeof (uint64_t), 1,
+			    &ddt->ddt_version);
+			if (error != 0)
+				return (error);
+
+			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
+			    DDT_DIR_FLAGS, sizeof (uint64_t), 1,
+			    &ddt->ddt_flags);
+			if (error != 0)
+				return (error);
+
+			if (ddt->ddt_version != DDT_VERSION_FDT) {
+				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
+				    "unknown version %llu", spa_name(spa),
+				    name, (u_longlong_t)ddt->ddt_version);
+				return (SET_ERROR(EINVAL));
+			}
+
+			if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
+				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
+				    "version=%llu unknown flags %llx",
+				    spa_name(spa), name,
+				    (u_longlong_t)ddt->ddt_flags,
+				    (u_longlong_t)ddt->ddt_version);
+				return (SET_ERROR(EINVAL));
+			}
+
+			return (0);
+		}
+		if (error != ENOENT)
+			return (error);
+	}
+
+	/* Any object in the root indicates a traditional setup. */
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			ddt_object_name(ddt, type, class, name);
+			uint64_t obj;
+			error = zap_lookup(spa->spa_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
+			    1, &obj);
+			if (error == ENOENT)
+				continue;
+			if (error != 0)
+				return (error);
+
+			ddt->ddt_version = DDT_VERSION_LEGACY;
+			ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+			ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
+
+			return (0);
+		}
+	}
+
+not_found:
+	if (!new)
+		return (SET_ERROR(ENOENT));
+
+	/* Nothing on disk, so set up for the best version we can */
+	if (fdt_enabled) {
+		ddt->ddt_version = DDT_VERSION_FDT;
+		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+		ddt->ddt_dir_object = 0; /* create on first use */
+	} else {
+		ddt->ddt_version = DDT_VERSION_LEGACY;
+		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+		ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
+	}
+
+	return (0);
+}
+
 static ddt_t *
 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 {
@@ -853,6 +1067,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
+	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
 	return (ddt);
 }
@@ -889,7 +1104,6 @@ ddt_load(spa_t *spa)
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 	    &spa->spa_ddt_stat_object);
-
 	if (error)
 		return (error == ENOENT ? 0 : error);
 
@@ -898,6 +1112,12 @@ ddt_load(spa_t *spa)
 			continue;
 
 		ddt_t *ddt = spa->spa_ddt[c];
+		error = ddt_configure(ddt, B_FALSE);
+		if (error == ENOENT)
+			continue;
+		if (error != 0)
+			return (error);
+
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
@@ -912,10 +1132,11 @@ ddt_load(spa_t *spa)
 		 */
 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 		    sizeof (ddt->ddt_histogram));
-		spa->spa_dedup_dspace = ~0ULL;
-		spa->spa_dedup_dsize = ~0ULL;
 	}
 
+	spa->spa_dedup_dspace = ~0ULL;
+	spa->spa_dedup_dsize = ~0ULL;
+
 	return (0);
 }
 
@@ -1147,25 +1368,44 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 		    DMU_POOL_DDT_STATS, tx);
 	}
 
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
 		ddt_sync_entry(ddt, dde, tx, txg);
 		ddt_free(dde);
 	}
 
+	uint64_t count = 0;
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
-		uint64_t add, count = 0;
+		uint64_t add, tcount = 0;
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (ddt_object_exists(ddt, type, class)) {
 				ddt_object_sync(ddt, type, class, tx);
 				VERIFY0(ddt_object_count(ddt, type, class,
 				    &add));
-				count += add;
+				tcount += add;
 			}
 		}
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
-			if (count == 0 && ddt_object_exists(ddt, type, class))
+			if (tcount == 0 && ddt_object_exists(ddt, type, class))
 				ddt_object_destroy(ddt, type, class, tx);
 		}
+		count += tcount;
+	}
+
+	if (count == 0) {
+		/*
+		 * No entries left on the DDT, so reset the version for next
+		 * time. This allows us to handle the feature being changed
+		 * since the DDT was originally created. New entries should get
+		 * whatever the feature currently demands.
+		 */
+		if (ddt->ddt_version == DDT_VERSION_FDT)
+			ddt_destroy_dir(ddt, tx);
+
+		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+		ddt->ddt_flags = 0;
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index e12d5498ccd..c3bceababa3 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -48,6 +48,10 @@ static unsigned long zio_decompress_fail_fraction = 0;
 
 /*
  * Compression vectors.
+ *
+ * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
+ * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
+ * PART OF THE ON-DISK FORMAT.
  */
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 	{"inherit",	0,	NULL,		NULL, NULL},
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index e8a94ce209b..50c1b7a9d09 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -109,5 +109,6 @@ if is_linux || is_freebsd; then
 	    "feature@block_cloning"
 	    "feature@vdev_zaps_v2"
 	    "feature@raidz_expansion"
+	    "feature@fast_dedup"
 	)
 fi

From 2b131d734577bf489c86fdb9dbb63460a5675613 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 13 Jun 2024 14:50:33 +1000
Subject: [PATCH 08/59] ZTS: tests for dedup legacy/FDT tables

Very basic coverage to make sure things appear to work, have the right
format on disk, and pool upgrades and mixed table types work as
expected.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15892
---
 tests/runfiles/common.run                     |   4 +-
 tests/zfs-tests/tests/Makefile.am             |   6 +
 .../functional/dedup/dedup_fdt_create.ksh     |  99 ++++++++++++++
 .../functional/dedup/dedup_fdt_import.ksh     | 112 ++++++++++++++++
 .../functional/dedup/dedup_legacy_create.ksh  |  95 ++++++++++++++
 .../dedup/dedup_legacy_fdt_mixed.ksh          |  97 ++++++++++++++
 .../dedup/dedup_legacy_fdt_upgrade.ksh        | 122 ++++++++++++++++++
 .../functional/dedup/dedup_legacy_import.ksh  | 104 +++++++++++++++
 .../tests/functional/dedup/setup.ksh          |   4 -
 9 files changed, 638 insertions(+), 5 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 326eb2a44d3..ad131664698 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -672,7 +672,9 @@ post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
-tests = ['dedup_quota']
+tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import',
+    'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
+    'dedup_legacy_fdt_mixed', 'dedup_quota']
 pre =
 post =
 tags = ['functional', 'dedup']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 9dcb097e2b3..bbeabc6dfb4 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
+	functional/dedup/dedup_fdt_create.ksh \
+	functional/dedup/dedup_fdt_import.ksh \
+	functional/dedup/dedup_legacy_create.ksh \
+	functional/dedup/dedup_legacy_import.ksh \
+	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
+	functional/dedup/dedup_legacy_fdt_mixed.ksh \
 	functional/dedup/dedup_quota.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
new file mode 100755
index 00000000000..83c4d7c8e2a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -0,0 +1,99 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Simple test of dedup table operations (FDT)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "basic dedup (FDT) operations work"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with fast dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=enabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with only one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# copy the file
+log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must zpool sync
+
+# now four entries in the duplicate table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'"
+
+# now two DDT ZAPs in the container object; DDT ZAPs aren't cleaned up until
+# the entire logical table is destroyed
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 2
+
+# remove the files
+log_must rm -f /$TESTPOOL/file*
+log_must zpool sync
+
+# feature should move back to enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; containing object destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0
+
+log_pass "basic dedup (FDT) operations work"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
new file mode 100755
index 00000000000..f0f20671b95
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -0,0 +1,112 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Ensure dedup retains version after import (FDT)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "dedup (FDT) retains version after import"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with fast dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=enabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with only one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# export and import the pool
+zpool export $TESTPOOL
+zpool import $TESTPOOL
+
+# feature still active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# remove the file
+log_must rm -f /$TESTPOOL/file1
+log_must zpool sync
+
+# feature should revert to enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; containing object destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0
+
+# create a new file
+log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4
+log_must zpool sync
+
+# feature should be active again
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with only one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+log_pass "dedup (FDT) retains version after import"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
new file mode 100755
index 00000000000..e3efcf5c8b3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
@@ -0,0 +1,95 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Simple test of dedup table operations (legacy)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "basic dedup (legacy) operations work"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# copy the file
+log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must zpool sync
+
+# now four entries in the duplicate table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'"
+
+# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire
+# logical table is destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2
+
+# remove the files
+log_must rm -f /$TESTPOOL/file*
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; all DDT ZAPs removed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
+
+log_pass "basic dedup (legacy) operations work"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
new file mode 100755
index 00000000000..049ccaae3dc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Check legacy dedup table continues to work after pool upgrade to fast_dedup,
+# but if deleted and recreated, the new table is FDT
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "legacy and FDT dedup tables on the same pool can happily coexist"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# create two datasets, enabling a different dedup algorithm on each
+log_must zfs create -o dedup=skein $TESTPOOL/ds1
+log_must zfs create -o dedup=blake3 $TESTPOOL/ds2
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-skein"
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-blake3"
+
+# create a file in the first dataset
+log_must dd if=/dev/urandom of=/$TESTPOOL/ds1/file1 bs=128k count=4
+log_must zpool sync
+
+# should be four entries in the skein unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-skein-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1
+
+# enable the fast_dedup feature
+log_must zpool set feature@fast_dedup=enabled $TESTPOOL
+
+# confirm the feature is now enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# create a file in the first dataset
+log_must dd if=/dev/urandom of=/$TESTPOOL/ds2/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# now also four entries in the blake3 unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-blake3-zap-unique: 4 entries'"
+
+# two entries in the MOS: the legacy skein DDT ZAP, and the containing dir for
+# the blake3 FDT table
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | wc -l) -eq 1
+
+# containing object has one ZAP inside
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | awk '{ print $NF }')
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-.*-zap- | wc -l) -eq 1
+
+log_pass "legacy and FDT dedup tables on the same pool can happily coexist"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
new file mode 100755
index 00000000000..d563fade88a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -0,0 +1,122 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Check legacy dedup table continues to work after pool upgrade to fast_dedup,
+# but if deleted and recreated, the new table is FDT
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# enable the fast_dedup feature
+log_must zpool set feature@fast_dedup=enabled $TESTPOOL
+
+# confirm the feature is now enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# copy the file
+log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must zpool sync
+
+# feature should still be enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# now four entries in the duplicate table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'"
+
+# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire
+# logical table is destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2
+
+# remove the files
+log_must rm -f /$TESTPOOL/file*
+log_must zpool sync
+
+# feature should still be enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; all DDT ZAPs removed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
+
+# create a new file
+log_must dd if=/dev/urandom of=/$TESTPOOL/file3 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+log_pass "legacy dedup tables work after upgrade; new dedup tables created as FDT"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
new file mode 100755
index 00000000000..a7b667eaf88
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
@@ -0,0 +1,104 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Ensure dedup retains version after import (legacy)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "dedup (legacy) retains version after import"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# export and import the pool
+zpool export $TESTPOOL
+zpool import $TESTPOOL
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# remove the file
+log_must rm -f /$TESTPOOL/file1
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; all DDT ZAPs removed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
+
+# create a new file
+log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+log_pass "dedup (legacy) retains version after import"
diff --git a/tests/zfs-tests/tests/functional/dedup/setup.ksh b/tests/zfs-tests/tests/functional/dedup/setup.ksh
index 3c0830401f8..a21238879fa 100755
--- a/tests/zfs-tests/tests/functional/dedup/setup.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/setup.ksh
@@ -25,7 +25,3 @@
 #
 
 . $STF_SUITE/include/libtest.shlib
-
-DISK=${DISKS%% *}
-
-default_setup $DISK

From d63f5d7e50b65c76d9a8b79db0b66ebb6a49742c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jun 2024 14:11:11 +1000
Subject: [PATCH 09/59] zdb: rework DDT block count and leak check to just
 count the blocks

The upcoming dedup features break the long held assumption that all
blocks on disk with a 'D' dedup bit will always be present in the DDT,
or will have the same set of DVA allocations on disk as in the DDT.

If the DDT is no longer a complete picture of all the dedup blocks that
will be and should be on disk, then it does us no good to walk and prime
it up front, since it won't necessarily match up with every block we'll
see anyway.

Instead, we rework things here to be more like the BRT checks. When we
see a dedup'd block, we look it up in the DDT, consume a refcount, and
for the second-or-later instances, count them as duplicates.

The DDT and BRT are moved ahead of the space accounting. This will
become important for the "flat" feature, which may need to count a
modified version of the block.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15892
---
 cmd/zdb/zdb.c     | 315 ++++++++++++++++++++++++++++------------------
 include/sys/ddt.h |   2 +-
 module/zfs/ddt.c  |   8 +-
 module/zfs/zio.c  |   4 +-
 4 files changed, 200 insertions(+), 129 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index dec70c60cec..fcf0e477978 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -33,7 +33,7 @@
  *     under sponsorship from the FreeBSD Foundation.
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
- * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2023, 2024, Klara Inc.
  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
@@ -3287,9 +3287,46 @@ fuid_table_destroy(void)
 	}
 }
 
+/*
+ * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
+ * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
+ * wouldn't want to anyway), but if we don't clean up the presence of stuff on
+ * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
+ *
+ * Note that this is not a particularly efficient way to do this, but
+ * ddt_remove() is the only public method that can do the work we need, and it
+ * requires the right locks and etc to do the job. This is only ever called
+ * during zdb shutdown so efficiency is not especially important.
+ */
+static void
+zdb_ddt_cleanup(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		ddt_enter(ddt);
+		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
+		while (dde) {
+			next = AVL_NEXT(&ddt->ddt_tree, dde);
+			memset(&dde->dde_lead_zio, 0,
+			    sizeof (dde->dde_lead_zio));
+			ddt_remove(ddt, dde);
+			dde = next;
+		}
+		ddt_exit(ddt);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+}
+
 static void
 zdb_exit(int reason)
 {
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
@@ -5633,7 +5670,6 @@ static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
-	uint64_t refcnt = 0;
 	int i;
 
 	ASSERT(type < ZDB_OT_TOTAL);
@@ -5641,8 +5677,144 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
+	/*
+	 * This flag controls if we will issue a claim for the block while
+	 * counting it, to ensure that all blocks are referenced in space maps.
+	 * We don't issue claims if we're not doing leak tracking, because it's
+	 * expensive if the user isn't interested. We also don't claim the
+	 * second or later occurences of cloned or dedup'd blocks, because we
+	 * already claimed them the first time.
+	 */
+	boolean_t do_claim = !dump_opt['L'];
+
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
+	if (BP_GET_DEDUP(bp)) {
+		/*
+		 * Dedup'd blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * We use the existing dedup system to track what we've seen.
+		 * The first time we see a block, we do a ddt_lookup() to see
+		 * if it exists in the DDT. If we're doing leak tracking, we
+		 * claim the block at this time.
+		 *
+		 * Each time we see a block, we reduce the refcount in the
+		 * entry by one, and add to the size and count of dedup'd
+		 * blocks to report at the end.
+		 */
+
+		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
+
+		ddt_enter(ddt);
+
+		/*
+		 * Find the block. This will create the entry in memory, but
+		 * we'll know if that happened by its refcount.
+		 */
+		ddt_entry_t *dde = ddt_lookup(ddt, bp);
+
+		/*
+		 * ddt_lookup() can only return NULL if this block didn't exist
+		 * in the DDT and creating it would take the DDT over its
+		 * quota. Since we got the block from disk, it must exist in
+		 * the DDT, so this can't happen.
+		 */
+		VERIFY3P(dde, !=, NULL);
+
+		/* Get the phys for this variant */
+		ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+		VERIFY3P(ddp, !=, NULL);
+
+		/*
+		 * This entry may have multiple sets of DVAs. We must claim
+		 * each set the first time we see them in a real block on disk,
+		 * or count them on subsequent occurences. We don't have a
+		 * convenient way to track the first time we see each variant,
+		 * so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We
+		 * can do this safely in zdb because it never writes, so it
+		 * will never have a writing zio for this block in that
+		 * pointer.
+		 */
+
+		/*
+		 * Work out which dde_phys index was used, get the seen flag,
+		 * and update it if necessary.
+		 */
+		uint_t idx =
+		    ((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) /
+		    sizeof (ddt_phys_t);
+		VERIFY3P(ddp, ==, &dde->dde_phys[idx]);
+		boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx];
+		if (!seen)
+			dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
+
+		/* Consume a reference for this block. */
+		VERIFY3U(ddt_phys_total_refcnt(dde), >, 0);
+		ddt_phys_decref(ddp);
+
+		if (seen) {
+			/*
+			 * The second or later time we see this block,
+			 * it's a duplicate and we count it.
+			 */
+			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_dedup_blocks++;
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+
+		ddt_exit(ddt);
+	} else if (zcb->zcb_brt_is_active &&
+	    brt_maybe_exists(zcb->zcb_spa, bp)) {
+		/*
+		 * Cloned blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * To do this, we keep our own in-memory BRT. For each block
+		 * we haven't seen before, we look it up in the real BRT and
+		 * if its there, we note it and its refcount then proceed as
+		 * normal. If we see the block again, we count it as a clone
+		 * and then give it no further consideration.
+		 */
+		zdb_brt_entry_t zbre_search, *zbre;
+		avl_index_t where;
+
+		zbre_search.zbre_dva = bp->blk_dva[0];
+		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+		if (zbre == NULL) {
+			/* Not seen before; track it */
+			uint64_t refcnt =
+			    brt_entry_get_refcount(zcb->zcb_spa, bp);
+			if (refcnt > 0) {
+				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+				    UMEM_NOFAIL);
+				zbre->zbre_dva = bp->blk_dva[0];
+				zbre->zbre_refcount = refcnt;
+				avl_insert(&zcb->zcb_brt, zbre, where);
+			}
+		} else  {
+			/*
+			 * Second or later occurrence, count it and take a
+			 * refcount.
+			 */
+			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_clone_blocks++;
+
+			zbre->zbre_refcount--;
+			if (zbre->zbre_refcount == 0) {
+				avl_remove(&zcb->zcb_brt, zbre);
+				umem_free(zbre, sizeof (zdb_brt_entry_t));
+			}
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+	}
+
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -5745,71 +5917,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
-	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
-		/*
-		 * Cloned blocks are special. We need to count them, so we can
-		 * later uncount them when reporting leaked space, and we must
-		 * only claim them them once.
-		 *
-		 * To do this, we keep our own in-memory BRT. For each block
-		 * we haven't seen before, we look it up in the real BRT and
-		 * if its there, we note it and its refcount then proceed as
-		 * normal. If we see the block again, we count it as a clone
-		 * and then give it no further consideration.
-		 */
-		zdb_brt_entry_t zbre_search, *zbre;
-		avl_index_t where;
-
-		zbre_search.zbre_dva = bp->blk_dva[0];
-		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
-		if (zbre != NULL) {
-			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
-			zcb->zcb_clone_blocks++;
-
-			zbre->zbre_refcount--;
-			if (zbre->zbre_refcount == 0) {
-				avl_remove(&zcb->zcb_brt, zbre);
-				umem_free(zbre, sizeof (zdb_brt_entry_t));
-			}
-			return;
-		}
-
-		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
-		if (crefcnt > 0) {
-			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
-			    UMEM_NOFAIL);
-			zbre->zbre_dva = bp->blk_dva[0];
-			zbre->zbre_refcount = crefcnt;
-			avl_insert(&zcb->zcb_brt, zbre, where);
-		}
-	}
-
-	if (dump_opt['L'])
+	if (!do_claim)
 		return;
 
-	if (BP_GET_DEDUP(bp)) {
-		ddt_t *ddt;
-		ddt_entry_t *dde;
-
-		ddt = ddt_select(zcb->zcb_spa, bp);
-		ddt_enter(ddt);
-		dde = ddt_lookup(ddt, bp, B_FALSE);
-
-		if (dde == NULL) {
-			refcnt = 0;
-		} else {
-			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
-			ddt_phys_decref(ddp);
-			refcnt = ddp->ddp_refcnt;
-			if (ddt_phys_total_refcnt(dde) == 0)
-				ddt_remove(ddt, dde);
-		}
-		ddt_exit(ddt);
-	}
-
-	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
-	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
+	    ZIO_FLAG_CANFAIL)));
 }
 
 static void
@@ -6120,49 +6233,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
 	return (counts);
 }
 
-static void
-zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
-{
-	ddt_bookmark_t ddb = {0};
-	ddt_entry_t dde;
-	int error;
-	int p;
-
-	ASSERT(!dump_opt['L']);
-
-	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
-		blkptr_t blk;
-		ddt_phys_t *ddp = dde.dde_phys;
-
-		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
-			return;
-
-		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
-		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
-		VERIFY(ddt);
-
-		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-			if (ddp->ddp_phys_birth == 0)
-				continue;
-			ddt_bp_create(ddb.ddb_checksum,
-			    &dde.dde_key, ddp, &blk);
-			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
-			} else {
-				zcb->zcb_dedup_asize +=
-				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
-				zcb->zcb_dedup_blocks++;
-			}
-		}
-
-		ddt_enter(ddt);
-		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
-		ddt_exit(ddt);
-	}
-
-	ASSERT(error == ENOENT);
-}
-
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
@@ -6546,10 +6616,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
-
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-	zdb_ddt_leak_init(spa, zcb);
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static boolean_t
@@ -6814,6 +6880,8 @@ dump_block_stats(spa_t *spa)
 	int e, c, err;
 	bp_embedded_type_t i;
 
+	ddt_prefetch_all(spa);
+
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
@@ -6938,7 +7006,6 @@ dump_block_stats(spa_t *spa)
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
-		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0) {
@@ -8022,16 +8089,21 @@ dump_mos_leaks(spa_t *spa)
 
 	mos_leak_vdev(spa->spa_root_vdev);
 
-	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
-		for (uint64_t type = 0; type < DDT_TYPES; type++) {
-			for (uint64_t cksum = 0;
-			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
-				ddt_t *ddt = spa->spa_ddt[cksum];
-				if (!ddt)
-					continue;
+	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		/* DDT store objects */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
+			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
+
+		/* FDT container */
+		mos_obj_refd(ddt->ddt_dir_object);
 	}
 
 	if (spa->spa_brt != NULL) {
@@ -9624,6 +9696,9 @@ retry_lookup:
 	}
 
 fini:
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 02d0cf5daab..20bae8ce0fc 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -253,7 +253,7 @@ extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
 extern void ddt_fini(void);
-extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
 extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
 extern void ddt_prefetch_all(spa_t *spa);
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 7e2010c423c..84d7800cbc7 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -715,7 +715,7 @@ ddt_prefetch_all(spa_t *spa)
 static int ddt_configure(ddt_t *ddt, boolean_t new);
 
 ddt_entry_t *
-ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ddt_key_t search;
@@ -767,10 +767,6 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 		return (dde);
 	}
 
-	/* Not found. */
-	if (!add)
-		return (NULL);
-
 	/* Time to make a new entry. */
 	dde = ddt_alloc(&search);
 	avl_insert(&ddt->ddt_tree, dde, where);
@@ -1502,7 +1498,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 	ddt = ddt_select(spa, bp);
 	ddt_enter(ddt);
 
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	dde = ddt_lookup(ddt, bp);
 
 	/* Can be NULL if the entry for this block was pruned. */
 	if (dde == NULL) {
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6d08d4bd163..5810e811a39 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3518,7 +3518,7 @@ zio_ddt_write(zio_t *zio)
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	dde = ddt_lookup(ddt, bp);
 	if (dde == NULL) {
 		/* DDT size is over its quota so no new entries */
 		zp->zp_dedup = B_FALSE;
@@ -3598,7 +3598,7 @@ zio_ddt_free(zio_t *zio)
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
-	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
 		ddp = ddt_phys_select(dde, bp);
 		if (ddp)

From d17ab631a9142b81b100d87f0619f5e59bc211ac Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 3 Jul 2023 15:16:02 +1000
Subject: [PATCH 10/59] ddt: rework access to phys array slots

The "flat phys" feature will use only a single phys slot for all
entries, which means the old "single", "double" etc naming now makes no
sense, and more importantly, means that choosing the right slot for a
given block pointer will depend on how many slots are in use for a given
DDT.

This removes the old names, and adds accessor macros to decouple
specific phys array indexes from any particular meaning.

(These macros look strange in isolation, mainly in the way they take the
ddt_t* as an arg but don't use it. This is mostly a separate commit to
introduce the concept to the reader before the "flat phys" commit
extends it).

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 cmd/zdb/zdb.c          | 13 +++++-----
 include/sys/ddt.h      | 27 ++++++++-------------
 include/sys/ddt_impl.h |  2 +-
 module/zfs/ddt.c       | 54 ++++++++++++++++++++++++------------------
 module/zfs/ddt_stats.c |  5 ++--
 module/zfs/ddt_zap.c   |  1 +
 module/zfs/dsl_scan.c  |  6 +++--
 module/zfs/zio.c       | 36 ++++++++++++++++++----------
 8 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index fcf0e477978..7a6459b756b 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1916,21 +1916,20 @@ dump_log_spacemaps(spa_t *spa)
 static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
-	const ddt_phys_t *ddp = dde->dde_phys;
 	const ddt_key_t *ddk = &dde->dde_key;
-	const char *types[4] = { "ditto", "single", "double", "triple" };
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
-	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_phys_t *ddp = &dde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
-		(void) printf("index %llx refcnt %llu %s %s\n",
+		(void) printf("index %llx refcnt %llu phys %d %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
-		    types[p], blkbuf);
+		    p, blkbuf);
 	}
 }
 
@@ -5724,7 +5723,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		VERIFY3P(dde, !=, NULL);
 
 		/* Get the phys for this variant */
-		ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+		ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp);
 		VERIFY3P(ddp, !=, NULL);
 
 		/*
@@ -5751,7 +5750,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
 
 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(dde), >, 0);
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
 		ddt_phys_decref(ddp);
 
 		if (seen) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 20bae8ce0fc..a2e069f1392 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -137,19 +137,10 @@ typedef struct {
 	uint64_t	ddp_phys_birth;
 } ddt_phys_t;
 
-/*
- * Named indexes into the ddt_phys_t array in each entry.
- *
- * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
- * we maintain the ability to free existing dedup-ditto blocks.
- */
-enum ddt_phys_type {
-	DDT_PHYS_DITTO = 0,
-	DDT_PHYS_SINGLE = 1,
-	DDT_PHYS_DOUBLE = 2,
-	DDT_PHYS_TRIPLE = 3,
-	DDT_PHYS_TYPES
-};
+#define	DDT_PHYS_MAX			(4)
+#define	DDT_NPHYS(ddt)			((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
+#define	DDT_PHYS_IS_DITTO(ddt, p)	((ddt) && p == 0)
+#define	DDT_PHYS_FOR_COPIES(ddt, p)	((ddt) ? (p) : (p))
 
 /*
  * A "live" entry, holding changes to an entry made this txg, and other data to
@@ -162,11 +153,11 @@ enum ddt_phys_type {
 
 typedef struct {
 	/* key must be first for ddt_key_compare */
-	ddt_key_t	dde_key;			/* ddt_tree key */
-	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];	/* on-disk data */
+	ddt_key_t	dde_key;		/* ddt_tree key */
+	ddt_phys_t	dde_phys[DDT_PHYS_MAX];	/* on-disk data */
 
 	/* in-flight update IOs */
-	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
+	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
 
 	/* copy of data after a repair read, to be rewritten */
 	struct abd	*dde_repair_abd;
@@ -234,7 +225,8 @@ extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
 extern void ddt_phys_clear(ddt_phys_t *ddp);
 extern void ddt_phys_addref(ddt_phys_t *ddp);
 extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
+    const blkptr_t *bp);
 
 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
 extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
@@ -249,6 +241,7 @@ extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
 extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize);
 
 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern ddt_t *ddt_select_checksum(spa_t *spa, enum zio_checksum checksum);
 extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 9c0fea64f38..e97b71621c3 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -82,7 +82,7 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
  */
 #define	DDT_NAMELEN	32
 
-extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 84d7800cbc7..9bb0b8f15fc 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -540,11 +540,10 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 }
 
 ddt_phys_t *
-ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 {
-	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p];
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
 			return (ddp);
@@ -553,12 +552,15 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 }
 
 uint64_t
-ddt_phys_total_refcnt(const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
 {
 	uint64_t refcnt = 0;
 
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;
 		refcnt += dde->dde_phys[p].ddp_refcnt;
+	}
 
 	return (refcnt);
 }
@@ -570,6 +572,12 @@ ddt_select(spa_t *spa, const blkptr_t *bp)
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
+ddt_t *
+ddt_select_checksum(spa_t *spa, enum zio_checksum checksum)
+{
+	return (spa->spa_ddt[checksum]);
+}
+
 void
 ddt_enter(ddt_t *ddt)
 {
@@ -613,9 +621,9 @@ ddt_alloc(const ddt_key_t *ddk)
 }
 
 static void
-ddt_free(ddt_entry_t *dde)
+ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 {
-	for (int p = 0; p < DDT_PHYS_TYPES; p++)
+	for (int p = 0; p < DDT_NPHYS(ddt); p++)
 		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
 
 	if (dde->dde_repair_abd != NULL)
@@ -631,7 +639,7 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
 	avl_remove(&ddt->ddt_tree, dde);
-	ddt_free(dde);
+	ddt_free(ddt, dde);
 }
 
 static boolean_t
@@ -759,7 +767,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
 			if (dde->dde_waiters == 0) {
 				avl_remove(&ddt->ddt_tree, dde);
-				ddt_free(dde);
+				ddt_free(ddt, dde);
 			}
 			return (NULL);
 		}
@@ -805,7 +813,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Over quota. If no one is waiting, clean up right now. */
 		if (dde->dde_waiters == 0) {
 			avl_remove(&ddt->ddt_tree, dde);
-			ddt_free(dde);
+			ddt_free(ddt, dde);
 			return (NULL);
 		}
 
@@ -1212,7 +1220,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
-		ddt_free(dde);
+		ddt_free(ddt, dde);
 
 	ddt_exit(ddt);
 }
@@ -1220,16 +1228,15 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 static void
 ddt_repair_entry_done(zio_t *zio)
 {
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *rdde = zio->io_private;
 
-	ddt_free(rdde);
+	ddt_free(ddt, rdde);
 }
 
 static void
 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 {
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_phys_t *rddp = rdde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_key_t *rddk = &rdde->dde_key;
 	zio_t *zio;
@@ -1238,7 +1245,9 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 	zio = zio_null(rio, rio->io_spa, NULL,
 	    ddt_repair_entry_done, rdde, rio->io_flags);
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+		ddt_phys_t *rddp = &rdde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 		    memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
@@ -1281,7 +1290,6 @@ static void
 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 {
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_type_t otype = dde->dde_type;
 	ddt_type_t ntype = DDT_TYPE_DEFAULT;
@@ -1291,13 +1299,14 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 
 	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
+		ddt_phys_t *ddp = &dde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0) {
 			ASSERT0(ddp->ddp_refcnt);
 			continue;
 		}
-		if (p == DDT_PHYS_DITTO) {
+		if (DDT_PHYS_IS_DITTO(ddt, p)) {
 			/*
 			 * Note, we no longer create DDT-DITTO blocks, but we
 			 * don't want to leak any written by older software.
@@ -1310,8 +1319,6 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		total_refcnt += ddp->ddp_refcnt;
 	}
 
-	/* We do not create new DDT-DITTO blocks. */
-	ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth);
 	if (total_refcnt > 1)
 		nclass = DDT_CLASS_DUPLICATE;
 	else
@@ -1369,7 +1376,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
 		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(dde);
+		ddt_free(ddt, dde);
 	}
 
 	uint64_t count = 0;
@@ -1512,7 +1519,8 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 
 		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
 
-		ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
+		ddp = &dde->dde_phys[p];
 
 		/*
 		 * This entry already existed (dde_type is real), so it must
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 82b682019ae..5449eca3afb 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -36,14 +36,15 @@ static void
 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 {
 	spa_t *spa = ddt->ddt_spa;
-	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	uint64_t lsize = DDK_GET_LSIZE(ddk);
 	uint64_t psize = DDK_GET_PSIZE(ddk);
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
 		uint64_t dsize = 0;
 		uint64_t refcnt = ddp->ddp_refcnt;
 
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 7ce7461a2b2..8f1bbeeecd8 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 085cfd3c569..737ee4f6600 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2933,7 +2933,6 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 {
 	(void) tx;
 	const ddt_key_t *ddk = &dde->dde_key;
-	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
@@ -2954,7 +2953,10 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	ddt_t *ddt = ddt_select_checksum(tx->tx_pool->dp_spa, checksum);
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 5810e811a39..914f83fb9f9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3254,12 +3254,14 @@ static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
-	ddp = ddt_phys_select(dde, bp);
+	ddt = ddt_select(zio->io_spa, bp);
+	ddp = ddt_phys_select(ddt, dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
@@ -3282,8 +3284,7 @@ zio_ddt_read_start(zio_t *zio)
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
-		ddt_phys_t *ddp = dde->dde_phys;
-		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
@@ -3292,7 +3293,8 @@ zio_ddt_read_start(zio_t *zio)
 		if (ddp_self == NULL)
 			return (zio);
 
-		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+			ddt_phys_t *ddp = &dde->dde_phys[p];
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
@@ -3372,7 +3374,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 	 * loaded).
 	 */
 
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;
+
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
@@ -3384,7 +3389,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 		}
 	}
 
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;
+
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0 && do_raw) {
@@ -3452,15 +3460,16 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
-	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
@@ -3477,9 +3486,10 @@ zio_ddt_child_write_ready(zio_t *zio)
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
-	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
@@ -3506,11 +3516,9 @@ zio_ddt_write(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
-	int p = zp->zp_copies;
 	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
@@ -3528,7 +3536,9 @@ zio_ddt_write(zio_t *zio)
 		ddt_exit(ddt);
 		return (zio);
 	}
-	ddp = &dde->dde_phys[p];
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
+	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
@@ -3600,7 +3610,7 @@ zio_ddt_free(zio_t *zio)
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
-		ddp = ddt_phys_select(dde, bp);
+		ddp = ddt_phys_select(ddt, dde, bp);
 		if (ddp)
 			ddt_phys_decref(ddp);
 	}

From 4d686c3da53db5e5f3f3cc52060d9fbca2baf092 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 3 Jul 2023 22:16:04 +1000
Subject: [PATCH 11/59] ddt: introduce lightweight entry

The idea here is that sometimes you need the contents of an entry with
no intent to modify it, and/or from a place where its difficult to get
hold of its originating ddt_t to know how to interpret it.

A lightweight entry contains everything you might need to "read" an
entry - its key, type and phys contents - but none of the extras for
modifying it or using it in a larger context. It also has the full
complement of phys slots, so it can represent any kind of dedup entry
without having to know the specific configuration of the table it came
from.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 cmd/zdb/zdb.c          | 15 ++++++++-------
 include/sys/ddt.h      | 16 ++++++++++++++--
 include/sys/ddt_impl.h | 13 ++++++++++++-
 include/sys/dsl_scan.h |  2 +-
 module/zfs/ddt.c       | 31 ++++++++++++++++---------------
 module/zfs/dsl_scan.c  | 15 +++++++--------
 6 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 7a6459b756b..3bde5736c0f 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1914,15 +1914,16 @@ dump_log_spacemaps(spa_t *spa)
 }
 
 static void
-dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    uint64_t index)
 {
-	const ddt_key_t *ddk = &dde->dde_key;
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
-	for (p = 0; p < DDT_NPHYS(ddt); p++) {
-		const ddt_phys_t *ddp = &dde->dde_phys[p];
+	for (p = 0; p < ddlwe->ddlwe_nphys; p++) {
+		const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
@@ -1959,7 +1960,7 @@ static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
-	ddt_entry_t dde;
+	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
@@ -2000,8 +2001,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 
 	(void) printf("%s contents:\n\n", name);
 
-	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
-		dump_dde(ddt, &dde, walk);
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
+		dump_ddt_entry(ddt, &ddlwe, walk);
 
 	ASSERT3U(error, ==, ENOENT);
 
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index a2e069f1392..7a091669090 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -173,6 +173,18 @@ typedef struct {
 	avl_node_t	dde_node;	/* ddt_tree node */
 } ddt_entry_t;
 
+/*
+ * A lightweight entry is for short-lived or transient uses, like iterating or
+ * inspecting, when you don't care where it came from.
+ */
+typedef struct {
+	ddt_key_t	ddlwe_key;
+	ddt_type_t	ddlwe_type;
+	ddt_class_t	ddlwe_class;
+	uint8_t		ddlwe_nphys;
+	ddt_phys_t	ddlwe_phys[DDT_PHYS_MAX];
+} ddt_lightweight_entry_t;
+
 /*
  * In-core DDT object. This covers all entries and stats for a the whole pool
  * for a given checksum type.
@@ -241,7 +253,6 @@ extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
 extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize);
 
 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
-extern ddt_t *ddt_select_checksum(spa_t *spa, enum zio_checksum checksum);
 extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
@@ -263,7 +274,8 @@ extern void ddt_create(spa_t *spa);
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
-extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
+    ddt_lightweight_entry_t *ddlwe);
 
 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
 
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index e97b71621c3..e88a046ab8a 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -41,6 +41,17 @@ extern "C" {
 #define	DDT_DIR_VERSION		"version"
 #define	DDT_DIR_FLAGS		"flags"
 
+/* Fill a lightweight entry from a live entry. */
+#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {		\
+	memset((ddlwe), 0, sizeof (*ddlwe));			\
+	(ddlwe)->ddlwe_key = (dde)->dde_key;			\
+	(ddlwe)->ddlwe_type = (dde)->dde_type;			\
+	(ddlwe)->ddlwe_class = (dde)->dde_class;		\
+	(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt);			\
+	for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++)		\
+		(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p];	\
+} while (0)
+
 /*
  * Ops vector to access a specific DDT object type.
  */
@@ -91,7 +102,7 @@ extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
     char *name);
 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
-    uint64_t *walk, ddt_entry_t *dde);
+    uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
 extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
     uint64_t *count);
 extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index f32f59a2bed..b91d7f4be88 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
 void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx);
+    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
 void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 9bb0b8f15fc..aac2250bf30 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -401,13 +401,20 @@ ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 int
 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    uint64_t *walk, ddt_entry_t *dde)
+    uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
-	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
-	    ddt->ddt_object[type][class], walk, &dde->dde_key,
-	    dde->dde_phys, sizeof (dde->dde_phys)));
+	int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+	    ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
+	    ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys));
+	if (error == 0) {
+		ddlwe->ddlwe_type = type;
+		ddlwe->ddlwe_class = class;
+		ddlwe->ddlwe_nphys = DDT_NPHYS(ddt);
+		return (0);
+	}
+	return (error);
 }
 
 int
@@ -572,12 +579,6 @@ ddt_select(spa_t *spa, const blkptr_t *bp)
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
-ddt_t *
-ddt_select_checksum(spa_t *spa, enum zio_checksum checksum)
-{
-	return (spa->spa_ddt[checksum]);
-}
-
 void
 ddt_enter(ddt_t *ddt)
 {
@@ -1347,8 +1348,10 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		 * traversing.)
 		 */
 		if (nclass < oclass) {
+			ddt_lightweight_entry_t ddlwe;
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, dde, tx);
+			    ddt->ddt_checksum, &ddlwe, tx);
 		}
 	}
 }
@@ -1455,7 +1458,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
 }
 
 int
-ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 {
 	do {
 		do {
@@ -1468,10 +1471,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 				    ddb->ddb_class)) {
 					error = ddt_object_walk(ddt,
 					    ddb->ddb_type, ddb->ddb_class,
-					    &ddb->ddb_cursor, dde);
+					    &ddb->ddb_cursor, ddlwe);
 				}
-				dde->dde_type = ddb->ddb_type;
-				dde->dde_class = ddb->ddb_class;
 				if (error == 0)
 					return (0);
 				if (error != ENOENT)
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 737ee4f6600..dec0eb28dc5 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2929,10 +2929,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
-	const ddt_key_t *ddk = &dde->dde_key;
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
@@ -2953,9 +2953,8 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	ddt_t *ddt = ddt_select_checksum(tx->tx_pool->dp_spa, checksum);
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
+	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+		ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
 
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
@@ -3004,11 +3003,11 @@ static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
-	ddt_entry_t dde = {{{{0}}}};
+	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	uint64_t n = 0;
 
-	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
@@ -3023,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))

From 0ba5f503c5d644d28429c366fd1cdbd1c6c9b2b9 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 3 Jul 2023 19:54:40 +1000
Subject: [PATCH 12/59] ddt: slim down ddt_entry_t

This slims down the in-memory entry to as small as it can be. The
IO-related parts are made into a separate entry, since they're
relatively rarely needed.

The variable allocation for dde_phys is to support the upcoming flat
format.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 include/sys/ddt.h | 22 ++++++++++++++++------
 module/zfs/ddt.c  | 46 +++++++++++++++++++++++++++++++++-------------
 module/zfs/zio.c  | 26 ++++++++++++++------------
 3 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 7a091669090..222373c98a0 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -151,16 +151,22 @@ typedef struct {
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
 
+/*
+ * Additional data to support entry update or repair. This is fixed size
+ * because its relatively rarely used.
+ */
 typedef struct {
-	/* key must be first for ddt_key_compare */
-	ddt_key_t	dde_key;		/* ddt_tree key */
-	ddt_phys_t	dde_phys[DDT_PHYS_MAX];	/* on-disk data */
+	/* copy of data after a repair read, to be rewritten */
+	abd_t		*dde_repair_abd;
 
 	/* in-flight update IOs */
 	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
+} ddt_entry_io_t;
 
-	/* copy of data after a repair read, to be rewritten */
-	struct abd	*dde_repair_abd;
+typedef struct {
+	/* key must be first for ddt_key_compare */
+	ddt_key_t	dde_key;	/* ddt_tree key */
+	avl_node_t	dde_node;	/* ddt_tree_node */
 
 	/* storage type and class the entry was loaded from */
 	ddt_type_t	dde_type;
@@ -170,7 +176,9 @@ typedef struct {
 	kcondvar_t	dde_cv;		/* signaled when load completes */
 	uint64_t	dde_waiters;	/* count of waiters on dde_cv */
 
-	avl_node_t	dde_node;	/* ddt_tree node */
+	ddt_entry_io_t	*dde_io;	/* IO support, when required */
+
+	ddt_phys_t	dde_phys[];	/* physical data */
 } ddt_entry_t;
 
 /*
@@ -265,6 +273,8 @@ extern void ddt_prefetch_all(spa_t *spa);
 extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
     const blkptr_t *bp);
 
+extern void ddt_alloc_entry_io(ddt_entry_t *dde);
+
 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index aac2250bf30..213e042394f 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -164,6 +164,9 @@
 static kmem_cache_t *ddt_cache;
 static kmem_cache_t *ddt_entry_cache;
 
+#define	DDT_ENTRY_SIZE	\
+	(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)
+
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  */
@@ -343,7 +346,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, sizeof (dde->dde_phys)));
+	    dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
 }
 
 static int
@@ -386,7 +389,7 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
-	    sizeof (dde->dde_phys), tx));
+	    sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
 }
 
 static int
@@ -597,7 +600,7 @@ ddt_init(void)
 	ddt_cache = kmem_cache_create("ddt_cache",
 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
-	    sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	    DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
@@ -613,7 +616,7 @@ ddt_alloc(const ddt_key_t *ddk)
 	ddt_entry_t *dde;
 
 	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
-	memset(dde, 0, sizeof (ddt_entry_t));
+	memset(dde, 0, DDT_ENTRY_SIZE);
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
@@ -621,14 +624,27 @@ ddt_alloc(const ddt_key_t *ddk)
 	return (dde);
 }
 
+void
+ddt_alloc_entry_io(ddt_entry_t *dde)
+{
+	if (dde->dde_io != NULL)
+		return;
+
+	dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
+}
+
 static void
 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 {
-	for (int p = 0; p < DDT_NPHYS(ddt); p++)
-		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
+	if (dde->dde_io != NULL) {
+		for (int p = 0; p < DDT_NPHYS(ddt); p++)
+			ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL);
 
-	if (dde->dde_repair_abd != NULL)
-		abd_free(dde->dde_repair_abd);
+		if (dde->dde_io->dde_repair_abd != NULL)
+			abd_free(dde->dde_io->dde_repair_abd);
+
+		kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
+	}
 
 	cv_destroy(&dde->dde_cv);
 	kmem_cache_free(ddt_entry_cache, dde);
@@ -1191,6 +1207,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 	ddt_key_fill(&ddk, bp);
 
 	dde = ddt_alloc(&ddk);
+	ddt_alloc_entry_io(dde);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
@@ -1205,7 +1222,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 		}
 	}
 
-	memset(dde->dde_phys, 0, sizeof (dde->dde_phys));
+	memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt));
 
 	return (dde);
 }
@@ -1217,7 +1234,8 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 
 	ddt_enter(ddt);
 
-	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
+	if (dde->dde_io->dde_repair_abd != NULL &&
+	    spa_writeable(ddt->ddt_spa) &&
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
@@ -1255,8 +1273,9 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
-		    rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
-		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
+		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+		    ZIO_DDT_CHILD_FLAGS(zio), NULL));
 	}
 
 	zio_nowait(zio);
@@ -1301,7 +1320,8 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
+		ASSERT(dde->dde_io == NULL ||
+		    dde->dde_io->dde_lead_zio[p] == NULL);
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0) {
 			ASSERT0(ddp->ddp_refcnt);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 914f83fb9f9..1ca71c738c8 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3265,8 +3265,8 @@ zio_ddt_child_read_done(zio_t *zio)
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
-	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
-		dde->dde_repair_abd = zio->io_abd;
+	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
+		dde->dde_io->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
@@ -3340,8 +3340,8 @@ zio_ddt_read_done(zio_t *zio)
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
-		if (dde->dde_repair_abd != NULL) {
-			abd_copy(zio->io_abd, dde->dde_repair_abd,
+		if (dde->dde_io->dde_repair_abd != NULL) {
+			abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
@@ -3378,7 +3378,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 		if (DDT_PHYS_IS_DITTO(ddt, p))
 			continue;
 
-		zio_t *lio = dde->dde_lead_zio[p];
+		zio_t *lio = dde->dde_io->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
 			return (lio->io_size != zio->io_size ||
@@ -3472,7 +3472,7 @@ zio_ddt_child_write_ready(zio_t *zio)
 
 	ddt_enter(ddt);
 
-	ASSERT(dde->dde_lead_zio[p] == zio);
+	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
@@ -3495,8 +3495,8 @@ zio_ddt_child_write_done(zio_t *zio)
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_lead_zio[p] == zio);
-	dde->dde_lead_zio[p] = NULL;
+	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
+	dde->dde_io->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
@@ -3563,11 +3563,13 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}
 
-	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+	ddt_alloc_entry_io(dde);
+
+	if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
-		if (dde->dde_lead_zio[p] != NULL)
-			zio_add_child(zio, dde->dde_lead_zio[p]);
+		if (dde->dde_io->dde_lead_zio[p] != NULL)
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
@@ -3583,7 +3585,7 @@ zio_ddt_write(zio_t *zio)
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_lead_zio[p] = cio;
+		dde->dde_io->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);

From f4aeb23f521cb4c5d94b103c926a3cc7b7be8abc Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 20 Jun 2023 11:09:48 +1000
Subject: [PATCH 13/59] ddt: add "flat phys" feature

Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.

This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.

Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.

Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 cmd/zdb/zdb.c          |  68 ++++---
 include/sys/ddt.h      | 122 +++++++++---
 include/sys/ddt_impl.h |  20 +-
 include/sys/dsl_scan.h |   2 +-
 include/sys/spa.h      |   7 +-
 module/zfs/ddt.c       | 344 +++++++++++++++++++++++++---------
 module/zfs/ddt_stats.c |  20 +-
 module/zfs/ddt_zap.c   |   6 +-
 module/zfs/dsl_scan.c  |  14 +-
 module/zfs/zio.c       | 412 ++++++++++++++++++++++++++++++++---------
 10 files changed, 757 insertions(+), 258 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 3bde5736c0f..142f55b299e 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1922,14 +1922,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
 	blkptr_t blk;
 	int p;
 
-	for (p = 0; p < ddlwe->ddlwe_nphys; p++) {
-		const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
-		if (ddp->ddp_phys_birth == 0)
+	for (p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu phys %d %s\n",
-		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
 		    p, blkbuf);
 	}
 }
@@ -3311,8 +3313,7 @@ zdb_ddt_cleanup(spa_t *spa)
 		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
 		while (dde) {
 			next = AVL_NEXT(&ddt->ddt_tree, dde);
-			memset(&dde->dde_lead_zio, 0,
-			    sizeof (dde->dde_lead_zio));
+			dde->dde_io = NULL;
 			ddt_remove(ddt, dde);
 			dde = next;
 		}
@@ -5689,6 +5690,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
+	blkptr_t tempbp;
 	if (BP_GET_DEDUP(bp)) {
 		/*
 		 * Dedup'd blocks are special. We need to count them, so we can
@@ -5724,35 +5726,51 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		VERIFY3P(dde, !=, NULL);
 
 		/* Get the phys for this variant */
-		ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp);
-		VERIFY3P(ddp, !=, NULL);
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 
 		/*
 		 * This entry may have multiple sets of DVAs. We must claim
 		 * each set the first time we see them in a real block on disk,
 		 * or count them on subsequent occurences. We don't have a
 		 * convenient way to track the first time we see each variant,
-		 * so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We
-		 * can do this safely in zdb because it never writes, so it
-		 * will never have a writing zio for this block in that
-		 * pointer.
+		 * so we repurpose dde_io as a set of "seen" flag bits. We can
+		 * do this safely in zdb because it never writes, so it will
+		 * never have a writing zio for this block in that pointer.
 		 */
-
-		/*
-		 * Work out which dde_phys index was used, get the seen flag,
-		 * and update it if necessary.
-		 */
-		uint_t idx =
-		    ((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) /
-		    sizeof (ddt_phys_t);
-		VERIFY3P(ddp, ==, &dde->dde_phys[idx]);
-		boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx];
+		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
 		if (!seen)
-			dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
+			dde->dde_io =
+			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
 		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
-		ddt_phys_decref(ddp);
+		ddt_phys_decref(dde->dde_phys, v);
+
+		/*
+		 * If this entry has a single flat phys, it may have been
+		 * extended with additional DVAs at some time in its life.
+		 * This block might be from before it was fully extended, and
+		 * so have fewer DVAs.
+		 *
+		 * If this is the first time we've seen this block, and we
+		 * claimed it as-is, then we would miss the claim on some
+		 * number of DVAs, which would then be seen as leaked.
+		 *
+		 * In all cases, if we've had fewer DVAs, then the asize would
+		 * be too small, and would lead to the pool apparently using
+		 * more space than allocated.
+		 *
+		 * To handle this, we copy the canonical set of DVAs from the
+		 * entry back to the block pointer before we claim it.
+		 */
+		if (v == DDT_PHYS_FLAT) {
+			ASSERT3U(BP_GET_BIRTH(bp), ==,
+			    ddt_phys_birth(dde->dde_phys, v));
+			tempbp = *bp;
+			ddt_bp_fill(dde->dde_phys, v, &tempbp,
+			    BP_GET_BIRTH(bp));
+			bp = &tempbp;
+		}
 
 		if (seen) {
 			/*
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 222373c98a0..11e09eef3bc 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -42,8 +42,8 @@ struct abd;
 /*
  * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
  */
-/* No flags yet. */
-#define	DDT_FLAG_MASK	(0)
+#define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT)
 
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
@@ -126,21 +126,80 @@ typedef struct {
  * characteristics of the stored block, such as its location on disk (DVAs),
  * birth txg and ref count.
  *
- * Note that an entry has an array of four ddt_phys_t, one for each number of
- * DVAs (copies= property) and another for additional "ditto" copies. Most
- * users of ddt_phys_t will handle indexing into or counting the phys they
- * want.
+ * The "traditional" entry has an array of four, one for each number of DVAs
+ * (copies= property) and another for additional "ditto" copies. Users of the
+ * traditional struct will specify the variant (index) of the one they want.
+ *
+ * The newer "flat" entry has only a single form that is specified using the
+ * DDT_PHYS_FLAT variant.
+ *
+ * Since the value size varies, use one of the size macros when interfacing
+ * with the ddt zap.
  */
-typedef struct {
-	dva_t		ddp_dva[SPA_DVAS_PER_BP];
-	uint64_t	ddp_refcnt;
-	uint64_t	ddp_phys_birth;
-} ddt_phys_t;
 
-#define	DDT_PHYS_MAX			(4)
-#define	DDT_NPHYS(ddt)			((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
-#define	DDT_PHYS_IS_DITTO(ddt, p)	((ddt) && p == 0)
-#define	DDT_PHYS_FOR_COPIES(ddt, p)	((ddt) ? (p) : (p))
+#define	DDT_PHYS_MAX	(4)
+
+/*
+ * Note - this can be used in a flexible array and allocated for
+ * a specific size (ddp_trad or ddp_flat). So be careful not to
+ * copy using "=" assignment but instead use ddt_phys_copy().
+ */
+typedef union {
+	/*
+	 * Traditional physical payload value for DDT zap (256 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth;
+	} ddp_trad[DDT_PHYS_MAX];
+
+	/*
+	 * Flat physical payload value for DDT zap (72 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth; /* txg based from BP */
+		uint64_t	ddp_class_start; /* in realtime seconds */
+	} ddp_flat;
+} ddt_univ_phys_t;
+
+/*
+ * This enum denotes which variant of a ddt_univ_phys_t to target. For
+ * a traditional DDT entry, it represents the indexes into the ddp_trad
+ * array. Any consumer of a ddt_univ_phys_t needs to know which variant
+ * is being targeted.
+ *
+ * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
+ * we maintain the ability to free existing dedup-ditto blocks.
+ */
+
+typedef enum {
+	DDT_PHYS_DITTO = 0,
+	DDT_PHYS_SINGLE = 1,
+	DDT_PHYS_DOUBLE = 2,
+	DDT_PHYS_TRIPLE = 3,
+	DDT_PHYS_FLAT = 4,
+	DDT_PHYS_NONE = 5
+} ddt_phys_variant_t;
+
+#define	DDT_PHYS_VARIANT(ddt, p)	\
+	(ASSERT((p) < DDT_PHYS_NONE),	\
+	((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
+
+#define	DDT_TRAD_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
+#define	DDT_FLAT_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
+
+#define	_DDT_PHYS_SWITCH(ddt, flat, trad)	\
+	(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
+
+#define	DDT_PHYS_SIZE(ddt)		_DDT_PHYS_SWITCH(ddt,	\
+	DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_NPHYS(ddt)			_DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
+#define	DDT_PHYS_FOR_COPIES(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, p)
+#define	DDT_PHYS_IS_DITTO(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, (p == 0))
 
 /*
  * A "live" entry, holding changes to an entry made this txg, and other data to
@@ -159,6 +218,9 @@ typedef struct {
 	/* copy of data after a repair read, to be rewritten */
 	abd_t		*dde_repair_abd;
 
+	/* original phys contents before update, for error handling */
+	ddt_univ_phys_t	dde_orig_phys;
+
 	/* in-flight update IOs */
 	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
 } ddt_entry_io_t;
@@ -178,7 +240,7 @@ typedef struct {
 
 	ddt_entry_io_t	*dde_io;	/* IO support, when required */
 
-	ddt_phys_t	dde_phys[];	/* physical data */
+	ddt_univ_phys_t	dde_phys[];	/* flexible -- allocated size varies */
 } ddt_entry_t;
 
 /*
@@ -189,8 +251,7 @@ typedef struct {
 	ddt_key_t	ddlwe_key;
 	ddt_type_t	ddlwe_type;
 	ddt_class_t	ddlwe_class;
-	uint8_t		ddlwe_nphys;
-	ddt_phys_t	ddlwe_phys[DDT_PHYS_MAX];
+	ddt_univ_phys_t	ddlwe_phys;
 } ddt_lightweight_entry_t;
 
 /*
@@ -236,17 +297,26 @@ typedef struct {
 	uint64_t	ddb_cursor;
 } ddt_bookmark_t;
 
-extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
-    uint64_t txg);
+extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    blkptr_t *bp, uint64_t txg);
 extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
-    const ddt_phys_t *ddp, blkptr_t *bp);
+    const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
 
-extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
-extern void ddt_phys_clear(ddt_phys_t *ddp);
-extern void ddt_phys_addref(ddt_phys_t *ddp);
-extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
+extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
     const blkptr_t *bp);
+extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+    ddt_phys_variant_t v);
+extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
+    const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    boolean_t encrypted);
 
 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
 extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index e88a046ab8a..c4e681fb117 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -42,14 +42,12 @@ extern "C" {
 #define	DDT_DIR_FLAGS		"flags"
 
 /* Fill a lightweight entry from a live entry. */
-#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {		\
-	memset((ddlwe), 0, sizeof (*ddlwe));			\
-	(ddlwe)->ddlwe_key = (dde)->dde_key;			\
-	(ddlwe)->ddlwe_type = (dde)->dde_type;			\
-	(ddlwe)->ddlwe_class = (dde)->dde_class;		\
-	(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt);			\
-	for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++)		\
-		(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p];	\
+#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {			\
+	memset((ddlwe), 0, sizeof (*ddlwe));				\
+	(ddlwe)->ddlwe_key = (dde)->dde_key;				\
+	(ddlwe)->ddlwe_type = (dde)->dde_type;				\
+	(ddlwe)->ddlwe_class = (dde)->dde_class;			\
+	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
 } while (0)
 
 /*
@@ -61,19 +59,19 @@ typedef struct {
 	    boolean_t prehash);
 	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
 	int (*ddt_op_lookup)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    const ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_contains)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
 	int (*ddt_op_update)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
+	    const ddt_key_t *ddk, const void *phys, size_t psize,
 	    dmu_tx_t *tx);
 	int (*ddt_op_remove)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk, dmu_tx_t *tx);
 	int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
-	    ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
 } ddt_ops_t;
 
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index b91d7f4be88..63734dbc176 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
 void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
 void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3998f5a6de7..a70912335b1 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -572,7 +572,7 @@ typedef struct blkptr {
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
-#define	BP_ZERO(bp)				\
+#define	BP_ZERO_DVAS(bp)			\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
@@ -580,6 +580,11 @@ typedef struct blkptr {
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
+}
+
+#define	BP_ZERO(bp)				\
+{						\
+	BP_ZERO_DVAS(bp);			\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 213e042394f..59526394bd0 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -75,12 +75,19 @@
  * fill the BP with the DVAs from the entry, increment the refcount and cause
  * the write IO to return immediately.
  *
- * Each ddt_phys_t slot in the entry represents a separate dedup block for the
- * same content/checksum. The slot is selected based on the zp_copies parameter
- * the block is written with, that is, the number of DVAs in the block. The
- * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
- * feature. These are no longer written, and will be freed if encountered on
- * old pools.
+ * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
+ * block for the same content/checksum. The slot is selected based on the
+ * zp_copies parameter the block is written with, that is, the number of DVAs
+ * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
+ * now-removed "dedupditto" feature. These are no longer written, and will be
+ * freed if encountered on old pools.
+ *
+ * If the "fast_dedup" feature is enabled, new dedup tables will be created
+ * with the "flat phys" option. In this mode, there is only one ddt_phys_t
+ * slot. If a write is issued for an entry that exists, but has fewer DVAs,
+ * then only as many new DVAs are allocated and written to make up the
+ * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
+ * new DVAs.
  *
  * ## Lifetime of an entry
  *
@@ -130,6 +137,16 @@
  * from the alternate block. If the block is actually damaged, this will invoke
  * the pool's "self-healing" mechanism, and repair the block.
  *
+ * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
+ * use, so there is only ever one ddt_phys_t slot. The repair process will
+ * still happen in this case, though it is unlikely to succeed as there will
+ * usually be no other equivalent blocks to fall back on (though there might
+ * be, if this was an early version of a dedup'd block that has since been
+ * extended).
+ *
+ * Note that this repair mechanism is in addition to and separate from the
+ * regular OpenZFS scrub and self-healing mechanisms.
+ *
  * ## Scanning (scrub/resilver)
  *
  * If dedup is active, the scrub machinery will walk the dedup table first, and
@@ -162,10 +179,15 @@
 	c == ZIO_CHECKSUM_BLAKE3)
 
 static kmem_cache_t *ddt_cache;
-static kmem_cache_t *ddt_entry_cache;
 
-#define	DDT_ENTRY_SIZE	\
-	(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)
+static kmem_cache_t *ddt_entry_flat_cache;
+static kmem_cache_t *ddt_entry_trad_cache;
+
+#define	DDT_ENTRY_FLAT_SIZE	(sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_ENTRY_TRAD_SIZE	(sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
 
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -195,7 +217,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
  */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = 0,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
 };
 
 /* Dummy version to signal that configure is still necessary */
@@ -346,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
+	    dde->dde_phys, DDT_PHYS_SIZE(ddt)));
 }
 
 static int
@@ -388,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
-	    sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
+	    ddt->ddt_object[type][class], &dde->dde_key,
+	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
 }
 
 static int
@@ -410,11 +432,10 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 	    ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
-	    ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys));
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
 	if (error == 0) {
 		ddlwe->ddlwe_type = type;
 		ddlwe->ddlwe_class = class;
-		ddlwe->ddlwe_nphys = DDT_NPHYS(ddt);
 		return (0);
 	}
 	return (error);
@@ -451,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 }
 
 void
-ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    blkptr_t *bp, uint64_t txg)
 {
 	ASSERT3U(txg, !=, 0);
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+	uint64_t phys_birth;
+	const dva_t *dvap;
+
+	if (v == DDT_PHYS_FLAT) {
+		phys_birth = ddp->ddp_flat.ddp_phys_birth;
+		dvap = ddp->ddp_flat.ddp_dva;
+	} else {
+		phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
+		dvap = ddp->ddp_trad[v].ddp_dva;
+	}
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-		bp->blk_dva[d] = ddp->ddp_dva[d];
-	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+		bp->blk_dva[d] = dvap[d];
+	BP_SET_BIRTH(bp, txg, phys_birth);
 }
 
 /*
@@ -465,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
  * will be missing the salt / IV required to do a full decrypting read.
  */
 void
-ddt_bp_create(enum zio_checksum checksum,
-    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+    const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
 {
 	BP_ZERO(bp);
 
 	if (ddp != NULL)
-		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+		ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
 
 	bp->blk_cksum = ddk->ddk_cksum;
 
@@ -502,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 }
 
 void
-ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
 {
-	ASSERT0(ddp->ddp_phys_birth);
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+	int bp_ndvas = BP_GET_NDVAS(bp);
+	int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
+	    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+	dva_t *dvas = (v == DDT_PHYS_FLAT) ?
+	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
 
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
-}
+	int s = 0, d = 0;
+	while (s < bp_ndvas && d < ddp_max_dvas) {
+		if (DVA_IS_VALID(&dvas[d])) {
+			d++;
+			continue;
+		}
+		dvas[d] = bp->blk_dva[s];
+		s++; d++;
+	}
 
-void
-ddt_phys_clear(ddt_phys_t *ddp)
-{
-	memset(ddp, 0, sizeof (*ddp));
-}
+	/*
+	 * If the caller offered us more DVAs than we can fit, something has
+	 * gone wrong in their accounting. zio_ddt_write() should never ask for
+	 * more than we need.
+	 */
+	ASSERT3U(s, ==, bp_ndvas);
 
-void
-ddt_phys_addref(ddt_phys_t *ddp)
-{
-	ddp->ddp_refcnt++;
-}
+	if (BP_IS_ENCRYPTED(bp))
+		dvas[2] = bp->blk_dva[2];
 
-void
-ddt_phys_decref(ddt_phys_t *ddp)
-{
-	if (ddp) {
-		ASSERT3U(ddp->ddp_refcnt, >, 0);
-		ddp->ddp_refcnt--;
+	if (ddt_phys_birth(ddp, v) == 0) {
+		if (v == DDT_PHYS_FLAT)
+			ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp);
+		else
+			ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp);
 	}
 }
 
+void
+ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+    ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		dst->ddp_flat = src->ddp_flat;
+	else
+		dst->ddp_trad[v] = src->ddp_trad[v];
+}
+
+void
+ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
+	else
+		memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
+}
+
+void
+ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		ddp->ddp_flat.ddp_refcnt++;
+	else
+		ddp->ddp_trad[v].ddp_refcnt++;
+}
+
+uint64_t
+ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	uint64_t *refcntp;
+
+	if (v == DDT_PHYS_FLAT)
+		refcntp = &ddp->ddp_flat.ddp_refcnt;
+	else
+		refcntp = &ddp->ddp_trad[v].ddp_refcnt;
+
+	ASSERT3U(*refcntp, >, 0);
+	(*refcntp)--;
+	return (*refcntp);
+}
+
 static void
-ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v, uint64_t txg)
 {
 	blkptr_t blk;
 
-	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 
 	/*
 	 * We clear the dedup bit so that zio_free() will actually free the
@@ -545,20 +637,67 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 	 */
 	BP_SET_DEDUP(&blk, 0);
 
-	ddt_phys_clear(ddp);
+	ddt_phys_clear(ddp, v);
 	zio_free(ddt->ddt_spa, txg, &blk);
 }
 
-ddt_phys_t *
+uint64_t
+ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		return (ddp->ddp_flat.ddp_phys_birth);
+	else
+		return (ddp->ddp_trad[v].ddp_phys_birth);
+}
+
+int
+ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    boolean_t encrypted)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
+	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
+
+	return (DVA_IS_VALID(&dvas[0]) +
+	    DVA_IS_VALID(&dvas[1]) +
+	    DVA_IS_VALID(&dvas[2]) * !encrypted);
+}
+
+ddt_phys_variant_t
 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 {
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p];
-		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
-			return (ddp);
+	const ddt_univ_phys_t *ddp = dde->dde_phys;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
+		    BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
+			return (DDT_PHYS_FLAT);
+		}
+	} else /* traditional phys */ {
+		for (int p = 0; p < DDT_PHYS_MAX; p++) {
+			if (DVA_EQUAL(BP_IDENTITY(bp),
+			    &ddp->ddp_trad[p].ddp_dva[0]) &&
+			    BP_GET_BIRTH(bp) ==
+			    ddp->ddp_trad[p].ddp_phys_birth) {
+				return (p);
+			}
+		}
 	}
-	return (NULL);
+	return (DDT_PHYS_NONE);
+}
+
+uint64_t
+ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		return (ddp->ddp_flat.ddp_refcnt);
+	else
+		return (ddp->ddp_trad[v].ddp_refcnt);
 }
 
 uint64_t
@@ -566,10 +705,11 @@ ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
 {
 	uint64_t refcnt = 0;
 
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		if (DDT_PHYS_IS_DITTO(ddt, p))
-			continue;
-		refcnt += dde->dde_phys[p].ddp_refcnt;
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
+	} else {
+		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
 	}
 
 	return (refcnt);
@@ -599,24 +739,33 @@ ddt_init(void)
 {
 	ddt_cache = kmem_cache_create("ddt_cache",
 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-	ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
-	    DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
+	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
+	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 ddt_fini(void)
 {
-	kmem_cache_destroy(ddt_entry_cache);
+	kmem_cache_destroy(ddt_entry_trad_cache);
+	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
 }
 
 static ddt_entry_t *
-ddt_alloc(const ddt_key_t *ddk)
+ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
 {
 	ddt_entry_t *dde;
 
-	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
-	memset(dde, 0, DDT_ENTRY_SIZE);
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
+		memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
+	} else {
+		dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
+		memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
+	}
+
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
@@ -647,7 +796,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 	}
 
 	cv_destroy(&dde->dde_cv);
-	kmem_cache_free(ddt_entry_cache, dde);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
 }
 
 void
@@ -793,7 +943,12 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 	}
 
 	/* Time to make a new entry. */
-	dde = ddt_alloc(&search);
+	dde = ddt_alloc(ddt, &search);
+
+	/* Record the time this class was created (used by ddt prune) */
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
+
 	avl_insert(&ddt->ddt_tree, dde, where);
 
 	/*
@@ -1206,7 +1361,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 
 	ddt_key_fill(&ddk, bp);
 
-	dde = ddt_alloc(&ddk);
+	dde = ddt_alloc(ddt, &ddk);
 	ddt_alloc_entry_io(dde);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
@@ -1222,7 +1377,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 		}
 	}
 
-	memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt));
+	memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
 
 	return (dde);
 }
@@ -1265,13 +1420,26 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 	    ddt_repair_entry_done, rdde, rio->io_flags);
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-		ddt_phys_t *rddp = &rdde->dde_phys[p];
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
-		    memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+		ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_univ_phys_t *rddp = rdde->dde_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(ddp, v);
+		const dva_t *dvas, *rdvas;
+
+		if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+			dvas = ddp->ddp_flat.ddp_dva;
+			rdvas = rddp->ddp_flat.ddp_dva;
+		} else {
+			dvas = ddp->ddp_trad[p].ddp_dva;
+			rdvas = rddp->ddp_trad[p].ddp_dva;
+		}
+
+		if (phys_birth == 0 ||
+		    phys_birth != ddt_phys_birth(rddp, v) ||
+		    memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
 			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
 		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
@@ -1297,7 +1465,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 		rdde_next = AVL_NEXT(t, rdde);
 		avl_remove(&ddt->ddt_repair_tree, rdde);
 		ddt_exit(ddt);
-		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
+		    DDT_PHYS_NONE, &blk);
 		dde = ddt_repair_start(ddt, &blk);
 		ddt_repair_entry(ddt, dde, rdde, rio);
 		ddt_repair_done(ddt, dde);
@@ -1322,9 +1491,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ASSERT(dde->dde_io == NULL ||
 		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-		if (ddp->ddp_phys_birth == 0) {
-			ASSERT0(ddp->ddp_refcnt);
+		ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
+
+		if (ddt_phys_birth(ddp, v) == 0) {
+			ASSERT0(phys_refcnt);
 			continue;
 		}
 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
@@ -1332,12 +1504,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 			 * Note, we no longer create DDT-DITTO blocks, but we
 			 * don't want to leak any written by older software.
 			 */
-			ddt_phys_free(ddt, ddk, ddp, txg);
+			ddt_phys_free(ddt, ddk, ddp, v, txg);
 			continue;
 		}
-		if (ddp->ddp_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, txg);
-		total_refcnt += ddp->ddp_refcnt;
+		if (phys_refcnt == 0)
+			ddt_phys_free(ddt, ddk, ddp, v, txg);
+		total_refcnt += phys_refcnt;
 	}
 
 	if (total_refcnt > 1)
@@ -1371,7 +1543,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 			ddt_lightweight_entry_t ddlwe;
 			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, &ddlwe, tx);
+			    ddt->ddt_checksum, ddt, &ddlwe, tx);
 		}
 	}
 }
@@ -1536,12 +1708,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 	}
 
 	if (dde->dde_type < DDT_TYPES) {
-		ddt_phys_t *ddp;
-
 		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
 
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
-		ddp = &dde->dde_phys[p];
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		/*
 		 * This entry already existed (dde_type is real), so it must
@@ -1553,9 +1723,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * likely further action is required to fill out the DDT entry,
 		 * and this is a place that is likely to be missed in testing.
 		 */
-		ASSERT3U(ddp->ddp_refcnt, >, 0);
+		ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
 
-		ddt_phys_addref(ddp);
+		ddt_phys_addref(dde->dde_phys, v);
 		result = B_TRUE;
 	} else {
 		/*
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 5449eca3afb..6da77bbca5c 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -43,18 +43,22 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 	memset(dds, 0, sizeof (*dds));
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
+		const ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
-		uint64_t dsize = 0;
-		uint64_t refcnt = ddp->ddp_refcnt;
-
-		if (ddp->ddp_phys_birth == 0)
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 
-		int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
-		    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+		int ndvas = ddt_phys_dva_count(ddp, v,
+		    DDK_GET_CRYPT(&dde->dde_key));
+		const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+		    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
+
+		uint64_t dsize = 0;
 		for (int d = 0; d < ndvas; d++)
-			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+			dsize += dva_get_dsize_sync(spa, &dvas[d]);
+
+		uint64_t refcnt = ddt_phys_refcnt(ddp, v);
 
 		dds->dds_blocks += 1;
 		dds->dds_lsize += lsize;
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 8f1bbeeecd8..4e01624f368 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -109,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
 
 static int
 ddt_zap_lookup(objset_t *os, uint64_t object,
-    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
+    const ddt_key_t *ddk, void *phys, size_t psize)
 {
 	uchar_t *cbuf;
 	uint64_t one, csize;
@@ -156,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
 
 static int
 ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
-    const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
+    const void *phys, size_t psize, dmu_tx_t *tx)
 {
 	const size_t cbuf_size = psize + 1;
 
@@ -182,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
 
 static int
 ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
-    ddt_phys_t *phys, size_t psize)
+    void *phys, size_t psize)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index dec0eb28dc5..daf1bd5d637 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2929,7 +2929,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
@@ -2953,13 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
-		ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
 
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+		if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
-		ddt_bp_create(checksum, ddk, ddp, &bp);
+		ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@@ -3022,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx);
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1ca71c738c8..1f3acb9b921 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3256,14 +3256,16 @@ zio_ddt_child_read_done(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddt = ddt_select(zio->io_spa, bp);
-	ddp = ddt_phys_select(ddt, dde, bp);
-	if (zio->io_error == 0)
-		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+
+	if (zio->io_error == 0) {
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+		/* this phys variant doesn't need repair */
+		ddt_phys_clear(dde->dde_phys, v);
+	}
 
 	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
 		dde->dde_io->dde_repair_abd = zio->io_abd;
@@ -3284,21 +3286,25 @@ zio_ddt_read_start(zio_t *zio)
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
-		ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp);
+		ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
+		ddt_univ_phys_t *ddp = dde->dde_phys;
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
-		if (ddp_self == NULL)
+		if (v_self == DDT_PHYS_NONE)
 			return (zio);
 
+		/* issue I/O for the other copies */
 		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-			ddt_phys_t *ddp = &dde->dde_phys[p];
-			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+			ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+			if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
 				continue;
-			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
-			    &blk);
+
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
+			    ddp, v, &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
@@ -3378,30 +3384,32 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 		if (DDT_PHYS_IS_DITTO(ddt, p))
 			continue;
 
-		zio_t *lio = dde->dde_io->dde_lead_zio[p];
+		if (dde->dde_io == NULL)
+			continue;
 
-		if (lio != NULL && do_raw) {
+		zio_t *lio = dde->dde_io->dde_lead_zio[p];
+		if (lio == NULL)
+			continue;
+
+		if (do_raw)
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
-		} else if (lio != NULL) {
-			return (lio->io_orig_size != zio->io_orig_size ||
-			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
-		}
+
+		return (lio->io_orig_size != zio->io_orig_size ||
+		    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 	}
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		if (DDT_PHYS_IS_DITTO(ddt, p))
-			continue;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
 
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-
-		if (ddp->ddp_phys_birth != 0 && do_raw) {
+		if (phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
@@ -3424,13 +3432,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
-		} else if (ddp->ddp_phys_birth != 0) {
+		} else if (phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
@@ -3457,53 +3465,88 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 	return (B_FALSE);
 }
 
-static void
-zio_ddt_child_write_ready(zio_t *zio)
-{
-	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
-	ddt_entry_t *dde = zio->io_private;
-	zio_t *pio;
-
-	if (zio->io_error)
-		return;
-
-	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-
-	ddt_enter(ddt);
-
-	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
-
-	ddt_phys_fill(ddp, zio->io_bp);
-
-	zio_link_t *zl = NULL;
-	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
-		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
-
-	ddt_exit(ddt);
-}
-
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 
+	zio_link_t *zl = NULL;
+	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
+
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
-	ddt_phys_t *ddp = &dde->dde_phys[p];
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	ddt_enter(ddt);
 
-	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
-	dde->dde_io->dde_lead_zio[p] = NULL;
+	/* we're the lead, so once we're done there's no one else outstanding */
+	if (dde->dde_io->dde_lead_zio[p] == zio)
+		dde->dde_io->dde_lead_zio[p] = NULL;
 
-	if (zio->io_error == 0) {
-		zio_link_t *zl = NULL;
-		while (zio_walk_parents(zio, &zl) != NULL)
-			ddt_phys_addref(ddp);
-	} else {
-		ddt_phys_clear(ddp);
+	ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
+
+	if (zio->io_error != 0) {
+		/*
+		 * The write failed, so we're about to abort the entire IO
+		 * chain. We need to revert the entry back to what it was at
+		 * the last time it was successfully extended.
+		 */
+		ddt_phys_copy(ddp, orig, v);
+		ddt_phys_clear(orig, v);
+
+		ddt_exit(ddt);
+		return;
+	}
+
+	/*
+	 * We've successfully added new DVAs to the entry. Clear the saved
+	 * state or, if there's still outstanding IO, remember it so we can
+	 * revert to a known good state if that IO fails.
+	 */
+	if (dde->dde_io->dde_lead_zio[p] == NULL)
+		ddt_phys_clear(orig, v);
+	else
+		ddt_phys_copy(orig, ddp, v);
+
+	/*
+	 * Add references for all dedup writes that were waiting on the
+	 * physical one, skipping any other physical writes that are waiting.
+	 */
+	zio_t *pio;
+	zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+			ddt_phys_addref(ddp, v);
+	}
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+
+	zio_link_t *zl = NULL;
+	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+	if (zio->io_error != 0)
+		return;
+
+	ddt_enter(ddt);
+
+	ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
+
+	zio_t *pio;
+	zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+			ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
 	}
 
 	ddt_exit(ddt);
@@ -3516,7 +3559,6 @@ zio_ddt_write(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
-	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 
@@ -3537,9 +3579,6 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}
 
-	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
@@ -3563,31 +3602,227 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}
 
-	ddt_alloc_entry_io(dde);
+	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+	ddt_univ_phys_t *ddp = dde->dde_phys;
 
-	if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) {
-		if (ddp->ddp_phys_birth != 0)
-			ddt_bp_fill(ddp, bp, txg);
-		if (dde->dde_io->dde_lead_zio[p] != NULL)
-			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
-		else
-			ddt_phys_addref(ddp);
-	} else if (zio->io_bp_override) {
-		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
-		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
-		ddt_phys_fill(ddp, bp);
-		ddt_phys_addref(ddp);
+	/*
+	 * In the common cases, at this point we have a regular BP with no
+	 * allocated DVAs, and the corresponding DDT entry for its checksum.
+	 * Our goal is to fill the BP with enough DVAs to satisfy its copies=
+	 * requirement.
+	 *
+	 * One of three things needs to happen to fulfill this:
+	 *
+	 * - if the DDT entry has enough DVAs to satisfy the BP, we just copy
+	 *   them out of the entry and return;
+	 *
+	 * - if the DDT entry has no DVAs (ie its brand new), then we have to
+	 *   issue the write as normal so that DVAs can be allocated and the
+	 *   data land on disk. We then copy the DVAs into the DDT entry on
+	 *   return.
+	 *
+	 * - if the DDT entry has some DVAs, but too few, we have to issue the
+	 *   write, adjusted to have allocate fewer copies. When it returns, we
+	 *   add the new DVAs to the DDT entry, and update the BP to have the
+	 *   full amount it originally requested.
+	 *
+	 * In all cases, if there's already a writing IO in flight, we need to
+	 * defer the action until after the write is done. If our action is to
+	 * write, we need to adjust our request for additional DVAs to match
+	 * what will be in the DDT entry after it completes. In this way every
+	 * IO can be guaranteed to recieve enough DVAs simply by joining the
+	 * end of the chain and letting the sequence play out.
+	 */
+
+	/*
+	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
+	 * the third one as normal.
+	 */
+	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
+	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
+
+	/* Number of DVAs requested bya the IO. */
+	uint8_t need_dvas = zp->zp_copies;
+
+	/*
+	 * What we do next depends on whether or not there's IO outstanding that
+	 * will update this entry.
+	 */
+	if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
+		/*
+		 * No IO outstanding, so we only need to worry about ourselves.
+		 */
+
+		/*
+		 * Override BPs bring their own DVAs and their own problems.
+		 */
+		if (zio->io_bp_override) {
+			/*
+			 * For a brand-new entry, all the work has been done
+			 * for us, and we can just fill it out from the provided
+			 * block and leave.
+			 */
+			if (have_dvas == 0) {
+				ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+				ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+				ddt_phys_extend(ddp, v, bp);
+				ddt_phys_addref(ddp, v);
+				ddt_exit(ddt);
+				return (zio);
+			}
+
+			/*
+			 * If we already have this entry, then we want to treat
+			 * it like a regular write. To do this we just wipe
+			 * them out and proceed like a regular write.
+			 *
+			 * Even if there are some DVAs in the entry, we still
+			 * have to clear them out. We can't use them to fill
+			 * out the dedup entry, as they are all referenced
+			 * together by a bp already on disk, and will be freed
+			 * as a group.
+			 */
+			BP_ZERO_DVAS(bp);
+			BP_SET_BIRTH(bp, 0, 0);
+		}
+
+		/*
+		 * If there are enough DVAs in the entry to service our request,
+		 * then we can just use them as-is.
+		 */
+		if (have_dvas >= need_dvas) {
+			ddt_bp_fill(ddp, v, bp, txg);
+			ddt_phys_addref(ddp, v);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * Otherwise, we have to issue IO to fill the entry up to the
+		 * amount we need.
+		 */
+		need_dvas -= have_dvas;
 	} else {
-		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
-		    zio->io_orig_size, zio->io_orig_size, zp,
-		    zio_ddt_child_write_ready, NULL,
-		    zio_ddt_child_write_done, dde, zio->io_priority,
-		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+		/*
+		 * There's a write in-flight. If there's already enough DVAs on
+		 * the entry, then either there were already enough to start
+		 * with, or the in-flight IO is between READY and DONE, and so
+		 * has extended the entry with new DVAs. Either way, we don't
+		 * need to do anything, we can just slot in behind it.
+		 */
 
-		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_io->dde_lead_zio[p] = cio;
+		if (zio->io_bp_override) {
+			/*
+			 * If there's a write out, then we're soon going to
+			 * have our own copies of this block, so clear out the
+			 * override block and treat it as a regular dedup
+			 * write. See comment above.
+			 */
+			BP_ZERO_DVAS(bp);
+			BP_SET_BIRTH(bp, 0, 0);
+		}
+
+		if (have_dvas >= need_dvas) {
+			/*
+			 * A minor point: there might already be enough
+			 * committed DVAs in the entry to service our request,
+			 * but we don't know which are completed and which are
+			 * allocated but not yet written. In this case, should
+			 * the IO for the new DVAs fail, we will be on the end
+			 * of the IO chain and will also recieve an error, even
+			 * though our request could have been serviced.
+			 *
+			 * This is an extremely rare case, as it requires the
+			 * original block to be copied with a request for a
+			 * larger number of DVAs, then copied again requesting
+			 * the same (or already fulfilled) number of DVAs while
+			 * the first request is active, and then that first
+			 * request errors. In return, the logic required to
+			 * catch and handle it is complex. For now, I'm just
+			 * not going to bother with it.
+			 */
+
+			/*
+			 * We always fill the bp here as we may have arrived
+			 * after the in-flight write has passed READY, and so
+			 * missed out.
+			 */
+			ddt_bp_fill(ddp, v, bp, txg);
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * There's not enough in the entry yet, so we need to look at
+		 * the write in-flight and see how many DVAs it will have once
+		 * it completes.
+		 *
+		 * The in-flight write has potentially had its copies request
+		 * reduced (if we're filling out an existing entry), so we need
+		 * to reach in and get the original write to find out what it is
+		 * expecting.
+		 *
+		 * Note that the parent of the lead zio will always have the
+		 * highest zp_copies of any zio in the chain, because ones that
+		 * can be serviced without additional IO are always added to
+		 * the back of the chain.
+		 */
+		zio_link_t *zl = NULL;
+		zio_t *pio =
+		    zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
+		ASSERT(pio);
+		uint8_t parent_dvas = pio->io_prop.zp_copies;
+
+		if (parent_dvas >= need_dvas) {
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * Still not enough, so we will need to issue to get the
+		 * shortfall.
+		 */
+		need_dvas -= parent_dvas;
 	}
 
+	/*
+	 * We need to write. We will create a new write with the copies
+	 * property adjusted to match the number of DVAs we need to need to
+	 * grow the DDT entry by to satisfy the request.
+	 */
+	zio_prop_t czp = *zp;
+	czp.zp_copies = need_dvas;
+	zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+	    zio->io_orig_size, zio->io_orig_size, &czp,
+	    zio_ddt_child_write_ready, NULL,
+	    zio_ddt_child_write_done, dde, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+	zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+
+	/*
+	 * We are the new lead zio, because our parent has the highest
+	 * zp_copies that has been requested for this entry so far.
+	 */
+	ddt_alloc_entry_io(dde);
+	if (dde->dde_io->dde_lead_zio[p] == NULL) {
+		/*
+		 * First time out, take a copy of the stable entry to revert
+		 * to if there's an error (see zio_ddt_child_write_done())
+		 */
+		ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
+	} else {
+		/*
+		 * Make the existing chain our child, because it cannot
+		 * complete until we have.
+		 */
+		zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
+	}
+	dde->dde_io->dde_lead_zio[p] = cio;
+
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
@@ -3603,8 +3838,7 @@ zio_ddt_free(zio_t *zio)
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
-	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
+	ddt_entry_t *dde = NULL;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -3612,9 +3846,9 @@ zio_ddt_free(zio_t *zio)
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
-		ddp = ddt_phys_select(ddt, dde, bp);
-		if (ddp)
-			ddt_phys_decref(ddp);
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+		if (v != DDT_PHYS_NONE)
+			ddt_phys_decref(dde->dde_phys, v);
 	}
 	ddt_exit(ddt);
 

From 27e9cb5f8022bef72553cbe12f7ec292535e4c0b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 15 Jun 2023 17:19:41 +1000
Subject: [PATCH 14/59] ddt: cleanup the stats & histogram code

Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.

Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.

Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.

Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 cmd/zdb/zdb.c          |  22 ++++-----
 include/sys/ddt.h      |   8 ++-
 include/sys/ddt_impl.h |   4 --
 module/zfs/ddt.c       |  24 +++++++--
 module/zfs/ddt_stats.c | 107 +++++++++++++++++++++++++++++------------
 5 files changed, 114 insertions(+), 51 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 142f55b299e..250052adfb1 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -7357,29 +7357,27 @@ dump_simulated_ddt(spa_t *spa)
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
-		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
-		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
-		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
-		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
-		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
 
-		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
-		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
-		dds.dds_ref_psize = zdde->zdde_ref_psize;
-		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
+		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
+		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
+		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
 
-		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
-		    &dds, 0);
+		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
+		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
+		dds->dds_ref_psize += zdde->zdde_ref_psize;
+		dds->dds_ref_dsize += zdde->zdde_ref_dsize;
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
-	ddt_histogram_stat(&dds_total, &ddh_total);
+	ddt_histogram_total(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 11e09eef3bc..2dd18526dbb 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -318,9 +318,15 @@ extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
 extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
     boolean_t encrypted);
 
+extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);
+extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);
+
 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
-extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
 extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+
 extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
 extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
 extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index c4e681fb117..ce4bc559ddb 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -77,8 +77,6 @@ typedef struct {
 
 extern const ddt_ops_t ddt_zap_ops;
 
-extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
-
 /*
  * These are only exposed so that zdb can access them. Try not to use them
  * outside of the DDT implementation proper, and if you do, consider moving
@@ -95,8 +93,6 @@ extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
-extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
-
 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
     char *name);
 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 59526394bd0..f3b34732611 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -992,7 +992,18 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Flag cleanup required */
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
-		ddt_stat_update(ddt, dde, -1ULL);
+		/*
+		 * The histograms only track inactive (stored) blocks.
+		 * We've just put an entry onto the live list, so we need to
+		 * remove its counts. When its synced back, it'll be re-added
+		 * to the right one.
+		 */
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
 	}
 
 	/* Entry loaded, everyone can proceed now */
@@ -1527,11 +1538,18 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	if (total_refcnt != 0) {
 		dde->dde_type = ntype;
 		dde->dde_class = nclass;
-		ddt_stat_update(ddt, dde, 0);
+
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
 
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
+
 		/*
 		 * If the class changes, the order that we scan this bp
 		 * changes.  If it decreases, we could miss it, so
@@ -1540,8 +1558,6 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		 * traversing.)
 		 */
 		if (nclass < oclass) {
-			ddt_lightweight_entry_t ddlwe;
-			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			dsl_scan_ddt_entry(dp->dp_scan,
 			    ddt->ddt_checksum, ddt, &ddlwe, tx);
 		}
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 6da77bbca5c..9316200f21f 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -33,24 +33,24 @@
 #include <sys/ddt_impl.h>
 
 static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    ddt_stat_t *dds)
 {
 	spa_t *spa = ddt->ddt_spa;
-	ddt_key_t *ddk = &dde->dde_key;
-	uint64_t lsize = DDK_GET_LSIZE(ddk);
-	uint64_t psize = DDK_GET_PSIZE(ddk);
+	uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
+	uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		const ddt_univ_phys_t *ddp = dde->dde_phys;
+	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 
 		int ndvas = ddt_phys_dva_count(ddp, v,
-		    DDK_GET_CRYPT(&dde->dde_key));
+		    DDK_GET_CRYPT(&ddlwe->ddlwe_key));
 		const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
 		    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
 
@@ -72,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 	}
 }
 
-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+static void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
 {
-	const uint64_t *s = (const uint64_t *)src;
-	uint64_t *d = (uint64_t *)dst;
-	uint64_t *d_end = (uint64_t *)(dst + 1);
+	dst->dds_blocks		+= src->dds_blocks;
+	dst->dds_lsize		+= src->dds_lsize;
+	dst->dds_psize		+= src->dds_psize;
+	dst->dds_dsize		+= src->dds_dsize;
+	dst->dds_ref_blocks	+= src->dds_ref_blocks;
+	dst->dds_ref_lsize	+= src->dds_ref_lsize;
+	dst->dds_ref_psize	+= src->dds_ref_psize;
+	dst->dds_ref_dsize	+= src->dds_ref_dsize;
+}
 
-	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+static void
+ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
+{
+	/* This caught more during development than you might expect... */
+	ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
+	ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
+	ASSERT3U(dst->dds_psize, >=, src->dds_psize);
+	ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
+	ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
+	ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
+	ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
+	ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);
 
-	for (int i = 0; i < d_end - d; i++)
-		d[i] += (s[i] ^ neg) - neg;
+	dst->dds_blocks		-= src->dds_blocks;
+	dst->dds_lsize		-= src->dds_lsize;
+	dst->dds_psize		-= src->dds_psize;
+	dst->dds_dsize		-= src->dds_dsize;
+	dst->dds_ref_blocks	-= src->dds_ref_blocks;
+	dst->dds_ref_lsize	-= src->dds_ref_lsize;
+	dst->dds_ref_psize	-= src->dds_ref_psize;
+	dst->dds_ref_dsize	-= src->dds_ref_dsize;
 }
 
 void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
 {
 	ddt_stat_t dds;
-	ddt_histogram_t *ddh;
 	int bucket;
 
-	ddt_stat_generate(ddt, dde, &dds);
+	ddt_stat_generate(ddt, ddlwe, &dds);
 
 	bucket = highbit64(dds.dds_ref_blocks) - 1;
-	ASSERT3U(bucket, >=, 0);
+	if (bucket < 0)
+		return;
 
-	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
+}
 
-	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+void
+ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_stat_t dds;
+	int bucket;
+
+	ddt_stat_generate(ddt, ddlwe, &dds);
+
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
+	if (bucket < 0)
+		return;
+
+	ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
 }
 
 void
 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 {
 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
 }
 
 void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 {
 	memset(dds, 0, sizeof (*dds));
 
 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+		ddt_stat_add(dds, &ddh->ddh_stat[h]);
 }
 
 boolean_t
 ddt_histogram_empty(const ddt_histogram_t *ddh)
 {
-	const uint64_t *s = (const uint64_t *)ddh;
-	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+	for (int h = 0; h < 64; h++) {
+		const ddt_stat_t *dds = &ddh->ddh_stat[h];
 
-	while (s < s_end)
-		if (*s++ != 0)
-			return (B_FALSE);
+		if (dds->dds_blocks == 0 &&
+		    dds->dds_lsize == 0 &&
+		    dds->dds_psize == 0 &&
+		    dds->dds_dsize == 0 &&
+		    dds->dds_ref_blocks == 0 &&
+		    dds->dds_ref_lsize == 0 &&
+		    dds->dds_ref_psize == 0 &&
+		    dds->dds_ref_dsize == 0)
+			continue;
+
+		return (B_FALSE);
+	}
 
 	return (B_TRUE);
 }
@@ -222,7 +269,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 
 	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 	ddt_get_dedup_histogram(spa, ddh_total);
-	ddt_histogram_stat(dds_total, ddh_total);
+	ddt_histogram_total(dds_total, ddh_total);
 	kmem_free(ddh_total, sizeof (ddt_histogram_t));
 }
 

From 592f38900dc21ff86ca9c821c72b55e4ace347af Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 11 Oct 2023 12:46:55 +1100
Subject: [PATCH 15/59] ddt: compare keys 64-bits at a time, trying to match
 ZAP order

This yields substantial performance improvements when we only write out
some small % of entries at a time, as it will cause entries that will go
into "nearby" ZAP leaf nodes to be grouped closer together in the AVL, and
so touch fewer blocks. Without this, the distribution is an even spread,
so we touch a lot more ZAP leaf nodes for any given number of entries.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 module/zfs/ddt.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index f3b34732611..26e127d61ac 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -1038,29 +1038,25 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 }
 
 /*
- * Key comparison. Any struct wanting to make use of this function must have
- * the key as the first element.
+ * ddt_key_t comparison. Any struct wanting to make use of this function must
+ * have the key as the first element. Casts it to N uint64_ts, and checks until
+ * we find there's a difference. This is intended to match how ddt_zap.c drives
+ * the ZAPs (first uint64_t as the key prehash), which will minimise the number
+ * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
+ * not an invariant for this function though, should you wish to change it.
  */
-#define	DDT_KEY_CMP_LEN	(sizeof (ddt_key_t) / sizeof (uint16_t))
-
-typedef struct ddt_key_cmp {
-	uint16_t	u16[DDT_KEY_CMP_LEN];
-} ddt_key_cmp_t;
-
 int
 ddt_key_compare(const void *x1, const void *x2)
 {
-	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1;
-	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2;
-	int32_t cmp = 0;
+	const uint64_t *k1 = (const uint64_t *)x1;
+	const uint64_t *k2 = (const uint64_t *)x2;
 
-	for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
-		cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
-		if (likely(cmp))
-			break;
-	}
+	int cmp;
+	for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
+		if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
+			return (cmp);
 
-	return (TREE_ISIGN(cmp));
+	return (0);
 }
 
 /* Create the containing dir for this DDT and bump the feature count */

From cbb9ef0a4c8e04358f7d5ddae0eb99d0f703ee21 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 6 Oct 2023 17:06:34 +1100
Subject: [PATCH 16/59] ddt: tuneable to override copies= on dedup metadata
 objects

All objects stored in the MOS get copies=3. For a large dedup table,
this requires significant extra IO and disk space, when its not really
necessary - the dedup table itself isn't needed to read or write data,
only to keep data usage down. Losing the dedup table does not render the
pool unusable, it just messes up the accounting somewhat.

This adds a dmu_ddt_copies tuneable. When set to 0, the existing
behaviour is used. When set higher, dedup table blocks (ZAP and log)
will have this many copies rather than the usual 3, while indirect
blocks will have one more again.

This is a tuneable for now mostly for testing. Losing a dedup table can
cause blocks to be leaked, and we currently have no facilities to repair
that.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 module/zfs/dmu.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3dcf49ceb64..b3eda8ea509 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
+/*
+ * Override copies= for dedup state objects. 0 means the traditional behaviour
+ * (ie the default for the containing objset ie 3 for the MOS).
+ */
+uint_t dmu_ddt_copies = 0;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
@@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
+
+		if (dmu_ddt_copies > 0) {
+			/*
+			 * If this tuneable is set, and this is a write for a
+			 * dedup entry store (zap or log), then we treat it
+			 * something like ZFS_REDUNDANT_METADATA_MOST on a
+			 * regular dataset: this many copies, and one more for
+			 * "higher" indirect blocks. This specific exception is
+			 * necessary because dedup objects are stored in the
+			 * MOS, which always has the highest possible copies.
+			 */
+			dmu_object_type_t stype =
+			    dn ? dn->dn_storage_type : DMU_OT_NONE;
+			if (stype == DMU_OT_NONE)
+				stype = type;
+			if (stype == DMU_OT_DDT_ZAP) {
+				copies = dmu_ddt_copies;
+				if (level >=
+				    zfs_redundant_metadata_most_ditto_level)
+					copies++;
+			}
+		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
@@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
+	"Override copies= for dedup objects");

From cd69ba3d49cdb939cba87e7fd6814608532df92f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 22 Jun 2023 17:46:22 +1000
Subject: [PATCH 17/59] ddt: dedup log

Adds a log/journal to dedup. At the end of txg, instead of writing the
entry directly to the ZAP, instead its adding to an in-memory tree and
appended to an on-disk object. The on-disk object is only read at
import, to reload the in-memory tree.

Lookups first go the the log tree before going to the ZAP, so
recently-used entries will remain close by in memory. This vastly
reduces overhead from dedup IO, as it will not have to do so many
read/update/write cycles on ZAP leaf nodes.

A flushing facility is added at end of txg, to push logged entries out
to the ZAP. There's actually two separate "logs" (in-memory tree and
on-disk object), one active (recieving updated entries) and one flushing
(writing out to disk). These are swapped (ie flushing begins) based on
memory used by the in-memory log trees and time since we last flushed
something.

The flushing facility monitors the amount of entries coming in and being
flushed out, and calibrates itself to try to flush enough each txg to
keep up with the ingest rate without competing too much with other IO.
Multiple tuneables are provided to control the flushing facility.

All the histograms and stats are update to accomodate the log as a
separate entry store. zdb gains knowledge of how to count them and dump
them. Documentation included!

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 cmd/zdb/zdb.c                                 |  33 +-
 include/sys/ddt.h                             |  39 +-
 include/sys/ddt_impl.h                        | 131 ++-
 include/sys/dmu.h                             |   1 +
 lib/libzpool/Makefile.am                      |   1 +
 man/man4/zfs.4                                |  82 ++
 module/Kbuild.in                              |   1 +
 module/Makefile.bsd                           |   2 +
 module/zfs/ddt.c                              | 646 ++++++++++++---
 module/zfs/ddt_log.c                          | 760 ++++++++++++++++++
 module/zfs/ddt_stats.c                        |   9 +-
 tests/zfs-tests/include/tunables.cfg          |   1 +
 .../functional/dedup/dedup_fdt_create.ksh     |   7 +
 .../functional/dedup/dedup_fdt_import.ksh     |   7 +
 .../dedup/dedup_legacy_fdt_mixed.ksh          |   7 +
 .../dedup/dedup_legacy_fdt_upgrade.ksh        |   7 +
 .../tests/functional/dedup/dedup_quota.ksh    |  18 +-
 17 files changed, 1621 insertions(+), 131 deletions(-)
 create mode 100644 module/zfs/ddt_log.c

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 250052adfb1..c72df390935 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1958,6 +1958,32 @@ dump_dedup_ratio(const ddt_stat_t *dds)
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
+static void
+dump_ddt_log(ddt_t *ddt)
+{
+	for (int n = 0; n < 2; n++) {
+		ddt_log_t *ddl = &ddt->ddt_log[n];
+
+		uint64_t count = avl_numnodes(&ddl->ddl_tree);
+		if (count == 0)
+			continue;
+
+		printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
+		    zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
+
+		if (dump_opt['D'] < 4)
+			continue;
+
+		ddt_lightweight_entry_t ddlwe;
+		uint64_t index = 0;
+		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
+			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+			dump_ddt_entry(ddt, &ddlwe, index++);
+		}
+	}
+}
+
 static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
@@ -2027,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
 				dump_ddt(ddt, type, class);
 			}
 		}
+		dump_ddt_log(ddt);
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
@@ -5743,7 +5770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
 		ddt_phys_decref(dde->dde_phys, v);
 
 		/*
@@ -8120,6 +8147,10 @@ dump_mos_leaks(spa_t *spa)
 
 		/* FDT container */
 		mos_obj_refd(ddt->ddt_dir_object);
+
+		/* FDT log objects */
+		mos_obj_refd(ddt->ddt_log[0].ddl_object);
+		mos_obj_refd(ddt->ddt_log[1].ddl_object);
 	}
 
 	if (spa->spa_brt != NULL) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 2dd18526dbb..2fc798725ed 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -43,7 +43,8 @@ struct abd;
  * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
  */
 #define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
-#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT)
+#define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)
 
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
@@ -209,6 +210,7 @@ typedef enum {
 /* State flags for dde_flags */
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
+#define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */
 
 /*
  * Additional data to support entry update or repair. This is fixed size
@@ -254,6 +256,19 @@ typedef struct {
 	ddt_univ_phys_t	ddlwe_phys;
 } ddt_lightweight_entry_t;
 
+/*
+ * In-core DDT log. A separate struct to make it easier to switch between the
+ * appending and flushing logs.
+ */
+typedef struct {
+	avl_tree_t	ddl_tree;	/* logged entries */
+	uint32_t	ddl_flags;	/* flags for this log */
+	uint64_t	ddl_object;	/* log object id */
+	uint64_t	ddl_length;	/* on-disk log size */
+	uint64_t	ddl_first_txg;	/* txg log became active */
+	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
+} ddt_log_t;
+
 /*
  * In-core DDT object. This covers all entries and stats for a the whole pool
  * for a given checksum type.
@@ -262,8 +277,22 @@ typedef struct {
 	kmutex_t	ddt_lock;	/* protects changes to all fields */
 
 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
+	avl_tree_t	ddt_log_tree;	/* logged entries */
 
-	avl_tree_t	ddt_repair_tree; /* entries being repaired */
+	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
+
+	ddt_log_t	ddt_log[2];		/* active/flushing logs */
+	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
+	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
+
+	hrtime_t	ddt_flush_start;	/* log flush start this txg */
+	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
+
+	int32_t		ddt_flush_count;	/* entries flushed this txg */
+	int32_t		ddt_flush_min;		/* min rem entries to flush */
+	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
+	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
+	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
 
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
@@ -276,13 +305,17 @@ typedef struct {
 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
 
-	/* object ids for whole-ddt and per-type/per-class stats */
+	/* object ids for stored, logged and per-type/per-class stats */
 	uint64_t	ddt_stat_object;
+	ddt_object_t	ddt_log_stats;
 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
 
 	/* type/class stats by power-2-sized referenced blocks */
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+
+	/* log stats power-2-sized referenced blocks */
+	ddt_histogram_t	ddt_log_histogram;
 } ddt_t;
 
 /*
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index ce4bc559ddb..6f11cd90c1d 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -28,6 +28,7 @@
 #define	_SYS_DDT_IMPL_H
 
 #include <sys/ddt.h>
+#include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -50,6 +51,106 @@ extern "C" {
 	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
 } while (0)
 
+#define	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do {             \
+	memset((ddlwe), 0, sizeof (*ddlwe));                            \
+	(ddlwe)->ddlwe_key = (ddle)->ddle_key;                          \
+	(ddlwe)->ddlwe_type = (ddle)->ddle_type;                        \
+	(ddlwe)->ddlwe_class = (ddle)->ddle_class;                      \
+	memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+/*
+ * An entry on the log tree. These are "frozen", and a record of what's in
+ * the on-disk log. They can't be used in place, but can be "loaded" back into
+ * the live tree.
+ */
+typedef struct {
+	ddt_key_t	ddle_key;	/* ddt_log_tree key */
+	avl_node_t	ddle_node;	/* ddt_log_tree node */
+
+	ddt_type_t	ddle_type;	/* storage type */
+	ddt_class_t	ddle_class;	/* storage class */
+
+	/* extra allocation for flat/trad phys */
+	ddt_univ_phys_t	ddle_phys[];
+} ddt_log_entry_t;
+
+/* On-disk log record types. */
+typedef enum {
+	DLR_INVALID	= 0,	/* end of block marker */
+	DLR_ENTRY	= 1,	/* an entry to add or replace in the log tree */
+} ddt_log_record_type_t;
+
+/* On-disk log record header. */
+typedef struct {
+	/*
+	 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:    record type (ddt_log_record_type_t)
+	 * bits 8-15:  length of record header+payload
+	 * bits 16-47:  reserved, all zero
+	 * bits 48-55:   if type==DLR_ENTRY, storage type (ddt_type)
+	 *                otherwise all zero
+	 * bits 56-63:  if type==DLR_ENTRY, storage class (ddt_class)
+	 *                otherwise all zero
+	 */
+	uint64_t	dlr_info;
+	uint8_t		dlr_payload[];
+} ddt_log_record_t;
+
+#define	DLR_GET_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 0, 8)
+#define	DLR_SET_TYPE(dlr, v)		BF64_SET((dlr)->dlr_info, 0, 8, v)
+#define	DLR_GET_RECLEN(dlr)		BF64_GET((dlr)->dlr_info, 8, 16)
+#define	DLR_SET_RECLEN(dlr, v)		BF64_SET((dlr)->dlr_info, 8, 16, v)
+#define	DLR_GET_ENTRY_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 48, 8)
+#define	DLR_SET_ENTRY_TYPE(dlr, v)	BF64_SET((dlr)->dlr_info, 48, 8, v)
+#define	DLR_GET_ENTRY_CLASS(dlr)	BF64_GET((dlr)->dlr_info, 56, 8)
+#define	DLR_SET_ENTRY_CLASS(dlr, v)	BF64_SET((dlr)->dlr_info, 56, 8, v)
+
+/* Payload for DLR_ENTRY. */
+typedef struct {
+	ddt_key_t	dlre_key;
+	ddt_univ_phys_t	dlre_phys[];
+} ddt_log_record_entry_t;
+
+/* Log flags (ddl_flags, dlh_flags) */
+#define	DDL_FLAG_FLUSHING	(1 << 0)	/* this log is being flushed */
+#define	DDL_FLAG_CHECKPOINT	(1 << 1)	/* header has a checkpoint */
+
+/* On-disk log header, stored in the bonus buffer. */
+typedef struct {
+	/*
+	 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:   log version
+	 * bits 8-15:  log flags
+	 * bits 16-63: reserved, all zero
+	 */
+	uint64_t	dlh_info;
+
+	uint64_t	dlh_length;	/* log size in bytes */
+	uint64_t	dlh_first_txg;	/* txg this log went active */
+	ddt_key_t	dlh_checkpoint;	/* last checkpoint */
+} ddt_log_header_t;
+
+#define	DLH_GET_VERSION(dlh)	BF64_GET((dlh)->dlh_info, 0, 8)
+#define	DLH_SET_VERSION(dlh, v)	BF64_SET((dlh)->dlh_info, 0, 8, v)
+#define	DLH_GET_FLAGS(dlh)	BF64_GET((dlh)->dlh_info, 8, 8)
+#define	DLH_SET_FLAGS(dlh, v)	BF64_SET((dlh)->dlh_info, 8, 8, v)
+
+/* DDT log update state */
+typedef struct {
+	dmu_tx_t	*dlu_tx;	/* tx the update is being applied to */
+	dnode_t		*dlu_dn;	/* log object dnode */
+	dmu_buf_t	**dlu_dbp;	/* array of block buffer pointers */
+	int		dlu_ndbp;	/* number of block buffer pointers */
+	uint16_t	dlu_reclen;	/* cached length of record */
+	uint64_t	dlu_block;	/* block for next entry */
+	uint64_t	dlu_offset;	/* offset for next entry */
+} ddt_log_update_t;
+
 /*
  * Ops vector to access a specific DDT object type.
  */
@@ -77,6 +178,33 @@ typedef struct {
 
 extern const ddt_ops_t ddt_zap_ops;
 
+/* Dedup log API */
+extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
+    ddt_log_update_t *dlu);
+extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
+    ddt_log_update_t *dlu);
+extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
+
+extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    dmu_tx_t *tx);
+extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
+
+extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
+
+extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
+
+extern int ddt_log_load(ddt_t *ddt);
+extern void ddt_log_alloc(ddt_t *ddt);
+extern void ddt_log_free(ddt_t *ddt);
+
+extern void ddt_log_init(void);
+extern void ddt_log_fini(void);
+
 /*
  * These are only exposed so that zdb can access them. Try not to use them
  * outside of the DDT implementation proper, and if you do, consider moving
@@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops;
  */
 #define	DDT_NAMELEN	32
 
-extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
+    const ddt_univ_phys_t *ddp);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 5b80dc31594..928f5f2b4fd 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -375,6 +375,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 42f3404db5a..070dc0132f2 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/dbuf.c \
 	module/zfs/dbuf_stats.c \
 	module/zfs/ddt.c \
+	module/zfs/ddt_log.c \
 	module/zfs/ddt_stats.c \
 	module/zfs/ddt_zap.c \
 	module/zfs/dmu.c \
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 45b6c338aa9..aae3d7dfb5f 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -974,6 +974,88 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
+.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
+Maximum number of dedup log flush passes (iterations) each transaction.
+.Pp
+At the start of each transaction, OpenZFS will estimate how many entries it
+needs to flush out to keep up with the change rate, taking the amount and time
+taken to flush on previous txgs into account (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+It will spread this amount into a number of passes.
+At each pass, it will use the amount already flushed and the total time taken
+by flushing and by other IO to recompute how much it should do for the remainder
+of the txg.
+.Pp
+Reducing the max number of passes will make flushing more aggressive, flushing
+out more entries on each pass.
+This can be faster, but also more likely to compete with other IO.
+Increasing the max number of passes will put fewer entries onto each pass,
+keeping the overhead of dedup changes to a minimum but possibly causing a large
+number of changes to be dumped on the last pass, which can blow out the txg
+sync time beyond
+.Sy zfs_txg_timeout .
+.
+.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
+Minimum time to spend on dedup log flush each transaction.
+.Pp
+At least this long will be spent flushing dedup log entries each transaction,
+up to
+.Sy zfs_txg_timeout .
+This occurs even if doing so would delay the transaction, that is, other IO
+completes under this time.
+.
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+Flush at least this many entries each transaction.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction to
+keep up with the ingest rate (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+This sets the minimum for that estimate.
+Raising it can force OpenZFS to flush more aggressively, keeping the log small
+and so reducing pool import times, but can make it less able to back off if
+log flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
+Number of transactions to use to compute the flow rate.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction by
+monitoring the number of entries changed (ingest rate), number of entries
+flushed (flush rate) and time spent flushing (flush time rate) and combining
+these into an overall "flow rate".
+It will use an exponential weighted moving average over some number of recent
+transactions to compute these rates.
+This sets the number of transactions to compute these averages over.
+Setting it higher can help to smooth out the flow rate in the face of spiky
+workloads, but will take longer for the flow rate to adjust to a sustained
+change in the ingress rate.
+.
+.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
+Max transactions to before starting to flush dedup logs.
+.Pp
+OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
+If there is nothing to flush, it will accumulate changes for no more than this
+many transactions before switching the logs and starting to flush entries out.
+.
+.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
+Max memory to use for dedup logs.
+.Pp
+OpenZFS will spend no more than this much memory on maintaining the in-memory
+dedup log.
+Flushing will begin when around half this amount is being spent on logs.
+The default value of
+.Sy 0
+will cause it to be set by
+.Sy zfs_dedup_log_mem_max_percent
+instead.
+.
+.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
+Max memory to use for dedup logs, as a percentage of total memory.
+.Pp
+If
+.Sy zfs_dedup_log_mem_max
+is not set, it will be initialised as a percentage of the total memory in the
+system.
+.
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 57682214dfd..a119198dbfc 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -322,6 +322,7 @@ ZFS_OBJS := \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
+	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index d9d31564d09..534f3257132 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -252,6 +252,7 @@ SRCS+=	abd.c \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast
 
 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.ddt_zap.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 26e127d61ac..ce5c4efb51e 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -125,6 +125,28 @@
  * without which, no space would be recovered and the DDT would continue to be
  * considered "over quota". See zap_shrink_enabled.
  *
+ * ## Dedup log
+ *
+ * Historically, all entries modified on a txg were written back to dedup
+ * storage objects at the end of every txg. This could cause significant
+ * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
+ * and so required reading the whole node, updating the entry, and writing it
+ * back. On busy pools, this could add serious IO and memory overheads.
+ *
+ * To address this, the dedup log was added. If the "fast_dedup" feature is
+ * enabled, at the end of each txg, modified entries will be copied to an
+ * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
+ * same block is requested again, the in-memory object will be checked first,
+ * and if its there, the entry inflated back onto the live tree without going
+ * to storage. The on-disk log is only read at pool import time, to reload the
+ * in-memory log.
+ *
+ * Each txg, some amount of the in-memory log will be flushed out to a DDT
+ * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
+ * keep up with the rate of change on dedup entries, but not so much that it
+ * would impact overall throughput, and not using too much memory. See the
+ * zfs_dedup_log_* tuneables in zfs(4) for more details.
+ *
  * ## Repair IO
  *
  * If a read on a dedup block fails, but there are other copies of the block in
@@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0;
 uint_t dedup_class_wait_txgs = 5;
 
 
+/*
+ * Don't do more than this many incremental flush passes per txg.
+ */
+uint_t zfs_dedup_log_flush_passes_max = 8;
+
+/*
+ * Minimum time to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_min_time_ms = 1000;
+
+/*
+ * Minimum entries to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_entries_min = 1000;
+
+/*
+ * Number of txgs to average flow rates across.
+ */
+uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
+
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
@@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
  */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };
 
 /* Dummy version to signal that configure is still necessary */
@@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 
 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
+	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
 }
 
 static int
@@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 }
 
 uint64_t
-ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
 {
 	uint64_t refcnt = 0;
 
-	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
-		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
-	} else {
-		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
-			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
-	}
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		refcnt = ddp->ddp_flat.ddp_refcnt;
+	else
+		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
+			refcnt += ddp->ddp_trad[v].ddp_refcnt;
 
 	return (refcnt);
 }
@@ -743,11 +784,15 @@ ddt_init(void)
 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	ddt_log_init();
 }
 
 void
 ddt_fini(void)
 {
+	ddt_log_fini();
+
 	kmem_cache_destroy(ddt_entry_trad_cache);
 	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
@@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
+	/* Entry is still in the log, so charge the entry back to it */
+	if (dde->dde_flags & DDE_FLAG_LOGGED) {
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(ddt, dde);
 }
@@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 
 	avl_insert(&ddt->ddt_tree, dde, where);
 
+	/* If its in the log tree, we can "load" it from there */
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		ddt_lightweight_entry_t ddlwe;
+
+		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
+		    &search, &ddlwe) ||
+		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
+
+			dde->dde_type = ddlwe.ddlwe_type;
+			dde->dde_class = ddlwe.ddlwe_class;
+			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
+			    DDT_PHYS_SIZE(ddt));
+
+			return (dde);
+		}
+	}
+
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
@@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
 		/*
-		 * The histograms only track inactive (stored) blocks.
+		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
 		 * remove its counts. When its synced back, it'll be re-added
 		 * to the right one.
+		 *
+		 * We only do this when we successfully found it in the store.
+		 * error == ENOENT means this is a new entry, and so its already
+		 * not counted.
 		 */
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
 		}
 	}
 
+	ddt_log_destroy(ddt, tx);
+
 	uint64_t count;
 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
@@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 
 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
-
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
+	ddt_log_alloc(ddt);
+
 	return (ddt);
 }
 
 static void
 ddt_table_free(ddt_t *ddt)
 {
+	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
@@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa)
 			}
 		}
 
+		error = ddt_log_load(ddt);
+		if (error != 0 && error != ENOENT)
+			return (error);
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1483,109 +1567,15 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 }
 
 static void
-ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
 {
-	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_key_t *ddk = &dde->dde_key;
-	ddt_type_t otype = dde->dde_type;
-	ddt_type_t ntype = DDT_TYPE_DEFAULT;
-	ddt_class_t oclass = dde->dde_class;
-	ddt_class_t nclass;
-	uint64_t total_refcnt = 0;
-
-	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
-
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT(dde->dde_io == NULL ||
-		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_univ_phys_t *ddp = dde->dde_phys;
-		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
-		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
-
-		if (ddt_phys_birth(ddp, v) == 0) {
-			ASSERT0(phys_refcnt);
-			continue;
-		}
-		if (DDT_PHYS_IS_DITTO(ddt, p)) {
-			/*
-			 * Note, we no longer create DDT-DITTO blocks, but we
-			 * don't want to leak any written by older software.
-			 */
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-			continue;
-		}
-		if (phys_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-		total_refcnt += phys_refcnt;
-	}
-
-	if (total_refcnt > 1)
-		nclass = DDT_CLASS_DUPLICATE;
-	else
-		nclass = DDT_CLASS_UNIQUE;
-
-	if (otype != DDT_TYPES &&
-	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
-		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
-		ASSERT3U(
-		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
-	}
-
-	if (total_refcnt != 0) {
-		dde->dde_type = ntype;
-		dde->dde_class = nclass;
-
-		if (!ddt_object_exists(ddt, ntype, nclass))
-			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
-
-		ddt_lightweight_entry_t ddlwe;
-		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
-
-		ddt_histogram_t *ddh =
-		    &ddt->ddt_histogram[ntype][nclass];
-		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
-
-		/*
-		 * If the class changes, the order that we scan this bp
-		 * changes.  If it decreases, we could miss it, so
-		 * scan it right now.  (This covers both class changing
-		 * while we are doing ddt_walk(), and when we are
-		 * traversing.)
-		 */
-		if (nclass < oclass) {
-			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, ddt, &ddlwe, tx);
-		}
-	}
-}
-
-static void
-ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde;
-	void *cookie = NULL;
-
-	if (avl_numnodes(&ddt->ddt_tree) == 0)
-		return;
-
-	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
-
-	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, tx);
-	}
-
-	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
-		ddt_create_dir(ddt, tx);
-
-	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
-		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(ddt, dde);
-	}
-
+	/*
+	 * Count all the entries stored for each type/class, and updates the
+	 * stats within (ddt_object_sync()). If there's no entries for the
+	 * type/class, the whole object is removed. If all objects for the DDT
+	 * are removed, its containing dir is removed, effectively resetting
+	 * the entire DDT to an empty slate.
+	 */
 	uint64_t count = 0;
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		uint64_t add, tcount = 0;
@@ -1604,6 +1594,12 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 		count += tcount;
 	}
 
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		/* Include logged entries in the total count */
+		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
+		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	}
+
 	if (count == 0) {
 		/*
 		 * No entries left on the DDT, so reset the version for next
@@ -1620,8 +1616,398 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
-	spa->spa_dedup_dspace = ~0ULL;
-	spa->spa_dedup_dsize = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+}
+
+static void
+ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+
+	/*
+	 * Compute the target class, so we can decide whether or not to inform
+	 * the scrub traversal (below). Note that we don't store this in the
+	 * entry, as it might change multiple times before finally being
+	 * committed (if we're logging). Instead, we recompute it in
+	 * ddt_sync_entry().
+	 */
+	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If the class changes, the order that we scan this bp changes. If it
+	 * decreases, we could miss it, so scan it right now. (This covers both
+	 * class changing while we are doing ddt_walk(), and when we are
+	 * traversing.)
+	 *
+	 * We also do this when the refcnt goes to zero, because that change is
+	 * only in the log so far; the blocks on disk won't be freed until
+	 * the log is flushed, and the refcnt might increase before that. If it
+	 * does, then we could miss it in the same way.
+	 */
+	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
+		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
+		    ddlwe, tx);
+}
+
+static void
+ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
+{
+	ddt_key_t *ddk = &ddlwe->ddlwe_key;
+	ddt_type_t ntype = DDT_TYPE_DEFAULT;
+	uint64_t refcnt = 0;
+
+	/*
+	 * Compute the total refcnt. Along the way, issue frees for any DVAs
+	 * we no longer want.
+	 */
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
+
+		if (ddt_phys_birth(ddp, v) == 0) {
+			ASSERT3U(phys_refcnt, ==, 0);
+			continue;
+		}
+		if (DDT_PHYS_IS_DITTO(ddt, p)) {
+			/*
+			 * We don't want to keep any obsolete slots (eg ditto),
+			 * regardless of their refcount, but we don't want to
+			 * leak them either. So, free them.
+			 */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+			continue;
+		}
+		if (phys_refcnt == 0)
+			/* No remaining references, free it! */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+		refcnt += phys_refcnt;
+	}
+
+	/* Select the best class for the entry. */
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If an existing entry changed type or class, or its refcount reached
+	 * zero, delete it from the DDT object
+	 */
+	if (otype != DDT_TYPES &&
+	    (otype != ntype || oclass != nclass || refcnt == 0)) {
+		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
+		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
+	}
+
+	/*
+	 * Add or update the entry
+	 */
+	if (refcnt != 0) {
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+
+		ddt_histogram_add_entry(ddt, ddh, ddlwe);
+
+		if (!ddt_object_exists(ddt, ntype, nclass))
+			ddt_object_create(ddt, ntype, nclass, tx);
+		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
+	}
+}
+
+/* Calculate an exponential weighted moving average, lower limited to zero */
+static inline int32_t
+_ewma(int32_t val, int32_t prev, uint32_t weight)
+{
+	ASSERT3U(val, >=, 0);
+	ASSERT3U(prev, >=, 0);
+	const int32_t new =
+	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
+	ASSERT3U(new, >=, 0);
+	return (new);
+}
+
+/* Returns true if done for this txg */
+static boolean_t
+ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (ddt->ddt_flush_pass == 0) {
+		if (spa_sync_pass(ddt->ddt_spa) == 1) {
+			/* First run this txg, get set up */
+			ddt->ddt_flush_start = gethrtime();
+			ddt->ddt_flush_count = 0;
+
+			/*
+			 * How many entries we need to flush. We want to at
+			 * least match the ingest rate.
+			 */
+			ddt->ddt_flush_min = MAX(
+			    ddt->ddt_log_ingest_rate,
+			    zfs_dedup_log_flush_entries_min);
+		} else {
+			/* We already decided we're done for this txg */
+			return (B_FALSE);
+		}
+	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
+		/*
+		 * We already did some flushing on this pass, skip it. This
+		 * happens when dsl_process_async_destroys() runs during a scan
+		 * (on pass 1) and does an additional ddt_sync() to update
+		 * freed blocks.
+		 */
+		return (B_FALSE);
+	}
+
+	if (spa_sync_pass(ddt->ddt_spa) >
+	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
+		/* Too many passes this txg, defer until next. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* Nothing to flush, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
+	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
+
+	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
+
+	if (elapsed_time >= target_time) {
+		/* Too long since we started, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	ddt->ddt_flush_pass++;
+	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
+
+	/*
+	 * Estimate how much time we'll need to flush the remaining entries
+	 * based on how long it normally takes.
+	 */
+	uint32_t want_time;
+	if (ddt->ddt_flush_pass == 1) {
+		/* First pass, use the average time/entries */
+		if (ddt->ddt_log_flush_rate == 0)
+			/* Zero rate, just assume the whole time */
+			want_time = target_time;
+		else
+			want_time = ddt->ddt_flush_min *
+			    ddt->ddt_log_flush_time_rate /
+			    ddt->ddt_log_flush_rate;
+	} else {
+		/* Later pass, calculate from this txg so far */
+		want_time = ddt->ddt_flush_min *
+		    elapsed_time / ddt->ddt_flush_count;
+	}
+
+	/* Figure out how much time we have left */
+	uint32_t remain_time = target_time - elapsed_time;
+
+	/* Smear the remaining entries over the remaining passes. */
+	uint32_t nentries = ddt->ddt_flush_min /
+	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
+	if (want_time > remain_time) {
+		/*
+		 * We're behind; try to catch up a bit by doubling the amount
+		 * this pass. If we're behind that means we're in a later
+		 * pass and likely have most of the remaining time to
+		 * ourselves. If we're in the last couple of passes, then
+		 * doubling might just take us over the timeout, but probably
+		 * not be much, and it stops us falling behind. If we're
+		 * in the middle passes, there'll be more to do, but it
+		 * might just help us catch up a bit and we'll recalculate on
+		 * the next pass anyway.
+		 */
+		nentries = MIN(ddt->ddt_flush_min, nentries*2);
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	uint32_t count = 0;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End this pass if we've synced as much as we need to. */
+		if (++count >= nentries)
+			break;
+	}
+	ddt->ddt_flush_count += count;
+	ddt->ddt_flush_min -= count;
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		ddt_log_truncate(ddt, tx);
+		/* No more passes needed this txg */
+		ddt->ddt_flush_pass = 0;
+	} else
+		/* More to do next time, save checkpoint */
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+
+	ddt_sync_update_stats(ddt, tx);
+
+	return (ddt->ddt_flush_pass == 0);
+}
+
+static void
+ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT(avl_is_empty(&ddt->ddt_tree));
+
+	/* Don't do any flushing when the pool is ready to shut down */
+	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
+		return;
+
+	/* Try to flush some. */
+	if (!ddt_sync_flush_log_incremental(ddt, tx))
+		/* More to do next time */
+		return;
+
+	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
+		/*
+		 * No more to flush, and the active list has stuff, so
+		 * try to swap the logs for next time.
+		 */
+		(void) ddt_log_swap(ddt, tx);
+	}
+
+	/*
+	 * Update flush rate. This is an exponential weighted moving average of
+	 * the number of entries flushed over recent txgs.
+	 */
+	ddt->ddt_log_flush_rate = _ewma(
+	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	    zfs_dedup_log_flush_flow_rate_txgs);
+
+	/*
+	 * Update flush time rate. This is an exponential weighted moving
+	 * average of the total time taken to flush over recent txgs.
+	 */
+	ddt->ddt_log_flush_time_rate = _ewma(
+	    ddt->ddt_log_flush_time_rate,
+	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	    zfs_dedup_log_flush_flow_rate_txgs);
+}
+
+static void
+ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	uint64_t count = avl_numnodes(&ddt->ddt_tree);
+
+	if (count > 0) {
+		ddt_log_update_t dlu = {0};
+		ddt_log_begin(ddt, count, tx, &dlu);
+
+		ddt_entry_t *dde;
+		void *cookie = NULL;
+		ddt_lightweight_entry_t ddlwe;
+		while ((dde =
+		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+			ddt_log_entry(ddt, &ddlwe, &dlu);
+			ddt_sync_scan_entry(ddt, &ddlwe, tx);
+			ddt_free(ddt, dde);
+		}
+
+		ddt_log_commit(ddt, &dlu);
+
+		/*
+		 * Sync the stats for the store objects. Even though we haven't
+		 * modified anything on those objects, they're no longer the
+		 * source of truth for entries that are now in the log, and we
+		 * need the on-disk counts to reflect that, otherwise we'll
+		 * miscount later when importing.
+		 */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0;
+			    class < DDT_CLASSES; class++) {
+				if (ddt_object_exists(ddt, type, class))
+					ddt_object_sync(ddt, type, class, tx);
+			}
+		}
+
+		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+		    sizeof (ddt->ddt_histogram));
+		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	}
+
+	if (spa_sync_pass(ddt->ddt_spa) == 1)
+		/*
+		 * Update ingest rate. This is an exponential weighted moving
+		 * average of the number of entries changed over recent txgs.
+		 * The ramp-up cost shouldn't matter too much because the
+		 * flusher will be trying to take at least the minimum anyway.
+		 */
+		ddt->ddt_log_ingest_rate = _ewma(
+		    count, ddt->ddt_log_ingest_rate,
+		    zfs_dedup_log_flush_flow_rate_txgs);
+}
+
+static void
+ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+	while ((dde = avl_destroy_nodes(
+	    &ddt->ddt_tree, &cookie)) != NULL) {
+		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    dde->dde_type, dde->dde_class, tx);
+		ddt_sync_scan_entry(ddt, &ddlwe, tx);
+		ddt_free(ddt, dde);
+	}
+
+	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+	    sizeof (ddt->ddt_histogram));
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	ddt_sync_update_stats(ddt, tx);
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+
+	if (ddt->ddt_version == UINT64_MAX)
+		return;
+
+	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
+		ASSERT0(avl_numnodes(&ddt->ddt_tree));
+		return;
+	}
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
+	}
+
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG)
+		ddt_sync_table_log(ddt, tx);
+	else
+		ddt_sync_table_flush(ddt, tx);
 }
 
 void
@@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg)
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
-		ddt_sync_table(ddt, tx, txg);
+		ddt_sync_table(ddt, tx);
+		if (ddt->ddt_flags & DDT_FLAG_LOG)
+			ddt_sync_flush_log(ddt, tx);
 		ddt_repair_table(ddt, rio);
 	}
 
@@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		return (B_FALSE);
 	}
 
-	if (dde->dde_type < DDT_TYPES) {
-		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
-
+	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
+		/*
+		 * This entry was either synced to a store object (dde_type is
+		 * real) or was logged. It must be properly on disk at this
+		 * point, so we can just bump its refcount.
+		 */
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
@@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * we may have a block with the DEDUP set, but which doesn't
 		 * have a corresponding entry in the DDT. Be ready.
 		 */
-		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
@@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
+	"Max number of incremental dedup log flush passes per transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
+	"Min time to spend on incremental dedup log flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
+	"Min number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
+	"Number of txgs to average flow rates across");
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
new file mode 100644
index 00000000000..7e7ff9e5b89
--- /dev/null
+++ b/module/zfs/ddt_log.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time.
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	if (!(too_large || too_old))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 9316200f21f..8f55bc24f0f 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
@@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}
 
 	/*
@@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }
 
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 3de316a1250..96943421f84 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
+DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
index 83c4d7c8e2a..4f6e5805bb3 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -29,9 +29,16 @@
 
 log_assert "basic dedup (FDT) operations work"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
index f0f20671b95..259eaddc084 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -29,9 +29,16 @@
 
 log_assert "dedup (FDT) retains version after import"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
index 049ccaae3dc..114cf0266e1 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -30,9 +30,16 @@
 
 log_assert "legacy and FDT dedup tables on the same pool can happily coexist"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
index d563fade88a..c36463134fd 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -30,9 +30,16 @@
 
 log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
index 5b83a1ca396..326152b510a 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
@@ -51,6 +51,12 @@ POOL="dedup_pool"
 
 save_tunable TXG_TIMEOUT
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	if poolexists $POOL ; then
@@ -58,6 +64,7 @@ function cleanup
 	fi
 	log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
 	log_must restore_tunable TXG_TIMEOUT
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 
@@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit
 
 	#
 	# With no DDT quota in place, the above workload will produce over
-	# 800,000 entries by using space in the normal class. With a quota,
-	# it will be well below 500,000 entries.
+	# 800,000 entries by using space in the normal class. With a quota, it
+	# should be well under 500,000. However, logged entries are hard to
+	# account for because they can appear on both logs, and can also
+	# represent an eventual removal. This isn't easily visible from
+	# outside, and even internally can result in going slightly over quota.
+	# For here, we just set the entry count a little higher than what we
+	# expect to allow for some instability.
 	#
-	log_must test $(ddt_entries) -le 500000
+	log_must test $(ddt_entries) -le 600000
 
 	do_clean
 }

From a1902f49509b66a475c7b4b0d081792f33f1dc52 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 16 Oct 2023 11:52:17 +1100
Subject: [PATCH 18/59] ddt: block scan until log is flushed, and flush
 aggressively

The dedup log does not have a stable cursor, so its not possible to
persist our current scan location within it across pool reloads.
Beccause of this, when walking (scanning), we can't treat it like just
another source of dedup entries.

Instead, when a scan is wanted, we switch to an aggressive flushing
mode, pushing out entries older than the scan start txg as fast as we
can, before starting the scan proper.

Entries after the scan start txg will be handled via other methods; the
DDT ZAPs and logs will be written as normal, and blocks not seen yet
will be offered to the scan machinery as normal.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 include/sys/ddt.h                             |  5 ++
 module/zfs/ddt.c                              | 89 +++++++++++++++++++
 module/zfs/ddt_log.c                          |  8 +-
 module/zfs/dsl_scan.c                         | 25 +++++-
 .../zpool_prefetch/zpool_prefetch_001_pos.ksh |  4 +
 5 files changed, 125 insertions(+), 6 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 2fc798725ed..a7920e65806 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -294,6 +294,8 @@ typedef struct {
 	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
 	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
 
+	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
+
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
 	objset_t	*ddt_os;	/* ddt objset (always MOS) */
@@ -393,6 +395,9 @@ extern void ddt_create(spa_t *spa);
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
+
+extern void ddt_walk_init(spa_t *spa, uint64_t txg);
+extern boolean_t ddt_walk_ready(spa_t *spa);
 extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
     ddt_lightweight_entry_t *ddlwe);
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index ce5c4efb51e..051005f137b 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -183,6 +183,12 @@
  * position on the object even if the object changes, the pool is exported, or
  * OpenZFS is upgraded.
  *
+ * If the "fast_dedup" feature is enabled and the table has a log, the scan
+ * cannot begin until entries on the log are flushed, as the on-disk log has no
+ * concept of a "stable position". Instead, the log flushing process will enter
+ * a more aggressive mode, to flush out as much as is necesary as soon as
+ * possible, in order to begin the scan as soon as possible.
+ *
  * ## Interaction with block cloning
  *
  * If block cloning and dedup are both enabled on a pool, BRT will look for the
@@ -1746,6 +1752,16 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
 			ddt->ddt_flush_min = MAX(
 			    ddt->ddt_log_ingest_rate,
 			    zfs_dedup_log_flush_entries_min);
+
+			/*
+			 * If we've been asked to flush everything in a hurry,
+			 * try to dump as much as possible on this txg. In
+			 * this case we're only limited by time, not amount.
+			 */
+			if (ddt->ddt_flush_force_txg > 0)
+				ddt->ddt_flush_min =
+				    MAX(ddt->ddt_flush_min, avl_numnodes(
+				    &ddt->ddt_log_flushing->ddl_tree));
 		} else {
 			/* We already decided we're done for this txg */
 			return (B_FALSE);
@@ -1856,6 +1872,40 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
 	return (ddt->ddt_flush_pass == 0);
 }
 
+static inline void
+ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
+{
+	/*
+	 * If we're not forcing flush, and not being asked to start, then
+	 * there's nothing more to do.
+	 */
+	if (txg == 0) {
+		/* Update requested, are we currently forcing flush? */
+		if (ddt->ddt_flush_force_txg == 0)
+			return;
+		txg = ddt->ddt_flush_force_txg;
+	}
+
+	/*
+	 * If either of the logs have entries unflushed entries before
+	 * the wanted txg, set the force txg, otherwise clear it.
+	 */
+
+	if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
+	    ddt->ddt_log_active->ddl_first_txg <= txg) ||
+	    (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
+		ddt->ddt_flush_force_txg = txg;
+		return;
+	}
+
+	/*
+	 * Nothing to flush behind the given txg, so we can clear force flush
+	 * state.
+	 */
+	ddt->ddt_flush_force_txg = 0;
+}
+
 static void
 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
@@ -1881,6 +1931,9 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 		(void) ddt_log_swap(ddt, tx);
 	}
 
+	/* If force flush is no longer necessary, turn it off. */
+	ddt_flush_force_update_txg(ddt, 0);
+
 	/*
 	 * Update flush rate. This is an exponential weighted moving average of
 	 * the number of entries flushed over recent txgs.
@@ -2049,6 +2102,38 @@ ddt_sync(spa_t *spa, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+void
+ddt_walk_init(spa_t *spa, uint64_t txg)
+{
+	if (txg == 0)
+		txg = spa_syncing_txg(spa);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+			continue;
+
+		ddt_enter(ddt);
+		ddt_flush_force_update_txg(ddt, txg);
+		ddt_exit(ddt);
+	}
+}
+
+boolean_t
+ddt_walk_ready(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+			continue;
+
+		if (ddt->ddt_flush_force_txg > 0)
+			return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
 int
 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 {
@@ -2058,6 +2143,10 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
 				if (ddt == NULL)
 					continue;
+
+				if (ddt->ddt_flush_force_txg > 0)
+					return (EAGAIN);
+
 				int error = ENOENT;
 				if (ddt_object_exists(ddt, ddb->ddb_type,
 				    ddb->ddb_class)) {
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
index 7e7ff9e5b89..a367d0cd02f 100644
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@@ -435,7 +435,8 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
 	/*
 	 * Swap policy. We swap the logs (and so begin flushing) when the
 	 * active tree grows too large, or when we haven't swapped it in
-	 * some amount of time.
+	 * some amount of time, or if something has requested the logs be
+	 * flushed ASAP (see ddt_walk_init()).
 	 */
 
 	/*
@@ -452,7 +453,10 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
 	    (ddt->ddt_log_active->ddl_first_txg +
 	    MAX(1, zfs_dedup_log_txg_max));
 
-	if (!(too_large || too_old))
+	const boolean_t force =
+	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
+
+	if (!(too_large || too_old || force))
 		return (B_FALSE);
 
 	ddt_log_t *swap = ddt->ddt_log_active;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index daf1bd5d637..9d040e14630 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		zap_cursor_fini(&zc);
 	}
 
+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
@@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
 	spa_history_log_internal(spa, "scan setup", tx,
@@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
-	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
+	    !ddt_walk_ready(scn->scn_dp->dp_spa)) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
@@ -3029,9 +3034,21 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 			break;
 	}
 
-	zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
-	    "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
-	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+	if (error == EAGAIN) {
+		dsl_scan_check_suspend(scn, NULL);
+		error = 0;
+
+		zfs_dbgmsg("waiting for ddt to become ready for scan "
+		    "on %s with class_max = %u; suspending=%u",
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);
+	} else
+		zfs_dbgmsg("scanned %llu ddt entries on %s with "
+		    "class_max = %u; suspending=%u", (longlong_t)n,
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
index a96a38ff178..474f41eae8f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
@@ -95,6 +95,10 @@ while (( i < 16384 )); do
 	done
 	((i += 1))
 done
+
+# Force the DDT logs to disk with a scrub so they can be prefetched
+log_must zpool scrub -w $TESTPOOL
+
 log_note "Dataset generation completed."
 
 typeset -A generated

From 0d2707815d34177ffa79e3c78512bb1d4237b1ad Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 25 Sep 2023 11:02:46 +1000
Subject: [PATCH 19/59] ddt: lookup and log stats

Adds per-DDT stats counting lookups and where they were serviced from
(either log or backing zap), number of log entries in memory, and flow
rates.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 include/sys/ddt.h |   2 +
 module/zfs/ddt.c  | 163 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 159 insertions(+), 6 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index a7920e65806..93abad85af4 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -296,6 +296,8 @@ typedef struct {
 
 	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
 
+	kstat_t		*ddt_ksp;	/* kstats context */
+
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
 	objset_t	*ddt_os;	/* ddt objset (always MOS) */
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 051005f137b..bd1941f43ad 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -271,6 +271,78 @@ static const uint64_t ddt_version_flags[] = {
 /* Dummy version to signal that configure is still necessary */
 #define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
 
+#ifdef _KERNEL
+/* per-DDT kstats */
+typedef struct {
+	/* total lookups and whether they returned new or existing entries */
+	kstat_named_t dds_lookup;
+	kstat_named_t dds_lookup_new;
+	kstat_named_t dds_lookup_existing;
+
+	/* entries found on live tree, and if we had to wait for load */
+	kstat_named_t dds_lookup_live_hit;
+	kstat_named_t dds_lookup_live_wait;
+	kstat_named_t dds_lookup_live_miss;
+
+	/* entries found on log trees */
+	kstat_named_t dds_lookup_log_hit;
+	kstat_named_t dds_lookup_log_active_hit;
+	kstat_named_t dds_lookup_log_flushing_hit;
+	kstat_named_t dds_lookup_log_miss;
+
+	/* entries found on store objects */
+	kstat_named_t dds_lookup_stored_hit;
+	kstat_named_t dds_lookup_stored_miss;
+
+	/* number of entries on log trees */
+	kstat_named_t dds_log_active_entries;
+	kstat_named_t dds_log_flushing_entries;
+
+	/* avg updated/flushed entries per txg */
+	kstat_named_t dds_log_ingest_rate;
+	kstat_named_t dds_log_flush_rate;
+	kstat_named_t dds_log_flush_time_rate;
+} ddt_kstats_t;
+
+static const ddt_kstats_t ddt_kstats_template = {
+	{ "lookup",			KSTAT_DATA_UINT64 },
+	{ "lookup_new",			KSTAT_DATA_UINT64 },
+	{ "lookup_existing",		KSTAT_DATA_UINT64 },
+	{ "lookup_live_hit",		KSTAT_DATA_UINT64 },
+	{ "lookup_live_wait",		KSTAT_DATA_UINT64 },
+	{ "lookup_live_miss",		KSTAT_DATA_UINT64 },
+	{ "lookup_log_hit",		KSTAT_DATA_UINT64 },
+	{ "lookup_log_active_hit",	KSTAT_DATA_UINT64 },
+	{ "lookup_log_flushing_hit",	KSTAT_DATA_UINT64 },
+	{ "lookup_log_miss",		KSTAT_DATA_UINT64 },
+	{ "lookup_stored_hit",		KSTAT_DATA_UINT64 },
+	{ "lookup_stored_miss",		KSTAT_DATA_UINT64 },
+	{ "log_active_entries",		KSTAT_DATA_UINT64 },
+	{ "log_flushing_entries",	KSTAT_DATA_UINT64 },
+	{ "log_ingest_rate",		KSTAT_DATA_UINT32 },
+	{ "log_flush_rate",		KSTAT_DATA_UINT32 },
+	{ "log_flush_time_rate",	KSTAT_DATA_UINT32 },
+};
+
+#define	_DDT_KSTAT_STAT(ddt, stat) \
+	&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
+#define	DDT_KSTAT_BUMP(ddt, stat) \
+	do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0)
+#define	DDT_KSTAT_ADD(ddt, stat, val) \
+	do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
+#define	DDT_KSTAT_SUB(ddt, stat, val) \
+	do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
+#define	DDT_KSTAT_SET(ddt, stat, val) \
+	do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
+#define	DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
+#else
+#define	DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
+#define	DDT_KSTAT_ADD(ddt, stat, val) do {} while (0)
+#define	DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
+#define	DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
+#define	DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
+#endif /* _KERNEL */
+
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
@@ -969,6 +1041,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
 	}
 
+	DDT_KSTAT_BUMP(ddt, dds_lookup);
+
 	ddt_key_fill(&search, bp);
 
 	/* Find an existing live entry */
@@ -979,11 +1053,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 			return (NULL);
 
 		/* If it's already loaded, we can just return it. */
+		DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
 		if (dde->dde_flags & DDE_FLAG_LOADED)
 			return (dde);
 
 		/* Someone else is loading it, wait for it. */
 		dde->dde_waiters++;
+		DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
 		while (!(dde->dde_flags & DDE_FLAG_LOADED))
 			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 		dde->dde_waiters--;
@@ -997,8 +1073,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 			return (NULL);
 		}
 
+		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
 		return (dde);
-	}
+	} else
+		DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
 
 	/* Time to make a new entry. */
 	dde = ddt_alloc(ddt, &search);
@@ -1012,11 +1090,19 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 	/* If its in the log tree, we can "load" it from there */
 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
 		ddt_lightweight_entry_t ddlwe;
+		boolean_t found = B_FALSE;
 
 		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
-		    &search, &ddlwe) ||
-		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
 		    &search, &ddlwe)) {
+			DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
+			found = B_TRUE;
+		} else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit);
+			found = B_TRUE;
+		}
+
+		if (found) {
 			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
 
 			dde->dde_type = ddlwe.ddlwe_type;
@@ -1024,8 +1110,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
 			    DDT_PHYS_SIZE(ddt));
 
+			DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
+			DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
+
 			return (dde);
 		}
+
+		DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
 	}
 
 	/*
@@ -1069,6 +1160,9 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Flag cleanup required */
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
+		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
+		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
+
 		/*
 		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
@@ -1085,6 +1179,9 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		ddt_lightweight_entry_t ddlwe;
 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
+	} else {
+		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
+		DDT_KSTAT_BUMP(ddt, dds_lookup_new);
 	}
 
 	/* Entry loaded, everyone can proceed now */
@@ -1317,6 +1414,30 @@ not_found:
 	return (0);
 }
 
+static void
+ddt_table_alloc_kstats(ddt_t *ddt)
+{
+#ifdef _KERNEL
+	char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
+	char *name = kmem_asprintf("ddt_stats_%s",
+	    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+	ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
+	    sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (ddt->ddt_ksp != NULL) {
+		ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
+		memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
+		ddt->ddt_ksp->ks_data = dds;
+		kstat_install(ddt->ddt_ksp);
+	}
+
+	kmem_strfree(name);
+	kmem_strfree(mod);
+#else
+	(void) ddt;
+#endif /* _KERNEL */
+}
+
 static ddt_t *
 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 {
@@ -1336,6 +1457,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
 	ddt_log_alloc(ddt);
+	ddt_table_alloc_kstats(ddt);
 
 	return (ddt);
 }
@@ -1343,6 +1465,14 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 static void
 ddt_table_free(ddt_t *ddt)
 {
+#ifdef _KERNEL
+	if (ddt->ddt_ksp != NULL) {
+		kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
+		ddt->ddt_ksp->ks_data = NULL;
+		kstat_delete(ddt->ddt_ksp);
+	}
+#endif /* _KERNEL */
+
 	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
@@ -1400,6 +1530,11 @@ ddt_load(spa_t *spa)
 		if (error != 0 && error != ENOENT)
 			return (error);
 
+		DDT_KSTAT_SET(ddt, dds_log_active_entries,
+		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
+		DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
+		    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1860,12 +1995,15 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
 
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
 		/* We emptied it, so truncate on-disk */
+		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
 		ddt_log_truncate(ddt, tx);
 		/* No more passes needed this txg */
 		ddt->ddt_flush_pass = 0;
-	} else
+	} else {
 		/* More to do next time, save checkpoint */
+		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
 		ddt_log_checkpoint(ddt, &ddlwe, tx);
+	}
 
 	ddt_sync_update_stats(ddt, tx);
 
@@ -1928,7 +2066,11 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 		 * No more to flush, and the active list has stuff, so
 		 * try to swap the logs for next time.
 		 */
-		(void) ddt_log_swap(ddt, tx);
+		if (ddt_log_swap(ddt, tx)) {
+			DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
+			DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
+			    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
+		}
 	}
 
 	/* If force flush is no longer necessary, turn it off. */
@@ -1941,6 +2083,7 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	ddt->ddt_log_flush_rate = _ewma(
 	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
 	    zfs_dedup_log_flush_flow_rate_txgs);
+	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
 
 	/*
 	 * Update flush time rate. This is an exponential weighted moving
@@ -1950,6 +2093,8 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	    ddt->ddt_log_flush_time_rate,
 	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
 	    zfs_dedup_log_flush_flow_rate_txgs);
+	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
+	    ddt->ddt_log_flush_time_rate);
 }
 
 static void
@@ -1975,6 +2120,9 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 
 		ddt_log_commit(ddt, &dlu);
 
+		DDT_KSTAT_SET(ddt, dds_log_active_entries,
+		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
+
 		/*
 		 * Sync the stats for the store objects. Even though we haven't
 		 * modified anything on those objects, they're no longer the
@@ -1996,7 +2144,7 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 	}
 
-	if (spa_sync_pass(ddt->ddt_spa) == 1)
+	if (spa_sync_pass(ddt->ddt_spa) == 1) {
 		/*
 		 * Update ingest rate. This is an exponential weighted moving
 		 * average of the number of entries changed over recent txgs.
@@ -2006,6 +2154,9 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 		ddt->ddt_log_ingest_rate = _ewma(
 		    count, ddt->ddt_log_ingest_rate,
 		    zfs_dedup_log_flush_flow_rate_txgs);
+		DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
+		    ddt->ddt_log_ingest_rate);
+	}
 }
 
 static void

From a60e15d6b980c7c029c4c3da1f922a39ea24eac5 Mon Sep 17 00:00:00 2001
From: Allan Jude <allan@klarasystems.com>
Date: Tue, 23 Jul 2024 20:51:01 +0000
Subject: [PATCH 20/59] Man page updates for dmu_ddt_copies

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Closes #15895
---
 man/man4/zfs.4 | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index aae3d7dfb5f..07564187267 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
 .
+.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
+Controls the number of copies stored for DeDup Table
+.Pq DDT
+objects.
+Reducing the number of copies to 1 from the previous default of 3
+can reduce the write inflation caused by deduplication.
+This assumes redundancy for this data is provided by the vdev layer.
+If the DDT is damaged, space may be leaked
+.Pq not freed
+when the DDT can not report the correct reference count.
+.
 .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Limit the amount we can prefetch with one call to this amount in bytes.
 This helps to limit the amount of memory that can be used by prefetching.

From 77a797a3823c12268e7f1d73f5e024a77b2f582a Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Fri, 16 Aug 2024 22:34:07 +0200
Subject: [PATCH 21/59] Enable L2 cache of all (MRU+MFU) metadata but MFU data
 only

`l2arc_mfuonly` was added to avoid wasting L2 ARC on read-once MRU
data and metadata. However it can be useful to cache as much
metadata as possible while, at the same time, restricting data
cache to MFU buffers only.

This patch allow for such behavior by setting `l2arc_mfuonly` to 2
(or higher). The list of possible values is the following:
0: cache both MRU and MFU for both data and metadata;
1: cache only MFU for both data and metadata;
2: cache both MRU and MFU for metadata, but only MFU for data.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #16343
Closes #16402
---
 man/man4/zfs.4   | 14 ++++++++++----
 module/zfs/arc.c | 11 ++++++++---
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 07564187267..2be3a8414aa 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -132,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
-.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
+.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
-The default is off,
+The default is 0,
 meaning both MRU and MFU data and metadata are cached.
-When turning off this feature, some MRU buffers will still be present
-in ARC and eventually cached on L2ARC.
+When turning off this feature (setting it to 0), some MRU buffers will
+still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
+Setting it to 1 means to L2 cache only MFU data and metadata.
+.Pp
+Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
+only MFU data (ie: MRU data are not cached). This can be the right setting
+to cache as much metadata as possible even when having high data turnover.
+.Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 78c2cf8ec5c..3c657c979cd 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -9158,12 +9158,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
-		 * If pass == 1 or 3, we cache MRU metadata and data
-		 * respectively.
+		 * pass == 0: MFU meta
+		 * pass == 1: MRU meta
+		 * pass == 2: MFU data
+		 * pass == 3: MRU data
 		 */
-		if (l2arc_mfuonly) {
+		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
+		} else if (l2arc_mfuonly > 1) {
+			if (pass == 3)
+				continue;
 		}
 
 		uint64_t passed_sz = 0;

From 06a7b123acaaedc36926ab45b3cf61396702dc1d Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Mon, 19 Aug 2024 09:42:17 -0700
Subject: [PATCH 22/59] Skip ro check for snaps when multi-mount

Skip ro check for snapshots since they are always ro regardless if ro
flag is passed by mount or not. This allows multi-mounting snapshots
without requiring to specify ro flag.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #16299
---
 module/os/linux/zfs/zpl_super.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index d98d32c1f9f..0a82b8858eb 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -292,6 +292,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 {
 	struct super_block *s;
 	objset_t *os;
+	boolean_t issnap = B_FALSE;
 	int err;
 
 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
@@ -323,6 +324,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 		if (zpl_enter(zfsvfs, FTAG) == 0) {
 			if (os != zfsvfs->z_os)
 				err = -SET_ERROR(EBUSY);
+			issnap = zfsvfs->z_issnap;
 			zpl_exit(zfsvfs, FTAG);
 		} else {
 			err = -SET_ERROR(EBUSY);
@@ -346,7 +348,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 			return (ERR_PTR(err));
 		}
 		s->s_flags |= SB_ACTIVE;
-	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
+		/*
+		 * Skip ro check for snap since snap is always ro regardless
+		 * ro flag is passed by mount or not.
+		 */
 		deactivate_locked_super(s);
 		return (ERR_PTR(-EBUSY));
 	}

From f0ad031cd9236e0b8d9a42ea6b61c14a512a9b70 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 7 May 2024 10:18:22 +1000
Subject: [PATCH 23/59] spl-generic: bring up kstats subsystem before taskq

For spl-taskq to use the kstats infrastructure, it has to be available
first.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 module/os/linux/spl/spl-generic.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c
index 986db151845..6ee0236d289 100644
--- a/module/os/linux/spl/spl-generic.c
+++ b/module/os/linux/spl/spl-generic.c
@@ -868,16 +868,16 @@ spl_init(void)
 	if ((rc = spl_tsd_init()))
 		goto out2;
 
-	if ((rc = spl_taskq_init()))
+	if ((rc = spl_proc_init()))
 		goto out3;
 
-	if ((rc = spl_kmem_cache_init()))
+	if ((rc = spl_kstat_init()))
 		goto out4;
 
-	if ((rc = spl_proc_init()))
+	if ((rc = spl_taskq_init()))
 		goto out5;
 
-	if ((rc = spl_kstat_init()))
+	if ((rc = spl_kmem_cache_init()))
 		goto out6;
 
 	if ((rc = spl_zlib_init()))
@@ -891,13 +891,13 @@ spl_init(void)
 out8:
 	spl_zlib_fini();
 out7:
-	spl_kstat_fini();
-out6:
-	spl_proc_fini();
-out5:
 	spl_kmem_cache_fini();
-out4:
+out6:
 	spl_taskq_fini();
+out5:
+	spl_kstat_fini();
+out4:
+	spl_proc_fini();
 out3:
 	spl_tsd_fini();
 out2:
@@ -913,10 +913,10 @@ spl_fini(void)
 {
 	spl_zone_fini();
 	spl_zlib_fini();
-	spl_kstat_fini();
-	spl_proc_fini();
 	spl_kmem_cache_fini();
 	spl_taskq_fini();
+	spl_kstat_fini();
+	spl_proc_fini();
 	spl_tsd_fini();
 	spl_kvmem_fini();
 	spl_random_fini();

From db40fe4cf6254e59459c7c9969a204c540523192 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 3 May 2024 14:42:51 +1000
Subject: [PATCH 24/59] spl-taskq: per-taskq kstats

This exposes a variety of per-taskq stats under /proc/spl/kstat/taskq,
one file per taskq, named for the taskq name.instance.

These include a small amount of info about the taskq config, the current
state of the threads and queues, and various counters for thread and
queue activity since the taskq was created.

To assist with decrementing queue size counters, the list an entry is on
is encoded in spare bits in the entry flags.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 include/os/linux/spl/sys/taskq.h |  42 +++++
 module/os/linux/spl/spl-taskq.c  | 314 +++++++++++++++++++++++++++++--
 2 files changed, 342 insertions(+), 14 deletions(-)

diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index b73dab631e0..8051de36ba8 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -20,6 +20,10 @@
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */
 
 #ifndef _SPL_TASKQ_H
 #define	_SPL_TASKQ_H
@@ -33,6 +37,9 @@
 #include <sys/thread.h>
 #include <sys/rwlock.h>
 #include <sys/wait.h>
+#include <sys/wmsum.h>
+
+typedef struct kstat_s kstat_t;
 
 #define	TASKQ_NAMELEN		31
 
@@ -74,6 +81,32 @@ typedef enum tq_lock_role {
 typedef unsigned long taskqid_t;
 typedef void (task_func_t)(void *);
 
+typedef struct taskq_sums {
+	/* gauges (inc/dec counters, current value) */
+	wmsum_t tqs_threads_active;		/* threads running a task */
+	wmsum_t tqs_threads_idle;		/* threads waiting for work */
+	wmsum_t tqs_threads_total;		/* total threads */
+	wmsum_t tqs_tasks_pending;		/* tasks waiting to execute */
+	wmsum_t tqs_tasks_priority;		/* hi-pri tasks waiting */
+	wmsum_t tqs_tasks_total;		/* total waiting tasks */
+	wmsum_t tqs_tasks_delayed;		/* tasks deferred to future */
+	wmsum_t tqs_entries_free;		/* task entries on free list */
+
+	/* counters (inc only, since taskq creation) */
+	wmsum_t tqs_threads_created;		/* threads created */
+	wmsum_t tqs_threads_destroyed;		/* threads destroyed */
+	wmsum_t tqs_tasks_dispatched;		/* tasks dispatched */
+	wmsum_t tqs_tasks_dispatched_delayed;	/* tasks delayed to future */
+	wmsum_t tqs_tasks_executed_normal;	/* normal pri tasks executed */
+	wmsum_t tqs_tasks_executed_priority;	/* high pri tasks executed */
+	wmsum_t tqs_tasks_executed;		/* total tasks executed */
+	wmsum_t tqs_tasks_delayed_requeued;	/* delayed tasks requeued */
+	wmsum_t tqs_tasks_cancelled;		/* tasks cancelled before run */
+	wmsum_t tqs_thread_wakeups;		/* total thread wakeups */
+	wmsum_t tqs_thread_wakeups_nowork;	/* thread woken but no tasks */
+	wmsum_t tqs_thread_sleeps;		/* total thread sleeps */
+} taskq_sums_t;
+
 typedef struct taskq {
 	spinlock_t		tq_lock;	/* protects taskq_t */
 	char			*tq_name;	/* taskq name */
@@ -105,6 +138,8 @@ typedef struct taskq {
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
 	unsigned long		lastspawnstop;	/* when to purge dynamic */
+	taskq_sums_t		tq_sums;
+	kstat_t			*tq_ksp;
 } taskq_t;
 
 typedef struct taskq_ent {
@@ -123,6 +158,13 @@ typedef struct taskq_ent {
 #define	TQENT_FLAG_PREALLOC	0x1
 #define	TQENT_FLAG_CANCEL	0x2
 
+/* bits 2-3 are which list tqent is on */
+#define	TQENT_LIST_NONE		0x0
+#define	TQENT_LIST_PENDING	0x4
+#define	TQENT_LIST_PRIORITY	0x8
+#define	TQENT_LIST_DELAY	0xc
+#define	TQENT_LIST_MASK		0xc
+
 typedef struct taskq_thread {
 	struct list_head	tqt_thread_list;
 	struct list_head	tqt_active_list;
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index e7b812c3b5b..61012bfb36d 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -22,16 +22,98 @@
  *
  *  Solaris Porting Layer (SPL) Task Queue Implementation.
  */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */
 
 #include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/kstat.h>
 #ifdef HAVE_CPU_HOTPLUG
 #include <linux/cpuhotplug.h>
 #endif
 
+typedef struct taskq_kstats {
+	/* static values, for completeness */
+	kstat_named_t tqks_threads_max;
+	kstat_named_t tqks_entry_pool_min;
+	kstat_named_t tqks_entry_pool_max;
+
+	/* gauges (inc/dec counters, current value) */
+	kstat_named_t tqks_threads_active;
+	kstat_named_t tqks_threads_idle;
+	kstat_named_t tqks_threads_total;
+	kstat_named_t tqks_tasks_pending;
+	kstat_named_t tqks_tasks_priority;
+	kstat_named_t tqks_tasks_total;
+	kstat_named_t tqks_tasks_delayed;
+	kstat_named_t tqks_entries_free;
+
+	/* counters (inc only, since taskq creation) */
+	kstat_named_t tqks_threads_created;
+	kstat_named_t tqks_threads_destroyed;
+	kstat_named_t tqks_tasks_dispatched;
+	kstat_named_t tqks_tasks_dispatched_delayed;
+	kstat_named_t tqks_tasks_executed_normal;
+	kstat_named_t tqks_tasks_executed_priority;
+	kstat_named_t tqks_tasks_executed;
+	kstat_named_t tqks_tasks_delayed_requeued;
+	kstat_named_t tqks_tasks_cancelled;
+	kstat_named_t tqks_thread_wakeups;
+	kstat_named_t tqks_thread_wakeups_nowork;
+	kstat_named_t tqks_thread_sleeps;
+} taskq_kstats_t;
+
+static taskq_kstats_t taskq_kstats_template = {
+	{ "threads_max",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_min",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_max",		KSTAT_DATA_UINT64 },
+	{ "threads_active",		KSTAT_DATA_UINT64 },
+	{ "threads_idle",		KSTAT_DATA_UINT64 },
+	{ "threads_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_pending",		KSTAT_DATA_UINT64 },
+	{ "tasks_priority",		KSTAT_DATA_UINT64 },
+	{ "tasks_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed",		KSTAT_DATA_UINT64 },
+	{ "entries_free",		KSTAT_DATA_UINT64 },
+
+	{ "threads_created",		KSTAT_DATA_UINT64 },
+	{ "threads_destroyed",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched_delayed",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_normal",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_priority",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed_requeued",	KSTAT_DATA_UINT64 },
+	{ "tasks_cancelled",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups_nowork",	KSTAT_DATA_UINT64 },
+	{ "thread_sleeps",		KSTAT_DATA_UINT64 },
+};
+
+#define	TQSTAT_INC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, 1)
+#define	TQSTAT_DEC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, -1)
+
+#define	_TQSTAT_MOD_LIST(mod, tq, t) do { \
+	switch (t->tqent_flags & TQENT_LIST_MASK) {			\
+	case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
+	case TQENT_LIST_PENDING: mod(tq, tasks_pending); break;		\
+	case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break;	\
+	case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break;		\
+	}								\
+} while (0)
+#define	TQSTAT_INC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
+#define	TQSTAT_DEC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
+
+#define	TQENT_SET_LIST(t, l)	\
+	t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
+
 static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
@@ -134,6 +216,7 @@ retry:
 		ASSERT(!timer_pending(&t->tqent_timer));
 
 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC(tq, entries_free);
 		return (t);
 	}
 
@@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
+	ASSERT(list_empty(&t->tqent_list));
 
 	/* Wake tasks blocked in taskq_wait_id() */
 	wake_up_all(&t->tqent_waitq);
 
-	list_del_init(&t->tqent_list);
-
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_id = TASKQID_INVALID;
 		t->tqent_func = NULL;
@@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 		t->tqent_flags = 0;
 
 		list_add_tail(&t->tqent_list, &tq->tq_free_list);
+		TQSTAT_INC(tq, entries_free);
 	} else {
 		task_free(tq, t);
 	}
@@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	wake_up(&tq->tq_work_waitq);
+
+	TQSTAT_INC(tq, tasks_delayed_requeued);
 }
 
 static void
@@ -534,7 +619,10 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	t = taskq_find(tq, id);
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC_LIST(tq, t);
+
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
+		TQSTAT_INC(tq, tasks_cancelled);
 
 		/*
 		 * When canceling the lowest outstanding task id we
@@ -604,13 +692,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	spin_lock(&t->tqent_lock);
 
 	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
-	if (flags & TQ_NOQUEUE)
+	if (flags & TQ_NOQUEUE) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 	/* Queue to the priority list instead of the pending list */
-	else if (flags & TQ_FRONT)
+	} else if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@@ -629,6 +723,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 
 	wake_up(&tq->tq_work_waitq);
 
+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@@ -662,6 +758,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 
 	/* Queue to the delay list for subsequent execution */
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
+	TQSTAT_INC_LIST(tq, t);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@@ -676,6 +774,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 
 	spin_unlock(&t->tqent_lock);
 
+	TQSTAT_INC(tq, tasks_dispatched_delayed);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@@ -724,10 +824,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;
 
 	/* Queue to the priority list instead of the pending list */
-	if (flags & TQ_FRONT)
+	if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = tq->tq_next_id;
 	tq->tq_next_id++;
@@ -742,6 +847,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 
 	wake_up(&tq->tq_work_waitq);
 
+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@@ -908,6 +1015,8 @@ taskq_thread(void *args)
 	wake_up(&tq->tq_wait_waitq);
 	set_current_state(TASK_INTERRUPTIBLE);
 
+	TQSTAT_INC(tq, threads_total);
+
 	while (!kthread_should_stop()) {
 
 		if (list_empty(&tq->tq_pend_list) &&
@@ -919,9 +1028,15 @@ taskq_thread(void *args)
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+			TQSTAT_INC(tq, thread_sleeps);
+			TQSTAT_INC(tq, threads_idle);
+
 			schedule();
 			seq_tasks = 0;
 
+			TQSTAT_DEC(tq, threads_idle);
+			TQSTAT_INC(tq, thread_wakeups);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 			remove_wait_queue(&tq->tq_work_waitq, &wait);
@@ -931,6 +1046,8 @@ taskq_thread(void *args)
 
 		if ((t = taskq_next_ent(tq)) != NULL) {
 			list_del_init(&t->tqent_list);
+			TQSTAT_DEC_LIST(tq, t);
+			TQSTAT_DEC(tq, tasks_total);
 
 			/*
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
@@ -955,6 +1072,7 @@ taskq_thread(void *args)
 			tq->tq_nactive++;
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+			TQSTAT_INC(tq, threads_active);
 			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
 
 			/* Perform the requested task */
@@ -962,8 +1080,17 @@ taskq_thread(void *args)
 
 			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
 
+			TQSTAT_DEC(tq, threads_active);
+			if ((t->tqent_flags & TQENT_LIST_MASK) ==
+			    TQENT_LIST_PENDING)
+				TQSTAT_INC(tq, tasks_executed_normal);
+			else
+				TQSTAT_INC(tq, tasks_executed_priority);
+			TQSTAT_INC(tq, tasks_executed);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
+
 			tq->tq_nactive--;
 			list_del_init(&tqt->tqt_active_list);
 			tqt->tqt_task = NULL;
@@ -989,7 +1116,8 @@ taskq_thread(void *args)
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
-		}
+		} else
+			TQSTAT_INC(tq, thread_wakeups_nowork);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -998,6 +1126,10 @@ taskq_thread(void *args)
 	__set_current_state(TASK_RUNNING);
 	tq->tq_nthreads--;
 	list_del_init(&tqt->tqt_thread_list);
+
+	TQSTAT_DEC(tq, threads_total);
+	TQSTAT_INC(tq, threads_destroyed);
+
 error:
 	kmem_free(tqt, sizeof (taskq_thread_t));
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
@@ -1037,9 +1169,156 @@ taskq_thread_create(taskq_t *tq)
 
 	wake_up_process(tqt->tqt_thread);
 
+	TQSTAT_INC(tq, threads_created);
+
 	return (tqt);
 }
 
+static void
+taskq_stats_init(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_init(&tqs->tqs_threads_active, 0);
+	wmsum_init(&tqs->tqs_threads_idle, 0);
+	wmsum_init(&tqs->tqs_threads_total, 0);
+	wmsum_init(&tqs->tqs_tasks_pending, 0);
+	wmsum_init(&tqs->tqs_tasks_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_total, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed, 0);
+	wmsum_init(&tqs->tqs_entries_free, 0);
+	wmsum_init(&tqs->tqs_threads_created, 0);
+	wmsum_init(&tqs->tqs_threads_destroyed, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_executed, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
+	wmsum_init(&tqs->tqs_tasks_cancelled, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
+	wmsum_init(&tqs->tqs_thread_sleeps, 0);
+}
+
+static void
+taskq_stats_fini(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_fini(&tqs->tqs_threads_active);
+	wmsum_fini(&tqs->tqs_threads_idle);
+	wmsum_fini(&tqs->tqs_threads_total);
+	wmsum_fini(&tqs->tqs_tasks_pending);
+	wmsum_fini(&tqs->tqs_tasks_priority);
+	wmsum_fini(&tqs->tqs_tasks_total);
+	wmsum_fini(&tqs->tqs_tasks_delayed);
+	wmsum_fini(&tqs->tqs_entries_free);
+	wmsum_fini(&tqs->tqs_threads_created);
+	wmsum_fini(&tqs->tqs_threads_destroyed);
+	wmsum_fini(&tqs->tqs_tasks_dispatched);
+	wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
+	wmsum_fini(&tqs->tqs_tasks_executed_normal);
+	wmsum_fini(&tqs->tqs_tasks_executed_priority);
+	wmsum_fini(&tqs->tqs_tasks_executed);
+	wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
+	wmsum_fini(&tqs->tqs_tasks_cancelled);
+	wmsum_fini(&tqs->tqs_thread_wakeups);
+	wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
+	wmsum_fini(&tqs->tqs_thread_sleeps);
+}
+
+static int
+taskq_kstats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	taskq_t *tq = ksp->ks_private;
+	taskq_kstats_t *tqks = ksp->ks_data;
+
+	tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
+	tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
+	tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
+
+	taskq_sums_t *tqs = &tq->tq_sums;
+
+	tqks->tqks_threads_active.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_active);
+	tqks->tqks_threads_idle.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_idle);
+	tqks->tqks_threads_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_total);
+	tqks->tqks_tasks_pending.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_pending);
+	tqks->tqks_tasks_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_priority);
+	tqks->tqks_tasks_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_total);
+	tqks->tqks_tasks_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed);
+	tqks->tqks_entries_free.value.ui64 =
+	    wmsum_value(&tqs->tqs_entries_free);
+	tqks->tqks_threads_created.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_created);
+	tqks->tqks_threads_destroyed.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_destroyed);
+	tqks->tqks_tasks_dispatched.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched);
+	tqks->tqks_tasks_dispatched_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
+	tqks->tqks_tasks_executed_normal.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_normal);
+	tqks->tqks_tasks_executed_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_priority);
+	tqks->tqks_tasks_executed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed);
+	tqks->tqks_tasks_delayed_requeued.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed_requeued);
+	tqks->tqks_tasks_cancelled.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_cancelled);
+	tqks->tqks_thread_wakeups.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups);
+	tqks->tqks_thread_wakeups_nowork.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups_nowork);
+	tqks->tqks_thread_sleeps.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_sleeps);
+
+	return (0);
+}
+
+static void
+taskq_kstats_init(taskq_t *tq)
+{
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
+
+	kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
+	    KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_private = tq;
+	ksp->ks_update = taskq_kstats_update;
+	ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
+	memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
+	kstat_install(ksp);
+
+	tq->tq_ksp = ksp;
+}
+
+static void
+taskq_kstats_fini(taskq_t *tq)
+{
+	if (tq->tq_ksp == NULL)
+		return;
+
+	kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
+	kstat_delete(tq->tq_ksp);
+
+	tq->tq_ksp = NULL;
+}
+
 taskq_t *
 taskq_create(const char *name, int threads_arg, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
@@ -1104,6 +1383,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	init_waitqueue_head(&tq->tq_wait_waitq);
 	tq->tq_lock_class = TQ_LOCK_GENERAL;
 	INIT_LIST_HEAD(&tq->tq_taskqs);
+	taskq_stats_init(tq);
 
 	if (flags & TASKQ_PREPOPULATE) {
 		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
@@ -1137,14 +1417,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 
 	if (rc) {
 		taskq_destroy(tq);
-		tq = NULL;
-	} else {
-		down_write(&tq_list_sem);
-		tq->tq_instance = taskq_find_by_name(name) + 1;
-		list_add_tail(&tq->tq_taskqs, &tq_list);
-		up_write(&tq_list_sem);
+		return (NULL);
 	}
 
+	down_write(&tq_list_sem);
+	tq->tq_instance = taskq_find_by_name(name) + 1;
+	list_add_tail(&tq->tq_taskqs, &tq_list);
+	up_write(&tq_list_sem);
+
+	/* Install kstats late, because the name includes tq_instance */
+	taskq_kstats_init(tq);
+
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create);
@@ -1177,6 +1460,8 @@ taskq_destroy(taskq_t *tq)
 
 	taskq_wait(tq);
 
+	taskq_kstats_fini(tq);
+
 	/* remove taskq from global list used by the kstats */
 	down_write(&tq_list_sem);
 	list_del(&tq->tq_taskqs);
@@ -1230,6 +1515,7 @@ taskq_destroy(taskq_t *tq)
 
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+	taskq_stats_fini(tq);
 	kmem_strfree(tq->tq_name);
 	kmem_free(tq, sizeof (taskq_t));
 }

From 3f8fd3cae081fc13608e30e25b2e9df73fc59de9 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 7 May 2024 10:26:20 +1000
Subject: [PATCH 25/59] spl-taskq: summary stats for all taskqs

This adds /proc/spl/kstats/taskq/summary, which attempts to show a
useful subset of stats for all taskqs in the system.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 module/os/linux/spl/spl-taskq.c | 98 +++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 61012bfb36d..29b8f542650 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1557,6 +1557,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
 }
 EXPORT_SYMBOL(taskq_create_synced);
 
+static kstat_t *taskq_summary_ksp = NULL;
+
+static int
+spl_taskq_kstat_headers(char *buf, size_t size)
+{
+	size_t n = snprintf(buf, size,
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n",
+	    "", "threads", "tasks on queue",
+	    "taskq name", "tot [act idl] max", " pend [ norm  high] dly",
+	    "--------------------", "-----------------",
+	    "-----------------------");
+	return (n >= size ? ENOMEM : 0);
+}
+
+static int
+spl_taskq_kstat_data(char *buf, size_t size, void *data)
+{
+	struct list_head *tql = NULL;
+	taskq_t *tq;
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	char threads[25];
+	char tasks[30];
+	size_t n;
+	int err = 0;
+
+	down_read(&tq_list_sem);
+	list_for_each_prev(tql, &tq_list) {
+		tq = list_entry(tql, taskq_t, tq_taskqs);
+
+		mutex_enter(tq->tq_ksp->ks_lock);
+		taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
+		taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
+
+		snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
+		    tq->tq_instance);
+		snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
+		    tqks->tqks_threads_total.value.ui64,
+		    tqks->tqks_threads_active.value.ui64,
+		    tqks->tqks_threads_idle.value.ui64,
+		    tqks->tqks_threads_max.value.ui64);
+		snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
+		    tqks->tqks_tasks_total.value.ui64,
+		    tqks->tqks_tasks_pending.value.ui64,
+		    tqks->tqks_tasks_priority.value.ui64,
+		    tqks->tqks_tasks_delayed.value.ui64);
+
+		mutex_exit(tq->tq_ksp->ks_lock);
+
+		n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
+		    name, threads, tasks);
+		if (n >= size) {
+			err = ENOMEM;
+			break;
+		}
+
+		buf = &buf[n];
+		size -= n;
+	}
+
+	up_read(&tq_list_sem);
+
+	return (err);
+}
+
+static void
+spl_taskq_kstat_init(void)
+{
+	kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_data = (void *)(uintptr_t)1;
+	ksp->ks_ndata = 1;
+	kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
+	    spl_taskq_kstat_data, NULL);
+	kstat_install(ksp);
+
+	taskq_summary_ksp = ksp;
+}
+
+static void
+spl_taskq_kstat_fini(void)
+{
+	if (taskq_summary_ksp == NULL)
+		return;
+
+	kstat_delete(taskq_summary_ksp);
+	taskq_summary_ksp = NULL;
+}
+
 static unsigned int spl_taskq_kick = 0;
 
 /*
@@ -1737,12 +1831,16 @@ spl_taskq_init(void)
 	 */
 	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
 
+	spl_taskq_kstat_init();
+
 	return (0);
 }
 
 void
 spl_taskq_fini(void)
 {
+	spl_taskq_kstat_fini();
+
 	taskq_destroy(dynamic_taskq);
 	dynamic_taskq = NULL;
 

From 816d2b2bfc2591b951f32aeb7c00e14e27ee624c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 7 May 2024 10:17:12 +1000
Subject: [PATCH 26/59] spl-proc: remove old taskq stats

These had minimal useful information for the admin, didn't work properly
in some places, and knew far too much about taskq internals.

With the new stats available, these should never be needed anymore.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 man/man4/spl.4                 |  11 --
 module/os/linux/spl/spl-proc.c | 268 ---------------------------------
 2 files changed, 279 deletions(-)

diff --git a/man/man4/spl.4 b/man/man4/spl.4
index 5cc12764e18..22832c492db 100644
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@@ -175,17 +175,6 @@ Increasing this value will
 result in a slower thread creation rate which may be preferable for some
 configurations.
 .
-.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
-The maximum number of tasks per pending list in each taskq shown in
-.Pa /proc/spl/taskq{,-all} .
-Write
-.Sy 0
-to turn off the limit.
-The proc file will walk the lists with lock held,
-reading it could cause a lock-up if the list grow too large
-without limiting the output.
-"(truncated)" will be shown if the list is larger than the limit.
-.
 .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
 Minimum idle threads exit interval for dynamic taskqs.
 Smaller values allow idle threads exit more often and potentially be
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index 2c0cdd9febf..9fefcd03c41 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -31,7 +31,6 @@
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
-#include <sys/taskq.h>
 #include <sys/proc.h>
 #include <linux/ctype.h>
 #include <linux/kmod.h>
@@ -63,8 +62,6 @@ static struct ctl_table_header *spl_kstat = NULL;
 static struct proc_dir_entry *proc_spl = NULL;
 static struct proc_dir_entry *proc_spl_kmem = NULL;
 static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
-static struct proc_dir_entry *proc_spl_taskq_all = NULL;
-static struct proc_dir_entry *proc_spl_taskq = NULL;
 struct proc_dir_entry *proc_spl_kstat = NULL;
 
 #ifdef DEBUG_KMEM
@@ -177,195 +174,6 @@ proc_dohostid(CONST_CTL_TABLE *table, int write,
 	return (0);
 }
 
-static void
-taskq_seq_show_headers(struct seq_file *f)
-{
-	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
-	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
-	    "mina", "maxa", "cura", "flags");
-}
-
-/* indices into the lheads array below */
-#define	LHEAD_PEND	0
-#define	LHEAD_PRIO	1
-#define	LHEAD_DELAY	2
-#define	LHEAD_WAIT	3
-#define	LHEAD_ACTIVE	4
-#define	LHEAD_SIZE	5
-
-static unsigned int spl_max_show_tasks = 512;
-/* CSTYLED */
-module_param(spl_max_show_tasks, uint, 0644);
-MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
-
-static int
-taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
-{
-	taskq_t *tq = p;
-	taskq_thread_t *tqt = NULL;
-	spl_wait_queue_entry_t *wq;
-	struct task_struct *tsk;
-	taskq_ent_t *tqe;
-	char name[100];
-	struct list_head *lheads[LHEAD_SIZE], *lh;
-	static char *list_names[LHEAD_SIZE] =
-	    {"pend", "prio", "delay", "wait", "active" };
-	int i, j, have_lheads = 0;
-	unsigned long wflags, flags;
-
-	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
-	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
-
-	/* get the various lists and check whether they're empty */
-	lheads[LHEAD_PEND] = &tq->tq_pend_list;
-	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
-	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
-#else
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
-#endif
-	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
-
-	for (i = 0; i < LHEAD_SIZE; ++i) {
-		if (list_empty(lheads[i]))
-			lheads[i] = NULL;
-		else
-			++have_lheads;
-	}
-
-	/* early return in non-"all" mode if lists are all empty */
-	if (!allflag && !have_lheads) {
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-		spin_unlock_irqrestore(&tq->tq_lock, flags);
-		return (0);
-	}
-
-	/* unlock the waitq quickly */
-	if (!lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-
-	/* show the base taskq contents */
-	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
-	seq_printf(f, "%-25s ", name);
-	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
-	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
-	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
-	    tq->tq_nalloc, tq->tq_flags);
-
-	/* show the active list */
-	if (lheads[LHEAD_ACTIVE]) {
-		j = 0;
-		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
-			if (j == 0)
-				seq_printf(f, "\t%s:",
-				    list_names[LHEAD_ACTIVE]);
-			else if (j == 2) {
-				seq_printf(f, "\n\t       ");
-				j = 0;
-			}
-			seq_printf(f, " [%d]%pf(%ps)",
-			    tqt->tqt_thread->pid,
-			    tqt->tqt_task->tqent_func,
-			    tqt->tqt_task->tqent_arg);
-			++j;
-		}
-		seq_printf(f, "\n");
-	}
-
-	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
-		if (lheads[i]) {
-			j = 0;
-			list_for_each(lh, lheads[i]) {
-				if (spl_max_show_tasks != 0 &&
-				    j >= spl_max_show_tasks) {
-					seq_printf(f, "\n\t(truncated)");
-					break;
-				}
-				/* show the wait waitq list */
-				if (i == LHEAD_WAIT) {
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, entry);
-#else
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, task_list);
-#endif
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 8 == 0)
-						seq_printf(f, "\n\t     ");
-
-					tsk = wq->private;
-					seq_printf(f, " %d", tsk->pid);
-				/* pend, prio and delay lists */
-				} else {
-					tqe = list_entry(lh, taskq_ent_t,
-					    tqent_list);
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 2 == 0)
-						seq_printf(f, "\n\t     ");
-
-					seq_printf(f, " %pf(%ps)",
-					    tqe->tqent_func,
-					    tqe->tqent_arg);
-				}
-				++j;
-			}
-			seq_printf(f, "\n");
-		}
-	if (lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-	spin_unlock_irqrestore(&tq->tq_lock, flags);
-
-	return (0);
-}
-
-static int
-taskq_all_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_TRUE));
-}
-
-static int
-taskq_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_FALSE));
-}
-
-static void *
-taskq_seq_start(struct seq_file *f, loff_t *pos)
-{
-	struct list_head *p;
-	loff_t n = *pos;
-
-	down_read(&tq_list_sem);
-	if (!n)
-		taskq_seq_show_headers(f);
-
-	p = tq_list.next;
-	while (n--) {
-		p = p->next;
-		if (p == &tq_list)
-		return (NULL);
-	}
-
-	return (list_entry(p, taskq_t, tq_taskqs));
-}
-
-static void *
-taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
-{
-	taskq_t *tq = p;
-
-	++*pos;
-	return ((tq->tq_taskqs.next == &tq_list) ?
-	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
-}
-
 static void
 slab_seq_show_headers(struct seq_file *f)
 {
@@ -501,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
 #endif
 };
 
-static void
-taskq_seq_stop(struct seq_file *f, void *v)
-{
-	up_read(&tq_list_sem);
-}
-
-static const struct seq_operations taskq_all_seq_ops = {
-	.show	= taskq_all_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static const struct seq_operations taskq_seq_ops = {
-	.show	= taskq_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static int
-proc_taskq_all_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_all_seq_ops));
-}
-
-static int
-proc_taskq_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_seq_ops));
-}
-
-static const kstat_proc_op_t proc_taskq_all_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_all_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_all_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
-static const kstat_proc_op_t proc_taskq_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
 static struct ctl_table spl_kmem_table[] = {
 #ifdef DEBUG_KMEM
 	{
@@ -677,8 +425,6 @@ static void spl_proc_cleanup(void)
 	remove_proc_entry("kstat", proc_spl);
 	remove_proc_entry("slab", proc_spl_kmem);
 	remove_proc_entry("kmem", proc_spl);
-	remove_proc_entry("taskq-all", proc_spl);
-	remove_proc_entry("taskq", proc_spl);
 	remove_proc_entry("spl", NULL);
 
 #ifndef HAVE_REGISTER_SYSCTL_TABLE
@@ -761,20 +507,6 @@ spl_proc_init(void)
 		goto out;
 	}
 
-	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
-	    &proc_taskq_all_operations, NULL);
-	if (proc_spl_taskq_all == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
-	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
-	    &proc_taskq_operations, NULL);
-	if (proc_spl_taskq == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
 	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
 	if (proc_spl_kmem == NULL) {
 		rc = -EUNATCH;

From 8e6a9aabb1e4038b1893d5eba5ebc2318988bd9c Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 20 Aug 2024 01:30:57 +0500
Subject: [PATCH 27/59] linux/zvol_os.c: Fix max_discard_sectors limit for 6.8+
 kernel

In kernels 6.8 and later, the zvol block device is allocated with
qlimits passed during initialization. However, the zvol driver does not
set `max_hw_discard_sectors`, which is necessary to properly
initialize `max_discard_sectors`. This causes the `zvol_misc_trim` test
to fail on 6.8+ kernels when invoking the `blkdiscard` command. Setting
`max_hw_discard_sectors` in the `HAVE_BLK_ALLOC_DISK_2ARG` case resolve
the issue.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16462
---
 module/os/linux/zfs/zvol_os.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index e04f64e232a..1ac079cc686 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1213,6 +1213,7 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->io_opt = limits->zql_io_opt;
 	qlimits->physical_block_size = limits->zql_physical_block_size;
 	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
 	qlimits->discard_granularity = limits->zql_discard_granularity;
 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	qlimits->features =

From a2c4e95cfdf60e8350884ff77a0df00d5ecdd275 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 20 Aug 2024 18:45:26 +0500
Subject: [PATCH 28/59] linux/zvol_os.c: cleanup limits for non-blk mq case

Rob Noris suggested that we could clean up redundant limits for the case
of non-blk mq scenario.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16462
---
 module/os/linux/zfs/zvol_os.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 1ac079cc686..d1e3061b50e 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1252,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
@@ -1266,10 +1265,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 
-#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
-#endif
-
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)

From bbe8512a93b0078c43fb5aa6f265059376647bc7 Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Wed, 21 Aug 2024 19:00:33 +0200
Subject: [PATCH 29/59] Ignore zfs_arc_shrinker_limit in direct reclaim mode

zfs_arc_shrinker_limit (default: 10000) avoids ARC collapse
due to excessive memory reclaim. However, when the kernel is
in direct reclaim mode (ie: low on memory), limiting ARC reclaim
increases OOM risk. This is especially true on system without
(or with inadequate) swap.

This patch ignores zfs_arc_shrinker_limit when the kernel is in
direct reclaim mode, avoiding most OOM. It also restores
"echo 3 > /proc/sys/vm/drop_caches" ability to correctly drop
(almost) all ARC.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Adam Moss <c@yotes.com>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #16313
---
 man/man4/zfs.4               | 1 +
 module/os/linux/zfs/arc_os.c | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 2be3a8414aa..20bb95c1aee 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -838,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
+To reduce OOM risk, this limit is applied for kswapd reclaims only.
 .Pp
 The default limit of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 75a9ea53225..c6b9cb2ddb3 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 	 * See also the comment above zfs_arc_shrinker_limit.
 	 */
 	int64_t can_free = btop(arc_evictable_memory());
-	int64_t limit = zfs_arc_shrinker_limit != 0 ?
-	    zfs_arc_shrinker_limit : INT64_MAX;
-	return (MIN(can_free, limit));
+	if (current_is_kswapd() && zfs_arc_shrinker_limit)
+		can_free = MIN(can_free, zfs_arc_shrinker_limit);
+	return (can_free);
 }
 
 static unsigned long

From b3f4e4e1ec930be85ebdf3c7d23f0be23800491c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 7 Jun 2024 18:00:31 +1000
Subject: [PATCH 30/59] abd: remove ABD_FLAG_ZEROS

Nothing ever checks it.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 include/sys/abd.h              | 3 +--
 module/os/freebsd/zfs/abd_os.c | 2 +-
 module/os/linux/zfs/abd_os.c   | 2 +-
 module/zfs/abd.c               | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 7b7d84b528c..daa247e0cb1 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -44,8 +44,7 @@ typedef enum abd_flags {
 	ABD_FLAG_LINEAR_PAGE 	= 1 << 5, /* linear but allocd from page */
 	ABD_FLAG_GANG		= 1 << 6, /* mult ABDs chained together */
 	ABD_FLAG_GANG_FREE	= 1 << 7, /* gang ABD is responsible for mem */
-	ABD_FLAG_ZEROS		= 1 << 8, /* ABD for zero-filled buffer */
-	ABD_FLAG_ALLOCD		= 1 << 9, /* we allocated the abd_t */
+	ABD_FLAG_ALLOCD		= 1 << 8, /* we allocated the abd_t */
 } abd_flags_t;
 
 typedef struct abd {
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index fb5c46ecf7c..ce8c30025f3 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -250,7 +250,7 @@ abd_alloc_zero_scatter(void)
 
 	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 
 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index f7af20c619a..c4cc2ce01d6 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -509,7 +509,7 @@ abd_alloc_zero_scatter(void)
 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 94f492522f0..f1df6082f04 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
 	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
-	    ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
+	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {

From 2b7d9a786346f70799fdc043f2455b870e924330 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 4 Jun 2024 13:13:05 -0400
Subject: [PATCH 31/59] zio: no alloc canary in userspace

Makes it harder to use memory debuggers like valgrind directly, because
they can't see canary overruns.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 module/zfs/zio.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1f3acb9b921..73252c2da97 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -299,10 +299,13 @@ zio_fini(void)
  * ==========================================================================
  */
 
-#ifdef ZFS_DEBUG
-static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+#if defined(ZFS_DEBUG) && defined(_KERNEL)
+#define	ZFS_ZIO_BUF_CANARY	1
 #endif
 
+#ifdef ZFS_ZIO_BUF_CANARY
+static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+
 /*
  * Use empty space after the buffer to detect overflows.
  *
@@ -314,7 +317,6 @@ static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
-#ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -323,13 +325,11 @@ zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
-#endif
 }
 
 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
-#ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -343,8 +343,8 @@ zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 			    *canary, zio_buf_canary);
 		}
 	}
-#endif
 }
+#endif
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
@@ -363,7 +363,9 @@ zio_buf_alloc(size_t size)
 #endif
 
 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
+#endif
 	return (p);
 }
 
@@ -381,7 +383,9 @@ zio_data_buf_alloc(size_t size)
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
+#endif
 	return (p);
 }
 
@@ -395,7 +399,9 @@ zio_buf_free(void *buf, size_t size)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
+#endif
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
@@ -406,7 +412,9 @@ zio_data_buf_free(void *buf, size_t size)
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
+#endif
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 

From 7a5b4355e2e3b3cdedcc75300323db35c98e78df Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 25 Dec 2023 22:25:48 +1100
Subject: [PATCH 32/59] abd_os: split userspace and Linux kernel code

The Linux abd_os.c serves double-duty as the userspace scatter abd
implementation, by carrying an emulation of kernel scatterlists. This
commit lifts common and userspace-specific parts out into a separate
abd_os.c for libzpool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 include/sys/abd.h            |   2 +
 lib/libzpool/Makefile.am     |   2 +-
 lib/libzpool/abd_os.c        | 492 +++++++++++++++++++++++++++++++++++
 module/os/linux/zfs/abd_os.c | 151 +----------
 4 files changed, 498 insertions(+), 149 deletions(-)
 create mode 100644 lib/libzpool/abd_os.c

diff --git a/include/sys/abd.h b/include/sys/abd.h
index daa247e0cb1..ed008465c89 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -68,7 +68,9 @@ typedef struct abd {
 		} abd_scatter;
 		struct abd_linear {
 			void		*abd_buf;
+#if defined(__linux__) && defined(_KERNEL)
 			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
+#endif
 		} abd_linear;
 		struct abd_gang {
 			list_t abd_gang_chain;
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 070dc0132f2..eb0dd0ace1f 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -9,6 +9,7 @@ lib_LTLIBRARIES += libzpool.la
 CPPCHECKTARGETS += libzpool.la
 
 dist_libzpool_la_SOURCES = \
+	%D%/abd_os.c \
 	%D%/kernel.c \
 	%D%/taskq.c \
 	%D%/util.c
@@ -39,7 +40,6 @@ nodist_libzpool_la_SOURCES = \
 	module/lua/lvm.c \
 	module/lua/lzio.c \
 	\
-	module/os/linux/zfs/abd_os.c \
 	module/os/linux/zfs/arc_os.c \
 	module/os/linux/zfs/trace.c \
 	module/os/linux/zfs/vdev_file.c \
diff --git a/lib/libzpool/abd_os.c b/lib/libzpool/abd_os.c
new file mode 100644
index 00000000000..de93f99a556
--- /dev/null
+++ b/lib/libzpool/abd_os.c
@@ -0,0 +1,492 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ *  (1) They avoid use of kmem_*, preventing performance problems where running
+ *      kmem_reap on very large memory systems never finishes and causes
+ *      constant TLB shootdowns.
+ *
+ *  (2) Fragmentation is less of an issue since when we are at the limit of
+ *      allocatable space, we won't have to search around for a long free
+ *      hole in the VA space for large ARC allocations. Each chunk is mapped in
+ *      individually, so even if we are using HIGHMEM (see next point) we
+ *      wouldn't need to worry about finding a contiguous address range.
+ *
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_chunks() for details.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+
+#define	abd_for_each_sg(abd, sg, n, i)	\
+	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+/*
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's.  Smaller allocations will use linear ABD's which uses
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page).  Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations.  This can
+ * improve memory efficiency, but it also makes it much harder for ARC
+ * evictions to actually free pages, because all the buffers on one slab need
+ * to be freed in order for the slab (and underlying pages) to be freed.
+ * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
+ * possible for them to actually waste more memory than scatter (one page per
+ * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
+ *
+ * Spill blocks are typically 512B and are heavily used on systems running
+ * selinux with the default dnode size and the `xattr=sa` property set.
+ *
+ * By default we use linear allocations for 512B and 1KB, and scatter
+ * allocations for larger (1.5KB and up).
+ */
+static int zfs_abd_scatter_min_size = 512 * 3;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
+ * just a single zero'd page. This allows us to conserve memory by
+ * only using a single zero page for the scatterlist.
+ */
+abd_t *abd_zero_scatter = NULL;
+
+struct page;
+/*
+ * abd_zero_page will be allocated with a zero'ed PAGESIZE buffer, which is
+ * assigned to each of the pages of abd_zero_scatter.
+ */
+static struct page *abd_zero_page = NULL;
+
+static kmem_cache_t *abd_cache = NULL;
+
+static uint_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+	/*
+	 * In Linux we do not use the size passed in during ABD
+	 * allocation, so we just ignore it.
+	 */
+	(void) size;
+	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+
+	return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+	kmem_cache_free(abd_cache, abd);
+}
+
+#define	nth_page(pg, i) \
+	((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+	struct page *page;
+	int length;
+	int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr)
+{
+	memset(sg, 0, nr * sizeof (struct scatterlist));
+	sg[nr - 1].end = 1;
+}
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+	int nents = ABD_SCATTER(abd).abd_nents;
+	vmem_free(ABD_SCATTER(abd).abd_sgl,
+	    nents * sizeof (struct scatterlist));
+}
+
+#define	for_each_sg(sgl, sg, nr, i)	\
+	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+    unsigned int offset)
+{
+	/* currently we don't use offset */
+	ASSERT(offset == 0);
+	sg->page = page;
+	sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+	return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+	if (sg->end)
+		return (NULL);
+
+	return (sg + 1);
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+	struct scatterlist *sg;
+	int i;
+
+	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+		sg_set_page(sg, p, PAGESIZE, 0);
+	}
+	ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	int i, n = ABD_SCATTER(abd).abd_nents;
+	struct scatterlist *sg;
+
+	abd_for_each_sg(abd, sg, n, i) {
+		struct page *p = nth_page(sg_page(sg), 0);
+		umem_free_aligned(p, PAGESIZE);
+	}
+	abd_free_sg_table(abd);
+}
+
+static void
+abd_alloc_zero_scatter(void)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct scatterlist *sg;
+	int i;
+
+	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+	memset(abd_zero_page, 0, PAGESIZE);
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+
+	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+	}
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	(void) abd;
+	(void) op;
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+	size_t n;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+	    ABD_SCATTER(abd).abd_sgl->length);
+	n = ABD_SCATTER(abd).abd_nents;
+	abd_for_each_sg(abd, sg, n, i) {
+		ASSERT3P(sg_page(sg), !=, NULL);
+	}
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+	abd_free_sg_table(abd_zero_scatter);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+	ASSERT3P(abd_zero_page, !=, NULL);
+	umem_free_aligned(abd_zero_page, PAGESIZE);
+}
+
+void
+abd_init(void)
+{
+	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+	abd_free_zero_scatter();
+
+	if (abd_cache) {
+		kmem_cache_destroy(abd_cache);
+		abd_cache = NULL;
+	}
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	(void) abd;
+	__builtin_unreachable();
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy.  But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
+    size_t size)
+{
+	(void) size;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+
+	if (abd == NULL)
+		abd = abd_alloc_struct(0);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that
+	 * if we own the underlying data buffer, which is not true in
+	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+
+	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+		if (new_offset < sg->length)
+			break;
+		new_offset -= sg->length;
+	}
+
+	ABD_SCATTER(abd).abd_sgl = sg;
+	ABD_SCATTER(abd).abd_offset = new_offset;
+	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+
+	return (abd);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
+	aiter->iter_abd = abd;
+	if (!abd_is_linear(abd)) {
+		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+	}
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+	aiter->iter_offset += amount;
+	if (!abd_is_linear(aiter->iter_abd)) {
+		while (aiter->iter_offset >= aiter->iter_sg->length) {
+			aiter->iter_offset -= aiter->iter_sg->length;
+			aiter->iter_sg = sg_next(aiter->iter_sg);
+			if (aiter->iter_sg == NULL) {
+				ASSERT0(aiter->iter_offset);
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	} else {
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+
+		paddr = sg_page(aiter->iter_sg);
+	}
+
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index c4cc2ce01d6..60287ccdda9 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -58,22 +58,16 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
-#ifdef _KERNEL
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #include <linux/version.h>
-#endif
 
-#ifdef _KERNEL
 #if defined(MAX_ORDER)
 #define	ABD_MAX_ORDER	(MAX_ORDER)
 #elif defined(MAX_PAGE_ORDER)
 #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
 #endif
-#else
-#define	ABD_MAX_ORDER	(1)
-#endif
 
 typedef struct abd_stats {
 	kstat_named_t abdstat_struct_size;
@@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;
 
 struct page;
 /*
- * _KERNEL   - Will point to ZERO_PAGE if it is available or it will be
- *             an allocated zero'd PAGESIZE buffer.
- * Userspace - Will be an allocated zero'ed PAGESIZE buffer.
- *
- * abd_zero_page is assigned to each of the pages of abd_zero_scatter.
+ * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
+ * point to ZERO_PAGE if it is available or it will be an allocated zero'd
+ * PAGESIZE buffer.
  */
 static struct page *abd_zero_page = NULL;
 
@@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 }
 
-#ifdef _KERNEL
 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
 
 /*
@@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 }
 
-#else /* _KERNEL */
-
-#ifndef PAGE_SHIFT
-#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
-#endif
-
-#define	zfs_kmap_local(chunk)		((void *)chunk)
-#define	zfs_kunmap_local(addr)		do { (void)(addr); } while (0)
-#define	local_irq_save(flags)		do { (void)(flags); } while (0)
-#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
-#define	nth_page(pg, i) \
-	((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
-	struct page *page;
-	int length;
-	int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
-	memset(sg, 0, nr * sizeof (struct scatterlist));
-	sg[nr - 1].end = 1;
-}
-
-/*
- * This must be called if any of the sg_table allocation functions
- * are called.
- */
-static void
-abd_free_sg_table(abd_t *abd)
-{
-	int nents = ABD_SCATTER(abd).abd_nents;
-	vmem_free(ABD_SCATTER(abd).abd_sgl,
-	    nents * sizeof (struct scatterlist));
-}
-
-#define	for_each_sg(sgl, sg, nr, i)	\
-	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
-    unsigned int offset)
-{
-	/* currently we don't use offset */
-	ASSERT(offset == 0);
-	sg->page = page;
-	sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
-	return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
-	if (sg->end)
-		return (NULL);
-
-	return (sg + 1);
-}
-
-void
-abd_alloc_chunks(abd_t *abd, size_t size)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
-	struct scatterlist *sg;
-	int i;
-
-	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-		sg_set_page(sg, p, PAGESIZE, 0);
-	}
-	ABD_SCATTER(abd).abd_nents = nr_pages;
-}
-
-void
-abd_free_chunks(abd_t *abd)
-{
-	int i, n = ABD_SCATTER(abd).abd_nents;
-	struct scatterlist *sg;
-
-	abd_for_each_sg(abd, sg, n, i) {
-		struct page *p = nth_page(sg_page(sg), 0);
-		umem_free_aligned(p, PAGESIZE);
-	}
-	abd_free_sg_table(abd);
-}
-
-static void
-abd_alloc_zero_scatter(void)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	struct scatterlist *sg;
-	int i;
-
-	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-	memset(abd_zero_page, 0, PAGESIZE);
-	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
-	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
-	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-
-	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
-		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
-	}
-
-	ABDSTAT_BUMP(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
-	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
-}
-
-#endif /* _KERNEL */
-
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
@@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
 	abd_free_struct(abd_zero_scatter);
 	abd_zero_scatter = NULL;
 	ASSERT3P(abd_zero_page, !=, NULL);
-#if defined(_KERNEL)
 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 	abd_unmark_zfs_page(abd_zero_page);
 	__free_page(abd_zero_page);
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
-#else
-	umem_free_aligned(abd_zero_page, PAGESIZE);
-#endif /* _KERNEL */
 }
 
 static int
@@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
 {
 }
 
-#if defined(_KERNEL)
-
 /*
  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
  * It yields the next page struct and data offset and size within it, without
@@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-
-#endif /* _KERNEL */

From 5b9e69539249bb823de65c182dd225e8edaf408b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 1 Apr 2024 15:37:34 +1100
Subject: [PATCH 33/59] abd_os: break out platform-specific header parts

Removing the platform #ifdefs from shared headers in favour of
per-platform headers. Makes abd_t much leaner, among other things.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 config/Rules.am                          |  3 +-
 include/os/freebsd/Makefile.am           |  2 +
 include/os/freebsd/zfs/sys/abd_impl_os.h | 41 ++++++++++++++++
 include/os/freebsd/zfs/sys/abd_os.h      | 46 ++++++++++++++++++
 include/os/linux/Makefile.am             |  2 +
 include/os/linux/zfs/sys/abd_impl_os.h   | 41 ++++++++++++++++
 include/os/linux/zfs/sys/abd_os.h        | 62 ++++++++++++++++++++++++
 include/sys/abd.h                        | 35 ++-----------
 include/sys/abd_impl.h                   | 14 +-----
 lib/libzpool/Makefile.am                 |  2 +
 lib/libzpool/include/Makefile.am         |  4 ++
 lib/libzpool/include/sys/abd_impl_os.h   | 41 ++++++++++++++++
 lib/libzpool/include/sys/abd_os.h        | 47 ++++++++++++++++++
 module/os/freebsd/zfs/abd_os.c           |  2 -
 14 files changed, 294 insertions(+), 48 deletions(-)
 create mode 100644 include/os/freebsd/zfs/sys/abd_impl_os.h
 create mode 100644 include/os/freebsd/zfs/sys/abd_os.h
 create mode 100644 include/os/linux/zfs/sys/abd_impl_os.h
 create mode 100644 include/os/linux/zfs/sys/abd_os.h
 create mode 100644 lib/libzpool/include/Makefile.am
 create mode 100644 lib/libzpool/include/sys/abd_impl_os.h
 create mode 100644 lib/libzpool/include/sys/abd_os.h

diff --git a/config/Rules.am b/config/Rules.am
index 00ac890e230..b462826e2c8 100644
--- a/config/Rules.am
+++ b/config/Rules.am
@@ -10,7 +10,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/module/icp/include \
 	-I$(top_srcdir)/lib/libspl/include \
-	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@
+	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
+	-I$(top_srcdir)/lib/libzpool/include
 
 AM_LIBTOOLFLAGS = --silent
 
diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 292f79b8ce7..d975c4fe69f 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -77,6 +77,8 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
 	%D%/zfs/sys/freebsd_event.h \
diff --git a/include/os/freebsd/zfs/sys/abd_impl_os.h b/include/os/freebsd/zfs/sys/abd_impl_os.h
new file mode 100644
index 00000000000..309e77110d3
--- /dev/null
+++ b/include/os/freebsd/zfs/sys/abd_impl_os.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	critical_enter()
+#define	abd_exit_critical(flags)	critical_exit()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
diff --git a/include/os/freebsd/zfs/sys/abd_os.h b/include/os/freebsd/zfs/sys/abd_os.h
new file mode 100644
index 00000000000..57122ee83e8
--- /dev/null
+++ b/include/os/freebsd/zfs/sys/abd_os.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	void		*abd_chunks[1]; /* actually variable-length */
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index f31ae50b96a..9100aebb541 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -20,6 +20,8 @@ kernel_linux_HEADERS = \
 
 kernel_sysdir = $(kerneldir)/sys
 kernel_sys_HEADERS = \
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/policy.h \
 	%D%/zfs/sys/trace_acl.h \
 	%D%/zfs/sys/trace_arc.h \
diff --git a/include/os/linux/zfs/sys/abd_impl_os.h b/include/os/linux/zfs/sys/abd_impl_os.h
new file mode 100644
index 00000000000..8192522cd22
--- /dev/null
+++ b/include/os/linux/zfs/sys/abd_impl_os.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	local_irq_save(flags)
+#define	abd_exit_critical(flags)	local_irq_restore(flags)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h
new file mode 100644
index 00000000000..ce4f5a2bdf9
--- /dev/null
+++ b/include/os/linux/zfs/sys/abd_os.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_nents;
+	struct scatterlist *abd_sgl;
+};
+
+struct abd_linear {
+	void		*abd_buf;
+	struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
+};
+
+typedef struct abd abd_t;
+
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+
+/*
+ * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
+ */
+unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
+unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/include/sys/abd.h b/include/sys/abd.h
index ed008465c89..67bf5e802c8 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -30,6 +30,7 @@
 #include <sys/debug.h>
 #include <sys/zfs_refcount.h>
 #include <sys/uio.h>
+#include <sys/abd_os.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -57,21 +58,8 @@ typedef struct abd {
 #endif
 	kmutex_t	abd_mtx;
 	union {
-		struct abd_scatter {
-			uint_t		abd_offset;
-#if defined(__FreeBSD__) && defined(_KERNEL)
-			void    *abd_chunks[1]; /* actually variable-length */
-#else
-			uint_t		abd_nents;
-			struct scatterlist *abd_sgl;
-#endif
-		} abd_scatter;
-		struct abd_linear {
-			void		*abd_buf;
-#if defined(__linux__) && defined(_KERNEL)
-			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
-#endif
-		} abd_linear;
+		struct abd_scatter	abd_scatter;
+		struct abd_linear	abd_linear;
 		struct abd_gang {
 			list_t abd_gang_chain;
 		} abd_gang;
@@ -80,9 +68,6 @@ typedef struct abd {
 
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
-#if defined(__linux__) && defined(_KERNEL)
-typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
-#endif
 
 extern int zfs_abd_scatter_enabled;
 
@@ -129,10 +114,6 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
-#if defined(__linux__) && defined(_KERNEL)
-int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
-    void *);
-#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@@ -226,16 +207,6 @@ abd_get_size(abd_t *abd)
 void abd_init(void);
 void abd_fini(void);
 
-/*
- * Linux ABD bio functions
- * Note: these are only needed to support vdev_classic. See comment in
- * vdev_disk.c.
- */
-#if defined(__linux__) && defined(_KERNEL)
-unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
-unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index f88ea25e245..1eb25d94adc 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -28,6 +28,7 @@
 #define	_ABD_IMPL_H
 
 #include <sys/abd.h>
+#include <sys/abd_impl_os.h>
 #include <sys/wmsum.h>
 
 #ifdef __cplusplus
@@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
 #define	ABD_LINEAR_BUF(abd)	(abd->abd_u.abd_linear.abd_buf)
 #define	ABD_GANG(abd)		(abd->abd_u.abd_gang)
 
-#if defined(_KERNEL)
-#if defined(__FreeBSD__)
-#define	abd_enter_critical(flags)	critical_enter()
-#define	abd_exit_critical(flags)	critical_exit()
-#else
-#define	abd_enter_critical(flags)	local_irq_save(flags)
-#define	abd_exit_critical(flags)	local_irq_restore(flags)
-#endif
-#else /* !_KERNEL */
-#define	abd_enter_critical(flags)	((void)0)
-#define	abd_exit_critical(flags)	((void)0)
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index eb0dd0ace1f..6989fefc666 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -1,3 +1,5 @@
+include $(srcdir)/%D%/include/Makefile.am
+
 libzpool_la_CFLAGS  = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
 libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
 
diff --git a/lib/libzpool/include/Makefile.am b/lib/libzpool/include/Makefile.am
new file mode 100644
index 00000000000..2e0c4c5610b
--- /dev/null
+++ b/lib/libzpool/include/Makefile.am
@@ -0,0 +1,4 @@
+libzpooldir = $(includedir)/libzpool
+libzpool_HEADERS = \
+	%D%/sys/abd_os.h \
+	%D%/sys/abd_impl_os.h
diff --git a/lib/libzpool/include/sys/abd_impl_os.h b/lib/libzpool/include/sys/abd_impl_os.h
new file mode 100644
index 00000000000..3137346f3bb
--- /dev/null
+++ b/lib/libzpool/include/sys/abd_impl_os.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	((void)0)
+#define	abd_exit_critical(flags)	((void)0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
diff --git a/lib/libzpool/include/sys/abd_os.h b/lib/libzpool/include/sys/abd_os.h
new file mode 100644
index 00000000000..67f7e5606be
--- /dev/null
+++ b/lib/libzpool/include/sys/abd_os.h
@@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_nents;
+	struct scatterlist *abd_sgl;
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index ce8c30025f3..f24ea3dc768 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -95,14 +95,12 @@ struct {
  */
 static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
 
-#if defined(_KERNEL)
 SYSCTL_DECL(_vfs_zfs);
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
 SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
-#endif
 
 kmem_cache_t *abd_chunk_cache;
 static kstat_t *abd_ksp;

From b69bebb535572ef905b065182d8c80d2fff5a8b4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sun, 21 Apr 2024 16:37:06 +1000
Subject: [PATCH 34/59] libzpool/abd_os: iovec-based scatter abd

This is intended to be a simple userspace scatter abd based on struct
iovec. It's not very sophisticated as-is, but sets a base for something
much more interesting.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 lib/libzpool/abd_os.c             | 471 +++++++++++-------------------
 lib/libzpool/include/sys/abd_os.h |   4 +-
 2 files changed, 174 insertions(+), 301 deletions(-)

diff --git a/lib/libzpool/abd_os.c b/lib/libzpool/abd_os.c
index de93f99a556..5a91605b2fe 100644
--- a/lib/libzpool/abd_os.c
+++ b/lib/libzpool/abd_os.c
@@ -24,34 +24,6 @@
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
-/*
- * See abd.c for a general overview of the arc buffered data (ABD).
- *
- * Linear buffers act exactly like normal buffers and are always mapped into the
- * kernel's virtual memory space, while scattered ABD data chunks are allocated
- * as physical pages and then mapped in only while they are actually being
- * accessed through one of the abd_* library functions. Using scattered ABDs
- * provides several benefits:
- *
- *  (1) They avoid use of kmem_*, preventing performance problems where running
- *      kmem_reap on very large memory systems never finishes and causes
- *      constant TLB shootdowns.
- *
- *  (2) Fragmentation is less of an issue since when we are at the limit of
- *      allocatable space, we won't have to search around for a long free
- *      hole in the VA space for large ARC allocations. Each chunk is mapped in
- *      individually, so even if we are using HIGHMEM (see next point) we
- *      wouldn't need to worry about finding a contiguous address range.
- *
- *  (3) If we are not using HIGHMEM, then all physical memory is always
- *      mapped into the kernel's address space, so we also avoid the map /
- *      unmap costs on each ABD access.
- *
- * If we are not using HIGHMEM, scattered buffers which have only one chunk
- * can be treated as linear buffers, because they are contiguous in the
- * kernel's virtual address space.  See abd_alloc_chunks() for details.
- */
-
 #include <sys/abd_impl.h>
 #include <sys/param.h>
 #include <sys/zio.h>
@@ -59,199 +31,112 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 
-
-#define	abd_for_each_sg(abd, sg, n, i)	\
-	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+/*
+ * We're simulating scatter/gather with 4K allocations, since that's more like
+ * what a typical kernel does.
+ */
+#define	ABD_PAGESIZE	(4096)
+#define	ABD_PAGESHIFT	(12)
+#define	ABD_PAGEMASK	(ABD_PAGESIZE-1)
 
 /*
- * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
- * ABD's.  Smaller allocations will use linear ABD's which uses
- * zio_[data_]buf_alloc().
- *
- * Scatter ABD's use at least one page each, so sub-page allocations waste
- * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
- * half of each page).  Using linear ABD's for small allocations means that
- * they will be put on slabs which contain many allocations.  This can
- * improve memory efficiency, but it also makes it much harder for ARC
- * evictions to actually free pages, because all the buffers on one slab need
- * to be freed in order for the slab (and underlying pages) to be freed.
- * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
- * possible for them to actually waste more memory than scatter (one page per
- * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
- *
- * Spill blocks are typically 512B and are heavily used on systems running
- * selinux with the default dnode size and the `xattr=sa` property set.
- *
- * By default we use linear allocations for 512B and 1KB, and scatter
- * allocations for larger (1.5KB and up).
+ * See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
+ * mostly useful to get a mix of linear and scatter ABDs for testing.
  */
-static int zfs_abd_scatter_min_size = 512 * 3;
+#define	ABD_SCATTER_MIN_SIZE	(512 * 3)
 
-/*
- * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
- * just a single zero'd page. This allows us to conserve memory by
- * only using a single zero page for the scatterlist.
- */
 abd_t *abd_zero_scatter = NULL;
 
-struct page;
-/*
- * abd_zero_page will be allocated with a zero'ed PAGESIZE buffer, which is
- * assigned to each of the pages of abd_zero_scatter.
- */
-static struct page *abd_zero_page = NULL;
-
-static kmem_cache_t *abd_cache = NULL;
-
 static uint_t
-abd_chunkcnt_for_bytes(size_t size)
+abd_iovcnt_for_bytes(size_t size)
 {
-	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+	/*
+	 * Each iovec points to a 4K page. There's no real reason to do this
+	 * in userspace, but our whole point here is to make it feel a bit
+	 * more like a real paged memory model.
+	 */
+	return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
 }
 
 abd_t *
 abd_alloc_struct_impl(size_t size)
 {
 	/*
-	 * In Linux we do not use the size passed in during ABD
-	 * allocation, so we just ignore it.
+	 * Zero-sized means it will be used for a linear or gang abd, so just
+	 * allocate the abd itself and return.
 	 */
-	(void) size;
-	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
-	ASSERT3P(abd, !=, NULL);
+	if (size == 0)
+		return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
 
+	/*
+	 * Allocating for a scatter abd, so compute how many ABD_PAGESIZE
+	 * iovecs we will need to hold this size. Append that allocation to the
+	 * end. Note that struct abd_scatter has includes abd_iov[1], so we
+	 * allocate one less iovec than we need.
+	 *
+	 * Note we're not allocating the pages proper, just the iovec pointers.
+	 * That's down in abd_alloc_chunks. We _could_ do it here in a single
+	 * allocation, but it's fiddly and harder to read for no real gain.
+	 */
+	uint_t n = abd_iovcnt_for_bytes(size);
+	abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
+	    UMEM_NOFAIL);
+	ABD_SCATTER(abd).abd_offset = 0;
+	ABD_SCATTER(abd).abd_iovcnt = n;
 	return (abd);
 }
 
 void
 abd_free_struct_impl(abd_t *abd)
 {
-	kmem_cache_free(abd_cache, abd);
-}
-
-#define	nth_page(pg, i) \
-	((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
-	struct page *page;
-	int length;
-	int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
-	memset(sg, 0, nr * sizeof (struct scatterlist));
-	sg[nr - 1].end = 1;
-}
-
-/*
- * This must be called if any of the sg_table allocation functions
- * are called.
- */
-static void
-abd_free_sg_table(abd_t *abd)
-{
-	int nents = ABD_SCATTER(abd).abd_nents;
-	vmem_free(ABD_SCATTER(abd).abd_sgl,
-	    nents * sizeof (struct scatterlist));
-}
-
-#define	for_each_sg(sgl, sg, nr, i)	\
-	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
-    unsigned int offset)
-{
-	/* currently we don't use offset */
-	ASSERT(offset == 0);
-	sg->page = page;
-	sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
-	return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
-	if (sg->end)
-		return (NULL);
-
-	return (sg + 1);
+	/* For scatter, compute the extra amount we need to free */
+	uint_t iovcnt =
+	    abd_is_linear(abd) || abd_is_gang(abd) ?
+	    0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
+	umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
 }
 
 void
 abd_alloc_chunks(abd_t *abd, size_t size)
 {
-	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
-	struct scatterlist *sg;
-	int i;
+	/*
+	 * We've already allocated the iovec array; ensure that the wanted size
+	 * actually matches, otherwise the caller has made a mistake somewhere.
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
 
-	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-		sg_set_page(sg, p, PAGESIZE, 0);
+	/*
+	 * Allocate a ABD_PAGESIZE region for each iovec.
+	 */
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base =
+		    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+		iov[i].iov_len = ABD_PAGESIZE;
 	}
-	ABD_SCATTER(abd).abd_nents = nr_pages;
 }
 
 void
 abd_free_chunks(abd_t *abd)
 {
-	int i, n = ABD_SCATTER(abd).abd_nents;
-	struct scatterlist *sg;
-
-	abd_for_each_sg(abd, sg, n, i) {
-		struct page *p = nth_page(sg_page(sg), 0);
-		umem_free_aligned(p, PAGESIZE);
-	}
-	abd_free_sg_table(abd);
-}
-
-static void
-abd_alloc_zero_scatter(void)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	struct scatterlist *sg;
-	int i;
-
-	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-	memset(abd_zero_page, 0, PAGESIZE);
-	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
-	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
-	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-
-	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
-		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
-	}
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++)
+		umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
 }
 
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
-	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
+	return (size < ABD_SCATTER_MIN_SIZE);
 }
 
 void
 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 {
 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
-	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+	int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
 	if (op == ABDSTAT_INCR) {
 		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 	} else {
@@ -270,67 +155,72 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 void
 abd_verify_scatter(abd_t *abd)
 {
-	size_t n;
-	int i = 0;
-	struct scatterlist *sg = NULL;
+#ifdef ZFS_DEBUG
+	/*
+	 * scatter abds shall have:
+	 * - at least one iovec
+	 * - all iov_base point somewhere
+	 * - all iov_len are ABD_PAGESIZE
+	 * - offset set within the abd pages somewhere
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, >, 0);
 
-	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
-	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
-	    ABD_SCATTER(abd).abd_sgl->length);
-	n = ABD_SCATTER(abd).abd_nents;
-	abd_for_each_sg(abd, sg, n, i) {
-		ASSERT3P(sg_page(sg), !=, NULL);
+	uint_t len = 0;
+	for (int i = 0; i < n; i++) {
+		ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
+		ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
+		len += ABD_PAGESIZE;
 	}
-}
 
-static void
-abd_free_zero_scatter(void)
-{
-	abd_free_sg_table(abd_zero_scatter);
-	abd_free_struct(abd_zero_scatter);
-	abd_zero_scatter = NULL;
-	ASSERT3P(abd_zero_page, !=, NULL);
-	umem_free_aligned(abd_zero_page, PAGESIZE);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
+#endif
 }
 
 void
 abd_init(void)
 {
-	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	/*
+	 * Create the "zero" scatter abd. This is always the size of the
+	 * largest possible block, but only actually has a single allocated
+	 * page, which all iovecs in the abd point to.
+	 */
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 
-	abd_alloc_zero_scatter();
+	void *zero =
+	    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+	memset(zero, 0, ABD_PAGESIZE);
+
+	uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base = zero;
+		iov[i].iov_len = ABD_PAGESIZE;
+	}
 }
 
 void
 abd_fini(void)
 {
-	abd_free_zero_scatter();
-
-	if (abd_cache) {
-		kmem_cache_destroy(abd_cache);
-		abd_cache = NULL;
-	}
+	umem_free_aligned(
+	    ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
 }
 
 void
 abd_free_linear_page(abd_t *abd)
 {
+	/*
+	 * LINEAR_PAGE is specific to the Linux kernel; we never set this
+	 * flag, so this will never be called.
+	 */
 	(void) abd;
-	__builtin_unreachable();
+	PANIC("unreachable");
 }
 
-/*
- * If we're going to use this ABD for doing I/O using the block layer, the
- * consumer of the ABD data doesn't care if it's scattered or not, and we don't
- * plan to store this ABD in memory for a long period of time, we should
- * allocate the ABD type that requires the least data copying to do the I/O.
- *
- * On Linux the optimal thing to do would be to use abd_get_offset() and
- * construct a new ABD which shares the original pages thereby eliminating
- * the copy.  But for the moment a new linear ABD is allocated until this
- * performance optimization can be implemented.
- */
 abd_t *
 abd_alloc_for_io(size_t size, boolean_t is_metadata)
 {
@@ -338,43 +228,60 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
 }
 
 abd_t *
-abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
-    size_t size)
+abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
 {
-	(void) size;
-	int i = 0;
-	struct scatterlist *sg = NULL;
-
-	abd_verify(sabd);
-	ASSERT3U(off, <=, sabd->abd_size);
-
-	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
-
-	if (abd == NULL)
-		abd = abd_alloc_struct(0);
 
 	/*
-	 * Even if this buf is filesystem metadata, we only track that
-	 * if we own the underlying data buffer, which is not true in
-	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 * Create a new scatter dabd by borrowing data pages from sabd to cover
+	 * off+size.
+	 *
+	 * sabd is an existing scatter abd with a set of iovecs, each covering
+	 * an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
+	 *
+	 *   [........][........][........][........]
+	 *      ^- sabd_offset
+	 *
+	 * We want to produce a new abd, referencing those allocations at the
+	 * given offset.
+	 *
+	 *   [........][........][........][........]
+	 *                    ^- dabd_offset = sabd_offset + off
+	 *                                        ^- dabd_offset + size
+	 *
+	 * In this example, dabd needs three iovecs. The first iovec is offset
+	 * 0, so the final dabd_offset is masked back into the first iovec.
+	 *
+	 *             [........][........][........]
+	 *                    ^- dabd_offset
 	 */
+	size_t soff = ABD_SCATTER(sabd).abd_offset + off;
+	size_t doff = soff & ABD_PAGEMASK;
+	size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
 
-	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
-		if (new_offset < sg->length)
-			break;
-		new_offset -= sg->length;
-	}
+	/*
+	 * If the passed-in abd has enough allocated iovecs already, reuse it.
+	 * Otherwise, make a new one. The caller will free the original if the
+	 * one it gets back is not the same.
+	 *
+	 * Note that it's ok if we reuse an abd with more iovecs than we need.
+	 * abd_size has the usable amount of data, and the abd does not own the
+	 * pages referenced by the iovecs. At worst, they're holding dangling
+	 * pointers that we'll never use anyway.
+	 */
+	if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
+		dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
 
-	ABD_SCATTER(abd).abd_sgl = sg;
-	ABD_SCATTER(abd).abd_offset = new_offset;
-	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+	/* Set offset into first page in view */
+	ABD_SCATTER(dabd).abd_offset = doff;
 
-	return (abd);
+	/* Copy the wanted iovecs from the source to the dest */
+	memcpy(&ABD_SCATTER(dabd).abd_iov[0],
+	    &ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
+	    iovcnt * sizeof (struct iovec));
+
+	return (dabd);
 }
 
-/*
- * Initialize the abd_iter.
- */
 void
 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
@@ -382,16 +289,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 	abd_verify(abd);
 	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	if (!abd_is_linear(abd)) {
-		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
-		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
-	}
 }
 
-/*
- * This is just a helper function to see if we have exhausted the
- * abd_iter and reached the end.
- */
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
@@ -399,83 +298,57 @@ abd_iter_at_end(struct abd_iter *aiter)
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
-/*
- * Advance the iterator by a certain amount. Cannot be called when a chunk is
- * in use. This can be safely called when the aiter has already exhausted, in
- * which case this does nothing.
- */
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
-	/*
-	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
-	 * this state (directly or abd_iter_unmap()) before advancing.
-	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
-	ASSERT3P(aiter->iter_page, ==, NULL);
-	ASSERT0(aiter->iter_page_doff);
-	ASSERT0(aiter->iter_page_dsize);
 
-	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	aiter->iter_pos += amount;
-	aiter->iter_offset += amount;
-	if (!abd_is_linear(aiter->iter_abd)) {
-		while (aiter->iter_offset >= aiter->iter_sg->length) {
-			aiter->iter_offset -= aiter->iter_sg->length;
-			aiter->iter_sg = sg_next(aiter->iter_sg);
-			if (aiter->iter_sg == NULL) {
-				ASSERT0(aiter->iter_offset);
-				break;
-			}
-		}
-	}
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 }
 
-/*
- * Map the current chunk into aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
 void
 abd_iter_map(struct abd_iter *aiter)
 {
-	void *paddr;
-	size_t offset = 0;
-
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
 
-	/* There's nothing left to iterate over, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	if (abd_is_linear(aiter->iter_abd)) {
-		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
-		offset = aiter->iter_offset;
-		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
-		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
-	} else {
-		offset = aiter->iter_offset;
-		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
-		    aiter->iter_abd->abd_size - aiter->iter_pos);
-
-		paddr = sg_page(aiter->iter_sg);
+		aiter->iter_mapaddr =
+		    ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+		aiter->iter_mapsize =
+		    aiter->iter_abd->abd_size - aiter->iter_pos;
+		return;
 	}
 
-	aiter->iter_mapaddr = (char *)paddr + offset;
+	/*
+	 * For scatter, we index into the appropriate iovec, and return the
+	 * smaller of the amount requested, or up to the end of the page.
+	 */
+	size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
+
+	ASSERT3U(poff >> ABD_PAGESHIFT, <=,
+	    ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
+	struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
+	    abd_iov[poff >> ABD_PAGESHIFT];
+
+	aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
+	    aiter->iter_abd->abd_size - aiter->iter_pos);
+	ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
+
+	aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
 }
 
-/*
- * Unmap the current chunk from aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
 void
 abd_iter_unmap(struct abd_iter *aiter)
 {
-	/* There's nothing left to unmap, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
diff --git a/lib/libzpool/include/sys/abd_os.h b/lib/libzpool/include/sys/abd_os.h
index 67f7e5606be..8ff6aa1e9e4 100644
--- a/lib/libzpool/include/sys/abd_os.h
+++ b/lib/libzpool/include/sys/abd_os.h
@@ -32,8 +32,8 @@ extern "C" {
 
 struct abd_scatter {
 	uint_t		abd_offset;
-	uint_t		abd_nents;
-	struct scatterlist *abd_sgl;
+	uint_t		abd_iovcnt;
+	struct iovec	abd_iov[1]; /* actually variable-length */
 };
 
 struct abd_linear {

From 9e15877dfb3e80021551301aac71976216b3fe1b Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 21 Aug 2024 17:38:06 -0700
Subject: [PATCH 35/59] Linux 6.10 compat: META

Update the META file to reflect compatibility with the 6.10 kernel.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16466
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index 7aac80c541b..76ca22cbae0 100644
--- a/META
+++ b/META
@@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.9
+Linux-Maximum: 6.10
 Linux-Minimum: 3.10

From ba2209ec9e2166dd9c6d80b61b4aed3dd457be4b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 11 Jul 2024 07:37:30 +1000
Subject: [PATCH 36/59] abd_get_from_buf_struct: wrap existing buf with ABD
 stored on stack

This allows a simple "wrapping" ABD for an existing linear buffer to be
allocated on the stack, avoiding an allocation.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/abd.h |  1 +
 module/zfs/abd.c  | 22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 67bf5e802c8..567b88c0fc0 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -93,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
 abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
 abd_t *abd_get_zeros(size_t);
 abd_t *abd_get_from_buf(void *, size_t);
+abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
 void abd_cache_reap_now(void);
 
 /*
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index f1df6082f04..c8c4d2270fa 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
 }
 
 /*
- * Allocate a linear ABD structure for buf.
+ * Create a linear ABD for an existing buf.
  */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
+static abd_t *
+abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
 {
-	abd_t *abd = abd_alloc_struct(0);
-
 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	/*
@@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
 	return (abd);
 }
 
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+	abd_t *abd = abd_alloc_struct(0);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
+abd_t *
+abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
+{
+	abd_init_struct(abd);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
 /*
  * Get the raw buffer associated with a linear ABD.
  */

From 5eede0d5fde556107321fae6b41d6f83eeaf28a1 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Jul 2024 16:11:12 +1000
Subject: [PATCH 37/59] compress: rework callers to always use the zio_compress
 calls

This will make future refactoring easier.

There are two we can't change for the moment, because zio_compress_data
does hole detection & collapsing which zio_decompress_data does not
actually know how to handle.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 module/zfs/blkptr.c      |  7 +++++--
 module/zfs/ddt_zap.c     | 13 +++++++++----
 module/zfs/dsl_dataset.c |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
index d85f0737f6f..6a6f06c7357 100644
--- a/module/zfs/blkptr.c
+++ b/module/zfs/blkptr.c
@@ -142,8 +142,11 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 		decode_embedded_bp_compressed(bp, dstbuf);
-		VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
-		    dstbuf, buf, psize, buflen, NULL));
+		abd_t dstabd;
+		abd_get_from_buf_struct(&dstabd, dstbuf, psize);
+		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &dstabd,
+		    buf, psize, buflen, NULL));
+		abd_free(&dstabd);
 	} else {
 		ASSERT3U(lsize, ==, psize);
 		decode_embedded_bp_compressed(bp, buf);
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 4e01624f368..8e78ec3277c 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -52,6 +52,7 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
 
 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */
 
+	/* Call compress function directly to avoid hole detection. */
 	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
 	    ci->ci_level);
 
@@ -72,12 +73,16 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 {
 	uchar_t version = *src++;
 	int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 
-	if (ci->ci_decompress != NULL)
-		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
-	else
+	if (zio_compress_table[cpfunc].ci_decompress == NULL) {
 		memcpy(dst, src, d_len);
+		return;
+	}
+
+	abd_t sabd;
+	abd_get_from_buf_struct(&sabd, src, s_len);
+	VERIFY0(zio_decompress_data(cpfunc, &sabd, dst, s_len, d_len, NULL));
+	abd_free(&sabd);
 
 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
 	    (ZFS_HOST_BYTEORDER != 0))
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 45d8a290d67..28e07259ddd 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2425,6 +2425,7 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	fnvlist_free(token_nv);
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
+	/* Call compress function directly to avoid hole detection. */
 	compressed_size = gzip_compress(packed, compressed,
 	    packed_size, packed_size, 6);
 

From b4d81b1a6a8c1254910f7e8b48e2f58fe77b769a Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 5 Jul 2024 15:01:57 +1000
Subject: [PATCH 38/59] zstream: use zio_compress calls for compression

This is updating zstream to use the zio_compress calls rather than using
its own dispatch. Since that was fairly entangled, some refactoring
included.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_decompress.c | 127 +++++++++++++------------------
 cmd/zstream/zstream_recompress.c |  97 ++++++++++++-----------
 2 files changed, 102 insertions(+), 122 deletions(-)

diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c
index 0cef36c0441..f5f66080d06 100644
--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@@ -22,6 +22,8 @@
 /*
  * Copyright 2022 Axcient.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2024, Klara, Inc.
  */
 
 #include <err.h>
@@ -257,83 +259,64 @@ zstream_do_decompress(int argc, char *argv[])
 			ENTRY e = {.key = key};
 
 			p = hsearch(e, FIND);
-			if (p != NULL) {
-				zio_decompress_func_t *xfunc = NULL;
-				switch ((enum zio_compress)(intptr_t)p->data) {
-				case ZIO_COMPRESS_OFF:
-					xfunc = NULL;
-					break;
-				case ZIO_COMPRESS_LZJB:
-					xfunc = lzjb_decompress;
-					break;
-				case ZIO_COMPRESS_GZIP_1:
-					xfunc = gzip_decompress;
-					break;
-				case ZIO_COMPRESS_ZLE:
-					xfunc = zle_decompress;
-					break;
-				case ZIO_COMPRESS_LZ4:
-					xfunc = lz4_decompress_zfs;
-					break;
-				case ZIO_COMPRESS_ZSTD:
-					xfunc = zfs_zstd_decompress;
-					break;
-				default:
-					assert(B_FALSE);
-				}
-
-
-				/*
-				 * Read and decompress the block
-				 */
-				char *lzbuf = safe_calloc(payload_size);
-				(void) sfread(lzbuf, payload_size, stdin);
-				if (xfunc == NULL) {
-					memcpy(buf, lzbuf, payload_size);
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					if (verbose)
-						fprintf(stderr, "Resetting "
-						    "compression type to off "
-						    "for ino %llu offset "
-						    "%llu\n",
-						    (u_longlong_t)
-						    drrw->drr_object,
-						    (u_longlong_t)
-						    drrw->drr_offset);
-				} else if (0 != xfunc(lzbuf, buf,
-				    payload_size, payload_size, 0)) {
-					/*
-					 * The block must not be compressed,
-					 * at least not with this compression
-					 * type, possibly because it gets
-					 * written multiple times in this
-					 * stream.
-					 */
-					warnx("decompression failed for "
-					    "ino %llu offset %llu",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-					memcpy(buf, lzbuf, payload_size);
-				} else if (verbose) {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					fprintf(stderr, "successfully "
-					    "decompressed ino %llu "
-					    "offset %llu\n",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-				} else {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-				}
-				free(lzbuf);
-			} else {
+			if (p == NULL) {
 				/*
 				 * Read the contents of the block unaltered
 				 */
 				(void) sfread(buf, payload_size, stdin);
+				break;
 			}
+
+			/*
+			 * Read and decompress the block
+			 */
+			enum zio_compress c =
+			    (enum zio_compress)(intptr_t)p->data;
+
+			if (c == ZIO_COMPRESS_OFF) {
+				(void) sfread(buf, payload_size, stdin);
+				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+				if (verbose)
+					fprintf(stderr,
+					    "Resetting compression type to "
+					    "off for ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				break;
+			}
+
+			char *lzbuf = safe_calloc(payload_size);
+			(void) sfread(lzbuf, payload_size, stdin);
+
+			abd_t sabd;
+			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
+			int err = zio_decompress_data(c, &sabd, buf,
+			    payload_size, payload_size, NULL);
+			abd_free(&sabd);
+
+			if (err != 0) {
+				/*
+				 * The block must not be compressed, at least
+				 * not with this compression type, possibly
+				 * because it gets written multiple times in
+				 * this stream.
+				 */
+				warnx("decompression failed for "
+				    "ino %llu offset %llu",
+				    (u_longlong_t)drrw->drr_object,
+				    (u_longlong_t)drrw->drr_offset);
+				memcpy(buf, lzbuf, payload_size);
+			} else if (verbose) {
+				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+				fprintf(stderr, "successfully decompressed "
+				    "ino %llu offset %llu\n",
+				    (u_longlong_t)drrw->drr_object,
+				    (u_longlong_t)drrw->drr_offset);
+			} else {
+				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+			}
+
+			free(lzbuf);
 			break;
 		}
 
diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index f9e01d1aa4c..0e5cc9cd815 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -22,10 +22,9 @@
 /*
  * Copyright 2022 Axcient.  All rights reserved.
  * Use is subject to license terms.
- */
-
-/*
+ *
  * Copyright (c) 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Klara, Inc.
  */
 
 #include <err.h>
@@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
 	dmu_replay_record_t *drr = &thedrr;
 	zio_cksum_t stream_cksum;
 	int c;
-	int level = -1;
+	int level = 0;
 
 	while ((c = getopt(argc, argv, "l:")) != -1) {
 		switch (c) {
@@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])
 
 	if (argc != 1)
 		zstream_usage();
-	int type = 0;
-	zio_compress_info_t *cinfo = NULL;
-	if (0 == strcmp(argv[0], "off")) {
-		type = ZIO_COMPRESS_OFF;
-		cinfo = &zio_compress_table[type];
-	} else if (0 == strcmp(argv[0], "inherit") ||
-	    0 == strcmp(argv[0], "empty") ||
-	    0 == strcmp(argv[0], "on")) {
-		// Fall through to invalid compression type case
-	} else {
-		for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
-			if (0 == strcmp(zio_compress_table[i].ci_name,
-			    argv[0])) {
-				cinfo = &zio_compress_table[i];
-				type = i;
-				break;
-			}
-		}
-	}
-	if (cinfo == NULL) {
-		fprintf(stderr, "Invalid compression type %s.\n",
-		    argv[0]);
-		exit(2);
-	}
 
-	if (cinfo->ci_compress == NULL) {
-		type = 0;
-		cinfo = &zio_compress_table[0];
+	enum zio_compress ctype;
+	if (strcmp(argv[0], "off") == 0) {
+		ctype = ZIO_COMPRESS_OFF;
+	} else {
+		for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
+			if (strcmp(argv[0],
+			    zio_compress_table[ctype].ci_name) == 0)
+				break;
+		}
+		if (ctype == ZIO_COMPRESS_FUNCTIONS ||
+		    zio_compress_table[ctype].ci_compress == NULL) {
+			fprintf(stderr, "Invalid compression type %s.\n",
+			    argv[0]);
+			exit(2);
+		}
 	}
 
 	if (isatty(STDIN_FILENO)) {
@@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
 		exit(1);
 	}
 
+	abd_init();
 	fletcher_4_init();
 	zio_init();
 	zstd_init();
@@ -247,53 +235,60 @@ zstream_do_recompress(int argc, char *argv[])
 				(void) sfread(buf, payload_size, stdin);
 				break;
 			}
-			if (drrw->drr_compressiontype >=
-			    ZIO_COMPRESS_FUNCTIONS) {
+			enum zio_compress dtype = drrw->drr_compressiontype;
+			if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
 				fprintf(stderr, "Invalid compression type in "
-				    "stream: %d\n", drrw->drr_compressiontype);
+				    "stream: %d\n", dtype);
 				exit(3);
 			}
-			zio_compress_info_t *dinfo =
-			    &zio_compress_table[drrw->drr_compressiontype];
+			if (zio_compress_table[dtype].ci_decompress == NULL)
+				dtype = ZIO_COMPRESS_OFF;
 
 			/* Set up buffers to minimize memcpys */
 			char *cbuf, *dbuf;
-			if (cinfo->ci_compress == NULL)
+			if (ctype == ZIO_COMPRESS_OFF)
 				dbuf = buf;
 			else
 				dbuf = safe_calloc(bufsz);
 
-			if (dinfo->ci_decompress == NULL)
+			if (dtype == ZIO_COMPRESS_OFF)
 				cbuf = dbuf;
 			else
 				cbuf = safe_calloc(payload_size);
 
 			/* Read and decompress the payload */
 			(void) sfread(cbuf, payload_size, stdin);
-			if (dinfo->ci_decompress != NULL) {
-				if (0 != dinfo->ci_decompress(cbuf, dbuf,
-				    payload_size, MIN(bufsz,
-				    drrw->drr_logical_size), dinfo->ci_level)) {
+			if (dtype != ZIO_COMPRESS_OFF) {
+				abd_t cabd;
+				abd_get_from_buf_struct(&cabd,
+				    cbuf, payload_size);
+				if (zio_decompress_data(dtype, &cabd, dbuf,
+				    payload_size,
+				    MIN(bufsz, drrw->drr_logical_size),
+				    NULL) != 0) {
 					warnx("decompression type %d failed "
 					    "for ino %llu offset %llu",
-					    type,
+					    dtype,
 					    (u_longlong_t)drrw->drr_object,
 					    (u_longlong_t)drrw->drr_offset);
 					exit(4);
 				}
 				payload_size = drrw->drr_logical_size;
+				abd_free(&cabd);
 				free(cbuf);
 			}
 
 			/* Recompress the payload */
-			if (cinfo->ci_compress != NULL) {
-				payload_size = P2ROUNDUP(cinfo->ci_compress(
-				    dbuf, buf, drrw->drr_logical_size,
-				    MIN(payload_size, bufsz), (level == -1 ?
-				    cinfo->ci_level : level)),
+			if (ctype != ZIO_COMPRESS_OFF) {
+				abd_t dabd;
+				abd_get_from_buf_struct(&dabd,
+				    dbuf, drrw->drr_logical_size);
+				payload_size = P2ROUNDUP(zio_compress_data(
+				    ctype, &dabd, (void **)&buf,
+				    drrw->drr_logical_size, level),
 				    SPA_MINBLOCKSIZE);
 				if (payload_size != drrw->drr_logical_size) {
-					drrw->drr_compressiontype = type;
+					drrw->drr_compressiontype = ctype;
 					drrw->drr_compressed_size =
 					    payload_size;
 				} else {
@@ -301,9 +296,10 @@ zstream_do_recompress(int argc, char *argv[])
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
 				}
+				abd_free(&dabd);
 				free(dbuf);
 			} else {
-				drrw->drr_compressiontype = type;
+				drrw->drr_compressiontype = ctype;
 				drrw->drr_compressed_size = 0;
 			}
 			break;
@@ -371,6 +367,7 @@ zstream_do_recompress(int argc, char *argv[])
 	fletcher_4_fini();
 	zio_fini();
 	zstd_fini();
+	abd_fini();
 
 	return (0);
 }

From e119483a95e8fece4097419689c7803754ca5c75 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 29 Jun 2024 11:47:16 +1000
Subject: [PATCH 39/59] compress: remove zio_decompress_data_buf

Nothing uses it anymore!

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio_compress.h |  2 --
 module/zfs/zio_compress.c  | 24 ++++++++++--------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index 691d7b62448..d0caee279f8 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -187,8 +187,6 @@ extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
     size_t s_len, uint8_t level);
 extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level);
-extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);
 
 #ifdef	__cplusplus
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index c3bceababa3..1d448b00261 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -161,26 +161,22 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 }
 
 int
-zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level)
 {
 	zio_compress_info_t *ci = &zio_compress_table[c];
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
+	void *sbuf = abd_borrow_buf_copy(src, s_len);
+
+	int err;
 	if (ci->ci_decompress_level != NULL && level != NULL)
-		return (ci->ci_decompress_level(src, dst, s_len, d_len, level));
+		err = ci->ci_decompress_level(sbuf, dst, s_len, d_len, level);
+	else
+		err = ci->ci_decompress(sbuf, dst, s_len, d_len, ci->ci_level);
 
-	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
-}
-
-int
-zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level)
-{
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
-	abd_return_buf(src, tmp, s_len);
+	abd_return_buf(src, sbuf, s_len);
 
 	/*
 	 * Decompression shouldn't fail, because we've already verified
@@ -189,9 +185,9 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
 	 */
 	if (zio_decompress_fail_fraction != 0 &&
 	    random_in_range(zio_decompress_fail_fraction) == 0)
-		ret = SET_ERROR(EINVAL);
+		err = SET_ERROR(EINVAL);
 
-	return (ret);
+	return (err);
 }
 
 int

From dd0c08f9c65ccf9d9c0c08a29de9fc21e136c47d Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 29 Jun 2024 11:19:10 +1000
Subject: [PATCH 40/59] compress: remove unused abd compress prototype

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio_compress.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index d0caee279f8..a7d19b633ef 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -130,17 +130,7 @@ typedef int zio_decompress_func_t(void *src, void *dst,
 /* Common signature for all zio decompress and get level functions. */
 typedef int zio_decompresslevel_func_t(void *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level);
-/* Common signature for all zio get-compression-level functions. */
-typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);
 
-
-/*
- * Common signature for all zio decompress functions using an ABD as input.
- * This is helpful if you have both compressed ARC and scatter ABDs enabled,
- * but is not a requirement for all compression algorithms.
- */
-typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
-    size_t s_len, size_t d_len, int);
 /*
  * Information about each compression function.
  */

From 522816498c0ea0d8dfa449cd18e2032b8ac0a9b8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 29 Jun 2024 11:16:50 +1000
Subject: [PATCH 41/59] compress: standardise names of compression functions

This is mostly to make searching easier.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio_compress.h |  32 +++++-----
 include/sys/zstd/zstd.h    |   2 -
 module/zfs/dsl_dataset.c   |   2 +-
 module/zfs/gzip.c          |   6 +-
 module/zfs/lz4_zfs.c       |   4 +-
 module/zfs/lzjb.c          |   6 +-
 module/zfs/zio_compress.c  |  48 +++++++++------
 module/zfs/zle.c           |   6 +-
 module/zstd/zfs_zstd.c     | 122 ++++++++++++++++++-------------------
 9 files changed, 122 insertions(+), 106 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index a7d19b633ef..56376fdd10a 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -153,22 +153,22 @@ extern void lz4_fini(void);
 /*
  * Compression routines.
  */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
+extern size_t zfs_lzjb_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lzjb_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_gzip_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_gzip_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_zle_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_zle_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_lz4_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lz4_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
 
 /*
  * Compress and decompress data if necessary.
diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h
index d8c3fa86dce..f9e7ac0b32e 100644
--- a/include/sys/zstd/zstd.h
+++ b/include/sys/zstd/zstd.h
@@ -92,8 +92,6 @@ void zstd_fini(void);
 
 size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int level);
-size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
-    size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
 int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
     size_t d_len, uint8_t *level);
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 28e07259ddd..e62ecdb259f 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2426,7 +2426,7 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
 	/* Call compress function directly to avoid hole detection. */
-	compressed_size = gzip_compress(packed, compressed,
+	compressed_size = zfs_gzip_compress(packed, compressed,
 	    packed_size, packed_size, 6);
 
 	zio_cksum_t cksum;
diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c
index f3b19446352..0ca66c2bd65 100644
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@@ -48,7 +48,8 @@ typedef uLongf zlen_t;
 #endif
 
 size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_gzip_compress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	int ret;
 	zlen_t dstlen = d_len;
@@ -83,7 +84,8 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 }
 
 int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_gzip_decompress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	zlen_t dstlen = d_len;
diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c
index a3b9e707037..698ed69956e 100644
--- a/module/zfs/lz4_zfs.c
+++ b/module/zfs/lz4_zfs.c
@@ -53,7 +53,7 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
 static kmem_cache_t *lz4_cache;
 
 size_t
-lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+zfs_lz4_compress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -81,7 +81,7 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
 }
 
 int
-lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+zfs_lz4_decompress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c
index a24f17e0fe7..b246693120a 100644
--- a/module/zfs/lzjb.c
+++ b/module/zfs/lzjb.c
@@ -46,7 +46,8 @@
 #define	LEMPEL_SIZE	1024
 
 size_t
-lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_lzjb_compress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	uchar_t *src = s_start;
@@ -101,7 +102,8 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 }
 
 int
-lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_lzjb_decompress(void *s_start, void *d_start,
+    size_t s_len, size_t d_len, int n)
 {
 	(void) s_len, (void) n;
 	uchar_t *src = s_start;
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 1d448b00261..9553a9377c3 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -54,24 +54,36 @@ static unsigned long zio_decompress_fail_fraction = 0;
  * PART OF THE ON-DISK FORMAT.
  */
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
-	{"inherit",	0,	NULL,		NULL, NULL},
-	{"on",		0,	NULL,		NULL, NULL},
-	{"uncompressed", 0,	NULL,		NULL, NULL},
-	{"lzjb",	0,	lzjb_compress,	lzjb_decompress, NULL},
-	{"empty",	0,	NULL,		NULL, NULL},
-	{"gzip-1",	1,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-2",	2,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-3",	3,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-4",	4,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-5",	5,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-6",	6,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-7",	7,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-8",	8,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-9",	9,	gzip_compress,	gzip_decompress, NULL},
-	{"zle",		64,	zle_compress,	zle_decompress, NULL},
-	{"lz4",		0,	lz4_compress_zfs, lz4_decompress_zfs, NULL},
-	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,	zfs_zstd_compress_wrap,
-	    zfs_zstd_decompress, zfs_zstd_decompress_level},
+	{"inherit",	0,	NULL,	NULL, NULL},
+	{"on",		0,	NULL,	NULL, NULL},
+	{"uncompressed", 0,	NULL,	NULL, NULL},
+	{"lzjb",	0,
+	    zfs_lzjb_compress,	zfs_lzjb_decompress, NULL},
+	{"empty",	0,	NULL,	NULL, NULL},
+	{"gzip-1",	1,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-2",	2,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-3",	3,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-4",	4,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-5",	5,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-6",	6,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-7",	7,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-8",	8,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-9",	9,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"zle",		64,
+	    zfs_zle_compress,	zfs_zle_decompress, NULL},
+	{"lz4",		0,
+	    zfs_lz4_compress,	zfs_lz4_decompress, NULL},
+	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,
+	    zfs_zstd_compress,	zfs_zstd_decompress, zfs_zstd_decompress_level},
 };
 
 uint8_t
diff --git a/module/zfs/zle.c b/module/zfs/zle.c
index 1483a65af80..32b5fe18cec 100644
--- a/module/zfs/zle.c
+++ b/module/zfs/zle.c
@@ -35,7 +35,8 @@
 #include <sys/zio_compress.h>
 
 size_t
-zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_zle_compress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
@@ -65,7 +66,8 @@ zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 }
 
 int
-zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_zle_decompress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 05120d27b8d..34ab8fd8a42 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -429,68 +429,9 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 	return (1);
 }
 
-
-size_t
-zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level)
-{
-	int16_t zstd_level;
-	if (zstd_enum_to_level(level, &zstd_level)) {
-		ZSTDSTAT_BUMP(zstd_stat_com_inval);
-		return (s_len);
-	}
-	/*
-	 * A zstd early abort heuristic.
-	 *
-	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
-	 *   128k), don't try any of this, just go.
-	 *   (because experimentally that was a reasonable cutoff for a perf win
-	 *   with tiny ratio change)
-	 * - First, we try LZ4 compression, and if it doesn't early abort, we
-	 *   jump directly to whatever compression level we intended to try.
-	 * - Second, we try zstd-1 - if that errors out (usually, but not
-	 *   exclusively, if it would overflow), we give up early.
-	 *
-	 *   If it works, instead we go on and compress anyway.
-	 *
-	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
-	 * compressible data, it was losing up to 8.5% of the compressed
-	 * savings versus no early abort, and all the zstd-fast levels are
-	 * worse indications on their own than LZ4, and don't improve the LZ4
-	 * pass noticably if stacked like this.
-	 */
-	size_t actual_abort_size = zstd_abort_size;
-	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
-	    s_len >= actual_abort_size) {
-		int pass_len = 1;
-		pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
-		if (pass_len < d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
-			goto keep_trying;
-		}
-		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
-
-		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
-		    ZIO_ZSTD_LEVEL_1);
-		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
-			return (s_len);
-		}
-		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
-	} else {
-		ZSTDSTAT_BUMP(zstd_stat_passignored);
-		if (s_len < actual_abort_size) {
-			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
-		}
-	}
-keep_trying:
-	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
-
-}
-
 /* Compress block using zstd */
-size_t
-zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+static size_t
+zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
     int level)
 {
 	size_t c_len;
@@ -594,6 +535,65 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	return (c_len + sizeof (*hdr));
 }
 
+
+size_t
+zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+    int level)
+{
+	int16_t zstd_level;
+	if (zstd_enum_to_level(level, &zstd_level)) {
+		ZSTDSTAT_BUMP(zstd_stat_com_inval);
+		return (s_len);
+	}
+	/*
+	 * A zstd early abort heuristic.
+	 *
+	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
+	 *   128k), don't try any of this, just go.
+	 *   (because experimentally that was a reasonable cutoff for a perf win
+	 *   with tiny ratio change)
+	 * - First, we try LZ4 compression, and if it doesn't early abort, we
+	 *   jump directly to whatever compression level we intended to try.
+	 * - Second, we try zstd-1 - if that errors out (usually, but not
+	 *   exclusively, if it would overflow), we give up early.
+	 *
+	 *   If it works, instead we go on and compress anyway.
+	 *
+	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
+	 * compressible data, it was losing up to 8.5% of the compressed
+	 * savings versus no early abort, and all the zstd-fast levels are
+	 * worse indications on their own than LZ4, and don't improve the LZ4
+	 * pass noticably if stacked like this.
+	 */
+	size_t actual_abort_size = zstd_abort_size;
+	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
+	    s_len >= actual_abort_size) {
+		int pass_len = 1;
+		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
+		if (pass_len < d_len) {
+			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
+			goto keep_trying;
+		}
+		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
+
+		pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
+		    d_len, ZIO_ZSTD_LEVEL_1);
+		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
+			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
+			return (s_len);
+		}
+		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
+	} else {
+		ZSTDSTAT_BUMP(zstd_stat_passignored);
+		if (s_len < actual_abort_size) {
+			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
+		}
+	}
+keep_trying:
+	return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
+
+}
+
 /* Decompress block using zstd and return its stored level */
 int
 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,

From d3c12383c95cf7988ac00234a42a4da7989c9034 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sun, 30 Jun 2024 12:10:00 +1000
Subject: [PATCH 42/59] compress: change compression providers API to use ABDs

This commit changes the provider compress and decompress API to take ABD
pointers instead of buffer pointers for both data source and
destination. It then updates all providers to match.

This doesn't actually change the providers to do chunked compression,
just changes the API to allow such an update in the future. Helper
macros are added to easily adapt the ABD functions to their buffer-based
implementations.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_decompress.c |  6 ++--
 include/sys/zio_compress.h       | 58 +++++++++++++++++++++++++-------
 include/sys/zstd/zstd.h          |  6 ++--
 module/zfs/ddt_zap.c             |  8 +++--
 module/zfs/dsl_dataset.c         |  7 +++-
 module/zfs/gzip.c                | 11 +++---
 module/zfs/lz4_zfs.c             | 11 +++---
 module/zfs/lzjb.c                | 11 +++---
 module/zfs/zio_compress.c        | 19 ++++++-----
 module/zfs/zle.c                 | 11 +++---
 module/zstd/zfs_zstd.c           | 26 +++++++++-----
 11 files changed, 120 insertions(+), 54 deletions(-)

diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c
index f5f66080d06..f8f439d4626 100644
--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@@ -288,10 +288,12 @@ zstream_do_decompress(int argc, char *argv[])
 			char *lzbuf = safe_calloc(payload_size);
 			(void) sfread(lzbuf, payload_size, stdin);
 
-			abd_t sabd;
+			abd_t sabd, dabd;
 			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
-			int err = zio_decompress_data(c, &sabd, buf,
+			abd_get_from_buf_struct(&dabd, buf, payload_size);
+			int err = zio_decompress_data(c, &sabd, &dabd,
 			    payload_size, payload_size, NULL);
+			abd_free(&dabd);
 			abd_free(&sabd);
 
 			if (err != 0) {
diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index 56376fdd10a..d41b5dfd447 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -22,7 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Copyright (c) 2019, Allan Jude
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
  * Use is subject to license terms.
  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
  */
@@ -122,13 +122,13 @@ enum zio_zstd_levels {
 struct zio_prop;
 
 /* Common signature for all zio compress functions. */
-typedef size_t zio_compress_func_t(void *src, void *dst,
+typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress functions. */
-typedef int zio_decompress_func_t(void *src, void *dst,
+typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress and get level functions. */
-typedef int zio_decompresslevel_func_t(void *src, void *dst,
+typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, uint8_t *level);
 
 /*
@@ -153,21 +153,21 @@ extern void lz4_fini(void);
 /*
  * Compression routines.
  */
-extern size_t zfs_lzjb_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_lzjb_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern size_t zfs_gzip_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_gzip_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern size_t zfs_zle_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_zle_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern size_t zfs_lz4_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_lz4_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
 
 /*
@@ -179,6 +179,40 @@ extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);
 
+#define	ZFS_COMPRESS_WRAP_DECL(name)					\
+size_t									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n);	\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (c_len);							\
+}
+#define	ZFS_DECOMPRESS_WRAP_DECL(name)					\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+#define	ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name)				\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n)	\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h
index f9e7ac0b32e..6d212b082f9 100644
--- a/include/sys/zstd/zstd.h
+++ b/include/sys/zstd/zstd.h
@@ -90,12 +90,12 @@ typedef struct zfs_zstd_meta {
 int zstd_init(void);
 void zstd_fini(void);
 
-size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
+size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
-int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, uint8_t *level);
-int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int n);
 void zfs_zstd_cache_reap_now(void);
 
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 8e78ec3277c..e96984b86f0 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -53,8 +53,12 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */
 
 	/* Call compress function directly to avoid hole detection. */
-	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
-	    ci->ci_level);
+	abd_t sabd, dabd;
+	abd_get_from_buf_struct(&sabd, (void *)src, s_len);
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
+	abd_free(&dabd);
+	abd_free(&sabd);
 
 	if (c_len == s_len) {
 		cpfunc = ZIO_COMPRESS_OFF;
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index e62ecdb259f..042725b235d 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2426,8 +2426,13 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
 	/* Call compress function directly to avoid hole detection. */
-	compressed_size = zfs_gzip_compress(packed, compressed,
+	abd_t pabd, cabd;
+	abd_get_from_buf_struct(&pabd, packed, packed_size);
+	abd_get_from_buf_struct(&cabd, compressed, packed_size);
+	compressed_size = zfs_gzip_compress(&pabd, &cabd,
 	    packed_size, packed_size, 6);
+	abd_free(&cabd);
+	abd_free(&pabd);
 
 	zio_cksum_t cksum;
 	fletcher_4_native_varsize(compressed, compressed_size, &cksum);
diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c
index 0ca66c2bd65..e7fd6f63c4b 100644
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@@ -47,8 +47,8 @@ typedef uLongf zlen_t;
 
 #endif
 
-size_t
-zfs_gzip_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	int ret;
@@ -83,8 +83,8 @@ zfs_gzip_compress(void *s_start, void *d_start, size_t s_len,
 	return ((size_t)dstlen);
 }
 
-int
-zfs_gzip_decompress(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -105,3 +105,6 @@ zfs_gzip_decompress(void *s_start, void *d_start, size_t s_len,
 
 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)
diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c
index 698ed69956e..0033b5e50d1 100644
--- a/module/zfs/lz4_zfs.c
+++ b/module/zfs/lz4_zfs.c
@@ -52,8 +52,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
 
 static kmem_cache_t *lz4_cache;
 
-size_t
-zfs_lz4_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -80,8 +80,8 @@ zfs_lz4_compress(void *s_start, void *d_start, size_t s_len,
 	return (bufsiz + sizeof (bufsiz));
 }
 
-int
-zfs_lz4_decompress(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -100,6 +100,9 @@ zfs_lz4_decompress(void *s_start, void *d_start, size_t s_len,
 	    d_start, bufsiz, d_len) < 0);
 }
 
+ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
+
 /*
  * LZ4 API Description:
  *
diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c
index b246693120a..2db549b1626 100644
--- a/module/zfs/lzjb.c
+++ b/module/zfs/lzjb.c
@@ -45,8 +45,8 @@
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
 #define	LEMPEL_SIZE	1024
 
-size_t
-zfs_lzjb_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_lzjb_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -101,8 +101,8 @@ zfs_lzjb_compress(void *s_start, void *d_start, size_t s_len,
 	return (dst - (uchar_t *)d_start);
 }
 
-int
-zfs_lzjb_decompress(void *s_start, void *d_start,
+static int
+zfs_lzjb_decompress_buf(void *s_start, void *d_start,
     size_t s_len, size_t d_len, int n)
 {
 	(void) s_len, (void) n;
@@ -132,3 +132,6 @@ zfs_lzjb_decompress(void *s_start, void *d_start,
 	}
 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_lzjb_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lzjb_decompress)
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 9553a9377c3..118003bd295 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -29,7 +29,7 @@
 
 /*
  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  */
 
@@ -160,10 +160,10 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 	if (*dst == NULL)
 		*dst = zio_buf_alloc(s_len);
 
-	/* No compression algorithms can read from ABDs directly */
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
-	abd_return_buf(src, tmp, s_len);
+	abd_t dabd;
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	c_len = ci->ci_compress(src, &dabd, s_len, d_len, complevel);
+	abd_free(&dabd);
 
 	if (c_len > d_len)
 		return (s_len);
@@ -180,15 +180,16 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
-	void *sbuf = abd_borrow_buf_copy(src, s_len);
+	abd_t dabd;
+	abd_get_from_buf_struct(&dabd, dst, d_len);
 
 	int err;
 	if (ci->ci_decompress_level != NULL && level != NULL)
-		err = ci->ci_decompress_level(sbuf, dst, s_len, d_len, level);
+		err = ci->ci_decompress_level(src, &dabd, s_len, d_len, level);
 	else
-		err = ci->ci_decompress(sbuf, dst, s_len, d_len, ci->ci_level);
+		err = ci->ci_decompress(src, &dabd, s_len, d_len, ci->ci_level);
 
-	abd_return_buf(src, sbuf, s_len);
+	abd_free(&dabd);
 
 	/*
 	 * Decompression shouldn't fail, because we've already verified
diff --git a/module/zfs/zle.c b/module/zfs/zle.c
index 32b5fe18cec..7810161966d 100644
--- a/module/zfs/zle.c
+++ b/module/zfs/zle.c
@@ -34,8 +34,8 @@
 #include <sys/sysmacros.h>
 #include <sys/zio_compress.h>
 
-size_t
-zfs_zle_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_zle_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	uchar_t *src = s_start;
@@ -65,8 +65,8 @@ zfs_zle_compress(void *s_start, void *d_start, size_t s_len,
 	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
 }
 
-int
-zfs_zle_decompress(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_zle_decompress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	uchar_t *src = s_start;
@@ -91,3 +91,6 @@ zfs_zle_decompress(void *s_start, void *d_start, size_t s_len,
 	}
 	return (dst == d_end ? 0 : -1);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_zle_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_zle_decompress)
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 34ab8fd8a42..8d1d53d234b 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -536,8 +536,8 @@ zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
 }
 
 
-size_t
-zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+static size_t
+zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
     int level)
 {
 	int16_t zstd_level;
@@ -569,7 +569,10 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
 	    s_len >= actual_abort_size) {
 		int pass_len = 1;
-		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
+		abd_t sabd;
+		abd_get_from_buf_struct(&sabd, s_start, s_len);
+		pass_len = zfs_lz4_compress(&sabd, d_start, s_len, d_len, 0);
+		abd_free(&sabd);
 		if (pass_len < d_len) {
 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
 			goto keep_trying;
@@ -595,8 +598,8 @@ keep_trying:
 }
 
 /* Decompress block using zstd and return its stored level */
-int
-zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, uint8_t *level)
 {
 	ZSTD_DCtx *dctx;
@@ -671,15 +674,20 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
 }
 
 /* Decompress datablock using zstd */
-int
-zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level __maybe_unused)
+static int
+zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int level __maybe_unused)
 {
 
-	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
+	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
 	    NULL));
 }
 
+ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
+ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
+
+
 /* Allocator for zstd compression context using mempool_allocator */
 static void *
 zstd_alloc(void *opaque __maybe_unused, size_t size)

From f62e6e1f985b5cc197940dcd2dc839aab0708ca2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Jul 2024 14:48:38 +1000
Subject: [PATCH 43/59] compress: change zio_compress API to use ABDs

This commit changes the frontend zio_compress_data and
zio_decompress_data APIs to take ABD points instead of buffer pointers.

All callers are updated to match. Any that already have an appropriate
ABD nearby now use it directly, while at the rest we create an one.

Internally, the ABDs are passed through to the provider directly.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zdb/zdb.c                    | 37 +++++++++-----
 cmd/zstream/zstream_recompress.c | 17 ++++---
 include/sys/zio_compress.h       |  4 +-
 module/zfs/arc.c                 | 84 ++++++++++++++------------------
 module/zfs/blkptr.c              | 12 +++--
 module/zfs/ddt_zap.c             |  6 ++-
 module/zfs/dmu_recv.c            |  7 ++-
 module/zfs/zio.c                 | 35 +++++++------
 module/zfs/zio_compress.c        | 20 +++-----
 module/zstd/zfs_zstd.c           |  6 ++-
 10 files changed, 116 insertions(+), 112 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index c72df390935..41c2b676558 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -4657,7 +4657,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
-	abd_t *abd;
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
@@ -4711,20 +4710,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
-		default:
-			abd = abd_alloc_for_io(asize, B_TRUE);
+		default: {
+			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
-			if (zio_decompress_data(L2BLK_GET_COMPRESS(
-			    (&lbps[0])->lbp_prop), abd, &this_lb,
-			    asize, sizeof (this_lb), NULL) != 0) {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, &this_lb,
+			    sizeof (this_lb));
+			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
+			    (&lbps[0])->lbp_prop), abd, &dabd,
+			    asize, sizeof (this_lb), NULL);
+			abd_free(&dabd);
+			abd_free(abd);
+			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
-				abd_free(abd);
 				goto out;
 			}
-			abd_free(abd);
 			break;
 		}
+		}
 
 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
@@ -8618,13 +8622,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);
 
+	abd_t labd, labd2;
+	abd_get_from_buf_struct(&labd, lbuf, lsize);
+	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
+
+	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
-	    lbuf, psize, lsize, NULL) == 0 &&
+	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
-	    lbuf2, psize, lsize, NULL) == 0 &&
+	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
-		return (B_TRUE);
-	return (B_FALSE);
+		ret = B_TRUE;
+
+	abd_free(&labd2);
+	abd_free(&labd);
+
+	return (ret);
 }
 
 static uint64_t
diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index 0e5cc9cd815..32ef6fa5443 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -259,12 +259,13 @@ zstream_do_recompress(int argc, char *argv[])
 			/* Read and decompress the payload */
 			(void) sfread(cbuf, payload_size, stdin);
 			if (dtype != ZIO_COMPRESS_OFF) {
-				abd_t cabd;
+				abd_t cabd, dabd;
 				abd_get_from_buf_struct(&cabd,
 				    cbuf, payload_size);
-				if (zio_decompress_data(dtype, &cabd, dbuf,
-				    payload_size,
-				    MIN(bufsz, drrw->drr_logical_size),
+				abd_get_from_buf_struct(&dabd, dbuf,
+				    MIN(bufsz, drrw->drr_logical_size));
+				if (zio_decompress_data(dtype, &cabd, &dabd,
+				    payload_size, abd_get_size(&dabd),
 				    NULL) != 0) {
 					warnx("decompression type %d failed "
 					    "for ino %llu offset %llu",
@@ -274,17 +275,20 @@ zstream_do_recompress(int argc, char *argv[])
 					exit(4);
 				}
 				payload_size = drrw->drr_logical_size;
+				abd_free(&dabd);
 				abd_free(&cabd);
 				free(cbuf);
 			}
 
 			/* Recompress the payload */
 			if (ctype != ZIO_COMPRESS_OFF) {
-				abd_t dabd;
+				abd_t dabd, abd;
 				abd_get_from_buf_struct(&dabd,
 				    dbuf, drrw->drr_logical_size);
+				abd_t *pabd =
+				    abd_get_from_buf_struct(&abd, buf, bufsz);
 				payload_size = P2ROUNDUP(zio_compress_data(
-				    ctype, &dabd, (void **)&buf,
+				    ctype, &dabd, &pabd,
 				    drrw->drr_logical_size, level),
 				    SPA_MINBLOCKSIZE);
 				if (payload_size != drrw->drr_logical_size) {
@@ -296,6 +300,7 @@ zstream_do_recompress(int argc, char *argv[])
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
 				}
+				abd_free(&abd);
 				abd_free(&dabd);
 				free(dbuf);
 			} else {
diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index d41b5dfd447..31602039a15 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -173,9 +173,9 @@ extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
 /*
  * Compress and decompress data if necessary.
  */
-extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
     size_t s_len, uint8_t level);
-extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
     size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);
 
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3c657c979cd..714a30e863a 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
-	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
+	boolean_t free_abd = B_FALSE;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3P(abd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
@@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
-
+		abd = NULL;
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
-		ASSERT3P(tmpbuf, !=, NULL);
+		    hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
+		ASSERT3P(abd, !=, NULL);
 		ASSERT3U(csize, <=, psize);
-		abd = abd_get_from_buf(tmpbuf, lsize);
-		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
+		free_abd = B_TRUE;
 	}
 
 	/*
@@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
-	else if (ret != ENOENT)
-		goto error;
+	else if (ret == ENOENT)
+		ret = 0;
 
-	if (tmpbuf != NULL)
-		abd_free(abd);
-
-	return (0);
-
-error:
-	if (tmpbuf != NULL)
+	if (free_abd)
 		abd_free(abd);
 
 	return (ret);
@@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
-	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
@@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
-		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}
 
-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, buf->b_data,
+			    HDR_GET_LSIZE(hdr));
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-			    hdr->b_l1hdr.b_pabd, buf->b_data,
+			    hdr->b_l1hdr.b_pabd, &dabd,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
+			abd_free(&dabd);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
@@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
-		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		size_t bufsize = MAX(size, asize);
-		void *buf = zio_buf_alloc(bufsize);
-		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		cabd = abd_alloc_for_io(MAX(size, asize), ismd);
+		uint64_t csize = zio_compress_data(compress, to_write, &cabd,
 		    size, hdr->b_complevel);
 		if (csize > psize) {
 			/*
@@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
-			zio_buf_free(buf, bufsize);
+			abd_free(cabd);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
-			memset((char *)buf + csize, 0, asize - csize);
-		to_write = cabd = abd_get_from_buf(buf, bufsize);
-		abd_take_ownership_of_buf(cabd, B_TRUE);
+			abd_zero_off(cabd, csize, asize - csize);
+		to_write = cabd;
 	}
 
 	if (HDR_ENCRYPTED(hdr)) {
@@ -10184,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
-	abd_t		*abd = NULL;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
@@ -10246,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
-	case ZIO_COMPRESS_LZ4:
-		abd = abd_alloc_for_io(asize, B_TRUE);
+	case ZIO_COMPRESS_LZ4: {
+		abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
-		if ((err = zio_decompress_data(
+		abd_t dabd;
+		abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
+		err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
-		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
+		    abd, &dabd, asize, sizeof (*this_lb), NULL);
+		abd_free(&dabd);
+		abd_free(abd);
+		if (err != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
+	}
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
@@ -10272,8 +10265,6 @@ cleanup:
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
-	if (abd != NULL)
-		abd_free(abd);
 	return (err);
 }
 
@@ -10509,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
-	uint8_t			*tmpbuf = NULL;
+	abd_t			*abd = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
@@ -10532,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
-	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
+	    abd_buf->abd, &abd, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
@@ -10558,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
-		memset(tmpbuf + psize, 0, asize - psize);
+		abd_zero_off(abd, psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
-		memcpy(tmpbuf, lb, sizeof (*lb));
+		abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
-	fletcher_4_native(tmpbuf, asize, NULL,
+	abd_fletcher_4_native(abd, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
-	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
-	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+	abd_buf->abd = abd;
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
index 6a6f06c7357..ac801c2bcf3 100644
--- a/module/zfs/blkptr.c
+++ b/module/zfs/blkptr.c
@@ -142,11 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 		decode_embedded_bp_compressed(bp, dstbuf);
-		abd_t dstabd;
-		abd_get_from_buf_struct(&dstabd, dstbuf, psize);
-		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &dstabd,
-		    buf, psize, buflen, NULL));
-		abd_free(&dstabd);
+		abd_t cabd, dabd;
+		abd_get_from_buf_struct(&cabd, dstbuf, psize);
+		abd_get_from_buf_struct(&dabd, buf, buflen);
+		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
+		    &dabd, psize, buflen, NULL));
+		abd_free(&dabd);
+		abd_free(&cabd);
 	} else {
 		ASSERT3U(lsize, ==, psize);
 		decode_embedded_bp_compressed(bp, buf);
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index e96984b86f0..d96dc505cde 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -83,9 +83,11 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 		return;
 	}
 
-	abd_t sabd;
+	abd_t sabd, dabd;
 	abd_get_from_buf_struct(&sabd, src, s_len);
-	VERIFY0(zio_decompress_data(cpfunc, &sabd, dst, s_len, d_len, NULL));
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
+	abd_free(&dabd);
 	abd_free(&sabd);
 
 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 0119191d792..a1752650f3b 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
-		    abd, abd_to_buf(dabd), abd_get_size(abd),
+		    abd, dabd, abd_get_size(abd),
 		    abd_get_size(dabd), NULL);
 
 		if (err != 0) {
@@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
-		void *buf = abd_to_buf(cabd);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
-		    abd, &buf, abd_get_size(abd),
+		    abd, &cabd, abd_get_size(abd),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
@@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
 
 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
-				    abd, abd_to_buf(decomp_abd),
+				    abd, decomp_abd,
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 73252c2da97..a841e0a7910 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -487,11 +487,9 @@ static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
-		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-		    zio->io_abd, tmp, zio->io_size, size,
+		    zio->io_abd, data, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
-		abd_return_buf_copy(data, tmp, size);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
@@ -538,17 +536,18 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
-			tmp = zio_buf_alloc(lsize);
+			abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
-			    zio->io_abd, tmp, zio->io_size, lsize,
+			    zio->io_abd, abd, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
+				abd_free(abd);
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
-			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
-			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
-			zio_buf_free(tmp, lsize);
+			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
+			    abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
+			abd_free(abd);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
@@ -1866,30 +1865,32 @@ zio_write_compress(zio_t *zio)
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
-		void *cbuf = NULL;
+		abd_t *cabd = NULL;
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0)
 			psize = 0;
 		else if (compress == ZIO_COMPRESS_EMPTY)
 			psize = lsize;
 		else
-			psize = zio_compress_data(compress, zio->io_abd, &cbuf,
+			psize = zio_compress_data(compress, zio->io_abd, &cabd,
 			    lsize, zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
-			if (cbuf != NULL)
-				zio_buf_free(cbuf, lsize);
+			if (cabd != NULL)
+				abd_free(cabd);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			void *cbuf = abd_borrow_buf_copy(cabd, lsize);
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
-			zio_buf_free(cbuf, lsize);
+			abd_return_buf(cabd, cbuf, lsize);
+			abd_free(cabd);
 			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
@@ -1908,14 +1909,12 @@ zio_write_compress(zio_t *zio)
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
-				zio_buf_free(cbuf, lsize);
+				abd_free(cabd);
 				psize = lsize;
 			} else {
-				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
-				abd_take_ownership_of_buf(cdata, B_TRUE);
-				abd_zero_off(cdata, psize, rounded - psize);
+				abd_zero_off(cabd, psize, rounded - psize);
 				psize = rounded;
-				zio_push_transform(zio, cdata,
+				zio_push_transform(zio, cabd,
 				    psize, lsize, NULL);
 			}
 		}
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 118003bd295..faf43097207 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -128,7 +128,7 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
 }
 
 size_t
-zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
+zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
     uint8_t level)
 {
 	size_t c_len, d_len;
@@ -158,12 +158,9 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 	}
 
 	if (*dst == NULL)
-		*dst = zio_buf_alloc(s_len);
+		*dst = abd_alloc_sametype(src, s_len);
 
-	abd_t dabd;
-	abd_get_from_buf_struct(&dabd, dst, d_len);
-	c_len = ci->ci_compress(src, &dabd, s_len, d_len, complevel);
-	abd_free(&dabd);
+	c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);
 
 	if (c_len > d_len)
 		return (s_len);
@@ -173,23 +170,18 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 }
 
 int
-zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, uint8_t *level)
 {
 	zio_compress_info_t *ci = &zio_compress_table[c];
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
-	abd_t dabd;
-	abd_get_from_buf_struct(&dabd, dst, d_len);
-
 	int err;
 	if (ci->ci_decompress_level != NULL && level != NULL)
-		err = ci->ci_decompress_level(src, &dabd, s_len, d_len, level);
+		err = ci->ci_decompress_level(src, dst, s_len, d_len, level);
 	else
-		err = ci->ci_decompress(src, &dabd, s_len, d_len, ci->ci_level);
-
-	abd_free(&dabd);
+		err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
 
 	/*
 	 * Decompression shouldn't fail, because we've already verified
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 8d1d53d234b..e113962f65b 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -569,9 +569,11 @@ zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
 	    s_len >= actual_abort_size) {
 		int pass_len = 1;
-		abd_t sabd;
+		abd_t sabd, dabd;
 		abd_get_from_buf_struct(&sabd, s_start, s_len);
-		pass_len = zfs_lz4_compress(&sabd, d_start, s_len, d_len, 0);
+		abd_get_from_buf_struct(&dabd, d_start, d_len);
+		pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
+		abd_free(&dabd);
 		abd_free(&sabd);
 		if (pass_len < d_len) {
 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);

From a9c94bea9fb3bef7704d71cd9486fbcebbe6e9c8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 5 Jul 2024 13:39:33 +1000
Subject: [PATCH 44/59] zio_compress_data: limit dest length to ABD size

Some callers (eg `do_corrective_recv()`) pass in a dest buffer much
smaller than the wanted 87.5% of the source buffer, because the
incoming abd is larger than the source data and they "know" what the
decompressed size with be.

However, `abd_borrow_buf()` rightly asserts if we try to borrow more
than is available, so these callers fail.

Previously when all we had was a dest buffer, we didn't know how big it
was, so we couldn't do anything. Now we have a dest abd, with a size, so
we can clamp dest size to the abd size.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 module/zfs/zio_compress.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index faf43097207..9182917f75e 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -135,13 +135,9 @@ zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
 	uint8_t complevel;
 	zio_compress_info_t *ci = &zio_compress_table[c];
 
-	ASSERT3U(c, <, ZIO_COMPRESS_FUNCTIONS);
 	ASSERT3U(ci->ci_compress, !=, NULL);
 	ASSERT3U(s_len, >, 0);
 
-	/* Compress at least 12.5% */
-	d_len = s_len - (s_len >> 3);
-
 	complevel = ci->ci_level;
 
 	if (c == ZIO_COMPRESS_ZSTD) {
@@ -160,6 +156,9 @@ zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
 	if (*dst == NULL)
 		*dst = abd_alloc_sametype(src, s_len);
 
+	/* Compress at least 12.5%, but limit to the size of the dest abd. */
+	d_len = MIN(s_len - (s_len >> 3), abd_get_size(*dst));
+
 	c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);
 
 	if (c_len > d_len)

From a537d90734a16d63c79080cfd2d710745d7c02fd Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 23 Jul 2024 11:43:18 +1000
Subject: [PATCH 45/59] zstream decompress: fix decompress size and output

This was incorrectly using the compressed length for the size of the
decompress buffer, and quietly doing nothing if the decompressor refused
to decompress the block because there wasn't enough space.

After that, it wasn't correctly rewriting the record to indicate
"not compressed".

So that's fixed now. Sigh.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_decompress.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c
index f8f439d4626..c64011e3822 100644
--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@@ -275,7 +275,8 @@ zstream_do_decompress(int argc, char *argv[])
 
 			if (c == ZIO_COMPRESS_OFF) {
 				(void) sfread(buf, payload_size, stdin);
-				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
 				if (verbose)
 					fprintf(stderr,
 					    "Resetting compression type to "
@@ -285,18 +286,32 @@ zstream_do_decompress(int argc, char *argv[])
 				break;
 			}
 
+			uint64_t lsize = drrw->drr_logical_size;
+			ASSERT3U(payload_size, <=, lsize);
+
 			char *lzbuf = safe_calloc(payload_size);
 			(void) sfread(lzbuf, payload_size, stdin);
 
 			abd_t sabd, dabd;
 			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
-			abd_get_from_buf_struct(&dabd, buf, payload_size);
+			abd_get_from_buf_struct(&dabd, buf, lsize);
 			int err = zio_decompress_data(c, &sabd, &dabd,
-			    payload_size, payload_size, NULL);
+			    payload_size, lsize, NULL);
 			abd_free(&dabd);
 			abd_free(&sabd);
 
-			if (err != 0) {
+			if (err == 0) {
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
+				payload_size = lsize;
+				if (verbose) {
+					fprintf(stderr,
+					    "successfully decompressed "
+					    "ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				}
+			} else {
 				/*
 				 * The block must not be compressed, at least
 				 * not with this compression type, possibly
@@ -308,14 +323,6 @@ zstream_do_decompress(int argc, char *argv[])
 				    (u_longlong_t)drrw->drr_object,
 				    (u_longlong_t)drrw->drr_offset);
 				memcpy(buf, lzbuf, payload_size);
-			} else if (verbose) {
-				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
-				fprintf(stderr, "successfully decompressed "
-				    "ino %llu offset %llu\n",
-				    (u_longlong_t)drrw->drr_object,
-				    (u_longlong_t)drrw->drr_offset);
-			} else {
-				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
 			}
 
 			free(lzbuf);

From cb36f4f3529473d977189010f41b9a98c644d2d3 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 23 Jul 2024 11:43:18 +1000
Subject: [PATCH 46/59] zstream recompress: fix zero recompressed buffer and
 output

If compression happend, any garbage past the compress size was not
zeroed out.

If compression didn't happen, then the payload size was still set to
the rounded-up return from zio_compress_data(), which is dependent on
the input, which is not necessarily the logical size.

So that's all fixed too, mostly from stealing the math from zio.c.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_recompress.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index 32ef6fa5443..ae2c56320b2 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -287,24 +287,26 @@ zstream_do_recompress(int argc, char *argv[])
 				    dbuf, drrw->drr_logical_size);
 				abd_t *pabd =
 				    abd_get_from_buf_struct(&abd, buf, bufsz);
-				payload_size = P2ROUNDUP(zio_compress_data(
-				    ctype, &dabd, &pabd,
-				    drrw->drr_logical_size, level),
-				    SPA_MINBLOCKSIZE);
-				if (payload_size != drrw->drr_logical_size) {
-					drrw->drr_compressiontype = ctype;
-					drrw->drr_compressed_size =
-					    payload_size;
-				} else {
+				size_t csize = zio_compress_data(ctype, &dabd,
+				    &pabd, drrw->drr_logical_size, level);
+				size_t rounded =
+				    P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
+				if (rounded >= drrw->drr_logical_size) {
 					memcpy(buf, dbuf, payload_size);
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
+				} else {
+					abd_zero_off(pabd, csize,
+					    rounded - csize);
+					drrw->drr_compressiontype = ctype;
+					drrw->drr_compressed_size =
+					    payload_size = rounded;
 				}
 				abd_free(&abd);
 				abd_free(&dabd);
 				free(dbuf);
 			} else {
-				drrw->drr_compressiontype = ctype;
+				drrw->drr_compressiontype = 0;
 				drrw->drr_compressed_size = 0;
 			}
 			break;

From 34118eac06fba834f0c934419aec1b386c98665a Mon Sep 17 00:00:00 2001
From: Low-power <msl0000023508@gmail.com>
Date: Sat, 24 Aug 2024 01:39:09 +0800
Subject: [PATCH 47/59] Make mount.zfs(8) calling zfs_mount_at for legacy
 mounts as well

Commit 329e2ffa4bca456e65c3db7f5c5c04931c551b61 has made mount.zfs(8) to
call libzfs function 'zfs_mount_at', in order to propagate dataset
properties into mount options. This fix however, is limited to a special
use case where mount.zfs(8) is used in initrd with option '-o zfsutil'.
If either initrd or the user need to use mount.zfs(8) to mount a file
system with 'mountpoint' set to 'legacy', '-o zfsutil' can't be used and
the original issue #7947 will still happen.

Since the existing code already excluded the possibility of calling
'zfs_mount_at' when it was invoked as a helper program from zfs(8), by
checking 'ZFS_MOUNT_HELPER' environment variable, it makes no sense to
avoid calling 'zfs_mount_at' without '-o zfsutil'.

An exception however, is when mount.zfs(8) was invoked with '-o remount'
to update the mount options for an existing mount point. In this case
call mount(2) directly without modifying the mount options passed from
command line.

Furthermore, don't run mount.zfs(8) helper for automounting snapshot.
The above change to make mount.zfs(8) to call 'zfs_mount_at'
apparently caused it to trigger an automount for the snapshot
directory. When the helper was invoked as a result of a snapshot
automount, an infinite recursion will occur.

Since the need of invoking user mode mount(8) for automounting was to
overcome that the 'vfs_kern_mount' being GPL-only, just run mount(8)
without the mount.zfs(8) helper by adding option '-i'.

Reviewed-by: Umer Saleem <usaleem@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: WHR <whr@rivoreo.one>
Closes #16393
---
 cmd/mount_zfs.c                  | 5 ++---
 module/os/linux/zfs/zfs_ctldir.c | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c
index fc922095064..283074daf71 100644
--- a/cmd/mount_zfs.c
+++ b/cmd/mount_zfs.c
@@ -269,8 +269,7 @@ main(int argc, char **argv)
 		return (MOUNT_USAGE);
 	}
 
-	if (!zfsutil || sloppy ||
-	    libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
+	if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 		zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
 	}
 
@@ -337,7 +336,7 @@ main(int argc, char **argv)
 		    dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
 
 	if (!fake) {
-		if (zfsutil && !sloppy &&
+		if (!remount && !sloppy &&
 		    !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 			error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
 			if (error) {
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index 54ed70d0394..e042116333f 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path;
-	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
-	    NULL };
+	char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
+	    NULL, NULL, NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
@@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
-	argv[5] = full_name;
-	argv[6] = full_path;
+	argv[6] = full_name;
+	argv[7] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {

From 2420ee6e12cb4bc4918fc88d44d59b486b86e58b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 24 Aug 2024 03:40:45 +1000
Subject: [PATCH 48/59] spl-taskq: fix task counts for delayed and cancelled
 tasks

Dispatched delayed tasks were not added to tasks_total, and cancelled
tasks were not removed. This notably could make tasks_total go to
UNIT64_MAX, but just generally meant the count could be wrong. So lets
not!

Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16473
---
 module/os/linux/spl/spl-taskq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 29b8f542650..c16bc9bc640 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -620,6 +620,7 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
 		TQSTAT_DEC_LIST(tq, t);
+		TQSTAT_DEC(tq, tasks_total);
 
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
 		TQSTAT_INC(tq, tasks_cancelled);
@@ -760,6 +761,7 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
 	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
 	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;

From 6be8bf5552b16475629a15ab62759eb7a6d73e3b Mon Sep 17 00:00:00 2001
From: Mateusz Piotrowski <0mp@FreeBSD.org>
Date: Mon, 26 Aug 2024 18:27:24 +0200
Subject: [PATCH 49/59] zpool: Provide GUID to zpool-reguid(8) with -g (#16239)

This commit extends the zpool-reguid(8) command with a -g flag, which
allows the user to specify the GUID to set.

This change also adds some general tests for zpool-reguid(8).

Sponsored-by: Wasabi Technology, Inc.
Sponsored-by: Klara, Inc.

Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 cmd/zpool/zpool_main.c                        | 23 ++++--
 cmd/ztest.c                                   |  2 +-
 include/libzfs.h                              |  1 +
 include/sys/fs/zfs.h                          |  5 ++
 include/sys/spa.h                             |  2 +-
 lib/libzfs/libzfs.abi                         |  6 ++
 lib/libzfs/libzfs_pool.c                      | 41 ++++++++++-
 man/man8/zpool-reguid.8                       | 14 +++-
 module/zfs/spa.c                              | 25 ++++++-
 module/zfs/zfs_ioctl.c                        | 30 +++++++-
 tests/runfiles/common.run                     |  4 +
 .../cli_root/zpool_reguid/Makefile.am         |  6 ++
 .../cli_root/zpool_reguid/cleanup.ksh         | 32 ++++++++
 .../cli_root/zpool_reguid/setup.ksh           | 34 +++++++++
 .../zpool_reguid/zpool_reguid_001_pos.ksh     | 73 +++++++++++++++++++
 .../zpool_reguid/zpool_reguid_002_neg.ksh     | 60 +++++++++++++++
 16 files changed, 342 insertions(+), 16 deletions(-)
 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 620746f8e7b..9cd26a8650a 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -537,7 +537,7 @@ get_usage(zpool_help_t idx)
 		    "\t    [-o property=value] <pool> <newpool> "
 		    "[<device> ...]\n"));
 	case HELP_REGUID:
-		return (gettext("\treguid <pool>\n"));
+		return (gettext("\treguid [-g guid] <pool>\n"));
 	case HELP_SYNC:
 		return (gettext("\tsync [pool] ...\n"));
 	case HELP_VERSION:
@@ -2025,7 +2025,7 @@ zpool_do_create(int argc, char **argv)
 				char *end;
 				u_longlong_t ver;
 
-				ver = strtoull(propval, &end, 10);
+				ver = strtoull(propval, &end, 0);
 				if (*end == '\0' &&
 				    ver < SPA_VERSION_FEATURES) {
 					enable_pool_features = B_FALSE;
@@ -8232,19 +8232,32 @@ zpool_do_clear(int argc, char **argv)
 }
 
 /*
- * zpool reguid <pool>
+ * zpool reguid [-g <guid>] <pool>
  */
 int
 zpool_do_reguid(int argc, char **argv)
 {
+	uint64_t guid;
+	uint64_t *guidp = NULL;
 	int c;
+	char *endptr;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "")) != -1) {
+	while ((c = getopt(argc, argv, "g:")) != -1) {
 		switch (c) {
+		case 'g':
+			errno = 0;
+			guid = strtoull(optarg, &endptr, 10);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("invalid GUID: %s\n"), optarg);
+				usage(B_FALSE);
+			}
+			guidp = &guid;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -8270,7 +8283,7 @@ zpool_do_reguid(int argc, char **argv)
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
-	ret = zpool_reguid(zhp);
+	ret = zpool_set_guid(zhp, guidp);
 
 	zpool_close(zhp);
 	return (ret);
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 6a9264ddcc4..7c9db84d4ea 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -6746,7 +6746,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
 	load = spa_load_guid(spa);
 
 	(void) pthread_rwlock_wrlock(&ztest_name_lock);
-	error = spa_change_guid(spa);
+	error = spa_change_guid(spa, NULL);
 	zs->zs_guid = spa_guid(spa);
 	(void) pthread_rwlock_unlock(&ztest_name_lock);
 
diff --git a/include/libzfs.h b/include/libzfs.h
index bf5579f38fd..2412797541d 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -300,6 +300,7 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
 
 _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 _LIBZFS_H int zpool_reguid(zpool_handle_t *);
+_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
 _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
 
 _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c7e48d1edc0..73d686a002e 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1710,6 +1710,11 @@ typedef enum {
 #define	ZPOOL_INITIALIZE_COMMAND	"initialize_command"
 #define	ZPOOL_INITIALIZE_VDEVS		"initialize_vdevs"
 
+/*
+ * The following are names used when invoking ZFS_IOC_POOL_REGUID.
+ */
+#define	ZPOOL_REGUID_GUID	"guid"
+
 /*
  * The following are names used when invoking ZFS_IOC_POOL_TRIM.
  */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index a70912335b1..93f381affd9 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1092,7 +1092,7 @@ extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
-extern int spa_change_guid(spa_t *spa);
+extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 88baa4168c3..87c5c4380be 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -556,6 +556,7 @@
     <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6639,6 +6640,11 @@
       <parameter type-id='9c313c2d' name='guid'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='713a56f5' name='guid'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
       <parameter type-id='4c81de99' name='zhp'/>
       <return type-id='95e97e5e'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index e493e8562a7..dfa7c4db688 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -4310,22 +4310,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
 
 /*
  * Change the GUID for a pool.
+ *
+ * Similar to zpool_reguid(), but may take a GUID.
+ *
+ * If the guid argument is NULL, then no GUID is passed in the nvlist to the
+ * ioctl().
  */
 int
-zpool_reguid(zpool_handle_t *zhp)
+zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
 {
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvl = NULL;
 	zfs_cmd_t zc = {"\0"};
+	int error = -1;
+
+	if (guid != NULL) {
+		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+			return (no_memory(hdl));
+
+		if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
+			nvlist_free(nvl);
+			return (no_memory(hdl));
+		}
+
+		zcmd_write_src_nvlist(hdl, &zc, nvl);
+	}
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
-		return (0);
+	error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
+	if (error) {
+		return (zpool_standard_error(hdl, errno, errbuf));
+	}
+	if (guid != NULL) {
+		zcmd_free_nvlists(&zc);
+		nvlist_free(nvl);
+	}
+	return (0);
+}
 
-	return (zpool_standard_error(hdl, errno, errbuf));
+/*
+ * Change the GUID for a pool.
+ */
+int
+zpool_reguid(zpool_handle_t *zhp)
+{
+	return (zpool_set_guid(zhp, NULL));
 }
 
 /*
diff --git a/man/man8/zpool-reguid.8 b/man/man8/zpool-reguid.8
index 1fd4ddd9a77..4fda3f316e3 100644
--- a/man/man8/zpool-reguid.8
+++ b/man/man8/zpool-reguid.8
@@ -25,8 +25,10 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024, Klara Inc.
+.\" Copyright (c) 2024, Mateusz Piotrowski
 .\"
-.Dd May 31, 2021
+.Dd June 21, 2023
 .Dt ZPOOL-REGUID 8
 .Os
 .
@@ -36,6 +38,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm reguid
+.Op Fl g Ar guid
 .Ar pool
 .
 .Sh DESCRIPTION
@@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
 You must ensure that all devices in this pool are online and healthy before
 performing this action.
 .
+.Bl -tag -width Ds
+.It Fl g Ar guid
+Set the pool GUID to the provided value.
+The GUID can be any 64-bit value accepted by
+.Xr strtoull 3
+in base 10.
+.Nm
+will return an error if the provided GUID is already in use.
+.El
 .Sh SEE ALSO
 .Xr zpool-export 8 ,
 .Xr zpool-import 8
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 99a8d107eca..d51cc4fcd09 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1040,16 +1040,34 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx)
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
+ *
+ * The GUID of the pool will be changed to the value pointed to by guidp.
+ * The GUID may not be set to the reserverd value of 0.
+ * The new GUID will be generated if guidp is NULL.
  */
 int
-spa_change_guid(spa_t *spa)
+spa_change_guid(spa_t *spa, const uint64_t *guidp)
 {
-	int error;
 	uint64_t guid;
+	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
-	guid = spa_generate_guid(NULL);
+
+	if (guidp != NULL) {
+		guid = *guidp;
+		if (guid == 0) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		if (spa_guid_exists(guid, 0)) {
+			error = SET_ERROR(EEXIST);
+			goto out;
+		}
+	} else {
+		guid = spa_generate_guid(NULL);
+	}
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
@@ -1068,6 +1086,7 @@ spa_change_guid(spa_t *spa)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
+out:
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 897335dd4e4..7ce2d919610 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1794,17 +1794,45 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_nvlist_src	nvlist optionally containing ZPOOL_REGUID_GUID
+ * zc_nvlist_src_size	size of the nvlist
+ */
 static int
 zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
+	uint64_t *guidp = NULL;
+	nvlist_t *props = NULL;
 	spa_t *spa;
+	uint64_t guid;
 	int error;
 
+	if (zc->zc_nvlist_src_size != 0) {
+		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+		    zc->zc_iflags, &props);
+		if (error != 0)
+			return (error);
+
+		error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid);
+		if (error == 0)
+			guidp = &guid;
+		else if (error == ENOENT)
+			guidp = NULL;
+		else
+			goto out;
+	}
+
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
-		error = spa_change_guid(spa);
+		error = spa_change_guid(spa, guidp);
 		spa_close(spa, FTAG);
 	}
+
+out:
+	if (props != NULL)
+		nvlist_free(props);
+
 	return (error);
 }
 
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index ad131664698..088e46ce578 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -514,6 +514,10 @@ tags = ['functional', 'cli_root', 'zpool_offline']
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
+[tests/functional/cli_root/zpool_reguid]
+tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
+tags = ['functional', 'cli_root', 'zpool_reguid']
+
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
new file mode 100644
index 00000000000..87d46b39401
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
@@ -0,0 +1,6 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_reguid
+dist_pkgdata_SCRIPTS = \
+	setup.ksh \
+	cleanup.ksh \
+	zpool_reguid_001_pos.ksh \
+	zpool_reguid_002_neg.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
new file mode 100755
index 00000000000..3167a5097b5
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
@@ -0,0 +1,32 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
new file mode 100755
index 00000000000..3d866cfd9f2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+DISK=${DISKS%% *}
+
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh
new file mode 100755
index 00000000000..4e18abd988c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh
@@ -0,0 +1,73 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright 2023 Mateusz Piotrowski
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify 'zpool reguid' can change pool's GUID.
+#
+# STRATEGY:
+# 1. Use zpool get to obtain the initial GUID of a pool.
+# 2. Change pool's GUID with zpool reguid.
+# 3. Verify the GUID has changed to a random GUID.
+#
+# 4. Change pool's GUID with zpool reguid -g.
+# 5. Verify the GUID has changed to the specified GUID.
+#
+
+# set_guid guid [expected_guid]
+set_guid() {
+	gflag_guid="$1"
+	expected_guid="${2:-"$gflag_guid"}"
+
+	initial_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	log_assert "Verify 'zpool reguid -g \"$gflag_guid\"' sets GUID as expected."
+	log_must zpool reguid -g "$gflag_guid" "$TESTPOOL"
+	retrieved_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	if [[ "$retrieved_guid" == "" ]]; then
+		log_fail "Unable to obtain the new GUID of pool $TESTPOOL"
+	fi
+	if [[ "$expected_guid" != "$retrieved_guid" ]]; then
+		log_fail "GUID set to '$retrieved_guid' instead of '$expected_guid'"
+	fi
+}
+
+log_assert "Verify 'zpool reguid' picks a new random GUID for the pool."
+initial_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+if [[ $initial_guid == "" ]]; then
+	log_fail "Unable to obtain the initial GUID of pool $TESTPOOL"
+fi
+log_must zpool reguid "$TESTPOOL"
+new_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+if [[ "$new_guid" == "" ]]; then
+	log_fail "Unable to obtain the new GUID of pool $TESTPOOL"
+fi
+if [[ "$initial_guid" == "$new_guid" ]]; then
+	log_fail "GUID change failed; GUID has not changed: $initial_guid"
+fi
+
+for g in "$(bc -e '2^64 - 1')" 0; do
+	set_guid "$g"
+done
+# zpool-reguid(8) will strip the leading 0.
+set_guid 0123 "123"
+# GUID "-1" is effectively 2^64 - 1 in value.
+set_guid -1 "$(bc -e '2^64 - 1')"
+
+log_pass "'zpool reguid' changes GUID as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh
new file mode 100755
index 00000000000..599041e284e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright 2023 Mateusz Piotrowski
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify 'zpool reguid' does not accept invalid GUIDs.
+#
+# STRATEGY:
+# 1. Call zpool reguid with an invalid GUID.
+# 2. Verify that the call fails.
+# 3. Verify that the pool GUID did not change.
+#
+# 4. Call zpool reguid with a GUID that is already in use.
+# 5. Verify that the call fails.
+#
+
+check_guid() {
+	invalid_guid="$1"
+	initial_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	log_assert "'zpool reguid' will not accept invalid GUID '$invalid_guid'"
+	if zpool reguid -g "$invalid_guid" "$TESTPOOL"; then
+		log_fail "'zpool reguid' accepted invalid GUID: $invalid_guid"
+	fi
+	final_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	if [[ "$initial_guid" != "$final_guid" ]]; then
+		log_fail "Invalid GUID change from '$initial_guid' to '$final_guid'"
+	fi
+}
+
+log_assert "Verify 'zpool reguid' does not accept invalid GUIDs"
+
+for ig in "$(bc -e '2^64')" 0xA 0xa; do
+	check_guid "$ig"
+done
+
+guid="42"
+log_assert "Verify 'zpool reguid -g' does not accept GUID which are already in use"
+log_must zpool reguid -g "$guid" "$TESTPOOL"
+if zpool reguid -g "$guid" "$TESTPOOL"; then
+	log_fail "'zpool reguid' accepted GUID that was already in use: $invalid_guid"
+fi
+
+log_pass "'zpool reguid' does not accept invalid GUIDs."

From 73866cf3468f59e89baba31b93d8fdf503b10b19 Mon Sep 17 00:00:00 2001
From: Jitendra Patidar <jitendra.patidar@nutanix.com>
Date: Tue, 27 Aug 2024 06:06:49 +0530
Subject: [PATCH 50/59] Fix issig() to check signal_pending after dequeue
 SIGSTOP/SIGTSTP

When process got SIGSTOP/SIGTSTP, issig() dequeue them and return 0.
But process could still have another signal pending after dequeue. So,
after dequeue, check and return 1, if signal_pending.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jitendra Patidar <jitendra.patidar@nutanix.com>
Closes #16464
---
 module/os/linux/spl/spl-thread.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c
index dbb8eefa7ec..2af766ac204 100644
--- a/module/os/linux/spl/spl-thread.c
+++ b/module/os/linux/spl/spl-thread.c
@@ -186,6 +186,13 @@ issig(void)
 
 		schedule();
 #endif
+		/*
+		 * Dequeued SIGSTOP/SIGTSTP.
+		 * Check if process has other singal pending.
+		 */
+		if (signal_pending(current))
+			return (1);
+
 		return (0);
 	}
 

From 50b32cb925f20ececeff1b500811fa349fb419ba Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Aug 2024 10:39:13 +1000
Subject: [PATCH 51/59] fm: pass io_flags through events & zed as uint64_t

In 4938d01db (#14086) zio_flag_t was converted from an enum (generally
signed 32-bit) to a uint64_t. The corresponding change wasn't made to
the error reporting subsystem, limiting the error flags being delivered
to zed to 32 bits. This bumps the whole pipeline to use uint64s.

A tiny bit of compatibility is added for newer zed working agsinst an
older kernel module, because its easy to do and misdetecting
scrub/resilver errors and taking action is potentially dangerous. Making
it work for new kernel modules against older zed seems to be far more
invasive for far less benefit, so I have not.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16469
---
 cmd/zed/agents/zfs_diagnosis.c | 15 ++++++++++++---
 module/zfs/zfs_fm.c            |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
index e0ad00800ad..e35cd0756c6 100644
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 		const char *failmode = NULL;
 		boolean_t checkremove = B_FALSE;
 		uint32_t pri = 0;
-		int32_t flags = 0;
 
 		/*
 		 * If this is a checksum or I/O error, then toss it into the
@@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 			}
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+			uint64_t flags = 0;
+			int32_t flags32 = 0;
 			/*
 			 * We ignore ereports for checksum errors generated by
 			 * scrub/resilver I/O to avoid potentially further
 			 * degrading the pool while it's being repaired.
+			 *
+			 * Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
+			 * be int32. To allow newer zed to work on older
+			 * kernels, if we don't find the flags, we look for
+			 * the older ones too.
 			 */
 			if (((nvlist_lookup_uint32(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
 			    (pri == ZIO_PRIORITY_SCRUB ||
 			    pri == ZIO_PRIORITY_REBUILD)) ||
-			    ((nvlist_lookup_int32(nvl,
+			    ((nvlist_lookup_uint64(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
-			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
+			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
+			    ((nvlist_lookup_int32(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
+			    (flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
 				fmd_hdl_debug(hdl, "ignoring '%s' for "
 				    "scrub/resilver I/O", class);
 				return;
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 2f43c4aa41b..f7cecc9af8a 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -645,7 +645,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
-		    DATA_TYPE_INT32, zio->io_flags, NULL);
+		    DATA_TYPE_UINT64, zio->io_flags, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
 		    DATA_TYPE_UINT32, zio->io_stage, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,

From 92fca1c2d0ea743c4c92e54df028f1639634b776 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 26 Aug 2024 16:24:59 +1000
Subject: [PATCH 52/59] zstream: build with debug to fix stack overruns

abd_t differs in size depending on whether or not ZFS_DEBUG is set. It
turns out that libzpool is built with FORCEDEBUG_CPPFLAGS, which sets
-DZFS_DEBUG, and so it always has a larger abd_t with extra debug
fields, regardless of whether or not --enable-debug is set.

zdb, ztest and zhack are also all built with FORCEDEBUG_CPPFLAGS, so had
the same idea of the size of abd_t, but zstream was not, and used the
"smaller" abd_t. In practice this didn't matter because it never used
abd_t directly.

This changed in b4d81b1a6, zstream was switched to use stack ABDs for
compression. When built with --enable-debug, zstream implicitly gets
ZFS_DEBUG, and everything was fine. Productions builds without that flag
ends up with the smaller abd_t, which is now mismatched with libzpool,
and causes stack overruns in zstream recompress.

The simplest fix for now is to compile zstream with FORCEDEBUG_CPPFLAGS
like the other binaries. This commit does that.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Issue #16476
Closes #16477
---
 cmd/zstream/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am
index 8506b351165..f9d0b0cfd2b 100644
--- a/cmd/zstream/Makefile.am
+++ b/cmd/zstream/Makefile.am
@@ -1,3 +1,5 @@
+zstream_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream
 

From b3b7491615308d80e363854e977387f633ad9327 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Aug 2024 09:44:53 +1000
Subject: [PATCH 53/59] build: rename FORCEDEBUG_CPPFLAGS to LIBZPOOL_CPPFLAGS

This is just a very small attempt to make it more obvious that these
flags aren't optional for libzpool-using programs, by not making it seem
like there's an option to say "well, I don't _want_ to force debugging".

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Issue #16476
Closes #16477
---
 cmd/Makefile.am                 | 4 ++--
 cmd/raidz_test/Makefile.am      | 2 +-
 cmd/zdb/Makefile.am             | 2 +-
 cmd/zstream/Makefile.am         | 2 +-
 config/Rules.am                 | 5 ++++-
 lib/libzpool/Makefile.am        | 2 +-
 tests/zfs-tests/cmd/Makefile.am | 2 +-
 7 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cmd/Makefile.am b/cmd/Makefile.am
index 2bd9d039f20..96040976e53 100644
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
 	libzfs.la
 
 
-zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += zhack
 CPPCHECKTARGETS += zhack
@@ -39,7 +39,7 @@ zhack_LDADD = \
 
 
 ztest_CFLAGS    = $(AM_CFLAGS) $(KERNEL_CFLAGS)
-ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += ztest
 CPPCHECKTARGETS += ztest
diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am
index 3b8b6056832..635216d65d7 100644
--- a/cmd/raidz_test/Makefile.am
+++ b/cmd/raidz_test/Makefile.am
@@ -1,5 +1,5 @@
 raidz_test_CFLAGS   = $(AM_CFLAGS)   $(KERNEL_CFLAGS)
-raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 bin_PROGRAMS    += raidz_test
 CPPCHECKTARGETS += raidz_test
diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am
index ebdc19128e1..8a4388bd188 100644
--- a/cmd/zdb/Makefile.am
+++ b/cmd/zdb/Makefile.am
@@ -1,4 +1,4 @@
-zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 zdb_CFLAGS   = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)
 
 sbin_PROGRAMS   += zdb
diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am
index f9d0b0cfd2b..be3539fe905 100644
--- a/cmd/zstream/Makefile.am
+++ b/cmd/zstream/Makefile.am
@@ -1,4 +1,4 @@
-zstream_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream
diff --git a/config/Rules.am b/config/Rules.am
index b462826e2c8..9c0714c8251 100644
--- a/config/Rules.am
+++ b/config/Rules.am
@@ -71,4 +71,7 @@ KERNEL_CFLAGS       = $(FRAME_LARGER_THAN)
 LIBRARY_CFLAGS      = -no-suppress
 
 # Forcibly enable asserts/debugging for libzpool &al.
-FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
+# Since ZFS_DEBUG can change shared data structures, all libzpool users must
+# be compiled with the same flags.
+# See https://github.com/openzfs/zfs/issues/16476
+LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 6989fefc666..81949bf9e5b 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -3,7 +3,7 @@ include $(srcdir)/%D%/include/Makefile.am
 libzpool_la_CFLAGS  = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
 libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
 
-libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
 libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD
 
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index 23848a82ffb..a8df06c2e99 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -24,7 +24,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/badsend
 
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test
-%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 %C%_btree_test_LDADD = \
 	libzpool.la \
 	libzfs_core.la

From bf8c61f489e07ddcfed246768059b37808b7f6e5 Mon Sep 17 00:00:00 2001
From: Seth Hoffert <Seth.Hoffert@gmail.com>
Date: Tue, 3 Sep 2024 19:52:33 -0500
Subject: [PATCH 54/59] Remove unused sysctl node

PR #14953 removed vdev-level read cache but accidentally left this
sysctl node behind.

Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Seth Hoffert <seth.hoffert@gmail.com>
Closes #16493
---
 module/os/freebsd/zfs/sysctl_os.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index 30983b13f7d..c84cb7407a9 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -124,7 +124,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
 
 SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
 	"ZFS livelist condense");
-SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
 	"ZFS VDEV mirror");

From 4a4f7b019fa57e2a196e95492aecbed1f312be3a Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jun 2024 14:11:11 +1000
Subject: [PATCH 55/59] zdb: rework dedup accounting for log, quota and prune

The simplest thing first: add the FDT and log objects to the list of
objects to be considered when checking for leaks.

The rest is based on a conceptual change in all of this patch stack: a
block on disk with a 'D' bit is not necessarily in the DDT at all
(pruned), or in the DDT ZAPs (still on the log).

As such, walking the DDT up front is difficult (for all the reasons that
walking an unflushed log is difficult) and not really useful, since it's
not a reflection of what's on disk anyway.

Instead, we rework things here to be more like the BRT checks. When we
see a dedup'd block, we look it up in the DDT, consume a refcount, and
for the second-or-later instances, count them as duplicates.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #16277
---
 module/zfs/ddt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index bd1941f43ad..11fd10fb769 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -789,6 +789,9 @@ ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
 ddt_phys_variant_t
 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 {
+	if (dde == NULL)
+		return (DDT_PHYS_NONE);
+
 	const ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {

From d4d79451cb87aa0d93f9068ce5844098a5ebe3b5 Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@klarasystems.com>
Date: Mon, 17 Jun 2024 22:35:18 +0000
Subject: [PATCH 56/59] Add DDT prune command

Requires the new 'flat' physical data which has the start
time for a class entry.

The amount to prune can be based on a target percentage of
the unique entries or based on the age (i.e., every entry
older than N days).

Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #16277
---
 cmd/zdb/zdb.c                           |  55 ++-
 cmd/zpool/zpool_main.c                  |  89 +++++
 cmd/ztest.c                             |  28 ++
 contrib/debian/openzfs-zfsutils.install |   1 +
 include/libzfs.h                        |   3 +
 include/libzfs_core.h                   |   3 +
 include/sys/ddt.h                       |   3 +
 include/sys/ddt_impl.h                  |  52 ++-
 include/sys/fs/zfs.h                    |  15 +-
 include/sys/spa_impl.h                  |   1 +
 lib/libzfs/libzfs.abi                   |  67 +++-
 lib/libzfs/libzfs_pool.c                |  28 ++
 lib/libzfs_core/libzfs_core.abi         |  15 +
 lib/libzfs_core/libzfs_core.c           |  22 ++
 man/Makefile.am                         |   1 +
 man/man8/zpool-ddtprune.8               |  48 +++
 man/man8/zpool.8                        |   1 +
 module/zfs/ddt.c                        | 474 +++++++++++++++++++++---
 module/zfs/ddt_log.c                    |  24 +-
 module/zfs/zfs_ioctl.c                  |  50 +++
 module/zfs/zio.c                        |  10 +
 21 files changed, 905 insertions(+), 85 deletions(-)
 create mode 100644 man/man8/zpool-ddtprune.8

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 41c2b676558..8e3b6972ae0 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2045,7 +2045,7 @@ dump_all_ddts(spa_t *spa)
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
-		if (!ddt)
+		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 			continue;
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
@@ -2072,6 +2072,32 @@ dump_all_ddts(spa_t *spa)
 	}
 
 	dump_dedup_ratio(&dds_total);
+
+	/*
+	 * Dump a histogram of unique class entry age
+	 */
+	if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
+		ddt_age_histo_t histogram;
+
+		(void) printf("DDT walk unique, building age histogram...\n");
+		ddt_prune_walk(spa, 0, &histogram);
+
+		/*
+		 * print out histogram for unique entry class birth
+		 */
+		if (histogram.dah_entries > 0) {
+			(void) printf("%5s  %9s  %4s\n",
+			    "age", "blocks", "amnt");
+			(void) printf("%5s  %9s  %4s\n",
+			    "-----", "---------", "----");
+			for (int i = 0; i < HIST_BINS; i++) {
+				(void) printf("%5d  %9d %4d%%\n", 1 << i,
+				    (int)histogram.dah_age_histo[i],
+				    (int)((histogram.dah_age_histo[i] * 100) /
+				    histogram.dah_entries));
+			}
+		}
+	}
 }
 
 static void
@@ -5749,12 +5775,17 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		ddt_entry_t *dde = ddt_lookup(ddt, bp);
 
 		/*
-		 * ddt_lookup() can only return NULL if this block didn't exist
+		 * ddt_lookup() can return NULL if this block didn't exist
 		 * in the DDT and creating it would take the DDT over its
 		 * quota. Since we got the block from disk, it must exist in
-		 * the DDT, so this can't happen.
+		 * the DDT, so this can't happen. However, when unique entries
+		 * are pruned, the dedup bit can be set with no corresponding
+		 * entry in the DDT.
 		 */
-		VERIFY3P(dde, !=, NULL);
+		if (dde == NULL) {
+			ddt_exit(ddt);
+			goto skipped;
+		}
 
 		/* Get the phys for this variant */
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
@@ -5774,8 +5805,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
-		ddt_phys_decref(dde->dde_phys, v);
+		if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
+			ddt_phys_decref(dde->dde_phys, v);
 
 		/*
 		 * If this entry has a single flat phys, it may have been
@@ -5864,6 +5895,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		}
 	}
 
+skipped:
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -8138,7 +8170,7 @@ dump_mos_leaks(spa_t *spa)
 
 	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
-		if (!ddt)
+		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 			continue;
 
 		/* DDT store objects */
@@ -8150,11 +8182,14 @@ dump_mos_leaks(spa_t *spa)
 		}
 
 		/* FDT container */
-		mos_obj_refd(ddt->ddt_dir_object);
+		if (ddt->ddt_version == DDT_VERSION_FDT)
+			mos_obj_refd(ddt->ddt_dir_object);
 
 		/* FDT log objects */
-		mos_obj_refd(ddt->ddt_log[0].ddl_object);
-		mos_obj_refd(ddt->ddt_log[1].ddl_object);
+		if (ddt->ddt_flags & DDT_FLAG_LOG) {
+			mos_obj_refd(ddt->ddt_log[0].ddl_object);
+			mos_obj_refd(ddt->ddt_log[1].ddl_object);
+		}
 	}
 
 	if (spa->spa_brt != NULL) {
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 9cd26a8650a..ce859226c21 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -130,6 +130,8 @@ static int zpool_do_version(int, char **);
 
 static int zpool_do_wait(int, char **);
 
+static int zpool_do_ddt_prune(int, char **);
+
 static int zpool_do_help(int argc, char **argv);
 
 static zpool_compat_status_t zpool_do_load_compat(
@@ -170,6 +172,7 @@ typedef enum {
 	HELP_CLEAR,
 	HELP_CREATE,
 	HELP_CHECKPOINT,
+	HELP_DDT_PRUNE,
 	HELP_DESTROY,
 	HELP_DETACH,
 	HELP_EXPORT,
@@ -426,6 +429,8 @@ static zpool_command_t command_table[] = {
 	{ "sync",	zpool_do_sync,		HELP_SYNC		},
 	{ NULL },
 	{ "wait",	zpool_do_wait,		HELP_WAIT		},
+	{ NULL },
+	{ "ddtprune",	zpool_do_ddt_prune,	HELP_DDT_PRUNE		},
 };
 
 #define	NCOMMAND	(ARRAY_SIZE(command_table))
@@ -545,6 +550,8 @@ get_usage(zpool_help_t idx)
 	case HELP_WAIT:
 		return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
 		    "<pool> [interval]\n"));
+	case HELP_DDT_PRUNE:
+		return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
 	default:
 		__builtin_unreachable();
 	}
@@ -13342,6 +13349,88 @@ found:;
 	return (error);
 }
 
+/*
+ * zpool ddtprune -d|-p <amount> <pool>
+ *
+ *       -d <days>	Prune entries <days> old and older
+ *       -p <percent>	Prune <percent> amount of entries
+ *
+ * Prune single reference entries from DDT to satisfy the amount specified.
+ */
+int
+zpool_do_ddt_prune(int argc, char **argv)
+{
+	zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
+	uint64_t amount = 0;
+	zpool_handle_t *zhp;
+	char *endptr;
+	int c;
+
+	while ((c = getopt(argc, argv, "d:p:")) != -1) {
+		switch (c) {
+		case 'd':
+			if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
+				(void) fprintf(stderr, gettext("-d cannot be "
+				    "combined with -p option\n"));
+				usage(B_FALSE);
+			}
+			errno = 0;
+			amount = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0' || amount == 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid days value\n"));
+				usage(B_FALSE);
+			}
+			amount *= 86400;	/* convert days to seconds */
+			unit = ZPOOL_DDT_PRUNE_AGE;
+			break;
+		case 'p':
+			if (unit == ZPOOL_DDT_PRUNE_AGE) {
+				(void) fprintf(stderr, gettext("-p cannot be "
+				    "combined with -d option\n"));
+				usage(B_FALSE);
+			}
+			errno = 0;
+			amount = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0' ||
+			    amount == 0 || amount > 100) {
+				(void) fprintf(stderr,
+				    gettext("invalid percentage value\n"));
+				usage(B_FALSE);
+			}
+			unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (unit == ZPOOL_DDT_PRUNE_NONE) {
+		(void) fprintf(stderr,
+		    gettext("missing amount option (-d|-p <value>)\n"));
+		usage(B_FALSE);
+	} else if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(B_FALSE);
+	} else if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+	zhp = zpool_open(g_zfs, argv[0]);
+	if (zhp == NULL)
+		return (-1);
+
+	int error = zpool_ddt_prune(zhp, unit, amount);
+
+	zpool_close(zhp);
+
+	return (error);
+}
+
 static int
 find_command_idx(const char *command, int *idx)
 {
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 7c9db84d4ea..a7843d33883 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
 extern unsigned long zfs_reconstruct_indirect_damage_fraction;
 extern uint64_t raidz_expand_max_reflow_bytes;
 extern uint_t raidz_expand_pause_point;
+extern boolean_t ddt_prune_artificial_age;
+extern boolean_t ddt_dump_prune_histogram;
 
 
 static ztest_shared_opts_t *ztest_shared_opts;
@@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
 ztest_func_t ztest_fletcher_incr;
 ztest_func_t ztest_verify_dnode_bt;
 ztest_func_t ztest_pool_prefetch_ddt;
+ztest_func_t ztest_ddt_prune;
 
 static uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
@@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
 	ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
 	ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
+	ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -7288,6 +7292,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
 	mutex_exit(&ztest_vdev_lock);
 }
 
+void
+ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
+{
+	(void) zd, (void) id;
+
+	spa_t *spa = ztest_spa;
+	uint64_t pct = ztest_random(15) + 1;
+
+	(void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
+}
+
 /*
  * Verify pool integrity by running zdb.
  */
@@ -7469,6 +7484,13 @@ ztest_resume_thread(void *arg)
 {
 	spa_t *spa = arg;
 
+	/*
+	 * Synthesize aged DDT entries for ddt prune testing
+	 */
+	ddt_prune_artificial_age = B_TRUE;
+	if (ztest_opts.zo_verbose >= 3)
+		ddt_dump_prune_histogram = B_TRUE;
+
 	while (!ztest_exiting) {
 		if (spa_suspended(spa))
 			ztest_resume(spa);
@@ -8587,6 +8609,12 @@ ztest_init(ztest_shared_t *zs)
 		if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
 			continue;
 
+		/*
+		 * split 50/50 between legacy and fast dedup
+		 */
+		if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
+			continue;
+
 		VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
 		    spa_feature_table[i].fi_uname));
 		fnvlist_add_uint64(props, buf, 0);
diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install
index 10083351abb..d51e4ef003e 100644
--- a/contrib/debian/openzfs-zfsutils.install
+++ b/contrib/debian/openzfs-zfsutils.install
@@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
 usr/share/man/man8/zpool-create.8
 usr/share/man/man8/zpool-destroy.8
 usr/share/man/man8/zpool-detach.8
+usr/share/man/man8/zpool-ddtprune.8
 usr/share/man/man8/zpool-events.8
 usr/share/man/man8/zpool-export.8
 usr/share/man/man8/zpool-get.8
diff --git a/include/libzfs.h b/include/libzfs.h
index 2412797541d..01d51999f4e 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -305,6 +305,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
 
 _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
 
+_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
+    uint64_t);
+
 _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
 _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 206e5e5c2bf..b1d74fbbc8f 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
 
 _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
 
+_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
+    uint64_t);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 93abad85af4..4e5ccd46318 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -405,6 +405,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
 
 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
 
+extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
+    uint64_t amount);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 6f11cd90c1d..4d3c0cae072 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -35,8 +35,11 @@ extern "C" {
 #endif
 
 /* DDT version numbers */
-#define	DDT_VERSION_LEGACY	(0)
-#define	DDT_VERSION_FDT		(1)
+#define	DDT_VERSION_LEGACY		(0)
+#define	DDT_VERSION_FDT			(1)
+
+/* Dummy version to signal that configure is still necessary */
+#define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
 
 /* Names of interesting objects in the DDT root dir */
 #define	DDT_DIR_VERSION		"version"
@@ -187,8 +190,11 @@ extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
 
 extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
     ddt_lightweight_entry_t *ddlwe);
-extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
-    const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk);
 
 extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
     dmu_tx_t *tx);
@@ -211,6 +217,44 @@ extern void ddt_log_fini(void);
  * them up.
  */
 
+/*
+ * We use a histogram to convert a percentage request into a
+ * cutoff value where entries older than the cutoff get pruned.
+ *
+ * The histogram bins represent hours in power-of-two increments.
+ * 16 bins covers up to four years.
+ */
+#define	HIST_BINS 16
+
+typedef struct ddt_age_histo {
+	uint64_t dah_entries;
+	uint64_t dah_age_histo[HIST_BINS];
+} ddt_age_histo_t;
+
+void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);
+
+#if defined(_KERNEL) || !defined(ZFS_DEBUG)
+#define	ddt_dump_age_histogram(histo, cutoff)	((void)0)
+#else
+static inline void
+ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
+{
+	if (histogram->dah_entries == 0)
+		return;
+
+	(void) printf("DDT prune unique class age, %llu hour cutoff\n",
+	    (u_longlong_t)(gethrestime_sec() - cutoff)/3600);
+	(void) printf("%5s  %9s  %4s\n", "age", "blocks", "amnt");
+	(void) printf("%5s  %9s  %4s\n", "-----", "---------", "----");
+	for (int i = 0; i < HIST_BINS; i++) {
+		(void) printf("%5d  %9llu %4d%%\n", 1<<i,
+		    (u_longlong_t)histogram->dah_age_histo[i],
+		    (int)((histogram->dah_age_histo[i] * 100) /
+		    histogram->dah_entries));
+	}
+}
+#endif
+
 /*
  * Enough room to expand DMU_POOL_DDT format for all possible DDT
  * checksum/class/type combinations.
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 73d686a002e..fc4f22cd530 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1422,7 +1422,7 @@ typedef enum {
  */
 typedef enum zfs_ioc {
 	/*
-	 * Core features - 88/128 numbers reserved.
+	 * Core features - 89/128 numbers reserved.
 	 */
 #ifdef __FreeBSD__
 	ZFS_IOC_FIRST =	0,
@@ -1519,6 +1519,7 @@ typedef enum zfs_ioc {
 	ZFS_IOC_VDEV_SET_PROPS,			/* 0x5a56 */
 	ZFS_IOC_POOL_SCRUB,			/* 0x5a57 */
 	ZFS_IOC_POOL_PREFETCH,			/* 0x5a58 */
+	ZFS_IOC_DDT_PRUNE,			/* 0x5a59 */
 
 	/*
 	 * Per-platform (Optional) - 8/128 numbers reserved.
@@ -1655,6 +1656,12 @@ typedef enum {
 	ZPOOL_PREFETCH_DDT
 } zpool_prefetch_type_t;
 
+typedef enum {
+	ZPOOL_DDT_PRUNE_NONE,
+	ZPOOL_DDT_PRUNE_AGE,		/* in seconds */
+	ZPOOL_DDT_PRUNE_PERCENTAGE,	/* 1 - 100 */
+} zpool_ddt_prune_unit_t;
+
 /*
  * Bookmark name values.
  */
@@ -1753,6 +1760,12 @@ typedef enum {
  */
 #define	ZPOOL_PREFETCH_TYPE		"prefetch_type"
 
+/*
+ * The following are names used when invoking ZFS_IOC_DDT_PRUNE.
+ */
+#define	DDT_PRUNE_UNIT		"ddt_prune_unit"
+#define	DDT_PRUNE_AMOUNT	"ddt_prune_amount"
+
 /*
  * Flags for ZFS_IOC_VDEV_SET_STATE
  */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 4fc6f22fcb5..7811abbb9ce 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -412,6 +412,7 @@ struct spa {
 	uint64_t	spa_dedup_dspace;	/* Cache get_dedup_dspace() */
 	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
 	uint64_t	spa_dspace;		/* dspace in normal class */
+	boolean_t	spa_active_ddt_prune;	/* ddt prune process active */
 	struct brt	*spa_brt;		/* in-core BRT */
 	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
 	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 87c5c4380be..88dd8b3c679 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -183,8 +183,8 @@
     <elf-symbol name='fsleep' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='get_dataset_depth' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='get_system_hostid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='get_timestamp' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='getextmntent' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='getmntany' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='getprop_uint64' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -466,7 +466,9 @@
     <elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_default_search_paths' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_disable_datasets' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -485,8 +487,8 @@
     <elf-symbol name='zpool_export_force' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_feature_init' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_find_config' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_find_parent_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_find_vdev_by_physpath' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_free_handles' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_get_all_vdev_props' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -529,7 +531,6 @@
     <elf-symbol name='zpool_prefetch' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_prepare_and_label_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_prepare_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_prop_align_right' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_prop_column_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_prop_default_numeric' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -5929,6 +5930,7 @@
       <enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
       <enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
       <enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
+      <enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
       <enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
       <enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
       <enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@@ -5963,6 +5965,13 @@
       <enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
     </enum-decl>
     <typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
+    <enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
+    </enum-decl>
+    <typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
     <enum-decl name='spa_feature' id='33ecb627'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='SPA_FEATURE_NONE' value='-1'/>
@@ -6139,6 +6148,12 @@
       <parameter type-id='857bb57e'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='80f4b756'/>
+      <parameter type-id='02e25ab0'/>
+      <parameter type-id='9c313c2d'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zfs_resolve_shortname' mangled-name='zfs_resolve_shortname' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_resolve_shortname'>
       <parameter type-id='80f4b756'/>
       <parameter type-id='26a90f95'/>
@@ -6798,6 +6813,12 @@
       <parameter type-id='80f4b756' name='propval'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_ddt_prune' mangled-name='zpool_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_ddt_prune'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='02e25ab0' name='unit'/>
+      <parameter type-id='9c313c2d' name='amount'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libzfs/libzfs_sendrecv.c' language='LANG_C99'>
     <array-type-def dimensions='1' type-id='8901473c' size-in-bits='576' id='f5da478b'>
@@ -7837,7 +7858,7 @@
       </data-member>
     </class-decl>
     <typedef-decl name='vdev_cbdata_t' type-id='b8006be8' id='a9679c94'/>
-    <class-decl name='zprop_get_cbdata' size-in-bits='832' is-struct='yes' visibility='default' id='f3d3c319'>
+    <class-decl name='zprop_get_cbdata' size-in-bits='960' is-struct='yes' visibility='default' id='f3d3c319'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='cb_sources' type-id='95e97e5e' visibility='default'/>
       </data-member>
@@ -7856,6 +7877,9 @@
       <data-member access='public' layout-offset-in-bits='448'>
         <var-decl name='cb_first' type-id='c19b74c3' visibility='default'/>
       </data-member>
+      <data-member access='public' layout-offset-in-bits='480'>
+        <var-decl name='cb_json' type-id='c19b74c3' visibility='default'/>
+      </data-member>
       <data-member access='public' layout-offset-in-bits='512'>
         <var-decl name='cb_proplist' type-id='3a9b2288' visibility='default'/>
       </data-member>
@@ -7865,6 +7889,15 @@
       <data-member access='public' layout-offset-in-bits='640'>
         <var-decl name='cb_vdevs' type-id='a9679c94' visibility='default'/>
       </data-member>
+      <data-member access='public' layout-offset-in-bits='832'>
+        <var-decl name='cb_jsobj' type-id='5ce45b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='cb_json_as_int' type-id='c19b74c3' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='928'>
+        <var-decl name='cb_json_pool_key_guid' type-id='c19b74c3' visibility='default'/>
+      </data-member>
     </class-decl>
     <typedef-decl name='zprop_get_cbdata_t' type-id='f3d3c319' id='f3d87113'/>
     <typedef-decl name='zprop_func' type-id='2e711a2a' id='1ec3747a'/>
@@ -7968,6 +8001,11 @@
     <qualified-type-def type-id='d33f11cb' restrict='yes' id='5c53ba29'/>
     <pointer-type-def type-id='ffa52b96' size-in-bits='64' id='76c8174b'/>
     <pointer-type-def type-id='f3d87113' size-in-bits='64' id='0d2a0670'/>
+    <function-decl name='nvlist_print_json' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='822cd80b'/>
+      <parameter type-id='5ce45b60'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_label_disk' mangled-name='zpool_label_disk' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_label_disk'>
       <parameter type-id='b0382bb3'/>
       <parameter type-id='4c81de99'/>
@@ -8075,6 +8113,11 @@
       <parameter type-id='d33f11cb'/>
       <return type-id='48b5725f'/>
     </function-decl>
+    <function-decl name='putc' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='822cd80b'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='puts' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
       <return type-id='95e97e5e'/>
@@ -8093,6 +8136,11 @@
       <parameter type-id='95e97e5e'/>
       <return type-id='48b5725f'/>
     </function-decl>
+    <function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='80f4b756'/>
+      <parameter type-id='80f4b756'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
     <function-decl name='strnlen' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
       <parameter type-id='b59d7dce'/>
@@ -8292,12 +8340,12 @@
     <function-decl name='zfs_version_print' mangled-name='zfs_version_print' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_print'>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
-      <return type-id='95e97e5e'/>
-    </function-decl>
     <function-decl name='zfs_version_nvlist' mangled-name='zfs_version_nvlist' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_nvlist'>
       <return type-id='5ce45b60'/>
     </function-decl>
+    <function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='printf_color' mangled-name='printf_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='printf_color'>
       <parameter type-id='80f4b756' name='color'/>
       <parameter type-id='80f4b756' name='format'/>
@@ -8802,11 +8850,6 @@
       <parameter type-id='78c01427'/>
       <return type-id='13956559'/>
     </function-decl>
-    <function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <parameter type-id='80f4b756'/>
-      <return type-id='b59d7dce'/>
-    </function-decl>
     <function-decl name='zfs_dirnamelen' mangled-name='zfs_dirnamelen' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_dirnamelen'>
       <parameter type-id='80f4b756' name='path'/>
       <return type-id='79a0948f'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index dfa7c4db688..14410b15313 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5649,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
 
 	return (ret);
 }
+
+/*
+ * Prune older entries from the DDT to reclaim space under the quota
+ */
+int
+zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit,
+    uint64_t amount)
+{
+	int error = lzc_ddt_prune(zhp->zpool_name, unit, amount);
+	if (error != 0) {
+		libzfs_handle_t *hdl = zhp->zpool_hdl;
+		char errbuf[ERRBUFLEN];
+
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot prune dedup table on '%s'"), zhp->zpool_name);
+
+		if (error == EALREADY) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "a prune operation is already in progress"));
+			(void) zfs_error(hdl, EZFS_BUSY, errbuf);
+		} else {
+			(void) zpool_standard_error(hdl, errno, errbuf);
+		}
+		return (-1);
+	}
+
+	return (0);
+}
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index 1062a6b52df..5ee6b8e09d6 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -162,6 +162,7 @@
     <elf-symbol name='lzc_channel_program_nosync' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_clone' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='lzc_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_destroy_bookmarks' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_destroy_snaps' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -1444,6 +1445,7 @@
       <enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
       <enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
       <enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
+      <enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
       <enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
       <enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
       <enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@@ -1484,6 +1486,13 @@
       <enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
     </enum-decl>
     <typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
+    <enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
+    </enum-decl>
+    <typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
     <enum-decl name='data_type_t' naming-typedef-id='8d0687d2' id='aeeae136'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='DATA_TYPE_DONTCARE' value='-1'/>
@@ -3015,6 +3024,12 @@
       <parameter type-id='857bb57e' name='outnvl'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='lzc_ddt_prune' mangled-name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='lzc_ddt_prune'>
+      <parameter type-id='80f4b756' name='pool'/>
+      <parameter type-id='02e25ab0' name='unit'/>
+      <parameter type-id='9c313c2d' name='amount'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-type size-in-bits='64' id='c70fa2e8'>
       <parameter type-id='95e97e5e'/>
       <parameter type-id='eaa32e2f'/>
diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c
index ec8b0ff4f61..d07fca6ceba 100644
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl)
 {
 	return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl));
 }
+
+/*
+ * Prune the specified amount from the pool's dedup table.
+ */
+int
+lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount)
+{
+	int error;
+
+	nvlist_t *result = NULL;
+	nvlist_t *args = fnvlist_alloc();
+
+	fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit);
+	fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount);
+
+	error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result);
+
+	fnvlist_free(args);
+	fnvlist_free(result);
+
+	return (error);
+}
diff --git a/man/Makefile.am b/man/Makefile.am
index 194bb472161..fde70493376 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -72,6 +72,7 @@ dist_man_MANS = \
 	%D%/man8/zpool-create.8 \
 	%D%/man8/zpool-destroy.8 \
 	%D%/man8/zpool-detach.8 \
+	%D%/man8/zpool-ddtprune.8 \
 	%D%/man8/zpool-events.8 \
 	%D%/man8/zpool-export.8 \
 	%D%/man8/zpool-get.8 \
diff --git a/man/man8/zpool-ddtprune.8 b/man/man8/zpool-ddtprune.8
new file mode 100644
index 00000000000..1ab7d3982c3
--- /dev/null
+++ b/man/man8/zpool-ddtprune.8
@@ -0,0 +1,48 @@
+.\"
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\"
+.\"
+.\" Copyright (c) 2024, Klara Inc.
+.\"
+.Dd June 17, 2024
+.Dt ZPOOL-DDTPRUNE 8
+.Os
+.
+.Sh NAME
+.Nm zpool-ddtprune
+.Nd Prunes the oldest entries from the single reference dedup table(s)
+.Sh SYNOPSIS
+.Nm zpool
+.Cm ddtprune
+.Fl d Ar days | Fl p Ar percentage
+.Ar pool
+.Sh DESCRIPTION
+This command prunes older unique entries from the dedup table.
+As a complement to the dedup quota feature,
+.Sy ddtprune
+allows removal of older non-duplicate entries to make room for
+newer duplicate entries.
+.Pp
+The amount to prune can be based on a target percentage of the unique entries
+or based on the age (i.e., every unique entry older than N days).
+.
+.Sh SEE ALSO
+.Xr zdb 8 ,
+.Xr zpool-status 8
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index c55644d9ece..02a258f6670 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -592,6 +592,7 @@ don't wait.
 .Xr zpool-checkpoint 8 ,
 .Xr zpool-clear 8 ,
 .Xr zpool-create 8 ,
+.Xr zpool-ddtprune 8 ,
 .Xr zpool-destroy 8 ,
 .Xr zpool-detach 8 ,
 .Xr zpool-events 8 ,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 11fd10fb769..0e12e7e4982 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -125,6 +125,13 @@
  * without which, no space would be recovered and the DDT would continue to be
  * considered "over quota". See zap_shrink_enabled.
  *
+ * ## Dedup table pruning
+ *
+ * As a complement to the dedup quota feature, ddtprune allows removal of older
+ * non-duplicate entries to make room for newer duplicate entries. The amount
+ * to prune can be based on a target percentage of the unique entries or based
+ * on the age (i.e., prune unique entry older than N days).
+ *
  * ## Dedup log
  *
  * Historically, all entries modified on a txg were written back to dedup
@@ -228,6 +235,19 @@ int zfs_dedup_prefetch = 0;
  */
 uint_t dedup_class_wait_txgs = 5;
 
+/*
+ * How many DDT prune entries to add to the DDT sync AVL tree.
+ * Note these addtional entries have a memory footprint of a
+ * ddt_entry_t (216 bytes).
+ */
+static uint32_t zfs_ddt_prunes_per_txg = 50000;
+
+/*
+ * For testing, synthesize aged DDT entries
+ * (in global scope for ztest)
+ */
+boolean_t ddt_prune_artificial_age = B_FALSE;
+boolean_t ddt_dump_prune_histogram = B_FALSE;
 
 /*
  * Don't do more than this many incremental flush passes per txg.
@@ -268,10 +288,6 @@ static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };
 
-/* Dummy version to signal that configure is still necessary */
-#define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
-
-#ifdef _KERNEL
 /* per-DDT kstats */
 typedef struct {
 	/* total lookups and whether they returned new or existing entries */
@@ -324,6 +340,7 @@ static const ddt_kstats_t ddt_kstats_template = {
 	{ "log_flush_time_rate",	KSTAT_DATA_UINT32 },
 };
 
+#ifdef _KERNEL
 #define	_DDT_KSTAT_STAT(ddt, stat) \
 	&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
 #define	DDT_KSTAT_BUMP(ddt, stat) \
@@ -343,6 +360,7 @@ static const ddt_kstats_t ddt_kstats_template = {
 #define	DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
 #endif /* _KERNEL */
 
+
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
@@ -715,6 +733,30 @@ ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 		memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
 }
 
+static uint64_t
+ddt_class_start(void)
+{
+	uint64_t start = gethrestime_sec();
+
+	if (ddt_prune_artificial_age) {
+		/*
+		 * debug aide -- simulate a wider distribution
+		 * so we don't have to wait for an aged DDT
+		 * to test prune.
+		 */
+		int range = 1 << 21;
+		int percent = random_in_range(100);
+		if (percent < 50) {
+			range = range >> 4;
+		} else if (percent > 75) {
+			range /= 2;
+		}
+		start -= random_in_range(range);
+	}
+
+	return (start);
+}
+
 void
 ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
@@ -1022,6 +1064,47 @@ ddt_prefetch_all(spa_t *spa)
 
 static int ddt_configure(ddt_t *ddt, boolean_t new);
 
+/*
+ * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
+ * to the ones in the entry. If they're different, then the passed-in BP is
+ * from a previous generation of this entry (ie was previously pruned) and we
+ * have to act like the entry doesn't exist at all.
+ *
+ * This should only happen during a lookup to free the block (zio_ddt_free()).
+ *
+ * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
+ *       -- robn, 2024-02-09
+ */
+static boolean_t
+ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
+{
+	/* If the BP has no DVAs, then this entry is good */
+	uint_t ndvas = BP_GET_NDVAS(bp);
+	if (ndvas == 0)
+		return (B_TRUE);
+
+	/*
+	 * Only checking the phys for the copies. For flat, there's only one;
+	 * for trad it'll be the one that has the matching set of DVAs.
+	 */
+	const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+	    dde->dde_phys->ddp_flat.ddp_dva :
+	    dde->dde_phys->ddp_trad[ndvas].ddp_dva;
+
+	/*
+	 * Compare entry DVAs with the BP. They should all be there, but
+	 * there's not really anything we can do if its only partial anyway,
+	 * that's an error somewhere else, maybe long ago.
+	 */
+	uint_t d;
+	for (d = 0; d < ndvas; d++)
+		if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
+			return (B_FALSE);
+	ASSERT3U(d, ==, ndvas);
+
+	return (B_TRUE);
+}
+
 ddt_entry_t *
 ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 {
@@ -1057,8 +1140,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 
 		/* If it's already loaded, we can just return it. */
 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
-		if (dde->dde_flags & DDE_FLAG_LOADED)
-			return (dde);
+		if (dde->dde_flags & DDE_FLAG_LOADED) {
+			if (ddt_entry_lookup_is_valid(ddt, bp, dde))
+				return (dde);
+			return (NULL);
+		}
 
 		/* Someone else is loading it, wait for it. */
 		dde->dde_waiters++;
@@ -1077,7 +1163,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		}
 
 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
-		return (dde);
+
+		/* Make sure the loaded entry matches the BP */
+		if (ddt_entry_lookup_is_valid(ddt, bp, dde))
+			return (dde);
+		return (NULL);
 	} else
 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
 
@@ -1086,32 +1176,42 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 
 	/* Record the time this class was created (used by ddt prune) */
 	if (ddt->ddt_flags & DDT_FLAG_FLAT)
-		dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
+		dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
 
 	avl_insert(&ddt->ddt_tree, dde, where);
 
 	/* If its in the log tree, we can "load" it from there */
 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
 		ddt_lightweight_entry_t ddlwe;
-		boolean_t found = B_FALSE;
-
-		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
-		    &search, &ddlwe)) {
-			DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
-			found = B_TRUE;
-		} else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing,
-		    &search, &ddlwe)) {
-			DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit);
-			found = B_TRUE;
-		}
-
-		if (found) {
-			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
 
+		if (ddt_log_find_key(ddt, &search, &ddlwe)) {
+			/*
+			 * See if we have the key first, and if so, set up
+			 * the entry.
+			 */
 			dde->dde_type = ddlwe.ddlwe_type;
 			dde->dde_class = ddlwe.ddlwe_class;
 			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
 			    DDT_PHYS_SIZE(ddt));
+			/* Whatever we found isn't valid for this BP, eject */
+			if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
+				avl_remove(&ddt->ddt_tree, dde);
+				ddt_free(ddt, dde);
+				return (NULL);
+			}
+
+			/* Remove it and count it */
+			if (ddt_log_remove_key(ddt,
+			    ddt->ddt_log_active, &search)) {
+				DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
+			} else {
+				VERIFY(ddt_log_remove_key(ddt,
+				    ddt->ddt_log_flushing, &search));
+				DDT_KSTAT_BUMP(ddt,
+				    dds_lookup_log_flushing_hit);
+			}
+
+			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
 
 			DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
 			DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
@@ -1150,6 +1250,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
 
+	boolean_t valid = B_TRUE;
+
 	if (dde->dde_type == DDT_TYPES &&
 	    dde->dde_class == DDT_CLASSES &&
 	    ddt_over_quota(spa)) {
@@ -1163,6 +1265,24 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Flag cleanup required */
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
+		/*
+		 * If what we loaded is no good for this BP and there's no one
+		 * waiting for it, we can just remove it and get out. If its no
+		 * good but there are waiters, we have to leave it, because we
+		 * don't know what they want. If its not needed we'll end up
+		 * taking an entry log/sync, but it can only happen if more
+		 * than one previous version of this block is being deleted at
+		 * the same time. This is extremely unlikely to happen and not
+		 * worth the effort to deal with without taking an entry
+		 * update.
+		 */
+		valid = ddt_entry_lookup_is_valid(ddt, bp, dde);
+		if (!valid && dde->dde_waiters == 0) {
+			avl_remove(&ddt->ddt_tree, dde);
+			ddt_free(ddt, dde);
+			return (NULL);
+		}
+
 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
 
@@ -1191,7 +1311,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 	dde->dde_flags |= DDE_FLAG_LOADED;
 	cv_broadcast(&dde->dde_cv);
 
-	return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde);
+	if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
+		return (NULL);
+
+	return (dde);
 }
 
 void
@@ -1420,7 +1543,6 @@ not_found:
 static void
 ddt_table_alloc_kstats(ddt_t *ddt)
 {
-#ifdef _KERNEL
 	char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
 	char *name = kmem_asprintf("ddt_stats_%s",
 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
@@ -1436,9 +1558,6 @@ ddt_table_alloc_kstats(ddt_t *ddt)
 
 	kmem_strfree(name);
 	kmem_strfree(mod);
-#else
-	(void) ddt;
-#endif /* _KERNEL */
 }
 
 static ddt_t *
@@ -1468,13 +1587,11 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 static void
 ddt_table_free(ddt_t *ddt)
 {
-#ifdef _KERNEL
 	if (ddt->ddt_ksp != NULL) {
 		kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
 		ddt->ddt_ksp->ks_data = NULL;
 		kstat_delete(ddt->ddt_ksp);
 	}
-#endif /* _KERNEL */
 
 	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
@@ -1814,7 +1931,7 @@ ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
 		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
 
 		if (ddt_phys_birth(ddp, v) == 0) {
-			ASSERT3U(phys_refcnt, ==, 0);
+			ASSERT0(phys_refcnt);
 			continue;
 		}
 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
@@ -2288,8 +2405,9 @@ ddt_walk_ready(spa_t *spa)
 	return (B_TRUE);
 }
 
-int
-ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
+static int
+ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
+    uint64_t flags, boolean_t wait)
 {
 	do {
 		do {
@@ -2298,7 +2416,11 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 				if (ddt == NULL)
 					continue;
 
-				if (ddt->ddt_flush_force_txg > 0)
+				if (flags != 0 &&
+				    (ddt->ddt_flags & flags) != flags)
+					continue;
+
+				if (wait && ddt->ddt_flush_force_txg > 0)
 					return (EAGAIN);
 
 				int error = ENOENT;
@@ -2322,13 +2444,19 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 	return (SET_ERROR(ENOENT));
 }
 
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
+{
+	return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
+}
+
 /*
  * This function is used by Block Cloning (brt.c) to increase reference
  * counter for the DDT entry if the block is already in DDT.
  *
  * Return false if the block, despite having the D bit set, is not present
- * in the DDT. Currently this is not possible but might be in the future.
- * See the comment below.
+ * in the DDT. This is possible when the DDT has been pruned by an admin
+ * or by the DDT quota mechanism.
  */
 boolean_t
 ddt_addref(spa_t *spa, const blkptr_t *bp)
@@ -2359,28 +2487,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
-		/*
-		 * This entry already existed (dde_type is real), so it must
-		 * have refcnt >0 at the start of this txg. We are called from
-		 * brt_pending_apply(), before frees are issued, so the refcnt
-		 * can't be lowered yet. Therefore, it must be >0. We assert
-		 * this because if the order of BRT and DDT interactions were
-		 * ever to change and the refcnt was ever zero here, then
-		 * likely further action is required to fill out the DDT entry,
-		 * and this is a place that is likely to be missed in testing.
-		 */
-		ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
-
 		ddt_phys_addref(dde->dde_phys, v);
 		result = B_TRUE;
 	} else {
 		/*
-		 * At the time of implementating this if the block has the
-		 * DEDUP flag set it must exist in the DEDUP table, but
-		 * there are many advocates that want ability to remove
-		 * entries from DDT with refcnt=1. If this will happen,
-		 * we may have a block with the DEDUP set, but which doesn't
-		 * have a corresponding entry in the DDT. Be ready.
+		 * If the block has the DEDUP flag set it still might not
+		 * exist in the DEDUP table due to DDT pruning of entries
+		 * where refcnt=1.
 		 */
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
@@ -2392,6 +2505,261 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 	return (result);
 }
 
+typedef struct ddt_prune_entry {
+	ddt_t		*dpe_ddt;
+	ddt_key_t	dpe_key;
+	list_node_t	dpe_node;
+	ddt_univ_phys_t	dpe_phys[];
+} ddt_prune_entry_t;
+
+typedef struct ddt_prune_info {
+	spa_t		*dpi_spa;
+	uint64_t	dpi_txg_syncs;
+	uint64_t	dpi_pruned;
+	list_t		dpi_candidates;
+} ddt_prune_info_t;
+
+/*
+ * Add prune candidates for ddt_sync during spa_sync
+ */
+static void
+prune_candidates_sync(void *arg, dmu_tx_t *tx)
+{
+	(void) tx;
+	ddt_prune_info_t *dpi = arg;
+	ddt_prune_entry_t *dpe;
+
+	spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
+
+	/* Process the prune candidates collected so far */
+	while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
+		blkptr_t blk;
+		ddt_t *ddt = dpe->dpe_ddt;
+
+		ddt_enter(ddt);
+
+		/*
+		 * If it's on the live list, then it was loaded for update
+		 * this txg and is no longer stale; skip it.
+		 */
+		if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
+			ddt_exit(ddt);
+			kmem_free(dpe, sizeof (*dpe));
+			continue;
+		}
+
+		ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
+		    dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
+
+		ddt_entry_t *dde = ddt_lookup(ddt, &blk);
+		if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
+			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+			/*
+			 * Zero the physical, so we don't try to free DVAs
+			 * at flush nor try to reuse this entry.
+			 */
+			ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
+
+			dpi->dpi_pruned++;
+		}
+
+		ddt_exit(ddt);
+		kmem_free(dpe, sizeof (*dpe));
+	}
+
+	spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
+	dpi->dpi_txg_syncs++;
+}
+
+/*
+ * Prune candidates are collected in open context and processed
+ * in sync context as part of ddt_sync_table().
+ */
+static void
+ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
+    const ddt_univ_phys_t *ddp)
+{
+	ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
+
+	size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
+	ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
+
+	dpe->dpe_ddt = ddt;
+	dpe->dpe_key = *ddk;
+	memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
+	list_insert_head(list, dpe);
+}
+
+/*
+ * Interate over all the entries in the DDT unique class.
+ * The walk will perform one of the following operations:
+ *  (a) build a histogram than can be used when pruning
+ *  (b) prune entries older than the cutoff
+ *
+ *  Also called by zdb(8) to dump the age histogram
+ */
+void
+ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
+{
+	ddt_bookmark_t ddb = {
+		.ddb_class = DDT_CLASS_UNIQUE,
+		.ddb_type = 0,
+		.ddb_checksum = 0,
+		.ddb_cursor = 0
+	};
+	ddt_lightweight_entry_t ddlwe = {0};
+	int error;
+	int total = 0, valid = 0;
+	int candidates = 0;
+	uint64_t now = gethrestime_sec();
+	ddt_prune_info_t dpi;
+	boolean_t pruning = (cutoff != 0);
+
+	if (pruning) {
+		dpi.dpi_txg_syncs = 0;
+		dpi.dpi_pruned = 0;
+		dpi.dpi_spa = spa;
+		list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
+		    offsetof(ddt_prune_entry_t, dpe_node));
+	}
+
+	if (histogram != NULL)
+		memset(histogram, 0, sizeof (ddt_age_histo_t));
+
+	while ((error =
+	    ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
+		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+		VERIFY(ddt);
+
+		if (spa_shutting_down(spa) || issig())
+			break;
+		total++;
+
+		ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
+		ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
+
+		uint64_t class_start =
+		    ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
+
+		/*
+		 * If this entry is on the log, then the stored entry is stale
+		 * and we should skip it.
+		 */
+		if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
+			continue;
+
+		/* prune older entries */
+		if (pruning && class_start < cutoff) {
+			if (candidates++ >= zfs_ddt_prunes_per_txg) {
+				/* sync prune candidates in batches */
+				VERIFY0(dsl_sync_task(spa_name(spa),
+				    NULL, prune_candidates_sync,
+				    &dpi, 0, ZFS_SPACE_CHECK_NONE));
+				candidates = 1;
+			}
+			ddt_prune_entry(&dpi.dpi_candidates, ddt,
+			    &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
+		}
+
+		/* build a histogram */
+		if (histogram != NULL) {
+			uint64_t age = MAX(1, (now - class_start) / 3600);
+			int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
+			histogram->dah_entries++;
+			histogram->dah_age_histo[bin]++;
+		}
+
+		valid++;
+	}
+
+	if (pruning && valid > 0) {
+		if (!list_is_empty(&dpi.dpi_candidates)) {
+			/* sync out final batch of prune candidates */
+			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+			    prune_candidates_sync, &dpi, 0,
+			    ZFS_SPACE_CHECK_NONE));
+		}
+		list_destroy(&dpi.dpi_candidates);
+
+		zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
+		    (u_longlong_t)dpi.dpi_pruned,
+		    (int)((dpi.dpi_pruned * 100) / valid),
+		    (u_longlong_t)dpi.dpi_txg_syncs);
+	}
+}
+
+static uint64_t
+ddt_total_entries(spa_t *spa)
+{
+	ddt_object_t ddo;
+	ddt_get_dedup_object_stats(spa, &ddo);
+
+	return (ddo.ddo_count);
+}
+
+int
+ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
+    uint64_t amount)
+{
+	uint64_t cutoff;
+	uint64_t start_time = gethrtime();
+
+	if (spa->spa_active_ddt_prune)
+		return (SET_ERROR(EALREADY));
+	if (ddt_total_entries(spa) == 0)
+		return (0);
+
+	spa->spa_active_ddt_prune = B_TRUE;
+
+	zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
+	    unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
+
+	if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
+		ddt_age_histo_t histogram;
+		uint64_t oldest = 0;
+
+		/* Make a pass over DDT to build a histogram */
+		ddt_prune_walk(spa, 0, &histogram);
+
+		int target = (histogram.dah_entries * amount) / 100;
+
+		/*
+		 * Figure out our cutoff date
+		 * (i.e., which bins to prune from)
+		 */
+		for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
+			if (histogram.dah_age_histo[i] != 0) {
+				/* less than this bucket remaining */
+				if (target < histogram.dah_age_histo[i]) {
+					oldest = MAX(1, (1<<i) * 3600);
+					target = 0;
+				} else {
+					target -= histogram.dah_age_histo[i];
+				}
+			}
+		}
+		cutoff = gethrestime_sec() - oldest;
+
+		if (ddt_dump_prune_histogram)
+			ddt_dump_age_histogram(&histogram, cutoff);
+	} else if (unit == ZPOOL_DDT_PRUNE_AGE) {
+		cutoff = gethrestime_sec() - amount;
+	} else {
+		return (EINVAL);
+	}
+
+	if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
+		/* Traverse DDT to prune entries older that our cuttoff */
+		ddt_prune_walk(spa, cutoff, NULL);
+	}
+
+	zfs_dbgmsg("%s: prune completed in %llu ms",
+	    spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
+
+	spa->spa_active_ddt_prune = B_FALSE;
+	return (0);
+}
+
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
 
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
index a367d0cd02f..3aa07dc25b9 100644
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@@ -353,16 +353,15 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 }
 
 boolean_t
-ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
-    ddt_lightweight_entry_t *ddlwe)
+ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
 {
 	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
 	if (ddle == NULL)
 		return (B_FALSE);
 
-	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
-
-	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+	ddt_lightweight_entry_t ddlwe;
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
@@ -371,6 +370,21 @@ ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
 	return (B_TRUE);
 }
 
+boolean_t
+ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
+	if (!ddle)
+		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
+	if (!ddle)
+		return (B_FALSE);
+	if (ddlwe)
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+	return (B_TRUE);
+}
+
 void
 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 7ce2d919610..55bf9b683f1 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4342,6 +4342,51 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	return (total_errors > 0 ? SET_ERROR(EINVAL) : 0);
 }
 
+#define	DDT_PRUNE_UNIT		"ddt_prune_unit"
+#define	DDT_PRUNE_AMOUNT	"ddt_prune_amount"
+
+/*
+ * innvl: {
+ *     "ddt_prune_unit" -> uint32_t
+ *     "ddt_prune_amount" -> uint64_t
+ * }
+ *
+ * outnvl: "waited" -> boolean_t
+ */
+static const zfs_ioc_key_t zfs_keys_ddt_prune[] = {
+	{DDT_PRUNE_UNIT,	DATA_TYPE_INT32,	0},
+	{DDT_PRUNE_AMOUNT,	DATA_TYPE_UINT64,	0},
+};
+
+static int
+zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int32_t unit;
+	uint64_t amount;
+
+	if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 ||
+	    nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) {
+		return (EINVAL);
+	}
+
+	spa_t *spa;
+	int error = spa_open(poolname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit,
+	    amount);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
 /*
  * This ioctl waits for activity of a particular type to complete. If there is
  * no activity of that type in progress, it returns immediately, and the
@@ -7430,6 +7475,11 @@ zfs_ioctl_init(void)
 	    POOL_CHECK_NONE, B_FALSE, B_FALSE,
 	    zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props));
 
+	zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE,
+	    zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune));
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a841e0a7910..e4ccd144f09 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3859,6 +3859,16 @@ zio_ddt_free(zio_t *zio)
 	}
 	ddt_exit(ddt);
 
+	/*
+	 * When no entry was found, it must have been pruned,
+	 * so we can free it now instead of decrementing the
+	 * refcount in the DDT.
+	 */
+	if (!dde) {
+		BP_SET_DEDUP(bp, 0);
+		zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
+	}
+
 	return (zio);
 }
 

From 82ff9aafd687d4eebb6041c99fa822e0478a2024 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 29 Feb 2024 11:25:24 +1100
Subject: [PATCH 57/59] value strings: pretty printers for flags and enums

This adds zfs_valstr, a collection of pretty printers for bitfields and
enums. These are useful in debugging, logging and other display contexts
where raw values are difficult for the untrained (or even trained!) eye
to decipher.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/Makefile.am         |   1 +
 include/sys/zio.h           |   3 +
 include/sys/zio_impl.h      |   3 +
 include/sys/zio_priority.h  |   4 +
 include/zfs_valstr.h        |  84 +++++++++++
 lib/libzfs/Makefile.am      |   1 +
 lib/libzfs/libzfs.abi       |  51 +++++++
 lib/libzpool/Makefile.am    |   1 +
 module/Kbuild.in            |   1 +
 module/Makefile.bsd         |   1 +
 module/zcommon/zfs_valstr.c | 277 ++++++++++++++++++++++++++++++++++++
 11 files changed, 427 insertions(+)
 create mode 100644 include/zfs_valstr.h
 create mode 100644 module/zcommon/zfs_valstr.c

diff --git a/include/Makefile.am b/include/Makefile.am
index fa725c2e7a5..f173064efc9 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -14,6 +14,7 @@ COMMON_H = \
 	zfs_fletcher.h \
 	zfs_namecheck.h \
 	zfs_prop.h \
+	zfs_valstr.h \
 	\
 	sys/abd.h \
 	sys/abd_impl.h \
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 446b64ccd8a..3a756949a42 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -167,6 +167,9 @@ typedef enum zio_suspend_reason {
  * This was originally an enum type. However, those are 32-bit and there is no
  * way to make a 64-bit enum type. Since we ran out of bits for flags, we were
  * forced to upgrade it to a uint64_t.
+ *
+ * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
+ * FLAG.
  */
 typedef uint64_t zio_flag_t;
 	/*
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index 2b026d48675..2c846a5d41f 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -120,6 +120,9 @@ extern "C" {
 
 /*
  * zio pipeline stage definitions
+ *
+ * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
+ * FLAG.
  */
 enum zio_stage {
 	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCXT */
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index 2d8e7fc36ba..bdf5f9b8ff3 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -22,6 +22,10 @@
 extern "C" {
 #endif
 
+/*
+ * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
+ * VALUE.
+ */
 typedef enum zio_priority {
 	ZIO_PRIORITY_SYNC_READ,
 	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
diff --git a/include/zfs_valstr.h b/include/zfs_valstr.h
new file mode 100644
index 00000000000..77c26ce1ae7
--- /dev/null
+++ b/include/zfs_valstr.h
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2024, Klara Inc.
+ */
+
+#ifndef	_ZFS_VALSTR_H
+#define	_ZFS_VALSTR_H extern __attribute__((visibility("default")))
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * These macros create function prototypes for pretty-printing or stringifying
+ * certain kinds of numeric types.
+ *
+ * _ZFS_VALSTR_DECLARE_BITFIELD(name) creates:
+ *
+ *   size_t zfs_valstr_<name>_bits(uint64_t bits, char *out, size_t outlen);
+ *     expands single char for each set bit, and space for each clear bit
+ *
+ *   size_t zfs_valstr_<name>_pairs(uint64_t bits, char *out, size_t outlen);
+ *     expands two-char mnemonic for each bit set in `bits`, separated by `|`
+ *
+ *   size_t zfs_valstr_<name>(uint64_t bits, char *out, size_t outlen);
+ *     expands full name of each bit set in `bits`, separated by spaces
+ *
+ * _ZFS_VALSTR_DECLARE_ENUM(name) creates:
+ *
+ *   size_t zfs_valstr_<name>(int v, char *out, size_t outlen);
+ *     expands full name of enum value
+ *
+ * Each _ZFS_VALSTR_DECLARE_xxx needs a corresponding _VALSTR_xxx_IMPL string
+ * table in vfs_valstr.c.
+ */
+
+#define	_ZFS_VALSTR_DECLARE_BITFIELD(name)			\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _bits(	\
+	    uint64_t bits, char *out, size_t outlen);		\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _pairs(	\
+	    uint64_t bits, char *out, size_t outlen);		\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name(		\
+	    uint64_t bits, char *out, size_t outlen);		\
+
+#define	_ZFS_VALSTR_DECLARE_ENUM(name)				\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name(		\
+	    int v, char *out, size_t outlen);			\
+
+_ZFS_VALSTR_DECLARE_BITFIELD(zio_flag)
+_ZFS_VALSTR_DECLARE_BITFIELD(zio_stage)
+
+_ZFS_VALSTR_DECLARE_ENUM(zio_priority)
+
+#undef _ZFS_VALSTR_DECLARE_BITFIELD
+#undef _ZFS_VALSTR_DECLARE_ENUM
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_VALSTR_H */
diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am
index 5e74d908de3..a976faaf991 100644
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@@ -47,6 +47,7 @@ nodist_libzfs_la_SOURCES = \
 	module/zcommon/zfs_fletcher_superscalar4.c \
 	module/zcommon/zfs_namecheck.c \
 	module/zcommon/zfs_prop.c \
+	module/zcommon/zfs_valstr.c \
 	module/zcommon/zpool_prop.c \
 	module/zcommon/zprop_common.c
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 88dd8b3c679..51b29643ee0 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -454,6 +454,13 @@
     <elf-symbol name='zfs_userns' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_userspace' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_valid_proplist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_flag' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_flag_bits' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_flag_pairs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_priority' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_stage' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_stage_bits' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_stage_pairs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_version_kernel' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_version_nvlist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_version_print' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -9831,6 +9838,50 @@
       <return type-id='c19b74c3'/>
     </function-decl>
   </abi-instr>
+  <abi-instr address-size='64' path='module/zcommon/zfs_valstr.c' language='LANG_C99'>
+    <function-decl name='zfs_valstr_zio_flag' mangled-name='zfs_valstr_zio_flag' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_flag_bits' mangled-name='zfs_valstr_zio_flag_bits' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag_bits'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_flag_pairs' mangled-name='zfs_valstr_zio_flag_pairs' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag_pairs'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_stage' mangled-name='zfs_valstr_zio_stage' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_stage_bits' mangled-name='zfs_valstr_zio_stage_bits' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage_bits'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_stage_pairs' mangled-name='zfs_valstr_zio_stage_pairs' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage_pairs'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_priority' mangled-name='zfs_valstr_zio_priority' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_priority'>
+      <parameter type-id='95e97e5e' name='v'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+  </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zpool_prop.c' language='LANG_C99'>
     <function-decl name='zpool_prop_string_to_index' mangled-name='zpool_prop_string_to_index' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_string_to_index'>
       <parameter type-id='5d0c23fb' name='prop'/>
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 81949bf9e5b..ff30af7d2b9 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -64,6 +64,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zcommon/zfs_fletcher_superscalar4.c \
 	module/zcommon/zfs_namecheck.c \
 	module/zcommon/zfs_prop.c \
+	module/zcommon/zfs_valstr.c \
 	module/zcommon/zpool_prop.c \
 	module/zcommon/zprop_common.c \
 	\
diff --git a/module/Kbuild.in b/module/Kbuild.in
index a119198dbfc..0472a9348c1 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -240,6 +240,7 @@ ZCOMMON_OBJS := \
 	zfs_fletcher_superscalar4.o \
 	zfs_namecheck.o \
 	zfs_prop.o \
+	zfs_valstr.o \
 	zpool_prop.o \
 	zprop_common.o
 
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 534f3257132..9161204c99d 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -233,6 +233,7 @@ SRCS+=	cityhash.c \
 	zfs_fletcher_superscalar.c \
 	zfs_namecheck.c \
 	zfs_prop.c \
+	zfs_valstr.c \
 	zpool_prop.c \
 	zprop_common.c
 
diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c
new file mode 100644
index 00000000000..e2d4d1aefef
--- /dev/null
+++ b/module/zcommon/zfs_valstr.c
@@ -0,0 +1,277 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2024, Klara Inc.
+ */
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/string.h>
+#include <sys/debug.h>
+#include "zfs_valstr.h"
+
+/*
+ * Each bit in a bitfield has three possible string representations:
+ * - single char
+ * - two-char pair
+ * - full name
+ */
+typedef struct {
+	const char	vb_bit;
+	const char	vb_pair[2];
+	const char	*vb_name;
+} valstr_bit_t;
+
+/*
+ * Emits a character for each bit in `bits`, up to the number of elements
+ * in the table. Set bits get the character in vb_bit, clear bits get a
+ * space. This results in all strings having the same width, for easier
+ * visual comparison.
+ */
+static size_t
+valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
+    uint64_t bits, char *out, size_t outlen)
+{
+	ASSERT(out);
+	size_t n = 0;
+	for (int b = 0; b < nelems; b++) {
+		if (n == outlen)
+			break;
+		uint64_t mask = (1ULL << b);
+		out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
+	}
+	if (n < outlen)
+		out[n++] = '\0';
+	return (n);
+}
+
+/*
+ * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
+ * separated by a `|` character. This gives a concise representation of the
+ * whole value.
+ */
+static size_t
+valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
+    uint64_t bits, char *out, size_t outlen)
+{
+	ASSERT(out);
+	size_t n = 0;
+	for (int b = 0; b < nelems; b++) {
+		ASSERT3U(n, <=, outlen);
+		if (n == outlen)
+			break;
+		uint64_t mask = (1ULL << b);
+		if (bits & mask) {
+			size_t len = (n > 0) ? 3 : 2;
+			if (n > outlen-len)
+				break;
+			if (n > 0)
+				out[n++] = '|';
+			out[n++] = table[b].vb_pair[0];
+			out[n++] = table[b].vb_pair[1];
+		}
+	}
+	if (n < outlen)
+		out[n++] = '\0';
+	return (n);
+}
+
+/*
+ * Emits the full name for each bit set in `bits`, taken from vb_name, and
+ * separated by a space. This unambiguously shows the entire set of bits, but
+ * can get very long.
+ */
+static size_t
+valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
+    uint64_t bits, char *out, size_t outlen)
+{
+	ASSERT(out);
+	size_t n = 0;
+	for (int b = 0; b < nelems; b++) {
+		ASSERT3U(n, <=, outlen);
+		if (n == outlen)
+			break;
+		uint64_t mask = (1ULL << b);
+		if (bits & mask) {
+			size_t len = strlen(table[b].vb_name);
+			if (n > 0)
+				len++;
+			if (n > outlen-len)
+				break;
+			if (n > 0) {
+				out[n++] = ' ';
+				len--;
+			}
+			memcpy(&out[n], table[b].vb_name, len);
+			n += len;
+		}
+	}
+	if (n < outlen)
+		out[n++] = '\0';
+	return (n);
+}
+
+/*
+ * Emits the name of the given enum value in the table.
+ */
+static size_t
+valstr_enum_str(const char **table, const size_t nelems,
+    int v, char *out, size_t outlen)
+{
+	ASSERT(out);
+	ASSERT3U(v, <, nelems);
+	if (v >= nelems)
+		return (0);
+	return (MIN(strlcpy(out, table[v], outlen), outlen));
+}
+
+/*
+ * These macros create the string tables for the given name, and implement
+ * the public functions described in zfs_valstr.h.
+ */
+#define	_VALSTR_BITFIELD_IMPL(name, ...)				\
+static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
+size_t									\
+zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen)	\
+{									\
+	return (valstr_bitfield_bits(valstr_ ## name ## _table,		\
+	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
+}									\
+									\
+size_t									\
+zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen)	\
+{									\
+	return (valstr_bitfield_pairs(valstr_ ## name ## _table,	\
+	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
+}									\
+									\
+size_t									\
+zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen)		\
+{									\
+	return (valstr_bitfield_str(valstr_ ## name ## _table,		\
+	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
+}									\
+
+#define	_VALSTR_ENUM_IMPL(name, ...)					\
+static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ };	\
+size_t									\
+zfs_valstr_ ## name(int v, char *out, size_t outlen)			\
+{									\
+	return (valstr_enum_str(valstr_ ## name ## _table,		\
+	    ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen));	\
+}									\
+
+
+/* String tables */
+
+/* ZIO flags: zio_flag_t, typically zio->io_flags */
+/* BEGIN CSTYLED */
+_VALSTR_BITFIELD_IMPL(zio_flag,
+	{ '.', "DA", "DONT_AGGREGATE" },
+	{ '.', "RP", "IO_REPAIR" },
+	{ '.', "SH", "SELF_HEAL" },
+	{ '.', "RS", "RESILVER" },
+	{ '.', "SC", "SCRUB" },
+	{ '.', "ST", "SCAN_THREAD" },
+	{ '.', "PH", "PHYSICAL" },
+	{ '.', "CF", "CANFAIL" },
+	{ '.', "SP", "SPECULATIVE" },
+	{ '.', "CW", "CONFIG_WRITER" },
+	{ '.', "DR", "DONT_RETRY" },
+	{ '?', "??", "[UNUSED 11]" },
+	{ '.', "ND", "NODATA" },
+	{ '.', "ID", "INDUCE_DAMAGE" },
+	{ '.', "AL", "IO_ALLOCATING" },
+	{ '.', "RE", "IO_RETRY" },
+	{ '.', "PR", "PROBE" },
+	{ '.', "TH", "TRYHARD" },
+	{ '.', "OP", "OPTIONAL" },
+	{ '.', "DQ", "DONT_QUEUE" },
+	{ '.', "DP", "DONT_PROPAGATE" },
+	{ '.', "BY", "IO_BYPASS" },
+	{ '.', "RW", "IO_REWRITE" },
+	{ '.', "CM", "RAW_COMPRESS" },
+	{ '.', "EN", "RAW_ENCRYPT" },
+	{ '.', "GG", "GANG_CHILD" },
+	{ '.', "DD", "DDT_CHILD" },
+	{ '.', "GF", "GODFATHER" },
+	{ '.', "NP", "NOPWRITE" },
+	{ '.', "EX", "REEXECUTED" },
+	{ '.', "DG", "DELEGATED" },
+)
+/* END CSTYLED */
+
+/*
+ * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
+ *                        zio->io_pipeline.
+ */
+/* BEGIN CSTYLED */
+_VALSTR_BITFIELD_IMPL(zio_stage,
+	{ 'O', "O ", "OPEN" },
+	{ 'I', "RI", "READ_BP_INIT" },
+	{ 'I', "WI", "WRITE_BP_INIT" },
+	{ 'I', "FI", "FREE_BP_INIT" },
+	{ 'A', "IA", "ISSUE_ASYNC" },
+	{ 'W', "WC", "WRITE_COMPRESS" },
+	{ 'E', "EN", "ENCRYPT" },
+	{ 'C', "CG", "CHECKSUM_GENERATE" },
+	{ 'N', "NW", "NOP_WRITE" },
+	{ 'B', "BF", "BRT_FREE" },
+	{ 'd', "dS", "DDT_READ_START" },
+	{ 'd', "dD", "DDT_READ_DONE" },
+	{ 'd', "dW", "DDT_WRITE" },
+	{ 'd', "dF", "DDT_FREE" },
+	{ 'G', "GA", "GANG_ASSEMBLE" },
+	{ 'G', "GI", "GANG_ISSUE" },
+	{ 'D', "DT", "DVA_THROTTLE" },
+	{ 'D', "DA", "DVA_ALLOCATE" },
+	{ 'D', "DF", "DVA_FREE" },
+	{ 'D', "DC", "DVA_CLAIM" },
+	{ 'R', "R ", "READY" },
+	{ 'V', "VS", "VDEV_IO_START" },
+	{ 'V', "VD", "VDEV_IO_DONE" },
+	{ 'V', "VA", "VDEV_IO_ASSESS" },
+	{ 'C', "CV", "CHECKSUM_VERIFY" },
+	{ 'X', "X ", "DONE" },
+)
+/* END CSTYLED */
+
+/* ZIO priority: zio_priority_t, typically zio->io_priority */
+/* BEGIN CSTYLED */
+_VALSTR_ENUM_IMPL(zio_priority,
+	"SYNC_READ",
+	"SYNC_WRITE",
+	"ASYNC_READ",
+	"ASYNC_WRITE",
+	"SCRUB",
+	"REMOVAL",
+	"INITIALIZING",
+	"TRIM",
+	"REBUILD",
+	"[NUM_QUEUEABLE]",
+	"NOW",
+)
+/* END CSTYLED */
+
+#undef _VALSTR_BITFIELD_IMPL
+#undef _VALSTR_ENUM_IMPL

From 17dd66dedab9f9bebc823cca3eae3405ef28c7ef Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 29 Feb 2024 15:00:25 +1100
Subject: [PATCH 58/59] zpool events: expand value strings for ZIO error values

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zpool/zpool_main.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index ce859226c21..349c208c521 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -75,6 +75,7 @@
 #include "zpool_util.h"
 #include "zfs_comutil.h"
 #include "zfeature_common.h"
+#include "zfs_valstr.h"
 
 #include "statcommon.h"
 
@@ -11936,6 +11937,7 @@ static void
 zpool_do_events_nvprint(nvlist_t *nvl, int depth)
 {
 	nvpair_t *nvp;
+	static char flagstr[256];
 
 	for (nvp = nvlist_next_nvpair(nvl, NULL);
 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
@@ -11995,7 +11997,21 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)
 
 		case DATA_TYPE_UINT32:
 			(void) nvpair_value_uint32(nvp, &i32);
-			printf(gettext("0x%x"), i32);
+			if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE) == 0 ||
+			    strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE) == 0) {
+				zfs_valstr_zio_stage(i32, flagstr,
+				    sizeof (flagstr));
+				printf(gettext("0x%x [%s]"), i32, flagstr);
+			} else if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY) == 0) {
+				zfs_valstr_zio_priority(i32, flagstr,
+				    sizeof (flagstr));
+				printf(gettext("0x%x [%s]"), i32, flagstr);
+			} else {
+				printf(gettext("0x%x"), i32);
+			}
 			break;
 
 		case DATA_TYPE_INT64:
@@ -12016,6 +12032,12 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)
 				printf(gettext("\"%s\" (0x%llx)"),
 				    zpool_state_to_name(i64, VDEV_AUX_NONE),
 				    (u_longlong_t)i64);
+			} else if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS) == 0) {
+				zfs_valstr_zio_flag(i64, flagstr,
+				    sizeof (flagstr));
+				printf(gettext("0x%llx [%s]"),
+				    (u_longlong_t)i64, flagstr);
 			} else {
 				printf(gettext("0x%llx"), (u_longlong_t)i64);
 			}

From b109925820fb79db3e37670c159977f03edd950f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 7 Sep 2024 01:45:58 +1000
Subject: [PATCH 59/59] spa_prop_get: require caller to supply output nvlist

All callers to spa_prop_get() and spa_prop_get_nvlist() supplied their
own preallocated nvlist (except ztest), so we can remove the option to
have them allocate one if none is supplied.

This sidesteps a bug in spa_prop_get(), where the error var wasn't
initialised, which could lead to the provided nvlist being freed at the
end.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16505
---
 cmd/ztest.c            |   5 ++-
 include/sys/spa.h      |   4 +-
 module/zfs/spa.c       | 100 ++++++++++++++++++-----------------------
 module/zfs/zfs_ioctl.c |  10 ++---
 4 files changed, 53 insertions(+), 66 deletions(-)

diff --git a/cmd/ztest.c b/cmd/ztest.c
index a7843d33883..ce031632e75 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -6215,13 +6215,14 @@ void
 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
 	(void) zd, (void) id;
-	nvlist_t *props = NULL;
 
 	(void) pthread_rwlock_rdlock(&ztest_name_lock);
 
 	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2));
 
-	VERIFY0(spa_prop_get(ztest_spa, &props));
+	nvlist_t *props = fnvlist_alloc();
+
+	VERIFY0(spa_prop_get(ztest_spa, props));
 
 	if (ztest_opts.zo_verbose >= 6)
 		dump_nvlist(props, 4);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 93f381affd9..aa66d489ef1 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1201,9 +1201,9 @@ extern void spa_boot_init(void);
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
-extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
+extern int spa_prop_get(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get_nvlist(spa_t *spa, char **props,
-    unsigned int n_props, nvlist_t **outnvl);
+    unsigned int n_props, nvlist_t *outnvl);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index d51cc4fcd09..1a68a095356 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -366,21 +366,15 @@ spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl)
 
 int
 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props,
-    nvlist_t **outnvl)
+    nvlist_t *outnvl)
 {
 	int err = 0;
 
 	if (props == NULL)
 		return (0);
 
-	if (*outnvl == NULL) {
-		err = nvlist_alloc(outnvl, NV_UNIQUE_NAME, KM_SLEEP);
-		if (err)
-			return (err);
-	}
-
 	for (unsigned int i = 0; i < n_props && err == 0; i++) {
-		err = spa_prop_add(spa, props[i], *outnvl);
+		err = spa_prop_add(spa, props[i], outnvl);
 	}
 
 	return (err);
@@ -406,7 +400,7 @@ spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
  * Get property values from the spa configuration.
  */
 static void
-spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+spa_prop_get_config(spa_t *spa, nvlist_t *nv)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
@@ -428,48 +422,48 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 		size += metaslab_class_get_space(spa_dedup_class(spa));
 		size += metaslab_class_get_space(spa_embedded_log_class(spa));
 
-		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+		spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
+		spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+		spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL,
 		    spa->spa_checkpoint_info.sci_dspace, src);
 
-		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == SPA_MODE_READ), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+		spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
-		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL,
 		    brt_get_used(spa), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL,
 		    brt_get_saved(spa), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL,
 		    brt_get_ratio(spa), src);
 
-		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
 		    ddt_get_ddt_dsize(spa), src);
 
-		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
+			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_DEFAULT);
 		} else {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
+			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
-		spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
+		spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID,
 		    NULL, spa_load_guid(spa), src);
 	}
 
@@ -479,62 +473,62 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+			spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL,
 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 			    src);
 		} else {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+			spa_prop_add_list(nv, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL,
 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 			    src);
 		} else {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
-	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+	spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
+		spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_compatibility != NULL) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
+		spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY,
 		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
-		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+		spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
 	} else {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
@@ -544,19 +538,13 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
  * Get zpool property values.
  */
 int
-spa_prop_get(spa_t *spa, nvlist_t **nvp)
+spa_prop_get(spa_t *spa, nvlist_t *nv)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	dsl_pool_t *dp;
-	int err;
-
-	if (*nvp == NULL) {
-		err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
-		if (err)
-			return (err);
-	}
+	int err = 0;
 
 	dp = spa_get_dsl(spa);
 	dsl_pool_config_enter(dp, FTAG);
@@ -565,7 +553,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 	/*
 	 * Get properties from the spa config.
 	 */
-	spa_prop_get_config(spa, nvp);
+	spa_prop_get_config(spa, nv);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0)
@@ -610,7 +598,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 				intval = za.za_first_integer;
 			}
 
-			spa_prop_add_list(*nvp, prop, strval, intval, src);
+			spa_prop_add_list(nv, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
@@ -627,10 +615,10 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 				break;
 			}
 			if (prop != ZPOOL_PROP_INVAL) {
-				spa_prop_add_list(*nvp, prop, strval, 0, src);
+				spa_prop_add_list(nv, prop, strval, 0, src);
 			} else {
 				src = ZPROP_SRC_LOCAL;
-				spa_prop_add_user(*nvp, za.za_name, strval,
+				spa_prop_add_user(nv, za.za_name, strval,
 				    src);
 			}
 			kmem_free(strval, za.za_num_integers);
@@ -644,11 +632,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 out:
 	mutex_exit(&spa->spa_props_lock);
 	dsl_pool_config_exit(dp, FTAG);
-	if (err && err != ENOENT) {
-		nvlist_free(*nvp);
-		*nvp = NULL;
+
+	if (err && err != ENOENT)
 		return (err);
-	}
 
 	return (0);
 }
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 55bf9b683f1..53366ad4978 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3050,7 +3050,6 @@ static const zfs_ioc_key_t zfs_keys_get_props[] = {
 static int
 zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
 {
-	nvlist_t *nvp = outnvl;
 	spa_t *spa;
 	char **props = NULL;
 	unsigned int n_props = 0;
@@ -3069,16 +3068,17 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
 		 */
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(pool)) != NULL) {
-			error = spa_prop_get(spa, &nvp);
+			error = spa_prop_get(spa, outnvl);
 			if (error == 0 && props != NULL)
 				error = spa_prop_get_nvlist(spa, props, n_props,
-				    &nvp);
+				    outnvl);
 		}
 		mutex_exit(&spa_namespace_lock);
 	} else {
-		error = spa_prop_get(spa, &nvp);
+		error = spa_prop_get(spa, outnvl);
 		if (error == 0 && props != NULL)
-			error = spa_prop_get_nvlist(spa, props, n_props, &nvp);
+			error = spa_prop_get_nvlist(spa, props, n_props,
+			    outnvl);
 		spa_close(spa, FTAG);
 	}