From 04bae5ec95f7273105237159a882d5b72ec2b998 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 3 May 2024 12:53:34 -0400 Subject: [PATCH 01/43] Disable high priority ZIO threads on FreeBSD and Linux High priority threads are handling ZIL writes. While there is no ZIL compression, there is encryption, checksuming and RAIDZ math. We've found that on large systems 1 taskq with 5 threads can be a bottleneck for throughput, IOPS or both. Instead of just bumping number of threads with a risk of overloading CPUs and increasing latency, switch to using TQ_FRONT mechanism to increase sync write requests priority within standard write threads. Do not do it on Illumos, since its TQ_FRONT implementation is inherently unfair. FreeBSD and Linux don't have this problem, so we can do it there. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #16146 --- man/man4/zfs.4 | 2 +- module/zfs/spa.c | 11 ++++++++--- module/zfs/zio.c | 12 +++++++----- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5edd80659e0..6895a2a6d79 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2367,7 +2367,7 @@ This is an advanced debugging parameter. Don't change this unless you understand what it does. Set values only apply to pools imported/created after that. . -.It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp +.It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp Set the queue and thread configuration for the IO write queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index ec2b674fb7e..560fd67087b 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -170,14 +170,19 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue - * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that - * need to be handled with minimum delay. + * and interrupt) and then to reserve threads for high priority I/Os that + * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT + * implementation, so separate high priority threads are used there. */ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ +#ifdef illumos { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ +#else + { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ +#endif { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ @@ -1217,7 +1222,7 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) * * Example (the defaults for READ and WRITE) * zio_taskq_read='fixed,1,8 null scale null' - * zio_taskq_write='sync fixed,1,5 scale fixed,1,5' + * zio_taskq_write='sync null scale null' * * Each sets the entire row at a time. * diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 870343bf4fa..65a0afaaa21 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2041,12 +2041,14 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) /* * If this is a high priority I/O, then use the high priority taskq if - * available. + * available or cut the line otherwise. */ - if ((zio->io_priority == ZIO_PRIORITY_NOW || - zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && - spa->spa_zio_taskq[t][q + 1].stqs_count != 0) - q++; + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { + if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) + q++; + else + flags |= TQ_FRONT; + } ASSERT3U(q, <, ZIO_TASKQ_TYPES); From 2dff7527d4a40310f589045f5ab3a07b02963516 Mon Sep 17 00:00:00 2001 From: Daniel Perry Date: Thu, 9 May 2024 10:30:28 -0400 Subject: [PATCH 02/43] Replace usage of schedule_timeout with schedule_timeout_interruptible (#16150) This commit replaces current usages of schedule_timeout() with schedule_timeout_interruptible() in code paths that expect the running task to sleep for a short period of time. When schedule_timeout() is called without previously calling set_current_state(), the running task never sleeps because the task state remains in TASK_RUNNING. By calling schedule_timeout_interruptible() to set the task state to TASK_INTERRUPTIBLE before calling schedule_timeout() we achieve the intended/desired behavior of putting the task to sleep for the specified timeout. Reviewed-by: Brian Behlendorf Signed-off-by: Daniel Perry Closes #16150 --- module/os/linux/spl/spl-taskq.c | 2 +- module/os/linux/zfs/vdev_disk.c | 2 +- module/os/linux/zfs/zvol_os.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index c384b7b378c..e7b812c3b5b 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -158,7 +158,7 @@ retry: * throttling the task dispatch rate. */ spin_unlock_irqrestore(&tq->tq_lock, *irqflags); - schedule_timeout(HZ / 100); + schedule_timeout_interruptible(HZ / 100); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (count < 100) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 463c5f70510..7284b922b3b 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -397,7 +397,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, if (v->vdev_removed) break; - schedule_timeout(MSEC_TO_TICK(10)); + schedule_timeout_interruptible(MSEC_TO_TICK(10)); } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 2a036dc5136..3012423e9f2 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -798,7 +798,8 @@ retry: if ((gethrtime() - start) > timeout) return (SET_ERROR(-ERESTARTSYS)); - schedule_timeout(MSEC_TO_TICK(10)); + schedule_timeout_interruptible( + MSEC_TO_TICK(10)); goto retry; #endif } else { From a0f3c8aaf1e8c1196282e91cca603f877d7a618b Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 9 May 2024 19:31:57 +0500 Subject: [PATCH 03/43] zdb: add missing cleanup for early return Reviewed-by: Brian Behlendorf Reviewed-by: Don Brady Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #16152 --- cmd/zdb/zdb.c | 78 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 449b6bf2ccb..ce80c0aa590 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -120,6 +120,9 @@ static int flagbits[256]; static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static range_tree_t *mos_refd_objs; +static spa_t *spa; +static objset_t *os; +static boolean_t kernel_init_done; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); @@ -131,6 +134,7 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, static void zdb_print_blkptr(const blkptr_t *bp, int flags); +static void zdb_exit(int reason); typedef struct sublivelist_verify_block_refcnt { /* block pointer entry in livelist being verified */ @@ -818,7 +822,7 @@ usage(void) (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); - exit(1); + zdb_exit(1); } static void @@ -849,7 +853,7 @@ fatal(const char *fmt, ...) dump_debug_buffer(); - exit(1); + zdb_exit(1); } static void @@ -2276,7 +2280,7 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); - exit(1); + zdb_exit(1); } decode_embedded_bp_compressed(bp, buf); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); @@ -3231,6 +3235,23 @@ fuid_table_destroy(void) } } +static void +zdb_exit(int reason) +{ + if (os != NULL) { + close_objset(os, FTAG); + } else if (spa != NULL) { + spa_close(spa, FTAG); + } + + fuid_table_destroy(); + + if (kernel_init_done) + kernel_fini(); + + exit(reason); +} + /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. @@ -4161,32 +4182,32 @@ dump_cachefile(const char *cachefile) if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); - exit(1); + zdb_exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); - exit(1); + zdb_exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); - exit(1); + zdb_exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); - exit(1); + zdb_exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); - exit(1); + zdb_exit(1); } free(buf); @@ -5102,14 +5123,14 @@ dump_label(const char *dev) if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); - exit(1); + zdb_exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); - exit(1); + zdb_exit(1); } if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) @@ -8221,7 +8242,7 @@ dump_zpool(spa_t *spa) if (rc != 0) { dump_debug_buffer(); - exit(rc); + zdb_exit(rc); } } @@ -8825,18 +8846,18 @@ zdb_embedded_block(char *thing) words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); - exit(1); + zdb_exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); - exit(1); + zdb_exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); - exit(1); + zdb_exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); @@ -8863,8 +8884,6 @@ int main(int argc, char **argv) { int c; - spa_t *spa = NULL; - objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; @@ -9093,6 +9112,7 @@ main(int argc, char **argv) spa_mode_readable_spacemaps = B_TRUE; kernel_init(SPA_MODE_READ); + kernel_init_done = B_TRUE; if (dump_all) verbose = MAX(verbose, 1); @@ -9116,19 +9136,23 @@ main(int argc, char **argv) if (argc != 1) usage(); zdb_embedded_block(argv[0]); - return (0); + error = 0; + goto fini; } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); - return (0); + error = 0; + goto fini; } usage(); } - if (dump_opt['l']) - return (dump_label(argv[0])); + if (dump_opt['l']) { + error = dump_label(argv[0]); + goto fini; + } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | @@ -9183,7 +9207,8 @@ main(int argc, char **argv) } else if (objset_str && !zdb_numeric(objset_str + 1) && dump_opt['N']) { printf("Supply a numeric objset ID with -N\n"); - exit(1); + error = 1; + goto fini; } } else { target_pool = target; @@ -9240,7 +9265,8 @@ main(int argc, char **argv) if (argc != 2) usage(); dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1], NULL)); + error = dump_path(argv[0], argv[1], NULL); + goto fini; } if (dump_opt['r']) { @@ -9328,7 +9354,7 @@ main(int argc, char **argv) fatal("can't dump '%s': %s", target, strerror(error)); } - return (error); + goto fini; } else { target_pool = strdup(target); if (strpbrk(target, "/@") != NULL) @@ -9458,9 +9484,10 @@ retry_lookup: free(checkpoint_target); } +fini: if (os != NULL) { close_objset(os, FTAG); - } else { + } else if (spa != NULL) { spa_close(spa, FTAG); } @@ -9468,7 +9495,8 @@ retry_lookup: dump_debug_buffer(); - kernel_fini(); + if (kernel_init_done) + kernel_fini(); return (error); } From af5dbed3193eb91e1302e1b976606b64fb9c557b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 9 May 2024 10:32:59 -0400 Subject: [PATCH 04/43] Fix scn_queue races on very old pools Code for pools before version 11 uses dmu_objset_find_dp() to scan for children datasets/clones. It calls enqueue_clones_cb() and enqueue_cb() callbacks in parallel from multiple taskq threads. It ends up bad for scan_ds_queue_insert(), corrupting scn_queue AVL-tree. Fix it by introducing a mutex to protect those two scan_ds_queue_insert() calls. All other calls are done from the sync thread and so serialized. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16162 --- include/sys/dsl_scan.h | 1 + module/zfs/dsl_scan.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 2e3452e5eba..f32f59a2bed 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -173,6 +173,7 @@ typedef struct dsl_scan { dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ + kmutex_t scn_queue_lock; /* serializes scn_queue inserts */ uint64_t scn_queues_pending; /* outstanding data to issue */ /* members needed for syncing error scrub status to disk */ dsl_errorscrub_phys_t errorscrub_phys; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 55e89b89f06..085cfd3c569 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); + mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); @@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp) scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); + mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); @@ -2723,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } @@ -2915,8 +2919,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } From 3400127a75fda737bc59ae52f1f8ecedd6201117 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 9 May 2024 10:39:57 -0400 Subject: [PATCH 05/43] Fix ZIL clone records for legacy holes Previous code overengineered cloned range calculation by using BP_GET_LSIZE(). The problem is that legacy holes don't have the logical size, so result will be wrong. But we also don't need to look on every block size, since they all must be identical. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16165 --- module/zfs/zfs_log.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 433a653e550..fa4e7093ca4 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -895,7 +895,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, itx_t *itx; lr_clone_range_t *lr; uint64_t partlen, max_log_data; - size_t i, partnbps; + size_t partnbps; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; @@ -904,10 +904,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, while (nbps > 0) { partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); - partlen = 0; - for (i = 0; i < partnbps; i++) { - partlen += BP_GET_LSIZE(&bps[i]); - } + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); partlen = MIN(partlen, len); itx = zil_itx_create(txtype, From 414acbd37e0a1121e93310e88956e30554ad1dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Thu, 9 May 2024 16:42:51 +0200 Subject: [PATCH 06/43] Unbreak FreeBSD cross-build on MacOS broken in 051460b8b MacOS used FreeBSD-compatible getprogname() and pthread_getname_np(). But pthread_getthreadid_np() does not exist on MacOS. This implements libspl_gettid() using pthread_threadid_np() to get the thread id of the current thread. Tested with FreeBSD GitHub actions freebsd-src/.github/workflows/cross-bootstrap-tools.yml Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Martin Matuska Closes #16167 --- lib/libspl/assert.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index e6e3008f0aa..5b12c14acd6 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -41,9 +41,11 @@ #define libspl_getprogname() (program_invocation_short_name) #define libspl_getthreadname(buf, len) \ prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0) -#elif defined(__FreeBSD__) +#elif defined(__FreeBSD__) || defined(__APPLE__) +#if !defined(__APPLE__) #include #define libspl_gettid() pthread_getthreadid_np() +#endif #define libspl_getprogname() getprogname() #define libspl_getthreadname(buf, len) \ pthread_getname_np(pthread_self(), buf, len); @@ -98,6 +100,19 @@ libspl_dump_backtrace(void) #define libspl_dump_backtrace() #endif +#if defined(__APPLE__) +static inline uint64_t +libspl_gettid(void) +{ + uint64_t tid; + + if (pthread_threadid_np(NULL, &tid) != 0) + tid = 0; + + return (tid); +} +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -128,7 +143,11 @@ libspl_assertf(const char *file, const char *func, int line, fprintf(stderr, "\n" " PID: %-8u COMM: %s\n" +#if defined(__APPLE__) + " TID: %-8" PRIu64 " NAME: %s\n", +#else " TID: %-8u NAME: %s\n", +#endif getpid(), libspl_getprogname(), libspl_gettid(), tname); From 1ede0c716beeee4a720ff5c361121021555d7e3c Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 10 May 2024 00:43:48 +1000 Subject: [PATCH 07/43] libspl_assert: always link -lpthread on FreeBSD The pthread_* functions are in -lpthread on FreeBSD. Some of them are implicitly linked through libc, but on FreeBSD 13 at least pthread_getname_np() is not. Just be explicit, since -lpthread is the documented interface anyway. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16168 --- lib/libspl/Makefile.am | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index eb2377305ac..94be416d46a 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -45,3 +45,7 @@ libspl_la_LIBADD = \ libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) + +if BUILD_FREEBSD +libspl_assert_la_LIBADD += -lpthread +endif From 41ae864b69991f7e13d5171f54a42c721b297233 Mon Sep 17 00:00:00 2001 From: chenqiuhao1997 Date: Fri, 10 May 2024 23:47:21 +0800 Subject: [PATCH 08/43] Replace P2ALIGN with P2ALIGN_TYPED and delete P2ALIGN. In P2ALIGN, the result would be incorrect when align is unsigned integer and x is larger than max value of the type of align. In that case, -(align) would be a positive integer, which means high bits would be zero and finally stay zero after '&' when align is converted to a larger integer type. Reviewed-by: Brian Behlendorf Reviewed-by: Youzhong Yang Signed-off-by: Qiuhao Chen Closes #15940 --- cmd/zdb/zdb.c | 2 +- cmd/ztest.c | 15 +++++++++------ include/os/freebsd/spl/sys/ccompile.h | 3 ++- include/os/freebsd/spl/sys/sysmacros.h | 3 ++- include/os/linux/spl/sys/sysmacros.h | 3 ++- lib/libefi/rdwr_efi.c | 4 ++-- lib/libspl/include/os/linux/sys/sysmacros.h | 3 ++- lib/libzfs/os/linux/libzfs_pool_os.c | 3 ++- module/os/freebsd/zfs/vdev_geom.c | 2 +- module/os/linux/zfs/zvol_os.c | 2 +- module/zcommon/zfs_fletcher.c | 8 +++++--- module/zfs/btree.c | 2 +- module/zfs/dmu.c | 5 +++-- module/zfs/dmu_object.c | 2 +- module/zfs/metaslab.c | 4 ++-- module/zfs/vdev.c | 11 ++++++----- module/zfs/vdev_raidz.c | 3 ++- 17 files changed, 44 insertions(+), 31 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ce80c0aa590..797ae34b6e5 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5143,7 +5143,7 @@ dump_label(const char *dev) sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); ashift = SPA_MINBLOCKSHIFT; /* diff --git a/cmd/ztest.c b/cmd/ztest.c index b0fea8b3cfb..56eb01618c2 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2529,7 +2529,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { - offset = P2ALIGN(offset, size); + offset = P2ALIGN_TYPED(offset, size, uint64_t); } else { ASSERT3U(offset, <, size); offset = 0; @@ -3978,7 +3978,8 @@ raidz_scratch_verify(void) raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); offset = RRSS_GET_OFFSET(&spa->spa_uberblock); state = RRSS_GET_STATE(&spa->spa_uberblock); - write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift); + write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, + uint64_t); logical_size = write_size * raidvd->vdev_children; switch (state) { @@ -5016,7 +5017,8 @@ ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); - os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, + uint64_t); mutex_exit(&os->os_obj_lock); } @@ -6638,7 +6640,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ - uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), + uint64_t); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; @@ -6962,8 +6965,8 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) - inc += P2ALIGN(ztest_random(64), - sizeof (uint32_t)); + inc += P2ALIGN_TYPED(ztest_random(64), + sizeof (uint32_t), uint64_t); if (inc > (size - pos)) inc = size - pos; diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h index 26cf4db87ae..bebeb0db245 100644 --- a/include/os/freebsd/spl/sys/ccompile.h +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -138,7 +138,8 @@ typedef int enum_t; #define readdir64 readdir #define dirent64 dirent #endif -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) diff --git a/include/os/freebsd/spl/sys/sysmacros.h b/include/os/freebsd/spl/sys/sysmacros.h index 3e8841ae66b..2c9f4438d76 100644 --- a/include/os/freebsd/spl/sys/sysmacros.h +++ b/include/os/freebsd/spl/sys/sysmacros.h @@ -191,7 +191,8 @@ extern unsigned char bcd_to_byte[256]; * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) /* * return x % (mod) align diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h index 99e3a6fb41c..0e839073630 100644 --- a/include/os/linux/spl/sys/sysmacros.h +++ b/include/os/linux/spl/sys/sysmacros.h @@ -159,7 +159,8 @@ makedev(unsigned int major, unsigned int minor) /* * Compatibility macros/typedefs needed for Solaris -> Linux port */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index 739219e0410..63c91059ae5 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -1175,8 +1175,8 @@ efi_use_whole_disk(int fd) * (for performance reasons). The alignment should match the * alignment used by the "zpool_label_disk" function. */ - limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, - PARTITION_END_ALIGNMENT); + limit = P2ALIGN_TYPED(efi_label->efi_last_lba - nblocks - + EFI_MIN_RESV_SIZE, PARTITION_END_ALIGNMENT, diskaddr_t); if (data_start + data_size != limit || resv_start != limit) sync_needed = B_TRUE; diff --git a/lib/libspl/include/os/linux/sys/sysmacros.h b/lib/libspl/include/os/linux/sys/sysmacros.h index 5765ee25c6c..26e1c87a35a 100644 --- a/lib/libspl/include/os/linux/sys/sysmacros.h +++ b/lib/libspl/include/os/linux/sys/sysmacros.h @@ -52,7 +52,8 @@ /* * Compatibility macros/typedefs needed for Solaris -> Linux port */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2BOUNDARY(off, len, align) \ diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c index 86eef3255bc..7b18e31c867 100644 --- a/lib/libzfs/os/linux/libzfs_pool_os.c +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -268,7 +268,8 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) if (start_block == MAXOFFSET_T) start_block = NEW_START_BLOCK; slice_size -= start_block; - slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT, + uint64_t); vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 9d88971919d..38c1d8e9e46 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -457,7 +457,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); size = sizeof (*vdev_lists[0]) + pp->sectorsize - ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 3012423e9f2..3e020e53226 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -384,7 +384,7 @@ zvol_discard(zv_request_t *zvr) */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN(end, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); size = end - start; } diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 619ddef0243..74b8c2a475a 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -471,7 +471,8 @@ fletcher_4_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, + uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); @@ -519,7 +520,8 @@ fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, + uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); @@ -878,7 +880,7 @@ abd_fletcher_4_iter(void *data, size_t size, void *private) fletcher_4_ctx_t *ctx = cdp->acd_ctx; fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; - uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + uint64_t asize = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); diff --git a/module/zfs/btree.c b/module/zfs/btree.c index af2b94a850b..9c52083603f 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -218,7 +218,7 @@ zfs_btree_create_custom(zfs_btree_t *tree, zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; - tree->bt_leaf_cap = P2ALIGN(esize / size, 2); + tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t); tree->bt_height = -1; tree->bt_bulk = NULL; } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 6ef149aab9a..8b440aafba4 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -537,7 +537,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; + P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t)) + >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " @@ -854,7 +855,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) } /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); + *start = P2ALIGN_TYPED(*start, iblkrange, uint64_t); } if (*start < minimum) *start = minimum; diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index d0e39a423bb..56986ea4344 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, * is not suitably aligned. */ os->os_obj_next_chunk = - P2ALIGN(object, dnodes_per_chunk) + + P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) + dnodes_per_chunk; (void) atomic_swap_64(cpuobj, object); mutex_exit(&os->os_obj_lock); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9e762357b72..cb004930d28 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -627,8 +627,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc) * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ - space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c5551eb6cf6..414bf84f6f7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -348,7 +348,8 @@ vdev_get_min_asize(vdev_t *vd) * to the nearest metaslab. */ if (vd == vd->vdev_top) - return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); + return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, + uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } @@ -2115,8 +2116,8 @@ vdev_open(vdev_t *vd) } } - osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); - max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); + osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); + max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { @@ -4764,9 +4765,9 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { - vs->vs_esize = P2ALIGN( + vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index de7d0fa7947..15c8b8ca601 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4039,7 +4039,8 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); int ashift = raidvd->vdev_ashift; - uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); + uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, + uint64_t); uint64_t logical_size = write_size * raidvd->vdev_children; uint64_t read_size = P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), From 136c05321140ecefa81a830754c64a7867d033e0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 10 May 2024 15:35:20 -0400 Subject: [PATCH 09/43] ZAP: Fix leaf references on zap_expand_leaf() errors Depending on kind of error zap_expand_leaf() may return with or without valid leaf reference held. Make sure it returns NULL if due to error it has no leaf to return. Make its callers to check the returned leaf pointer, and release the leaf if it is not NULL. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #12366 Closes #16159 --- module/zfs/zap.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 1b6b16fc666..81dab80daf8 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -711,6 +711,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, uint64_t object = zap->zap_object; zap_put_leaf(l); + *lp = l = NULL; zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); @@ -920,21 +921,17 @@ retry: } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) { + if (err == 0) goto retry; - } else if (err == ENOSPC) { - /* - * If we failed to expand the leaf, then bailout - * as there is no point trying - * zap_put_leaf_maybe_grow_ptrtbl(). - */ - return (err); - } } out: - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } @@ -991,8 +988,12 @@ retry: goto retry; } - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } From abec7dcd30acfb195bca36334cec4fe82b082b1d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 13 May 2024 17:12:07 -0500 Subject: [PATCH 10/43] Linux: disable lockdep for a couple of locks When running a debug kernel with lockdep enabled there are several locks which report false positives. Set MUTEX_NOLOCKDEP/RW_NOLOCKDEP to disable these warnings. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #16188 --- module/os/linux/spl/spl-procfs-list.c | 2 +- module/zfs/dbuf.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/module/os/linux/spl/spl-procfs-list.c b/module/os/linux/spl/spl-procfs-list.c index 5e073950d61..91840ed2ca2 100644 --- a/module/os/linux/spl/spl-procfs-list.c +++ b/module/os/linux/spl/spl-procfs-list.c @@ -234,7 +234,7 @@ procfs_list_install(const char *module, modulestr = kmem_asprintf("%s/%s", module, submodule); else modulestr = kmem_asprintf("%s", module); - mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index bb913f55637..806ebcfc578 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -292,8 +292,8 @@ dbuf_cons(void *vdb, void *unused, int kmflag) dmu_buf_impl_t *db = vdb; memset(db, 0, sizeof (dmu_buf_impl_t)); - mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); - rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL); + rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -958,7 +958,7 @@ dbuf_init(void) 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (int i = 0; i < hmsize; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL); dbuf_stats_init(h); From 975a13259b87572c39d8467f1f4a31869d0abc84 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 2 May 2024 19:28:10 +0000 Subject: [PATCH 11/43] Add support for parallel pool exports Changed spa_export_common() such that it no longer holds the spa_namespace_lock for the entire duration and instead sets spa_export_thread to indicate an import is in progress on the spa. This allows for an export to a diffent pool to proceed in parallel while an export is still processing potentially long operations like spa_unload_log_sm_flush_all(). Calls like spa_lookup() and spa_vdev_enter() that rely on the spa_namespace_lock to serialize them against a concurrent export, now wait for any in-progress export thread to complete before proceeding. The 'zpool import -a' sub-command also provides multi-threaded support, using a thread pool to submit the exports in parallel. Sponsored-By: Klara Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Don Brady Closes #16153 --- cmd/zpool/zpool_main.c | 88 +++++++++++- include/sys/spa_impl.h | 1 + module/zfs/arc.c | 4 +- module/zfs/spa.c | 36 ++++- module/zfs/spa_misc.c | 50 +++++-- module/zfs/vdev_initialize.c | 9 +- module/zfs/vdev_rebuild.c | 3 +- module/zfs/vdev_trim.c | 9 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 2 + .../zpool_export_parallel_admin.ksh | 72 ++++++++++ .../zpool_export_parallel_pos.ksh | 129 ++++++++++++++++++ 12 files changed, 373 insertions(+), 33 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 300b383af4f..400f4bf1a67 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2030,10 +2030,19 @@ zpool_do_destroy(int argc, char **argv) } typedef struct export_cbdata { + tpool_t *tpool; + pthread_mutex_t mnttab_lock; boolean_t force; boolean_t hardforce; + int retval; } export_cbdata_t; + +typedef struct { + char *aea_poolname; + export_cbdata_t *aea_cbdata; +} async_export_args_t; + /* * Export one pool */ @@ -2042,11 +2051,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; - if (zpool_disable_datasets(zhp, cb->force) != 0) - return (1); + /* + * zpool_disable_datasets() is not thread-safe for mnttab access. + * So we serialize access here for 'zpool export -a' parallel case. + */ + if (cb->tpool != NULL) + pthread_mutex_lock(&cb->mnttab_lock); - /* The history must be logged as part of the export */ - log_history = B_FALSE; + int retval = zpool_disable_datasets(zhp, cb->force); + + if (cb->tpool != NULL) + pthread_mutex_unlock(&cb->mnttab_lock); + + if (retval) + return (1); if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) @@ -2058,6 +2076,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data) return (0); } +/* + * Asynchronous export request + */ +static void +zpool_export_task(void *arg) +{ + async_export_args_t *aea = arg; + + zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname); + if (zhp != NULL) { + int ret = zpool_export_one(zhp, aea->aea_cbdata); + if (ret != 0) + aea->aea_cbdata->retval = ret; + zpool_close(zhp); + } else { + aea->aea_cbdata->retval = 1; + } + + free(aea->aea_poolname); + free(aea); +} + +/* + * Process an export request in parallel + */ +static int +zpool_export_one_async(zpool_handle_t *zhp, void *data) +{ + tpool_t *tpool = ((export_cbdata_t *)data)->tpool; + async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t)); + + /* save pool name since zhp will go out of scope */ + aea->aea_poolname = strdup(zpool_get_name(zhp)); + aea->aea_cbdata = data; + + /* ship off actual export to another thread */ + if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0) + return (errno); /* unlikely */ + else + return (0); +} + /* * zpool export [-f] ... * @@ -2098,17 +2158,33 @@ zpool_do_export(int argc, char **argv) cb.force = force; cb.hardforce = hardforce; + cb.tpool = NULL; + cb.retval = 0; argc -= optind; argv += optind; + /* The history will be logged as part of the export itself */ + log_history = B_FALSE; + if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, - ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb)); + cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + pthread_mutex_init(&cb.mnttab_lock, NULL); + + /* Asynchronously call zpool_export_one using thread pool */ + ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, + B_FALSE, zpool_export_one_async, &cb); + + tpool_wait(cb.tpool); + tpool_destroy(cb.tpool); + (void) pthread_mutex_destroy(&cb.mnttab_lock); + + return (ret | cb.retval); } /* check arguments */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index a40914ec5fc..2004166781a 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -243,6 +243,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_export_thread; /* valid during pool export */ kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 51039af9bcc..d1d60b84109 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8143,11 +8143,11 @@ l2arc_dev_get_next(void) ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all); + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all) + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) next = NULL; l2arc_dev_last = next; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 560fd67087b..1f047dcd2a4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -34,6 +34,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024, Klara Inc. */ /* @@ -1991,7 +1992,8 @@ spa_destroy_aux_threads(spa_t *spa) static void spa_unload(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); @@ -6955,7 +6957,7 @@ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { - int error; + int error = 0; spa_t *spa; hrtime_t export_start = gethrtime(); @@ -6979,8 +6981,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa->spa_is_exporting = B_TRUE; /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. + * Put a hold on the pool, drop the namespace lock, stop async tasks + * and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -6990,10 +6992,18 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); + spa->spa_export_thread = curthread; spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + + /* + * At this point we no longer hold the spa_namespace_lock and + * the spa_export_thread indicates that an export is in progress. + */ if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; + /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, @@ -7089,6 +7099,10 @@ export_spa: if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + + mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); @@ -7100,17 +7114,25 @@ export_spa: * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; } - if (new_state == POOL_STATE_EXPORTED) - zio_handle_export_delay(spa, gethrtime() - export_start); - + /* + * Wake up any waiters on spa_namespace_lock + * They need to re-attempt a spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: + mutex_enter(&spa_namespace_lock); spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; spa_async_resume(spa); + + /* Wake up any waiters on spa_namespace_lock */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index e6d4a9bdb29..6d7667cf3e9 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -27,7 +27,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -82,8 +82,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/export - * - Held at the start and end of import + * - Held for the duration of create/destroy + * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -636,8 +636,14 @@ retry: if (spa == NULL) return (NULL); - if (spa->spa_load_thread != NULL && - spa->spa_load_thread != curthread) { + /* + * Avoid racing with import/export, which don't hold the namespace + * lock for their entire duration. + */ + if ((spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) || + (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread)) { cv_wait(&spa_namespace_cv, &spa_namespace_lock); goto retry; } @@ -950,14 +956,15 @@ spa_open_ref(spa_t *spa, const void *tag) /* * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. + * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || - spa->spa_load_thread == curthread); + spa->spa_load_thread == curthread || + spa->spa_export_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } @@ -977,13 +984,15 @@ spa_async_close(spa_t *spa, const void *tag) /* * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against spa_minref, which is the - * number of references acquired when opening a pool + * spa_namespace_lock held or be the spa export thread. We really + * compare against spa_minref, which is the number of references + * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } @@ -1231,6 +1240,21 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + /* + * We have a reference on the spa and a spa export could be + * starting but no longer holding the spa_namespace_lock. So + * check if there is an export and if so wait. It will fail + * fast (EBUSY) since we are still holding a spa reference. + * + * Note that we can be woken by a different spa transitioning + * through an import/export, so we must wait for our condition + * to change before proceeding. + */ + while (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + } + vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); @@ -1248,6 +1272,12 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + /* See comment in spa_vdev_enter() */ + while (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + } + vdev_autotrim_stop_all(spa); if (guid != 0) { diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index c5e16af1669..0a7323f58df 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -682,7 +682,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); @@ -724,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -756,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) spa_t *spa = vd->vdev_spa; list_t vd_list; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 00ebd4c9fca..8a8b02cab5c 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -1087,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 9753d5a1ea0..9cf10332e8b 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -1040,7 +1040,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); @@ -1079,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -1115,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) list_t vd_list; vdev_t *vd_l2cache; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 5e7fdf359a7..ac2c541a918 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -430,7 +430,8 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', - 'zpool_export_003_neg', 'zpool_export_004_pos'] + 'zpool_export_003_neg', 'zpool_export_004_pos', + 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index d625c040b81..44eedcf6fae 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1084,6 +1084,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \ + functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh new file mode 100755 index 00000000000..cab8fc2b423 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that admin commands cannot race a pool export +# +# STRATEGY: +# 1. Create a pool +# 2. Import the pool with an injected delay in the background +# 3. Execute some admin commands against the pool +# + +verify_runnable "global" + +DEVICE_DIR=$TEST_BASE_DIR/dev_export-test + +function cleanup +{ + zinject -c all + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "admin commands cannot race a pool export" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR +log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zpool create -f $TESTPOOL1 mirror ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zinject -P export -s 10 $TESTPOOL1 + +log_must zpool export $TESTPOOL1 & + +zpool set comment=hello $TESTPOOL1 +zpool reguid $TESTPOOL1 & +zpool split $TESTPOOL1 & + +log_pass "admin commands cannot race a pool export" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh new file mode 100755 index 00000000000..037d17d082b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh @@ -0,0 +1,129 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +MAX_NUM=8 +DEVICE_DIR=$TEST_BASE_DIR/dev_import-test + + +# +# DESCRIPTION: +# Verify that pool exports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Inject an export delay using zinject +# 3. Export half of the pools synchronously to baseline sequential cost +# 4. Export the other half asynchronously to demonstrate parallel savings +# 6. Import 4 pools +# 7. Test zpool export -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# + +POOLNAME="test_pool" + +function cleanup +{ + zinject -c all + + for i in {0..$(($MAX_NUM - 1))}; do + poolexists $POOLNAME-$i && destroy_pool $POOLNAME-$i + done + + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "Pool exports can occur in parallel" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR + +# +# Create some pools with export delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk$i + log_must zpool create $POOLNAME-$i $DEVICE_DIR/disk$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# Export half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool export $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially exported 4 pools in $sequential_time seconds" + +# +# Export half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool export $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# import 4 pools with export delay injectors +# +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR/disk$i $POOLNAME-$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# now test zpool export -a +# +SECONDS=0 +log_must zpool export -a +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools, using '-a', in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool exports occur in parallel" From 89acef992bf328e0ffba63950b176a0d9572b792 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Sun, 5 May 2024 14:57:33 +0000 Subject: [PATCH 12/43] Simplified the scope of the namespace lock If we wait until after we check for no spa references to drop the namespace lock, then we know that spa consumers will need to call spa_lookup() and end up waiting on the spa_namespace_cv until we finish. This narrows the external checks to spa_lookup and we no longer need to worry about the spa_vdev_enter case. Sponsored-By: Klara Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Don Brady Closes #16153 --- module/zfs/spa.c | 32 ++++++++++++++++++++------------ module/zfs/spa_misc.c | 21 ++------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1f047dcd2a4..9eb14b4f1d3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6994,15 +6994,11 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, mutex_enter(&spa_namespace_lock); spa->spa_export_thread = curthread; spa_close(spa, FTAG); - mutex_exit(&spa_namespace_lock); - /* - * At this point we no longer hold the spa_namespace_lock and - * the spa_export_thread indicates that an export is in progress. - */ - - if (spa->spa_state == POOL_STATE_UNINITIALIZED) + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + mutex_exit(&spa_namespace_lock); goto export_spa; + } /* * The pool will be in core if it's openable, in which case we can @@ -7024,6 +7020,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, goto fail; } + mutex_exit(&spa_namespace_lock); + /* + * At this point we no longer hold the spa_namespace_lock and + * there were no references on the spa. Future spa_lookups will + * notice the spa->spa_export_thread and wait until we signal + * that we are finshed. + */ + if (spa->spa_sync_on) { vdev_t *rvd = spa->spa_root_vdev; /* @@ -7035,6 +7039,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); + mutex_enter(&spa_namespace_lock); goto fail; } @@ -7102,6 +7107,9 @@ export_spa: if (new_state == POOL_STATE_EXPORTED) zio_handle_export_delay(spa, gethrtime() - export_start); + /* + * Take the namespace lock for the actual spa_t removal + */ mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) @@ -7118,20 +7126,20 @@ export_spa: } /* - * Wake up any waiters on spa_namespace_lock - * They need to re-attempt a spa_lookup() + * Wake up any waiters in spa_lookup() */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: - mutex_enter(&spa_namespace_lock); spa->spa_is_exporting = B_FALSE; spa->spa_export_thread = NULL; - spa_async_resume(spa); - /* Wake up any waiters on spa_namespace_lock */ + spa_async_resume(spa); + /* + * Wake up any waiters in spa_lookup() + */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6d7667cf3e9..d1d41bbe721 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1240,20 +1240,7 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); - /* - * We have a reference on the spa and a spa export could be - * starting but no longer holding the spa_namespace_lock. So - * check if there is an export and if so wait. It will fail - * fast (EBUSY) since we are still holding a spa reference. - * - * Note that we can be woken by a different spa transitioning - * through an import/export, so we must wait for our condition - * to change before proceeding. - */ - while (spa->spa_export_thread != NULL && - spa->spa_export_thread != curthread) { - cv_wait(&spa_namespace_cv, &spa_namespace_lock); - } + ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); @@ -1272,11 +1259,7 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); - /* See comment in spa_vdev_enter() */ - while (spa->spa_export_thread != NULL && - spa->spa_export_thread != curthread) { - cv_wait(&spa_namespace_cv, &spa_namespace_lock); - } + ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); From f625d038d2ae59fa1ae81b76079da464ed6db61a Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 May 2024 13:53:38 -0600 Subject: [PATCH 13/43] tpool_dispatch: fail if it cannot start at least 1 worker. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alan Somers Closes #16178 --- lib/libtpool/thread_pool.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/lib/libtpool/thread_pool.c b/lib/libtpool/thread_pool.c index 7802f8c1750..9bf9cdf5dc8 100644 --- a/lib/libtpool/thread_pool.c +++ b/lib/libtpool/thread_pool.c @@ -423,6 +423,26 @@ tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg) pthread_mutex_lock(&tpool->tp_mutex); + if (!(tpool->tp_flags & TP_SUSPEND)) { + if (tpool->tp_idle > 0) + (void) pthread_cond_signal(&tpool->tp_workcv); + else if (tpool->tp_current >= tpool->tp_maximum) { + /* At worker limit. Leave task on queue */ + } else { + if (create_worker(tpool) == 0) { + /* Started a new worker thread */ + tpool->tp_current++; + } else if (tpool->tp_current > 0) { + /* Leave task on queue */ + } else { + /* Cannot start a single worker! */ + pthread_mutex_unlock(&tpool->tp_mutex); + free(job); + return (-1); + } + } + } + if (tpool->tp_head == NULL) tpool->tp_head = job; else @@ -430,14 +450,6 @@ tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg) tpool->tp_tail = job; tpool->tp_njobs++; - if (!(tpool->tp_flags & TP_SUSPEND)) { - if (tpool->tp_idle > 0) - (void) pthread_cond_signal(&tpool->tp_workcv); - else if (tpool->tp_current < tpool->tp_maximum && - create_worker(tpool) == 0) - tpool->tp_current++; - } - pthread_mutex_unlock(&tpool->tp_mutex); return (0); } From eced2e2f1e56b54753702da52a88fccbe73b3dcb Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 May 2024 14:21:31 -0600 Subject: [PATCH 14/43] libzfs: Fix mounting datasets under thread limit pressure During parallel zpool import, /sbin/zpool will create a separate thread pool for each pool, used to mount that pool's datasets. If the total thread count exceed's the system's limit on threads per process, then tpool_dispatch may fail. If it does, directly execute the mount operation instead. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alan Somers Closes #16178 Fixes #16172 --- lib/libzfs/libzfs_mount.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index ec6ebad2f18..3084e05e4d4 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1098,7 +1098,10 @@ zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, mnt_param->mnt_func = func; mnt_param->mnt_data = data; - (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); + if (tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param)) { + /* Could not dispatch to thread pool; execute directly */ + zfs_mount_task((void*)mnt_param); + } } /* From b64afa41d56e98b5817aaf14c7deb0fa7e2142fb Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 8 May 2024 10:01:22 -0600 Subject: [PATCH 15/43] Better control the thread pool size when mounting datasets Ever since a10d50f999, ZFS has mounted file systems in parallel when importing a pool. It uses a fixed size of 512 for the thread pool. But since c183d164aa1, it has also imported pools in parallel. So the total number of threads at one time is 513 * npools + 1. That can easily exceed the system's limit on the number of threads per process, which will cause one or more pools to be unable to allocate any worker threads, forcing them to fallback to slow serial mounting . To forestall that, manage the threadpool size in /sbin/zpool, not libzfs. Use the same size (512), but divided by the number of pools. This is a backwards-incompatible change to the libzfs abi. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alan Somers Closes #16178 --- cmd/zed/agents/zfs_mod.c | 2 +- cmd/zfs/zfs_main.c | 6 ++++-- cmd/zpool/zpool_main.c | 20 +++++++++++++++----- include/libzfs.h | 5 +++-- lib/libzfs/libzfs.abi | 3 ++- lib/libzfs/libzfs_mount.c | 25 +++++++++++++------------ 6 files changed, 38 insertions(+), 23 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 69163b80bd5..d0372608c72 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -702,7 +702,7 @@ zfs_enable_ds(void *arg) { unavailpool_t *pool = (unavailpool_t *)arg; - (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0, 512); zpool_close(pool->uap_zhp); free(pool); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 0bbdd5b18ed..75c0e40b610 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -7192,6 +7192,8 @@ share_mount(int op, int argc, char **argv) int c, ret = 0; char *options = NULL; int flags = 0; + const uint_t mount_nthr = 512; + uint_t nthr; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al")) @@ -7310,9 +7312,9 @@ share_mount(int op, int argc, char **argv) * be serialized so that we can prompt the user for their keys * in a consistent manner. */ + nthr = op == OP_MOUNT && !(flags & MS_CRYPT) ? mount_nthr : 1; zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, - share_mount_one_cb, &share_mount_state, - op == OP_MOUNT && !(flags & MS_CRYPT)); + share_mount_one_cb, &share_mount_state, nthr); zfs_commit_shares(NULL); ret = share_mount_state.sm_status; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 400f4bf1a67..d47e1cda9c3 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -80,6 +80,8 @@ libzfs_handle_t *g_zfs; +static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ + static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); @@ -3418,7 +3420,7 @@ zfs_force_import_required(nvlist_t *config) */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, - nvlist_t *props, int flags) + nvlist_t *props, int flags, uint_t mntthreads) { int ret = 0; int ms_status = 0; @@ -3518,7 +3520,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY)) { - ms_status = zpool_enable_datasets(zhp, mntopts, 0); + ms_status = zpool_enable_datasets(zhp, mntopts, 0, mntthreads); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to share some datasets\n")); @@ -3537,6 +3539,7 @@ typedef struct import_parameters { const char *ip_mntopts; nvlist_t *ip_props; int ip_flags; + uint_t ip_mntthreads; int *ip_err; } import_parameters_t; @@ -3545,7 +3548,7 @@ do_import_task(void *arg) { import_parameters_t *ip = arg; *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, - ip->ip_props, ip->ip_flags); + ip->ip_props, ip->ip_flags, ip->ip_mntthreads); free(ip); } @@ -3559,6 +3562,7 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, uint64_t pool_state; boolean_t pool_specified = (import->poolname != NULL || import->guid != 0); + uint_t npools = 0; tpool_t *tp = NULL; @@ -3576,6 +3580,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, int err = 0; nvpair_t *elem = NULL; boolean_t first = B_TRUE; + if (!pool_specified && import->do_all) { + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) + npools++; + } while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); @@ -3606,6 +3614,7 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, ip->ip_mntopts = mntopts; ip->ip_props = props; ip->ip_flags = flags; + ip->ip_mntthreads = mount_tp_nthr / npools; ip->ip_err = &err; (void) tpool_dispatch(tp, do_import_task, @@ -3673,7 +3682,7 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, err = B_TRUE; } else { err |= do_import(found_config, new_name, - mntopts, props, flags); + mntopts, props, flags, mount_tp_nthr); } } @@ -7217,7 +7226,8 @@ zpool_do_split(int argc, char **argv) } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { - ms_status = zpool_enable_datasets(zhp, mntopts, 0); + ms_status = zpool_enable_datasets(zhp, mntopts, 0, + mount_tp_nthr); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Split was successful, " "datasets are mounted but sharing of some datasets " diff --git a/include/libzfs.h b/include/libzfs.h index 2823b884582..7836c2325f4 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -716,7 +716,7 @@ typedef struct get_all_cb { } get_all_cb_t; _LIBZFS_H void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, - size_t, zfs_iter_f, void *, boolean_t); + size_t, zfs_iter_f, void *, uint_t); _LIBZFS_H void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* @@ -1004,7 +1004,8 @@ _LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ -_LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int); +_LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int, + uint_t); _LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_volume_os(const char *); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 2bbaae6345a..c3efb298416 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5532,13 +5532,14 @@ - + + diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 3084e05e4d4..42988bf9cbb 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -83,8 +83,6 @@ #include #define MAXISALEN 257 /* based on sysinfo(2) man page */ -static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ - static void zfs_mount_task(void *); static const proto_table_t proto_table[SA_PROTOCOL_COUNT] = { @@ -1205,19 +1203,20 @@ out: * * Callbacks are issued in one of two ways: * - * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT + * 1. Sequentially: If the nthr argument is <= 1 or the ZFS_SERIAL_MOUNT * environment variable is set, then we issue callbacks sequentially. * - * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT + * 2. In parallel: If the nthr argument is > 1 and the ZFS_SERIAL_MOUNT * environment variable is not set, then we use a tpool to dispatch threads * to mount filesystems in parallel. This function dispatches tasks to mount * the filesystems at the top-level mountpoints, and these tasks in turn * are responsible for recursively mounting filesystems in their children - * mountpoints. + * mountpoints. The value of the nthr argument will be the number of worker + * threads for the thread pool. */ void zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, - size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) + size_t num_handles, zfs_iter_f func, void *data, uint_t nthr) { zoneid_t zoneid = getzoneid(); @@ -1226,7 +1225,7 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, * variable that can be used as a convenience to do a/b comparison * of serial vs. parallel mounting. */ - boolean_t serial_mount = !parallel || + boolean_t serial_mount = nthr <= 1 || (getenv("ZFS_SERIAL_MOUNT") != NULL); /* @@ -1246,7 +1245,7 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, * Issue the callback function for each dataset using a parallel * algorithm that uses a thread pool to manage threads. */ - tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL); + tpool_t *tp = tpool_create(1, nthr, 0, NULL); /* * There may be multiple "top level" mountpoints outside of the pool's @@ -1273,10 +1272,12 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, /* * Mount and share all datasets within the given pool. This assumes that no - * datasets within the pool are currently mounted. + * datasets within the pool are currently mounted. nthr will be number of + * worker threads to use while mounting datasets. */ int -zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) +zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags, + uint_t nthr) { get_all_cb_t cb = { 0 }; mount_state_t ms = { 0 }; @@ -1302,7 +1303,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) ms.ms_mntopts = mntopts; ms.ms_mntflags = flags; zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, - zfs_mount_one, &ms, B_TRUE); + zfs_mount_one, &ms, nthr); if (ms.ms_mntstatus != 0) ret = EZFS_MOUNTFAILED; @@ -1313,7 +1314,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) */ ms.ms_mntstatus = 0; zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, - zfs_share_one, &ms, B_FALSE); + zfs_share_one, &ms, 1); if (ms.ms_mntstatus != 0) ret = EZFS_SHAREFAILED; else From cc38691534310ba22ddc80fedbc10a7ac55237fd Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 11:57:23 +1000 Subject: [PATCH 16/43] zfs_ioc_send: use a dedicated taskq thread for send When stack space is tight, the stream is written to its target on a separate taskq thread to make sure there's enough stack space to complete it. This has always used an IO taskq, but that doesn't really make sense for it, and moving it onto a regular taskq lets us get rid of spa_taskq_dispatch_sync(), which is not used anywhere else. Stream writes may block for a long time depending on what the target is, and we have no way of discovering this, so we can't risk using the system taskq, as there may be many tens of sends in progress. Instead, we create a dedicated taskq thread for each send writer to run on, and clean it up when it's done. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- module/zfs/zfs_ioctl.c | 103 ++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 908b9efc181..b720b4f222b 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,7 +38,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, 2021, Klara Inc. + * Copyright (c) 2019, 2021, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright 2024 Oxide Computer Company */ @@ -5514,6 +5514,14 @@ out: return (error); } +/* + * When stack space is limited, we write replication stream data to the target + * on a separate taskq thread, to make sure there's enough stack space. + */ +#ifndef HAVE_LARGE_STACKS +#define USE_SEND_TASKQ 1 +#endif + typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; @@ -5534,31 +5542,65 @@ dump_bytes_cb(void *arg) dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } +typedef struct dump_bytes_arg { + zfs_file_t *dba_fp; +#ifdef USE_SEND_TASKQ + taskq_t *dba_tq; + taskq_ent_t dba_tqent; +#endif +} dump_bytes_arg_t; + static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { + dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg; dump_bytes_io_t dbi; - dbi.dbi_fp = arg; + dbi.dbi_fp = dba->dba_fp; dbi.dbi_buf = buf; dbi.dbi_len = len; -#if defined(HAVE_LARGE_STACKS) - dump_bytes_cb(&dbi); +#ifdef USE_SEND_TASKQ + taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP, + &dba->dba_tqent); + taskq_wait(dba->dba_tq); #else - /* - * The vn_rdwr() call is performed in a taskq to ensure that there is - * always enough stack space to write safely to the target filesystem. - * The ZIO_TYPE_FREE threads are used because there can be a lot of - * them and they are used in vdev_file.c for a similar purpose. - */ - spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, - ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); -#endif /* HAVE_LARGE_STACKS */ + dump_bytes_cb(&dbi); +#endif return (dbi.dbi_err); } +static int +dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out) +{ + zfs_file_t *fp = zfs_file_get(fd); + if (fp == NULL) + return (SET_ERROR(EBADF)); + + dba->dba_fp = fp; +#ifdef USE_SEND_TASKQ + dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0); + taskq_init_ent(&dba->dba_tqent); +#endif + + memset(out, 0, sizeof (dmu_send_outparams_t)); + out->dso_outfunc = dump_bytes; + out->dso_arg = dba; + out->dso_dryrun = B_FALSE; + + return (0); +} + +static void +dump_bytes_fini(dump_bytes_arg_t *dba) +{ + zfs_file_put(dba->dba_fp); +#ifdef USE_SEND_TASKQ + taskq_destroy(dba->dba_tq); +#endif +} + /* * inputs: * zc_name name of snapshot to send @@ -5643,21 +5685,18 @@ zfs_ioc_send(zfs_cmd_t *zc) dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { - zfs_file_t *fp; - dmu_send_outparams_t out = {0}; + dump_bytes_arg_t dba; + dmu_send_outparams_t out; + error = dump_bytes_init(&dba, zc->zc_cookie, &out); + if (error) + return (error); - if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) - return (SET_ERROR(EBADF)); - - off = zfs_file_off(fp); - out.dso_outfunc = dump_bytes; - out.dso_arg = fp; - out.dso_dryrun = B_FALSE; + off = zfs_file_off(dba.dba_fp); error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); - zfs_file_put(fp); + dump_bytes_fini(&dba); } return (error); } @@ -6604,7 +6643,6 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) offset_t off; const char *fromname = NULL; int fd; - zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; @@ -6629,20 +6667,19 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); - if ((fp = zfs_file_get(fd)) == NULL) - return (SET_ERROR(EBADF)); + dump_bytes_arg_t dba; + dmu_send_outparams_t out; + error = dump_bytes_init(&dba, fd, &out); + if (error) + return (error); - off = zfs_file_off(fp); - - dmu_send_outparams_t out = {0}; - out.dso_outfunc = dump_bytes; - out.dso_arg = fp; - out.dso_dryrun = B_FALSE; + off = zfs_file_off(dba.dba_fp); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); - zfs_file_put(fp); + dump_bytes_fini(&dba); + return (error); } From adda768e3eb931b82e8477eb9287f7ca9c881a98 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 12:04:24 +1000 Subject: [PATCH 17/43] spa: remove spa_taskq_dispatch_sync() It has no callers anymore. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- include/sys/spa_impl.h | 2 -- module/zfs/spa.c | 13 ------------- 2 files changed, 15 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 2004166781a..366410acf28 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -481,8 +481,6 @@ extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); -extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9eb14b4f1d3..a1258546c8c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1520,19 +1520,6 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, taskq_dispatch_ent(tq, func, arg, flags, ent); } -/* - * Same as spa_taskq_dispatch_ent() but block on the task until completion. - */ -void -spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags) -{ - taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); - taskqid_t id = taskq_dispatch(tq, func, arg, flags); - if (id) - taskq_wait_id(tq, id); -} - static void spa_create_zio_taskqs(spa_t *spa) { From 515c4dd2130a2c986640522d298a224ddd7e6018 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 12:06:58 +1000 Subject: [PATCH 18/43] spa: flatten spa_taskq_dispatch_ent() It is the only user of spa_taskq_dispatch_select(), so might as well just carry it directly. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- module/zfs/spa.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a1258546c8c..930c527b63c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1490,8 +1490,9 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. */ -static taskq_t * -spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; @@ -1508,15 +1509,7 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } - return (tq); -} -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, - zio_t *zio) -{ - taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); taskq_dispatch_ent(tq, func, arg, flags, ent); } From 0a543db37111c28085043da89e452ea6b131c019 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 12:13:38 +1000 Subject: [PATCH 19/43] spa_taskq_dispatch_ent: simplify arguments This renames it to spa_taskq_dispatch(), and reduces and simplifies its arguments based on these observations from its two call sites: - arg is always the zio, so it can be typed that way, and we don't need to provide it twice; - ent is always &zio->io_tqent, and zio is always provided, so we can use it directly; - the only flag used is TQ_FRONT, which can just be a bool; - zio != NULL was part of the "use allocator" test, but it never would have got that far, because that arg was only set to NULL in the reexecute path, which is forced to type CLAIM, so the condition would fail at t == WRITE anyway. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- include/sys/spa_impl.h | 4 ++-- module/zfs/spa.c | 18 +++++++++++++----- module/zfs/zio.c | 17 ++++------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 366410acf28..5605a35b864 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -479,8 +479,8 @@ struct spa { extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; -extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); +extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, zio_t *zio, boolean_t cutinline); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 930c527b63c..d762f21a376 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1491,9 +1491,8 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) * on the taskq itself. */ void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, - zio_t *zio) +spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, zio_t *zio, boolean_t cutinline) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; @@ -1501,16 +1500,25 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(zio); + ASSERT(taskq_empty_ent(&zio->io_tqent)); + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && - (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { + ZIO_HAS_ALLOCATOR(zio)) { tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } - taskq_dispatch_ent(tq, func, arg, flags, ent); + taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, + &zio->io_tqent); } static void diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 65a0afaaa21..d68d5ababe7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2023,7 +2023,6 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -2047,19 +2046,12 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; else - flags |= TQ_FRONT; + cutinline = B_TRUE; } ASSERT3U(q, <, ZIO_TASKQ_TYPES); - /* - * NB: We are assuming that the zio can only be dispatched - * to a single taskq at a time. It would be a grievous error - * to dispatch the zio to another taskq at the same time. - */ - ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, - &zio->io_tqent, zio); + spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline); } static boolean_t @@ -5007,10 +4999,9 @@ zio_done(zio_t *zio) * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ - ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(zio->io_spa, + spa_taskq_dispatch(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - zio_reexecute, zio, 0, &zio->io_tqent, NULL); + zio_reexecute, zio, B_FALSE); } return (NULL); } From 91c46d4399e42b2b14ae65ae8637061b67adbd82 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 09:56:48 +1000 Subject: [PATCH 20/43] zdb: bring crash handling over from ztest ztest has a very nice ability to show a backtrace when there's an unexpected crash. zdb is used often enough on corrupted data and can blow up too, so nice output is useful there too. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 797ae34b6e5..f3274a65db2 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -85,6 +85,9 @@ #include #include #include +#if (__GLIBC__ && !__UCLIBC__) +#include /* for backtrace() */ +#endif #include #include @@ -828,11 +831,41 @@ usage(void) static void dump_debug_buffer(void) { - if (dump_opt['G']) { - (void) printf("\n"); - (void) fflush(stdout); - zfs_dbgmsg_print("zdb"); - } + ssize_t ret __attribute__((unused)); + + if (!dump_opt['G']) + return; + /* + * We use write() instead of printf() so that this function + * is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "\n", 1); + zfs_dbgmsg_print("zdb"); +} + +#define BACKTRACE_SZ 100 + +static void sig_handler(int signo) +{ + struct sigaction action; +#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ + int nptrs; + void *buffer[BACKTRACE_SZ]; + + nptrs = backtrace(buffer, BACKTRACE_SZ); + backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); +#endif + dump_debug_buffer(); + + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + raise(signo); } /* @@ -8899,9 +8932,27 @@ main(int argc, char **argv) char *spa_config_path_env, *objset_str; boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; + struct sigaction action; dprintf_setup(&argc, argv); + /* + * Set up signal handlers, so if we crash due to bad on-disk data we + * can get more info. Unlike ztest, we don't bail out if we can't set + * up signal handlers, because zdb is very useful without them. + */ + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGSEGV, &action, NULL) < 0) { + (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n", + strerror(errno)); + } + if (sigaction(SIGABRT, &action, NULL) < 0) { + (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n", + strerror(errno)); + } + /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will From e7b451941b92e2bdbb9c08bb4283c9a39d5571c6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 10:19:48 +1000 Subject: [PATCH 21/43] zdb/ztest: use libspl backtrace for crashes We can show much nicer backtraces these days, lets use them. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 12 +----------- cmd/ztest.c | 12 +----------- lib/libspl/assert.c | 9 ++++++--- lib/libspl/include/assert.h | 2 ++ 4 files changed, 10 insertions(+), 25 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index f3274a65db2..908e4e0ab27 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -85,9 +85,6 @@ #include #include #include -#if (__GLIBC__ && !__UCLIBC__) -#include /* for backtrace() */ -#endif #include #include @@ -843,18 +840,11 @@ dump_debug_buffer(void) zfs_dbgmsg_print("zdb"); } -#define BACKTRACE_SZ 100 - static void sig_handler(int signo) { struct sigaction action; -#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ - int nptrs; - void *buffer[BACKTRACE_SZ]; - nptrs = backtrace(buffer, BACKTRACE_SZ); - backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); -#endif + libspl_dump_backtrace(); dump_debug_buffer(); /* diff --git a/cmd/ztest.c b/cmd/ztest.c index 56eb01618c2..ccfe71c2956 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -136,9 +136,6 @@ #include #include #include -#if (__GLIBC__ && !__UCLIBC__) -#include /* for backtrace() */ -#endif static int ztest_fd_data = -1; static int ztest_fd_rand = -1; @@ -621,18 +618,11 @@ dump_debug_buffer(void) zfs_dbgmsg_print("ztest"); } -#define BACKTRACE_SZ 100 - static void sig_handler(int signo) { struct sigaction action; -#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ - int nptrs; - void *buffer[BACKTRACE_SZ]; - nptrs = backtrace(buffer, BACKTRACE_SZ); - backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); -#endif + libspl_dump_backtrace(); dump_debug_buffer(); /* diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 5b12c14acd6..79b640d8952 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -55,7 +55,7 @@ #define UNW_LOCAL_ONLY #include -static inline void +void libspl_dump_backtrace(void) { unw_context_t uc; @@ -85,7 +85,7 @@ libspl_dump_backtrace(void) #elif defined(HAVE_BACKTRACE) #include -static inline void +void libspl_dump_backtrace(void) { void *btptrs[100]; @@ -97,7 +97,10 @@ libspl_dump_backtrace(void) free(bt); } #else -#define libspl_dump_backtrace() +void +libspl_dump_backtrace(void) +{ +} #endif #if defined(__APPLE__) diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 155bbab3020..126f2db2417 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -60,6 +60,8 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) return (0); } +extern void libspl_dump_backtrace(void); + #ifdef verify #undef verify #endif From 3974ef045ef270e72be6ca1d20baf67bfbecfbe5 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 11:26:11 +1000 Subject: [PATCH 22/43] libspl: lift backtrace into a separate file If it's going to be used directly by zdb/ztest, then it sort of doesn't make sense to carry it with the assert code. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 3 +- cmd/ztest.c | 3 +- lib/libnvpair/libnvpair.abi | 111 +++++++++++++++++++++++++++-- lib/libspl/Makefile.am | 3 +- lib/libspl/assert.c | 55 +------------- lib/libspl/backtrace.c | 80 +++++++++++++++++++++ lib/libspl/include/Makefile.am | 1 + lib/libspl/include/assert.h | 2 - lib/libspl/include/sys/backtrace.h | 32 +++++++++ lib/libuutil/libuutil.abi | 85 ++++++++++++++++++++-- lib/libzfs/libzfs.abi | 41 ++++++++++- lib/libzfs_core/libzfs_core.abi | 89 ++++++++++++++++++++--- 12 files changed, 426 insertions(+), 79 deletions(-) create mode 100644 lib/libspl/backtrace.c create mode 100644 lib/libspl/include/sys/backtrace.h diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 908e4e0ab27..01d584844c0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include @@ -844,7 +845,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_dump_backtrace(); + libspl_backtrace(); dump_debug_buffer(); /* diff --git a/cmd/ztest.c b/cmd/ztest.c index ccfe71c2956..d6f22d04a6e 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -136,6 +136,7 @@ #include #include #include +#include static int ztest_fd_data = -1; static int ztest_fd_rand = -1; @@ -622,7 +623,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_dump_backtrace(); + libspl_backtrace(); dump_debug_buffer(); /* diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi index ef92f3e9bda..69009375e88 100644 --- a/lib/libnvpair/libnvpair.abi +++ b/lib/libnvpair/libnvpair.abi @@ -79,6 +79,7 @@ + @@ -1156,6 +1157,11 @@ + + + + + @@ -2041,9 +2047,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2051,11 +2125,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2536,11 +2642,6 @@ - - - - - diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 94be416d46a..f8943572bf2 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -7,7 +7,8 @@ noinst_LTLIBRARIES += libspl_assert.la libspl.la CPPCHECKTARGETS += libspl_assert.la libspl.la libspl_assert_la_SOURCES = \ - %D%/assert.c + %D%/assert.c \ + %D%/backtrace.c libspl_la_SOURCES = \ %D%/libspl_impl.h \ diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 79b640d8952..ff0d17ba2a5 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -28,6 +28,7 @@ #include #include +#include #if defined(__linux__) #include @@ -51,58 +52,6 @@ pthread_getname_np(pthread_self(), buf, len); #endif -#if defined(HAVE_LIBUNWIND) -#define UNW_LOCAL_ONLY -#include - -void -libspl_dump_backtrace(void) -{ - unw_context_t uc; - unw_cursor_t cp; - unw_word_t ip, off; - char funcname[128]; -#ifdef HAVE_LIBUNWIND_ELF - char objname[128]; - unw_word_t objoff; -#endif - - fprintf(stderr, "Call trace:\n"); - unw_getcontext(&uc); - unw_init_local(&cp, &uc); - while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &ip); - unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); -#ifdef HAVE_LIBUNWIND_ELF - unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); - fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", - ip, funcname, off, objname, objoff); -#else - fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); -#endif - } -} -#elif defined(HAVE_BACKTRACE) -#include - -void -libspl_dump_backtrace(void) -{ - void *btptrs[100]; - size_t nptrs = backtrace(btptrs, 100); - char **bt = backtrace_symbols(btptrs, nptrs); - fprintf(stderr, "Call trace:\n"); - for (size_t i = 0; i < nptrs; i++) - fprintf(stderr, " %s\n", bt[i]); - free(bt); -} -#else -void -libspl_dump_backtrace(void) -{ -} -#endif - #if defined(__APPLE__) static inline uint64_t libspl_gettid(void) @@ -154,7 +103,7 @@ libspl_assertf(const char *file, const char *func, int line, getpid(), libspl_getprogname(), libspl_gettid(), tname); - libspl_dump_backtrace(); + libspl_backtrace(); #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c new file mode 100644 index 00000000000..0e653cd9643 --- /dev/null +++ b/lib/libspl/backtrace.c @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024, Rob Norris + * Copyright (c) 2024, Klara Inc. + */ + +#include +#include + +#if defined(HAVE_LIBUNWIND) +#define UNW_LOCAL_ONLY +#include + +void +libspl_backtrace(void) +{ + unw_context_t uc; + unw_cursor_t cp; + unw_word_t ip, off; + char funcname[128]; +#ifdef HAVE_LIBUNWIND_ELF + char objname[128]; + unw_word_t objoff; +#endif + + fprintf(stderr, "Call trace:\n"); + unw_getcontext(&uc); + unw_init_local(&cp, &uc); + while (unw_step(&cp) > 0) { + unw_get_reg(&cp, UNW_REG_IP, &ip); + unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); +#ifdef HAVE_LIBUNWIND_ELF + unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); + fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", + ip, funcname, off, objname, objoff); +#else + fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); +#endif + } +} +#elif defined(HAVE_BACKTRACE) +#include + +void +libspl_backtrace(void) +{ + void *btptrs[100]; + size_t nptrs = backtrace(btptrs, 100); + char **bt = backtrace_symbols(btptrs, nptrs); + fprintf(stderr, "Call trace:\n"); + for (size_t i = 0; i < nptrs; i++) + fprintf(stderr, " %s\n", bt[i]); + free(bt); +} +#else +void +libspl_backtrace(void) +{ +} +#endif + diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am index 2c1d21edf19..4ad3b854cbe 100644 --- a/lib/libspl/include/Makefile.am +++ b/lib/libspl/include/Makefile.am @@ -27,6 +27,7 @@ libspl_sys_HEADERS = \ %D%/sys/acl.h \ %D%/sys/acl_impl.h \ %D%/sys/asm_linkage.h \ + %D%/sys/backtrace.h \ %D%/sys/callb.h \ %D%/sys/cmn_err.h \ %D%/sys/cred.h \ diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 126f2db2417..155bbab3020 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -60,8 +60,6 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) return (0); } -extern void libspl_dump_backtrace(void); - #ifdef verify #undef verify #endif diff --git a/lib/libspl/include/sys/backtrace.h b/lib/libspl/include/sys/backtrace.h new file mode 100644 index 00000000000..97ee7740ce6 --- /dev/null +++ b/lib/libspl/include/sys/backtrace.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024, Rob Norris + * Copyright (c) 2024, Klara Inc. + */ + +#ifndef _LIBSPL_SYS_BACKTRACE_H +#define _LIBSPL_SYS_BACKTRACE_H + +void libspl_backtrace(void); + +#endif diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index e942d24c653..2ed2fb2e41e 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -149,6 +149,7 @@ + @@ -242,6 +243,22 @@ + + + + + + + + + + + + + + + + @@ -576,6 +593,27 @@ + + + + + + + + + + + + + + + + + + + + + @@ -596,14 +634,11 @@ - + - - - - + @@ -800,9 +835,16 @@ + + + + + + + @@ -912,6 +954,25 @@ + + + + + + + + + + + + + + + + + + + @@ -920,12 +981,23 @@ + + + + + + + + + + + @@ -937,8 +1009,9 @@ - + + diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index c3efb298416..80f4b7439a5 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -191,6 +191,7 @@ + @@ -777,6 +778,18 @@ + + + + + + + + + + + + @@ -1092,6 +1105,19 @@ + + + + + + + + + + + + + @@ -6252,6 +6278,11 @@ + + + + + @@ -6363,7 +6394,7 @@ - + @@ -8672,7 +8703,7 @@ - + @@ -8697,6 +8728,12 @@ + + + + + + diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 5b95c8f779d..cf9d6bddc9f 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -132,6 +132,7 @@ + @@ -231,10 +232,18 @@ + + + + + + + + @@ -242,6 +251,14 @@ + + + + + + + + @@ -574,6 +591,27 @@ + + + + + + + + + + + + + + + + + + + + + @@ -594,14 +632,11 @@ - + - - - - + @@ -770,6 +805,13 @@ + + + + + + + @@ -873,12 +915,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -895,8 +967,9 @@ - + + @@ -1119,7 +1192,7 @@ - + @@ -1127,7 +1200,7 @@ - + From 1ea8c59441cd215d4f45cbe839cbfe51c6e32068 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 13:04:14 +1000 Subject: [PATCH 23/43] backtrace: rework for signal safety Mostly, try a lot harder to not allocate anything. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 2 +- cmd/ztest.c | 2 +- lib/libspl/assert.c | 2 +- lib/libspl/backtrace.c | 91 +++++++++++++++++++++--------- lib/libspl/include/sys/backtrace.h | 2 +- 5 files changed, 68 insertions(+), 31 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 01d584844c0..7c2819d3cf0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -845,7 +845,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_backtrace(); + libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* diff --git a/cmd/ztest.c b/cmd/ztest.c index d6f22d04a6e..b4d63b02dda 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -623,7 +623,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_backtrace(); + libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index ff0d17ba2a5..d11361b387e 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -103,7 +103,7 @@ libspl_assertf(const char *file, const char *func, int line, getpid(), libspl_getprogname(), libspl_gettid(), tname); - libspl_backtrace(); + libspl_backtrace(STDERR_FILENO); #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index 0e653cd9643..dd8cb025f4f 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -24,57 +24,94 @@ */ #include -#include +#include +#include + +/* + * libspl_backtrace() must be safe to call from inside a signal hander. This + * mostly means it must not allocate, and so we can't use things like printf. + */ #if defined(HAVE_LIBUNWIND) #define UNW_LOCAL_ONLY #include -void -libspl_backtrace(void) +static size_t +libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) { + static const char hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; + + size_t pos = 0; + boolean_t want = (digits == 0); + for (int i = 15; i >= 0; i--) { + const uint64_t d = v >> (i * 4) & 0xf; + if (!want && (d != 0 || digits > i)) + want = B_TRUE; + if (want) { + buf[pos++] = hexdigits[d]; + if (pos == buflen) + break; + } + } + return (pos); +} + +void +libspl_backtrace(int fd) +{ + ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; - unw_word_t ip, off; - char funcname[128]; -#ifdef HAVE_LIBUNWIND_ELF - char objname[128]; - unw_word_t objoff; -#endif + unw_word_t loc; + char buf[128]; + size_t n; - fprintf(stderr, "Call trace:\n"); + ret = write(fd, "Call trace:\n", 12); unw_getcontext(&uc); unw_init_local(&cp, &uc); while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &ip); - unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); + unw_get_reg(&cp, UNW_REG_IP, &loc); + ret = write(fd, " [0x", 5); + n = libspl_u64_to_hex_str(loc, 10, buf, sizeof (buf)); + ret = write(fd, buf, n); + ret = write(fd, "] ", 2); + unw_get_proc_name(&cp, buf, sizeof (buf), &loc); + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + ret = write(fd, buf, n); + ret = write(fd, "+0x", 3); + n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + ret = write(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF - unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); - fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", - ip, funcname, off, objname, objoff); -#else - fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); + ret = write(fd, " (in ", 5); + unw_get_elf_filename(&cp, buf, sizeof (buf), &loc); + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + ret = write(fd, buf, n); + ret = write(fd, " +0x", 4); + n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + ret = write(fd, buf, n); + ret = write(fd, ")", 1); #endif + ret = write(fd, "\n", 1); } } #elif defined(HAVE_BACKTRACE) #include void -libspl_backtrace(void) +libspl_backtrace(int fd) { - void *btptrs[100]; - size_t nptrs = backtrace(btptrs, 100); - char **bt = backtrace_symbols(btptrs, nptrs); - fprintf(stderr, "Call trace:\n"); - for (size_t i = 0; i < nptrs; i++) - fprintf(stderr, " %s\n", bt[i]); - free(bt); + ssize_t ret __attribute__((unused)); + void *btptrs[64]; + size_t nptrs = backtrace(btptrs, 64); + ret = write(fd, "Call trace:\n", 12); + backtrace_symbols_fd(btptrs, nptrs, fd); } #else void -libspl_backtrace(void) +libspl_backtrace(int fd __maybe_unused) { } #endif - diff --git a/lib/libspl/include/sys/backtrace.h b/lib/libspl/include/sys/backtrace.h index 97ee7740ce6..f9869ffc9e1 100644 --- a/lib/libspl/include/sys/backtrace.h +++ b/lib/libspl/include/sys/backtrace.h @@ -27,6 +27,6 @@ #ifndef _LIBSPL_SYS_BACKTRACE_H #define _LIBSPL_SYS_BACKTRACE_H -void libspl_backtrace(void); +void libspl_backtrace(int fd); #endif From fa99d9cd9cbc6aca3245afcfe321b8226985597d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 13:54:08 +1000 Subject: [PATCH 24/43] zfs_dbgmsg_print: make FreeBSD and Linux consistent FreeBSD was using fprintf(), which might not be signal-safe. Meanwhile, Linux's locking did not cover the header output. This two quirks are unrelated, but both have the same response: be like the other one. So with this commit, both functions are the same except for the names of their lock and list variables. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- module/os/freebsd/zfs/zfs_debug.c | 24 ++++++++++++++++++++---- module/os/linux/zfs/zfs_debug.c | 3 ++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c index 78d50c6fd8b..3e832a9104f 100644 --- a/module/os/freebsd/zfs/zfs_debug.c +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -234,13 +234,29 @@ __dprintf(boolean_t dprint, const char *file, const char *func, void zfs_dbgmsg_print(const char *tag) { - zfs_dbgmsg_t *zdm; + ssize_t ret __attribute__((unused)); - (void) printf("ZFS_DBGMSG(%s):\n", tag); mutex_enter(&zfs_dbgmsgs_lock); - for (zdm = list_head(&zfs_dbgmsgs); zdm; + + /* + * We use write() in this function instead of printf() + * so it is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") START:\n", 9); + + for (zfs_dbgmsg_t zdm = list_head(&zfs_dbgmsgs); zdm != NULL; zdm = list_next(&zfs_dbgmsgs, zdm)) - (void) printf("%s\n", zdm->zdm_msg); + ret = write(STDOUT_FILENO, zdm->zdm_msg, + strlen(zdm->zdm_msg)); + ret = write(STDOUT_FILENO, "\n", 1); + } + + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") END\n", 6); + mutex_exit(&zfs_dbgmsgs_lock); } #endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index f707959c944..bc5c028dca0 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -225,6 +225,8 @@ zfs_dbgmsg_print(const char *tag) { ssize_t ret __attribute__((unused)); + mutex_enter(&zfs_dbgmsgs.pl_lock); + /* * We use write() in this function instead of printf() * so it is safe to call from a signal handler. @@ -233,7 +235,6 @@ zfs_dbgmsg_print(const char *tag) ret = write(STDOUT_FILENO, tag, strlen(tag)); ret = write(STDOUT_FILENO, ") START:\n", 9); - mutex_enter(&zfs_dbgmsgs.pl_lock); for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { ret = write(STDOUT_FILENO, zdm->zdm_msg, From 3c941d18183455138f7c5dcc212177bd3cea8afc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 13:58:26 +1000 Subject: [PATCH 25/43] zdb/ztest: send dbgmsg output to stderr And, make the output fd an arg to zfs_dbgmsg_print(). This is a change in behaviour, but keeps it consistent with where crash traces go, and it's easy to argue this is what we want anyway; this is information about the task, not the actual output of the task. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 4 ++-- cmd/ztest.c | 4 ++-- include/sys/zfs_debug.h | 2 +- module/os/freebsd/zfs/zfs_debug.c | 25 ++++++++++++------------- module/os/linux/zfs/zfs_debug.c | 19 +++++++++---------- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 7c2819d3cf0..704fcf4422d 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -837,8 +837,8 @@ dump_debug_buffer(void) * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "\n", 1); - zfs_dbgmsg_print("zdb"); + ret = write(STDERR_FILENO, "\n", 1); + zfs_dbgmsg_print(STDERR_FILENO, "zdb"); } static void sig_handler(int signo) diff --git a/cmd/ztest.c b/cmd/ztest.c index b4d63b02dda..f77a37c2154 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -615,8 +615,8 @@ dump_debug_buffer(void) * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "\n", 1); - zfs_dbgmsg_print("ztest"); + ret = write(STDERR_FILENO, "\n", 1); + zfs_dbgmsg_print(STDERR_FILENO, "ztest"); } static void sig_handler(int signo) diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 8d94557a588..e509c8b7c63 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -104,7 +104,7 @@ extern void zfs_dbgmsg_fini(void); #ifndef _KERNEL extern int dprintf_find_string(const char *string); -extern void zfs_dbgmsg_print(const char *tag); +extern void zfs_dbgmsg_print(int fd, const char *tag); #endif #ifdef __cplusplus diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c index 3e832a9104f..c4cebe10207 100644 --- a/module/os/freebsd/zfs/zfs_debug.c +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -232,30 +232,29 @@ __dprintf(boolean_t dprint, const char *file, const char *func, #else void -zfs_dbgmsg_print(const char *tag) +zfs_dbgmsg_print(int fd, const char *tag) { ssize_t ret __attribute__((unused)); - mutex_enter(&zfs_dbgmsgs_lock); - /* * We use write() in this function instead of printf() * so it is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") START:\n", 9); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") START:\n", 9); - for (zfs_dbgmsg_t zdm = list_head(&zfs_dbgmsgs); zdm != NULL; + mutex_enter(&zfs_dbgmsgs_lock); + + for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs); zdm != NULL; zdm = list_next(&zfs_dbgmsgs, zdm)) - ret = write(STDOUT_FILENO, zdm->zdm_msg, - strlen(zdm->zdm_msg)); - ret = write(STDOUT_FILENO, "\n", 1); + ret = write(fd, zdm->zdm_msg, strlen(zdm->zdm_msg)); + ret = write(fd, "\n", 1); } - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") END\n", 6); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") END\n", 6); mutex_exit(&zfs_dbgmsgs_lock); } diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index bc5c028dca0..9ee40771fc1 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -221,7 +221,7 @@ __dprintf(boolean_t dprint, const char *file, const char *func, #else void -zfs_dbgmsg_print(const char *tag) +zfs_dbgmsg_print(int fd, const char *tag) { ssize_t ret __attribute__((unused)); @@ -231,20 +231,19 @@ zfs_dbgmsg_print(const char *tag) * We use write() in this function instead of printf() * so it is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") START:\n", 9); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") START:\n", 9); for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { - ret = write(STDOUT_FILENO, zdm->zdm_msg, - strlen(zdm->zdm_msg)); - ret = write(STDOUT_FILENO, "\n", 1); + ret = write(fd, zdm->zdm_msg, strlen(zdm->zdm_msg)); + ret = write(fd, "\n", 1); } - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") END\n", 6); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") END\n", 6); mutex_exit(&zfs_dbgmsgs.pl_lock); } From e675852bc1d50404fcbe4fa0e1f57b6c318e6349 Mon Sep 17 00:00:00 2001 From: Rob N Date: Thu, 16 May 2024 06:03:41 +1000 Subject: [PATCH 26/43] dbuf: separate refcount calls for dbuf and dbuf_user In 92dc4ad83 I updated the dbuf_cache accounting to track the size of userdata associated with dbufs. This adds the size of the dbuf+userdata together in a single call to zfs_refcount_add_many(), but sometime removes them in separate calls to zfs_refcount_remove_many(), if dbuf and userdata are evicted separately. What I didn't realise is that when refcount tracking is on, zfs_refcount_add_many() and zfs_refcount_remove_many() are expected to be paired, with their second & third args (count & holder) the same on both sides. Splitting the remove part into two calls means the counts don't match up, tripping a panic. This commit fixes that, by always adding and removing the dbuf and userdata counts separately. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reported-by: Mark Johnston Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16191 --- module/zfs/dbuf.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 806ebcfc578..bce41948c40 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -578,7 +578,7 @@ dbuf_evict_user(dmu_buf_impl_t *db) */ uint64_t size = dbu->dbu_size; (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, size, db); + &dbuf_caches[db->db_caching_status].size, size, dbu); if (db->db_caching_status == DB_DBUF_CACHE) DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); } @@ -784,12 +784,15 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); - uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); + uint64_t size = db->db.db_size; + uint64_t usize = dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( &dbuf_caches[DB_DBUF_CACHE].size, size, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); @@ -3794,16 +3797,21 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); - uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); + uint64_t size = db->db.db_size; + uint64_t usize = dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, size, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, usize, + db->db_user); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + size + usize); } db->db_caching_status = DB_NO_CACHE; } @@ -4022,10 +4030,12 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size + - dmu_buf_user_size(&db->db); - size = zfs_refcount_add_many( + uint64_t db_size = db->db.db_size; + uint64_t dbu_size = dmu_buf_user_size(&db->db); + (void) zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); + size = zfs_refcount_add_many( + &dbuf_caches[dcs].size, dbu_size, db->db_user); uint8_t db_level = db->db_level; mutex_exit(&db->db_mtx); @@ -4038,7 +4048,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) DBUF_STAT_MAX(cache_size_bytes_max, size); DBUF_STAT_BUMP(cache_levels[db_level]); DBUF_STAT_INCR(cache_levels_bytes[db_level], - db_size); + db_size + dbu_size); } if (dcs == DB_DBUF_CACHE && !evicting) From a043b60f1eabcdd72f30b692565a2c982b1a1e8a Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 16 May 2024 18:37:50 -0400 Subject: [PATCH 27/43] Correct level handling in zstream recompress. sscanf returns number of items parsed on success and EOF on failure. Reviewed-by: Adam Moss Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Rich Ercolani Closes #16198 --- cmd/zstream/zstream_recompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c index 8392ef3de72..f9e01d1aa4c 100644 --- a/cmd/zstream/zstream_recompress.c +++ b/cmd/zstream/zstream_recompress.c @@ -77,7 +77,7 @@ zstream_do_recompress(int argc, char *argv[]) while ((c = getopt(argc, argv, "l:")) != -1) { switch (c) { case 'l': - if (sscanf(optarg, "%d", &level) != 0) { + if (sscanf(optarg, "%d", &level) != 1) { fprintf(stderr, "failed to parse level '%s'\n", optarg); From d0d7c0d8f92d1a70f2b92e600f980f254b725668 Mon Sep 17 00:00:00 2001 From: omni Date: Sat, 4 May 2024 08:44:55 +0000 Subject: [PATCH 28/43] config/zfs-build.m4: sort vendors Reviewed-by: Brian Behlendorf Signed-off-by: omni Closes #16164 --- config/zfs-build.m4 | 96 +++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index bb5a85d815d..b33b9225bb9 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -512,32 +512,33 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ [with_vendor=$withval], [with_vendor=check]) AS_IF([test "x$with_vendor" = "xcheck"],[ - if test -f /etc/toss-release ; then - VENDOR=toss ; - elif test -f /etc/fedora-release ; then - VENDOR=fedora ; - elif test -f /etc/redhat-release ; then - VENDOR=redhat ; - elif test -f /etc/gentoo-release ; then - VENDOR=gentoo ; + if test -f /etc/alpine-release ; then + VENDOR=alpine ; elif test -f /etc/arch-release ; then VENDOR=arch ; + elif test -f /etc/fedora-release ; then + VENDOR=fedora ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; + elif test -f /etc/gentoo-release ; then + VENDOR=gentoo ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/openEuler-release ; then + VENDOR=openeuler ; elif test -f /etc/SuSE-release ; then VENDOR=sles ; elif test -f /etc/slackware-version ; then VENDOR=slackware ; - elif test -f /etc/lunar.release ; then - VENDOR=lunar ; + elif test -f /etc/toss-release ; then + VENDOR=toss ; elif test -f /etc/lsb-release ; then VENDOR=ubuntu ; + # put debian and redhat last as derivatives may have also their file elif test -f /etc/debian_version ; then VENDOR=debian ; - elif test -f /etc/alpine-release ; then - VENDOR=alpine ; - elif test -f /bin/freebsd-version ; then - VENDOR=freebsd ; - elif test -f /etc/openEuler-release ; then - VENDOR=openeuler ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; else VENDOR= ; fi], @@ -550,20 +551,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default package type]) case "$VENDOR" in - toss) DEFAULT_PACKAGE=rpm ;; - redhat) DEFAULT_PACKAGE=rpm ;; - fedora) DEFAULT_PACKAGE=rpm ;; - gentoo) DEFAULT_PACKAGE=tgz ;; - alpine) DEFAULT_PACKAGE=tgz ;; - arch) DEFAULT_PACKAGE=tgz ;; - sles) DEFAULT_PACKAGE=rpm ;; - slackware) DEFAULT_PACKAGE=tgz ;; - lunar) DEFAULT_PACKAGE=tgz ;; - ubuntu) DEFAULT_PACKAGE=deb ;; - debian) DEFAULT_PACKAGE=deb ;; - freebsd) DEFAULT_PACKAGE=pkg ;; - openeuler) DEFAULT_PACKAGE=rpm ;; - *) DEFAULT_PACKAGE=rpm ;; + alpine|arch|gentoo|lunar|slackware) + DEFAULT_PACKAGE=tgz ;; + debian|ubuntu) + DEFAULT_PACKAGE=deb ;; + freebsd) + DEFAULT_PACKAGE=pkg ;; + *) + # fedora|openeuler|redhat|sles|toss + DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) @@ -578,7 +574,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default shell]) case "$VENDOR" in - gentoo|alpine) DEFAULT_INIT_SHELL=/sbin/openrc-run + alpine|gentoo) DEFAULT_INIT_SHELL=/sbin/openrc-run IS_SYSV_RC=false ;; *) DEFAULT_INIT_SHELL=/bin/sh IS_SYSV_RC=true ;; @@ -598,17 +594,19 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default init config directory]) case "$VENDOR" in - alpine) initconfdir=/etc/conf.d ;; - gentoo) initconfdir=/etc/conf.d ;; - toss) initconfdir=/etc/sysconfig ;; - redhat) initconfdir=/etc/sysconfig ;; - fedora) initconfdir=/etc/sysconfig ;; - sles) initconfdir=/etc/sysconfig ;; - openeuler) initconfdir=/etc/sysconfig ;; - ubuntu) initconfdir=/etc/default ;; - debian) initconfdir=/etc/default ;; - freebsd) initconfdir=$sysconfdir/rc.conf.d;; - *) initconfdir=/etc/default ;; + alpine|gentoo) + initconfdir=/etc/conf.d + ;; + fedora|openeuler|redhat|sles|toss) + initconfdir=/etc/sysconfig + ;; + freebsd) + initconfdir=$sysconfdir/rc.conf.d + ;; + *) + # debian|ubuntu + initconfdir=/etc/default + ;; esac AC_MSG_RESULT([$initconfdir]) AC_SUBST(initconfdir) @@ -625,11 +623,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in - ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; - debian) bashcompletiondir=/usr/share/bash-completion/completions ;; - freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; - gentoo) bashcompletiondir=/usr/share/bash-completion/completions ;; - *) bashcompletiondir=/etc/bash_completion.d ;; + debian|gentoo|ubuntu) + bashcompletiondir=/usr/share/bash-completion/completions + ;; + freebsd) + bashcompletiondir=$sysconfdir/bash_completion.d + ;; + *) + bashcompletiondir=/etc/bash_completion.d + ;; esac AC_MSG_RESULT([$bashcompletiondir]) AC_SUBST(bashcompletiondir) From fec16b93c46d80ae60a4f20632932601030b6fc0 Mon Sep 17 00:00:00 2001 From: omni Date: Sat, 4 May 2024 08:47:13 +0000 Subject: [PATCH 29/43] config/zfs-build.m4: add Alpine Linux bash-completion path Reviewed-by: Brian Behlendorf Signed-off-by: omni Closes #16164 --- config/zfs-build.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index b33b9225bb9..368684e1c51 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -623,7 +623,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in - debian|gentoo|ubuntu) + alpine|debian|gentoo|ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; freebsd) From efbef9e6cc1e14cc19a24b76175f7ec86610161a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 16 May 2024 20:56:55 -0400 Subject: [PATCH 30/43] FreeBSD: Add zfs_link_create() error handling Originally Solaris didn't expect errors there, but they may happen if we fail to add entry into ZAP. Linux fixed it in #7421, but it was never fully ported to FreeBSD. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #13215 Closes #16138 --- module/os/freebsd/zfs/zfs_dir.c | 1 + module/os/freebsd/zfs/zfs_vnops_os.c | 54 ++++++++++++++++++++------ module/os/freebsd/zfs/zfs_znode.c | 1 - tests/test-runner/bin/zts-report.py.in | 1 - 4 files changed, 43 insertions(+), 14 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c index 948df8e50de..3cdb94d6cd5 100644 --- a/module/os/freebsd/zfs/zfs_dir.c +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -543,6 +543,7 @@ zfs_rmnode(znode_t *zp) dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); + zfs_znode_free(zp); dmu_tx_commit(tx); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index d9a8c8a0d76..b9b332434bd 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1175,10 +1175,25 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dzp, name, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + getnewvnode_drop_reserve(); + goto out; + } + if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); @@ -1526,13 +1541,19 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* * Now put new name in parent dir. */ - (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + error = zfs_link_create(dzp, dirname, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); *zpp = zp; @@ -1540,6 +1561,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); +out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -1550,7 +1572,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); - return (0); + return (error); } #if __FreeBSD_version < 1300124 @@ -3586,10 +3608,14 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, /* * Insert the new object into the directory. */ - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); - - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - *zpp = zp; + error = zfs_link_create(dzp, name, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + } else { + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } zfs_acl_ids_free(&acl_ids); @@ -3597,8 +3623,12 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, getnewvnode_drop_reserve(); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } zfs_exit(zfsvfs, FTAG); return (error); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 0d4c94555c6..0eea2a84941 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -1234,7 +1234,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); - zfs_znode_free(zp); } void diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ecc50f48715..5ca13093136 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -182,7 +182,6 @@ if sys.platform.startswith('freebsd'): 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], - 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], 'rsend/send_raw_ashift': ['SKIP', 14961], }) From 08648cf0da381fb667fa413ba95407ae4c3f8a8f Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Fri, 24 May 2024 18:45:09 -0700 Subject: [PATCH 31/43] Allow block cloning to be interrupted by a signal. Even though block cloning is much faster than regular copying, it is not instantaneous - the file might be large and the recordsize small. It would be nice to be able to interrupt it with a signal (e.g., SIGINFO on FreeBSD to see the progress). Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #16208 --- module/zfs/zfs_vnops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index babb07ca25a..b222a6f88d2 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1412,6 +1412,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, outoff += size; len -= size; done += size; + + if (issig(JUSTLOOKING) && issig(FORREAL)) { + error = SET_ERROR(EINTR); + break; + } } vmem_free(bps, sizeof (bps[0]) * maxblocks); From 7572e8ca04adda7af207dd27d643d241351680e7 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 24 May 2024 18:45:58 -0700 Subject: [PATCH 32/43] Avoid a gcc -Wint-to-pointer-cast warning On 32-bit platforms long long is generally 64-bits. Sufficiently modern versions of gcc (13 in my testing) complains when casting a pointer to an integer of a different width so cast to uintptr_t first to avoid the warning. Fixes: c183d164aa Parallel pool import Reviewed-by: Brian Behlendorf Reviewed-by: Don Brady Signed-off-by: Brooks Davis Closes #16203 --- module/zfs/spa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d762f21a376..412f883e9c3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6827,7 +6827,7 @@ spa_tryimport(nvlist_t *tryconfig) */ char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", - TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); + TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); mutex_enter(&spa_namespace_lock); spa = spa_add(name, tryconfig, NULL); From 708be0f415c83c941d3ed5153fafc5d3766e4cdd Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 11:54:24 +1000 Subject: [PATCH 33/43] Linux 6.7 compat: detect if kernel defines intptr_t Since Linux 6.7 the kernel has defined intptr_t. Clang has -Wtypedef-redefinition by default, which causes the build to fail because we also have a typedef for intptr_t. Since its better to use the kernel's if it exists, detect it and skip our own. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16201 --- config/kernel-types.m4 | 40 ++++++++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ include/os/linux/spl/sys/types.h | 2 ++ 3 files changed, 44 insertions(+) create mode 100644 config/kernel-types.m4 diff --git a/config/kernel-types.m4 b/config/kernel-types.m4 new file mode 100644 index 00000000000..ed76af28337 --- /dev/null +++ b/config/kernel-types.m4 @@ -0,0 +1,40 @@ +dnl # +dnl # check if kernel provides definitions for given types +dnl # + +dnl _ZFS_AC_KERNEL_SRC_TYPE(type) +AC_DEFUN([_ZFS_AC_KERNEL_SRC_TYPE], [ + ZFS_LINUX_TEST_SRC([type_$1], [ + #include + ],[ + const $1 __attribute__((unused)) x = ($1) 0; + ]) +]) + +dnl _ZFS_AC_KERNEL_TYPE(type) +AC_DEFUN([_ZFS_AC_KERNEL_TYPE], [ + AC_MSG_CHECKING([whether kernel defines $1]) + ZFS_LINUX_TEST_RESULT([type_$1], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_KERNEL_]m4_quote(m4_translit([$1], [a-z], [A-Z])), + 1, [kernel defines $1]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl ZFS_AC_KERNEL_TYPES([types...]) +AC_DEFUN([ZFS_AC_KERNEL_TYPES], [ + AC_DEFUN([ZFS_AC_KERNEL_SRC_TYPES], [ + m4_foreach_w([type], [$1], [ + _ZFS_AC_KERNEL_SRC_TYPE(type) + ]) + ]) + AC_DEFUN([ZFS_AC_KERNEL_TYPES], [ + m4_foreach_w([type], [$1], [ + _ZFS_AC_KERNEL_TYPE(type) + ]) + ]) +]) + +ZFS_AC_KERNEL_TYPES([intptr_t]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 548905ccd04..b51477b6a95 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -37,6 +37,7 @@ dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ + ZFS_AC_KERNEL_SRC_TYPES ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE @@ -187,6 +188,7 @@ dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ + ZFS_AC_KERNEL_TYPES ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h index 20ba457f7ef..94ba7b6ad32 100644 --- a/include/os/linux/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -38,7 +38,9 @@ typedef unsigned long ulong_t; typedef unsigned long long u_longlong_t; typedef long long longlong_t; +#ifndef HAVE_KERNEL_INTPTR_T typedef long intptr_t; +#endif typedef unsigned long long rlim64_t; typedef struct task_struct kthread_t; From 34906f8bbee337ee5aa9b79c141517bff0a4e0ab Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 11:55:47 +1000 Subject: [PATCH 34/43] zap: reuse zap_leaf_t on dbuf reuse after shrink If a shrink or truncate had recently freed a portion of the ZAP, the dbuf could still be sitting on the dbuf cache waiting for eviction. If it is then allocated for a new leaf before it can be evicted, the zap_leaf_t is still attached as userdata, tripping the VERIFY. Instead, just check for the userdata, and if we find it, reuse it. Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16157. Closes #16204 --- module/zfs/zap.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 81dab80daf8..03b76ea1b7b 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -425,20 +425,36 @@ zap_leaf_evict_sync(void *dbu) static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = zap_allocate_blocks(zap, 1); - l->l_dbuf = NULL; + uint64_t blkid = zap_allocate_blocks(zap, 1); + dmu_buf_t *db = NULL; VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, DMU_READ_NO_PREFETCH)); - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); + + /* + * Create the leaf structure and stash it on the dbuf. If zap was + * recent shrunk or truncated, the dbuf might have been sitting in the + * cache waiting to be evicted, and so still have the old leaf attached + * to it. If so, just reuse it. + */ + zap_leaf_t *l = dmu_buf_get_user(db); + if (l == NULL) { + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_blkid = blkid; + l->l_dbuf = db; + rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, + &l->l_dbuf); + dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + } else { + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + } + + rw_enter(&l->l_rwlock, RW_WRITER); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); From d0aa9dbccfb06778ca336732ee4e627f50475ad3 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 12:00:29 +1000 Subject: [PATCH 35/43] Use memset to zero stack allocations containing unions C99 6.7.8.17 says that when an undesignated initialiser is used, only the first element of a union is initialised. If the first element is not the largest within the union, how the remaining space is initialised is up to the compiler. GCC extends the initialiser to the entire union, while Clang treats the remainder as padding, and so initialises according to whatever automatic/implicit initialisation rules are currently active. When Linux is compiled with CONFIG_INIT_STACK_ALL_PATTERN, -ftrivial-auto-var-init=pattern is added to the kernel CFLAGS. This flag sets the policy for automatic/implicit initialisation of variables on the stack. Taken together, this means that when compiling under CONFIG_INIT_STACK_ALL_PATTERN on Clang, the "zero" initialiser will only zero the first element in a union, and the rest will be filled with a pattern. This is significant for aes_ctx_t, which in aes_encrypt_atomic() and aes_decrypt_atomic() is initialised to zero, but then used as a gcm_ctx_t, which is the fifth element in the union, and thus gets pattern initialisation. Later, it's assumed to be zero, resulting in a hang. As confusing and undiscoverable as it is, by the spec, we are at fault when we initialise a structure containing a union with the zero initializer. As such, this commit replaces these uses with an explicit memset(0). Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16135 Closes #16206 --- cmd/zstream/zstream_redup.c | 4 +++- lib/libzfs/libzfs_sendrecv.c | 6 ++++-- module/icp/io/aes.c | 8 ++++++-- tests/zfs-tests/cmd/libzfs_input_check.c | 4 +++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c index c56a09cee75..6866639fe46 100644 --- a/cmd/zstream/zstream_redup.c +++ b/cmd/zstream/zstream_redup.c @@ -186,7 +186,7 @@ static void zfs_redup_stream(int infd, int outfd, boolean_t verbose) { int bufsz = SPA_MAXBLOCKSIZE; - dmu_replay_record_t thedrr = { 0 }; + dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; redup_table_t rdt; zio_cksum_t stream_cksum; @@ -194,6 +194,8 @@ zfs_redup_stream(int infd, int outfd, boolean_t verbose) uint64_t num_records = 0; uint64_t num_write_byref_records = 0; + memset(&thedrr, 0, sizeof (dmu_replay_record_t)); + #ifdef _ILP32 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; #else diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 526f57ea403..0370112c022 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2170,7 +2170,8 @@ out: static int send_conclusion_record(int fd, zio_cksum_t *zc) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; + memset(&drr, 0, sizeof (dmu_replay_record_t)); drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; @@ -2272,7 +2273,8 @@ send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, } if (!dryrun) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; + memset(&drr, 0, sizeof (dmu_replay_record_t)); /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index d6f01304f56..522c436497b 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -832,12 +832,14 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext, crypto_spi_ctx_template_t template) { - aes_ctx_t aes_ctx = {{{{0}}}}; + aes_ctx_t aes_ctx; off_t saved_offset; size_t saved_length; size_t length_needed; int ret; + memset(&aes_ctx, 0, sizeof (aes_ctx_t)); + ASSERT(ciphertext != NULL); /* @@ -956,12 +958,14 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext, crypto_spi_ctx_template_t template) { - aes_ctx_t aes_ctx = {{{{0}}}}; + aes_ctx_t aes_ctx; off_t saved_offset; size_t saved_length; size_t length_needed; int ret; + memset(&aes_ctx, 0, sizeof (aes_ctx_t)); + ASSERT(plaintext != NULL); /* diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c index c661718a296..7d9ce4fada1 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check.c @@ -521,13 +521,15 @@ test_send_new(const char *snapshot, int fd) static void test_recv_new(const char *dataset, int fd) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); char snapshot[MAXNAMELEN + 32]; ssize_t count; + memset(&drr, 0, sizeof (dmu_replay_record_t)); + int cleanup_fd = open(ZFS_DEV, O_RDWR); if (cleanup_fd == -1) { (void) fprintf(stderr, "open(%s) failed: %s\n", ZFS_DEV, From 8865dfbcaad44a1056f35be60d3058dd3b1e9145 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Sat, 25 May 2024 04:02:58 +0200 Subject: [PATCH 36/43] Fix assertion in Persistent L2ARC At the end of l2arc_evict() fix an assertion in the case that l2ad_hand + distance == l2ad_end. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #16202 Closes #16207 --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d1d60b84109..30d30b98a6c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8886,7 +8886,7 @@ out: * assertions may be violated without functional consequences * as the device is about to be removed. */ - ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } From 02c5aa9b092818785ed8db4e2246a828278138e3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 24 May 2024 22:11:18 -0400 Subject: [PATCH 37/43] Destroy ARC buffer in case of fill error In case of error dmu_buf_fill_done() returns the buffer back into DB_UNCACHED state. Since during transition from DB_UNCACHED into DB_FILL state dbuf_noread() allocates an ARC buffer, we must free it here, otherwise it will be leaked. Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15665 Closes #15802 Closes #16216 --- module/zfs/dbuf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index bce41948c40..56fe2c4dbe3 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2853,6 +2853,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) failed = B_FALSE; } else if (failed) { VERIFY(!dbuf_undirty(db, tx)); + arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; dbuf_clear_data(db); DTRACE_SET_STATE(db, "fill failed"); From 800d59d5771806459a23f10f3c9ee8f2d178b9ed Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 29 May 2024 11:53:31 -0400 Subject: [PATCH 38/43] Some improvements to metaslabs eviction - Add old eviction for special and dedup metaslab classes. Those vdevs may be potentially big and fragmented with large metaslabs, while their asynchronous write pattern is not really different from normal class. It seems an omission to not evict old metaslabs from them. - If we have metaslab preload enabled, which means we are not too low on memory, do not evict active metaslabs even if they are not used for some time. Eviction of active metaslabs means we won't be able to write anything until we load them, that may take some time, that is straight opposite to metaslab preload goals. For small systems the memory saving should be less important after recent reduction in number of allocators and so open metaslabs. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16214 --- module/zfs/metaslab.c | 7 +++++-- module/zfs/spa.c | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index cb004930d28..7170b5eefce 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -638,6 +638,7 @@ void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; + hrtime_t now = gethrtime(); for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); @@ -661,8 +662,10 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && - gethrtime() > msp->ms_selected_time + - (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + now > msp->ms_selected_time + + MSEC2NSEC(metaslab_unload_delay_ms) && + (msp->ms_allocator == -1 || + !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 412f883e9c3..638572996c3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -10168,6 +10168,9 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); + /* spa_embedded_log_class has only one metaslab per vdev. */ + metaslab_class_evict_old(spa->spa_special_class, txg); + metaslab_class_evict_old(spa->spa_dedup_class, txg); spa_sync_close_syncing_log_sm(spa); From 6b95031f5642f54bab063da84dd4009df2bc0b5e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 29 May 2024 10:46:41 -0700 Subject: [PATCH 39/43] zed: Add deadman-slot_off.sh zedlet Optionally turn off disk's enclosure slot if an I/O is hung triggering the deadman. It's possible for outstanding I/O to a misbehaving SCSI disk to neither promptly complete or return an error. This can occur due to retry and recovery actions taken by the SCSI layer, driver, or disk. When it occurs the pool will be unresponsive even though there may be sufficient redundancy configured to proceeded without this single disk. When a hung I/O is detected by the kmods it will be posted as a deadman event. By default an I/O is considered to be hung after 5 minutes. This value can be changed with the zfs_deadman_ziotime_ms module parameter. If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure slot will be powered off causing the outstanding I/O to fail. The ZED will then handle this like a normal disk failure. By default ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is not set. As part of this change `zfs_deadman_events_per_second` is added to control the ratelimitting of deadman events independantly of delay events. In practice, a single deadman event is sufficient and more aren't particularly useful. Alphabetize the zfs_deadman_* entries in zfs.4. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #16226 --- cmd/zed/zed.d/Makefile.am | 2 + cmd/zed/zed.d/deadman-slot_off.sh | 71 +++++++++++++++++++ cmd/zed/zed.d/zed.rc | 7 ++ man/man4/zfs.4 | 21 +++--- module/zfs/vdev.c | 10 ++- tests/zfs-tests/include/tunables.cfg | 1 + .../functional/deadman/deadman_ratelimit.ksh | 8 +-- 7 files changed, 106 insertions(+), 14 deletions(-) create mode 100755 cmd/zed/zed.d/deadman-slot_off.sh diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index 812558cf6d0..093a04c4636 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -9,6 +9,7 @@ dist_zedexec_SCRIPTS = \ %D%/all-debug.sh \ %D%/all-syslog.sh \ %D%/data-notify.sh \ + %D%/deadman-slot_off.sh \ %D%/generic-notify.sh \ %D%/pool_import-led.sh \ %D%/resilver_finish-notify.sh \ @@ -29,6 +30,7 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS) zedconfdefaults = \ all-syslog.sh \ data-notify.sh \ + deadman-slot_off.sh \ history_event-zfs-list-cacher.sh \ pool_import-led.sh \ resilver_finish-notify.sh \ diff --git a/cmd/zed/zed.d/deadman-slot_off.sh b/cmd/zed/zed.d/deadman-slot_off.sh new file mode 100755 index 00000000000..7b339b3add0 --- /dev/null +++ b/cmd/zed/zed.d/deadman-slot_off.sh @@ -0,0 +1,71 @@ +#!/bin/sh +# shellcheck disable=SC3014,SC2154,SC2086,SC2034 +# +# Turn off disk's enclosure slot if an I/O is hung triggering the deadman. +# +# It's possible for outstanding I/O to a misbehaving SCSI disk to neither +# promptly complete or return an error. This can occur due to retry and +# recovery actions taken by the SCSI layer, driver, or disk. When it occurs +# the pool will be unresponsive even though there may be sufficient redundancy +# configured to proceeded without this single disk. +# +# When a hung I/O is detected by the kmods it will be posted as a deadman +# event. By default an I/O is considered to be hung after 5 minutes. This +# value can be changed with the zfs_deadman_ziotime_ms module parameter. +# If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure +# slot will be powered off causing the outstanding I/O to fail. The ZED +# will then handle this like a normal disk failure and FAULT the vdev. +# +# We assume the user will be responsible for turning the slot back on +# after replacing the disk. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN disabled +# 3: System not configured to wait on deadman +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_POOL_FAILMODE" != "wait" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +# Turn off the slot and wait for sysfs to report that the slot is off. +# It can take ~400ms on some enclosures and multiple retries may be needed. +for i in $(seq 1 20) ; do + echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + + for j in $(seq 1 5) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break 2 + fi + sleep 0.1 + done +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index ec64ecfaa13..af56147a969 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -148,6 +148,13 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # supports slot power control via sysfs. #ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1 +## +# Power off the drive's slot in the enclosure if there is a hung I/O which +# exceeds the deadman timeout. This can help prevent a single misbehaving +# drive from rendering a redundant pool unavailable. This assumes your drive +# enclosure fully supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN=1 + ## # Ntfy topic # This defines which topic will receive the ntfy notification. diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6895a2a6d79..f1d14b4d01a 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -906,6 +906,13 @@ Historically used for controlling what reporting was available under .Pa /proc/spl/kstat/zfs . No effect. . +.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 +Check time in milliseconds. +This defines the frequency at which we check for hung I/O requests +and potentially invoke the +.Sy zfs_deadman_failmode +behavior. +. .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int When a pool sync operation takes longer than .Sy zfs_deadman_synctime_ms , @@ -921,6 +928,10 @@ By default, the deadman is enabled and set to which results in "hung" I/O operations only being logged. The deadman is automatically disabled when a pool gets suspended. . +.It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int +Rate limit deadman zevents (which report hung I/O operations) to this many per +second. +. .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp Controls the failure behavior when the deadman detects a "hung" I/O operation. Valid values are: @@ -938,13 +949,6 @@ This can be used to facilitate automatic fail-over to a properly configured fail-over partner. .El . -.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 -Check time in milliseconds. -This defines the frequency at which we check for hung I/O requests -and potentially invoke the -.Sy zfs_deadman_failmode -behavior. -. .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". @@ -1002,8 +1006,7 @@ will result in objects waiting when there is not actually contention on the same object. . .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int -Rate limit delay and deadman zevents (which report slow I/O operations) to this -many per +Rate limit delay zevents (which report slow I/O operations) to this many per second. . .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 414bf84f6f7..c74f72159dc 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -112,6 +112,11 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12); */ static unsigned int zfs_slow_io_events_per_second = 20; +/* + * Rate limit deadman "hung IO" events to this many per second. + */ +static unsigned int zfs_deadman_events_per_second = 1; + /* * Rate limit checksum events after this many checksum errors per second. */ @@ -666,7 +671,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); - zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, + zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); @@ -6476,6 +6481,9 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); +ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, + "Rate limit hung IO (deadman) events to this many per second"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index a619b846dd1..721cf27f48c 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -29,6 +29,7 @@ CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indir CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms +DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh index 4dd4c5b9a76..d851d03e1a8 100755 --- a/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh +++ b/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh @@ -28,7 +28,7 @@ # Verify spa deadman events are rate limited # # STRATEGY: -# 1. Reduce the zfs_slow_io_events_per_second to 1. +# 1. Reduce the zfs_deadman_events_per_second to 1. # 2. Reduce the zfs_deadman_ziotime_ms to 1ms. # 3. Write data to a pool and read it back. # 4. Verify deadman events have been produced at a reasonable rate. @@ -44,15 +44,15 @@ function cleanup zinject -c all default_cleanup_noexit - set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + set_tunable64 DEADMAN_EVENTS_PER_SECOND $OLD_DEADMAN_EVENTS set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT } log_assert "Verify spa deadman events are rate limited" log_onexit cleanup -OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) -log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1 +OLD_DEADMAN_EVENTS=$(get_tunable DEADMAN_EVENTS_PER_SECOND) +log_must set_tunable64 DEADMAN_EVENTS_PER_SECOND 1 log_must set_tunable64 DEADMAN_ZIOTIME_MS 1 # Create a new pool in order to use the updated deadman settings. From 01c8efdd59b540eb6ea21e339d2dbd0283095130 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 29 May 2024 10:49:11 -0700 Subject: [PATCH 40/43] Simplify issig(). We always call it twice with JUSTLOOKING and then FORREAL. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #16225 --- include/os/freebsd/spl/sys/sig.h | 8 +------- include/os/linux/spl/sys/signal.h | 5 +---- include/sys/zfs_context.h | 3 +-- module/os/linux/spl/spl-thread.c | 16 +++------------- module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_objset.c | 2 +- module/zfs/dmu_recv.c | 2 +- module/zfs/dmu_redact.c | 2 +- module/zfs/dmu_send.c | 2 +- module/zfs/zcp.c | 3 +-- module/zfs/zfs_ioctl.c | 2 +- module/zfs/zfs_vnops.c | 2 +- 12 files changed, 14 insertions(+), 35 deletions(-) diff --git a/include/os/freebsd/spl/sys/sig.h b/include/os/freebsd/spl/sys/sig.h index a4d440d3832..17fc65cbe3e 100644 --- a/include/os/freebsd/spl/sys/sig.h +++ b/include/os/freebsd/spl/sys/sig.h @@ -39,20 +39,14 @@ #include #include -#define FORREAL 0 -#define JUSTLOOKING 1 - static __inline int -issig(int why) +issig(void) { struct thread *td = curthread; struct proc *p; int sig; - ASSERT(why == FORREAL || why == JUSTLOOKING); if (SIGPENDING(td)) { - if (why == JUSTLOOKING) - return (1); p = td->td_proc; PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); diff --git a/include/os/linux/spl/sys/signal.h b/include/os/linux/spl/sys/signal.h index 6b538c8966f..cb4b3326164 100644 --- a/include/os/linux/spl/sys/signal.h +++ b/include/os/linux/spl/sys/signal.h @@ -30,9 +30,6 @@ #include #endif -#define FORREAL 0 /* Usual side-effects */ -#define JUSTLOOKING 1 /* Don't stop the process */ - -extern int issig(int why); +extern int issig(void); #endif /* SPL_SIGNAL_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 8f264b50e99..e4711ce4194 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -249,8 +249,7 @@ extern struct proc p0; extern kthread_t *zk_thread_create(const char *name, void (*func)(void *), void *arg, size_t stksize, int state); -#define issig(why) (FALSE) -#define ISSIG(thr, why) (FALSE) +#define issig() (FALSE) #define KPREEMPT_SYNC (-1) diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index ee3eb4690c3..dbb8eefa7ec 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -152,26 +152,16 @@ spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) EXPORT_SYMBOL(spl_kthread_create); /* - * The "why" argument indicates the allowable side-effects of the call: - * - * FORREAL: Extract the next pending signal from p_sig into p_cursig; - * stop the process if a stop has been requested or if a traced signal - * is pending. - * - * JUSTLOOKING: Don't stop the process, just indicate whether or not - * a signal might be pending (FORREAL is needed to tell for sure). + * Extract the next pending signal from p_sig into p_cursig; stop the process + * if a stop has been requested or if a traced signal is pending. */ int -issig(int why) +issig(void) { - ASSERT(why == FORREAL || why == JUSTLOOKING); if (!signal_pending(current)) return (0); - if (why != FORREAL) - return (1); - struct task_struct *task = current; spl_kernel_siginfo_t __info; sigset_t set; diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index a2b1a27c88e..0def0956beb 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -116,7 +116,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dmu_diffarg_t *da = arg; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) return (SET_ERROR(EINTR)); if (zb->zb_level == ZB_DNODE_LEVEL || diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f1818ae155b..8f4fefa4f4d 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2437,7 +2437,7 @@ dmu_objset_space_upgrade(objset_t *os) if (err != 0) return (err); - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 680aed4513b..0119191d792 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -3389,7 +3389,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { err = SET_ERROR(EINTR); break; } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 5ac14edfca1..1feba0ba83d 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -912,7 +912,7 @@ perform_redaction(objset_t *os, redaction_list_t *rl, object = prev_obj; } while (err == 0 && object <= rec->end_object) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { err = EINTR; break; } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b6cc2f0a5e9..cb2b62fed31 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2552,7 +2552,7 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) err = SET_ERROR(EINTR); } diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c index 959404f665a..7c279162a9d 100644 --- a/module/zfs/zcp.c +++ b/module/zfs/zcp.c @@ -780,8 +780,7 @@ zcp_lua_counthook(lua_State *state, lua_Debug *ar) * Check if we were canceled while waiting for the * txg to sync or from our open context thread */ - if (ri->zri_canceled || - (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) { + if (ri->zri_canceled || (!ri->zri_sync && issig())) { ri->zri_canceled = B_TRUE; (void) lua_pushstring(state, "Channel program was canceled."); (void) lua_error(state); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b720b4f222b..7b527eb75e8 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -2336,7 +2336,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) } while (error == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { error = SET_ERROR(EINTR); break; } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b222a6f88d2..f3db953eab4 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1413,7 +1413,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, len -= size; done += size; - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { error = SET_ERROR(EINTR); break; } From ae22044da998e27497c3ad6724a0c64c89cfd87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Wed, 29 May 2024 19:51:01 +0200 Subject: [PATCH 41/43] spl: fix compilation without HAVE_BACKTRACE The __maybe_unused macro is defined in spl/sys/debug.h Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #16229 --- lib/libspl/backtrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index dd8cb025f4f..d26d742106e 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -110,6 +110,8 @@ libspl_backtrace(int fd) backtrace_symbols_fd(btptrs, nptrs, fd); } #else +#include + void libspl_backtrace(int fd __maybe_unused) { From 5137c132a5e82b2e799ad3ee5d82fb32b500e5a4 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 29 May 2024 13:34:59 -0700 Subject: [PATCH 42/43] zpool import output is not formated properly. The 'zpool status' output assumes that the longest prefix is six character long plus colon plus space, eg. 'status: ', 'action: ' or 'config: ' (so eight in total). This works well even when we have messages that requires more than one line, as '\t' is exactly eight characters, just like the longest prefix. The 'zpool import' output is a bit different, as it may display the comment pool property, then the longest prefix is 'comment: ', which is nine characters long, not eight. All the prefixes were given an extra space in front, but: - 'status: ' did not get an extra space. - Messages that require more than one line should use nine spaces of indentation, not eight. - The extra space in front looks redundant if there is no comment property set on the given pool. Fix it by adding an extra space to all prefixes, but only if the comment property is defined. Also, when we need to continue the message in a new line, use '\t ' for indentation. While here, apply small corrections to a couple messages. Before: pool: tank id: 7412636063178848859 state: ONLINE status: Some supported features are not enabled on the pool. (Note that they may be intentionally disabled if the 'compatibility' property is set.) action: The pool can be imported using its name or numeric identif[...] some features will not be available without an explicit 'zp[...] comment: Example comment. config: bclone ONLINE ada0 ONLINE After: pool: tank id: 10180960571062436759 state: ONLINE status: Some supported features are not enabled on the pool. (Note that they may be intentionally disabled if the 'compatibility' property is set.) action: The pool can be imported using its name or numeric identifi[...] some features will not be available without an explicit 'zp[...] config: tank ONLINE ada3 ONLINE pool: dozer id: 11028319538368222579 state: ONLINE status: Some supported features are not enabled on the pool. (Note that they may be intentionally disabled if the 'compatibility' property is set.) action: The pool can be imported using its name or numeric identif[...] some features will not be available without an explicit 'z[...] comment: Example comment. config: dozer ONLINE ada1 ONLINE Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Dawidek Closes #16128 --- cmd/zpool/zpool_main.c | 269 +++++++++++++++++++++-------------------- 1 file changed, 138 insertions(+), 131 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d47e1cda9c3..57170c8ae71 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3016,6 +3016,7 @@ show_import(nvlist_t *config, boolean_t report_error) const char *health; uint_t vsc; const char *comment; + const char *indent; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, @@ -3040,82 +3041,84 @@ show_import(nvlist_t *config, boolean_t report_error) if (reason != ZPOOL_STATUS_OK && !report_error) return (reason); - (void) printf(gettext(" pool: %s\n"), name); - (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); - (void) printf(gettext(" state: %s"), health); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) { + indent = " "; + } else { + comment = NULL; + indent = ""; + } + + (void) printf(gettext("%s pool: %s\n"), indent, name); + (void) printf(gettext("%s id: %llu\n"), indent, (u_longlong_t)guid); + (void) printf(gettext("%s state: %s"), indent, health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); + if (reason != ZPOOL_STATUS_OK) { + (void) printf("%s", indent); + printf_color(ANSI_BOLD, gettext("status: ")); + } switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: - printf_color(ANSI_BOLD, gettext("status: ")); - printf_color(ANSI_YELLOW, gettext("One or more devices contains" - " corrupted data.\n")); + printf_color(ANSI_YELLOW, gettext("One or more devices " + "contains corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf( - gettext(" status: The pool data is corrupted.\n")); + printf_color(ANSI_YELLOW, gettext("The pool data is " + "corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported " - "features are not enabled on the pool.\n\t" - "(Note that they may be intentionally disabled " - "if the\n\t'compatibility' property is set.)\n")); + "features are not enabled on the pool.\n" + "\t%s(Note that they may be intentionally disabled if the\n" + "\t%s'compatibility' property is set.)\n"), indent, indent); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" - "property.\n")); + "\t%sproperty.\n"), indent); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n" - "requested by the 'compatibility' property.\n")); + "\t%srequested by the 'compatibility' property.\n"), + indent); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); @@ -3124,66 +3127,60 @@ show_import(nvlist_t *config, boolean_t report_error) break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " - "accessed in read-only mode on this system. It\n\tcannot be" - " accessed in read-write mode because it uses the " - "following\n\tfeature(s) not supported on this system:\n")); + "accessed in read-only mode on this system. It\n" + "\t%scannot be accessed in read-write mode because it uses " + "the following\n" + "\t%sfeature(s) not supported on this system:\n"), + indent, indent); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " - "multihost property on. It cannot\n\tbe safely imported " - "when the system hostid is not set.\n")); + "multihost property on. It cannot\n" + "\t%sbe safely imported when the system hostid is not " + "set.\n"), indent); break; case ZPOOL_STATUS_HOSTID_MISMATCH: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" - "\tExpect reduced performance.\n")); + "\t%sExpect reduced performance.\n"), indent); break; default: @@ -3196,114 +3193,121 @@ show_import(nvlist_t *config, boolean_t report_error) /* * Print out an action according to the overall state of the pool. */ + if (vs->vs_state != VDEV_STATE_HEALTHY || + reason != ZPOOL_STATUS_ERRATA || errata != ZPOOL_ERRATA_NONE) { + (void) printf("%s", indent); + (void) printf(gettext("action: ")); + } if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric identifier, " - "though\n\tsome features will not be available " - "without an explicit 'zpool upgrade'.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric identifier, though\n" + "\t%ssome features will not be available without " + "an explicit 'zpool upgrade'.\n"), indent); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric\n\tidentifier, " - "though the file(s) indicated by its " - "'compatibility'\n\tproperty cannot be parsed at " - "this time.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric\n" + "\t%sidentifier, though the file(s) indicated by " + "its 'compatibility'\n" + "\t%sproperty cannot be parsed at this time.\n"), + indent, indent); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric " - "identifier and\n\tthe '-f' flag.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric identifier and\n" + "\t%sthe '-f' flag.\n"), indent); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { - case ZPOOL_ERRATA_NONE: - break; - case ZPOOL_ERRATA_ZOL_2094_SCRUB: - (void) printf(gettext(" action: The pool can " - "be imported using its name or numeric " - "identifier,\n\thowever there is a compat" - "ibility issue which should be corrected" - "\n\tby running 'zpool scrub'\n")); + (void) printf(gettext("The pool can be " + "imported using its name or numeric " + "identifier,\n" + "\t%showever there is a compatibility " + "issue which should be corrected\n" + "\t%sby running 'zpool scrub'\n"), + indent, indent); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: - (void) printf(gettext(" action: The pool can" - "not be imported with this version of ZFS " - "due to\n\tan active asynchronous destroy. " - "Revert to an earlier version\n\tand " - "allow the destroy to complete before " - "updating.\n")); + (void) printf(gettext("The pool cannot be " + "imported with this version of ZFS due to\n" + "\t%san active asynchronous destroy. " + "Revert to an earlier version\n" + "\t%sand allow the destroy to complete " + "before updating.\n"), indent, indent); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: - (void) printf(gettext(" action: Existing " - "encrypted datasets contain an on-disk " - "incompatibility, which\n\tneeds to be " - "corrected. Backup these datasets to new " - "encrypted datasets\n\tand destroy the " - "old ones.\n")); + (void) printf(gettext("Existing encrypted " + "datasets contain an on-disk " + "incompatibility, which\n" + "\t%sneeds to be corrected. Backup these " + "datasets to new encrypted datasets\n" + "\t%sand destroy the old ones.\n"), + indent, indent); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: - (void) printf(gettext(" action: Existing " - "encrypted snapshots and bookmarks contain " - "an on-disk\n\tincompatibility. This may " - "cause on-disk corruption if they are used" - "\n\twith 'zfs recv'. To correct the " - "issue, enable the bookmark_v2 feature.\n\t" - "No additional action is needed if there " - "are no encrypted snapshots or\n\t" - "bookmarks. If preserving the encrypted " - "snapshots and bookmarks is\n\trequired, " - "use a non-raw send to backup and restore " - "them. Alternately,\n\tthey may be removed" - " to resolve the incompatibility.\n")); + (void) printf(gettext("Existing encrypted " + "snapshots and bookmarks contain an " + "on-disk\n" + "\t%sincompatibility. This may cause " + "on-disk corruption if they are used\n" + "\t%swith 'zfs recv'. To correct the " + "issue, enable the bookmark_v2 feature.\n" + "\t%sNo additional action is needed if " + "there are no encrypted snapshots or\n" + "\t%sbookmarks. If preserving the " + "encrypted snapshots and bookmarks is\n" + "\t%srequired, use a non-raw send to " + "backup and restore them. Alternately,\n" + "\t%sthey may be removed to resolve the " + "incompatibility.\n"), indent, indent, + indent, indent, indent, indent); break; default: /* * All errata must contain an action message. */ - assert(0); + assert(errata == ZPOOL_ERRATA_NONE); } } else { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric " - "identifier.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { - (void) printf(gettext(" action: The pool can be imported " - "despite missing or damaged devices. The\n\tfault " - "tolerance of the pool may be compromised if imported.\n")); + (void) printf(gettext("The pool can be imported despite " + "missing or damaged devices. The\n" + "\t%sfault tolerance of the pool may be compromised if " + "imported.\n"), indent); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext(" action: The pool cannot be " - "imported. Access the pool on a system running " - "newer\n\tsoftware, or recreate the pool from " - "backup.\n")); + (void) printf(gettext("The pool cannot be imported. " + "Access the pool on a system running newer\n" + "\t%ssoftware, or recreate the pool from " + "backup.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: - printf_color(ANSI_BOLD, gettext("action: ")); - printf_color(ANSI_YELLOW, gettext("The pool cannot be " - "imported. Access the pool on a system that " - "supports\n\tthe required feature(s), or recreate " - "the pool from backup.\n")); + (void) printf(gettext("The pool cannot be imported. " + "Access the pool on a system that supports\n" + "\t%sthe required feature(s), or recreate the pool " + "from backup.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - printf_color(ANSI_BOLD, gettext("action: ")); - printf_color(ANSI_YELLOW, gettext("The pool cannot be " - "imported in read-write mode. Import the pool " - "with\n" - "\t\"-o readonly=on\", access the pool on a system " - "that supports the\n\trequired feature(s), or " - "recreate the pool from backup.\n")); + (void) printf(gettext("The pool cannot be imported in " + "read-write mode. Import the pool with\n" + "\t%s'-o readonly=on', access the pool on a system " + "that supports the\n" + "\t%srequired feature(s), or recreate the pool " + "from backup.\n"), indent, indent); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext(" action: The pool cannot be " - "imported. Attach the missing\n\tdevices and try " - "again.\n")); + (void) printf(gettext("The pool cannot be imported. " + "Attach the missing\n" + "\t%sdevices and try again.\n"), indent); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, @@ -3317,47 +3321,49 @@ show_import(nvlist_t *config, boolean_t report_error) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); - (void) printf(gettext(" action: The pool must be " - "exported from %s (hostid=%"PRIx64")\n\tbefore it " - "can be safely imported.\n"), hostname, hostid); + (void) printf(gettext("The pool must be exported from " + "%s (hostid=%"PRIx64")\n" + "\t%sbefore it can be safely imported.\n"), + hostname, hostid, indent); break; case ZPOOL_STATUS_HOSTID_REQUIRED: - (void) printf(gettext(" action: Set a unique system " - "hostid with the zgenhostid(8) command.\n")); + (void) printf(gettext("Set a unique system hostid with " + "the zgenhostid(8) command.\n")); break; default: - (void) printf(gettext(" action: The pool cannot be " - "imported due to damaged devices or data.\n")); + (void) printf(gettext("The pool cannot be imported due " + "to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ - if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + if (comment != NULL) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ - if (((vs->vs_state == VDEV_STATE_CLOSED) || - (vs->vs_state == VDEV_STATE_CANT_OPEN)) && - (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { + if ((vs->vs_state == VDEV_STATE_CLOSED || + vs->vs_state == VDEV_STATE_CANT_OPEN) && + vs->vs_aux == VDEV_AUX_CORRUPT_DATA) { if (pool_state == POOL_STATE_DESTROYED) - (void) printf(gettext("\tThe pool was destroyed, " - "but can be imported using the '-Df' flags.\n")); + (void) printf(gettext("\t%sThe pool was destroyed, " + "but can be imported using the '-Df' flags.\n"), + indent); else if (pool_state != POOL_STATE_EXPORTED) - (void) printf(gettext("\tThe pool may be active on " - "another system, but can be imported using\n\t" - "the '-f' flag.\n")); + (void) printf(gettext("\t%sThe pool may be active on " + "another system, but can be imported using\n" + "\t%sthe '-f' flag.\n"), indent, indent); } if (msgid != NULL) { - (void) printf(gettext( - " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), - msgid); + (void) printf(gettext("%s see: " + "https://openzfs.github.io/openzfs-docs/msg/%s\n"), + indent, msgid); } - (void) printf(gettext(" config:\n\n")); + (void) printf(gettext("%sconfig:\n\n"), indent); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); @@ -3371,9 +3377,10 @@ show_import(nvlist_t *config, boolean_t report_error) print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { - (void) printf(gettext("\n\tAdditional devices are known to " - "be part of this pool, though their\n\texact " - "configuration cannot be determined.\n")); + (void) printf(gettext("\n\t%sAdditional devices are known to " + "be part of this pool, though their\n" + "\t%sexact configuration cannot be determined.\n"), + indent, indent); } return (0); } From e2357561b9499296bff758afe4868dbc39735675 Mon Sep 17 00:00:00 2001 From: Zhenlei Huang Date: Fri, 31 May 2024 00:58:20 +0800 Subject: [PATCH 43/43] FreeBSD: Add const qualifier to members of struct opensolaris_utsname These members have directly references to the global variables exposed by the kernel. They are not going to be changed by this kernel module. Reviewed-by: Brian Behlendorf Signed-off-by: Zhenlei Huang Closes #16210 --- include/os/freebsd/spl/sys/misc.h | 10 +++++----- module/os/freebsd/spl/spl_misc.c | 7 +++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h index 2e4efc60544..894ccd8bf9b 100644 --- a/include/os/freebsd/spl/sys/misc.h +++ b/include/os/freebsd/spl/sys/misc.h @@ -45,11 +45,11 @@ #define F_SEEK_HOLE FIOSEEKHOLE struct opensolaris_utsname { - char *sysname; - char *nodename; - char *release; - char version[32]; - char *machine; + const char *sysname; + const char *nodename; + const char *release; + char version[32]; + const char *machine; }; #define task_io_account_read(n) diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index a5fc996b655..2d0821417ad 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -37,6 +37,9 @@ #include static struct opensolaris_utsname hw_utsname = { + .sysname = ostype, + .nodename = prison0.pr_hostname, + .release = osrelease, .machine = MACHINE }; @@ -49,10 +52,6 @@ utsname(void) static void opensolaris_utsname_init(void *arg) { - - hw_utsname.sysname = ostype; - hw_utsname.nodename = prison0.pr_hostname; - hw_utsname.release = osrelease; snprintf(hw_utsname.version, sizeof (hw_utsname.version), "%d", osreldate); }