diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d6f144c0e209..14507a17d0be 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -6750,6 +6750,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; + spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -6887,7 +6888,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == - spa_embedded_log_class(spa)) ? + spa_embedded_log_class(spa) || + msp->ms_group->mg_class == + spa_special_embedded_log_class(spa)) ? vd->vdev_log_mg : vd->vdev_mg); /* @@ -7121,6 +7124,8 @@ dump_block_stats(spa_t *spa) zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + zcb->zcb_totalasize += + metaslab_class_get_alloc(spa_special_embedded_log_class(spa)); zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); @@ -7169,6 +7174,7 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + + metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); @@ -7252,6 +7258,18 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } + if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor + != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_embedded_log_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special embedded log", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb->zcb_embedded_blocks[i] == 0) continue; diff --git a/include/sys/spa.h b/include/sys/spa.h index a3e36c1f59ae..e5ec39b64dc7 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1065,6 +1065,7 @@ extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8c52f751a819..a596235ce017 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -246,6 +246,7 @@ struct spa { metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ + metaslab_class_t *spa_special_embedded_log_class; /* log on special */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ diff --git a/include/sys/zil.h b/include/sys/zil.h index fa7945d8ab8b..9d1fb47e2dfc 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -635,6 +635,8 @@ extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern uint64_t zil_max_copied_data(zilog_t *zilog); extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize); +extern itx_wr_state_t zil_write_state(zilog_t *zilog, uint64_t size, + uint32_t blocksize, boolean_t o_direct, boolean_t commit); extern void zil_sums_init(zil_sums_t *zs); extern void zil_sums_fini(zil_sums_t *zs); @@ -642,6 +644,8 @@ extern void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums); extern int zil_replay_disable; +extern uint_t zfs_immediate_write_sz; +extern int zil_special_is_slog; #ifdef __cplusplus } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 67b2cef46e80..fba91d1e28b0 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1713,10 +1713,18 @@ Similar to but for cleanup of old indirection records for removed vdevs. . .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64 -Largest data block to write to the ZIL. -Larger blocks will be treated as if the dataset being written to had the -.Sy logbias Ns = Ns Sy throughput -property set. +Largest write size to store the data directly into the ZIL if +.Sy logbias Ns = Ns Sy latency . +Larger writes may be written indirectly similar to +.Sy logbias Ns = Ns Sy throughput . +In presence of SLOG this parameter is ignored, as if it was set to infinity, +storing all written data into ZIL to not depend on regular vdev latency. +. +.It Sy zil_special_is_slog Ns = Ns Sy 1 Ns | Ns 0 Pq int +When enabled, and written blocks go to normal vdevs, treat present special +vdevs as SLOGs, storing all synchronously written data into ZIL directly. +Disabling this forces the indirect writes to preserve special vdev write +throughput and endurance, likely at the cost of normal vdev latency. . .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64 Pattern written to vdev free space by diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 082d379cded5..89310655fbc9 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -750,7 +750,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) } IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; @@ -1288,7 +1289,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += @@ -1316,7 +1318,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6b52c6cb1f9e..46794cc62e31 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -417,11 +417,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv) alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + alloc += metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); + size += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); @@ -1679,6 +1683,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) "embedded_log", msp, B_TRUE); spa->spa_special_class = metaslab_class_create(spa, "special", msp, B_FALSE); + spa->spa_special_embedded_log_class = metaslab_class_create(spa, + "special_embedded_log", msp, B_TRUE); spa->spa_dedup_class = metaslab_class_create(spa, "dedup", msp, B_FALSE); @@ -1853,6 +1859,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; + metaslab_class_destroy(spa->spa_special_embedded_log_class); + spa->spa_special_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; @@ -9092,6 +9101,8 @@ spa_async_thread(void *arg) old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + old_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); @@ -9100,6 +9111,8 @@ spa_async_thread(void *arg) new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + new_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -10309,7 +10322,7 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); - /* spa_embedded_log_class has only one metaslab per vdev. */ + /* Embedded log classes have only one metaslab per vdev. */ metaslab_class_evict_old(spa->spa_special_class, txg); metaslab_class_evict_old(spa->spa_dedup_class, txg); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f054e4290bbf..d2ba1f954e93 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1308,6 +1308,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, metaslab_class_validate(spa_log_class(spa)); metaslab_class_validate(spa_embedded_log_class(spa)); metaslab_class_validate(spa_special_class(spa)); + metaslab_class_validate(spa_special_embedded_log_class(spa)); metaslab_class_validate(spa_dedup_class(spa)); spa_config_exit(spa, SCL_ALL, spa); @@ -1896,6 +1897,8 @@ spa_get_slop_space(spa_t *spa) */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); + embedded_log += metaslab_class_get_dspace( + spa_special_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* @@ -2000,6 +2003,12 @@ spa_special_class(spa_t *spa) return (spa->spa_special_class); } +metaslab_class_t * +spa_special_embedded_log_class(spa_t *spa) +{ + return (spa->spa_special_embedded_log_class); +} + metaslab_class_t * spa_dedup_class(spa_t *spa) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 01758b0c54c0..aa4038a7526f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -282,12 +282,15 @@ vdev_getops(const char *type) * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a - * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. + * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and + * will point to a metaslab group of either embedded_log_class (for normal + * vdevs) or special_embedded_log_class (for special vdevs). */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { - if (mc == spa_embedded_log_class(vd->vdev_spa) && + if ((mc == spa_embedded_log_class(vd->vdev_spa) || + mc == spa_special_embedded_log_class(vd->vdev_spa)) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else @@ -1508,8 +1511,13 @@ vdev_metaslab_group_create(vdev_t *vd) vd->vdev_mg = metaslab_group_create(mc, vd); if (!vd->vdev_islog) { - vd->vdev_log_mg = metaslab_group_create( - spa_embedded_log_class(spa), vd); + if (mc == spa_special_class(spa)) { + vd->vdev_log_mg = metaslab_group_create( + spa_special_embedded_log_class(spa), vd); + } else { + vd->vdev_log_mg = metaslab_group_create( + spa_embedded_log_class(spa), vd); + } } /* @@ -1624,9 +1632,10 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab - * group. + * group. This works for normal and special vdevs. */ - if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + if ((vd->vdev_mg->mg_class == spa_normal_class(spa) || + vd->vdev_mg->mg_class == spa_special_class(spa)) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 2ce25b72b288..2f61ecfd9b3b 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -607,8 +607,6 @@ zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, * called as soon as the write is on stable storage (be it via a DMU sync or a * ZIL commit). */ -static uint_t zfs_immediate_write_sz = 32768; - void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, @@ -626,15 +624,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, return; } - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - resid >= zfs_immediate_write_sz) - write_state = WR_INDIRECT; - else if (commit) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; + write_state = zil_write_state(zilog, resid, blocksize, o_direct, + commit); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, sizeof (gen)); @@ -938,6 +929,3 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, len -= partlen; } } - -ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, - "Largest data block to write to zil"); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 00059b2c6de0..1a66d32c3515 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -2095,6 +2095,19 @@ zil_max_waste_space(zilog_t *zilog) */ static uint_t zil_maxcopied = 7680; +/* + * Largest write size to store the data directly into ZIL. + */ +uint_t zfs_immediate_write_sz = 32768; + +/* + * When enabled and blocks go to normal vdev, treat special vdevs as SLOG, + * writing data to ZIL (WR_COPIED/WR_NEED_COPY). Disabling this forces the + * indirect writes (WR_INDIRECT) to preserve special vdev throughput and + * endurance, likely at the cost of normal vdev latency. + */ +int zil_special_is_slog = 1; + uint64_t zil_max_copied_data(zilog_t *zilog) { @@ -2102,6 +2115,46 @@ zil_max_copied_data(zilog_t *zilog) return (MIN(max_data, zil_maxcopied)); } +/* + * Determine the appropriate write state for ZIL transactions based on + * pool configuration, data placement, write size, and logbias settings. + */ +itx_wr_state_t +zil_write_state(zilog_t *zilog, uint64_t size, uint32_t blocksize, + boolean_t o_direct, boolean_t commit) +{ + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) + return (WR_INDIRECT); + + /* + * Don't use indirect for too small writes to reduce overhead. + * Don't use indirect if written less than a half of a block if + * we are going to commit it immediately, since next write might + * rewrite the same block again, causing inflation. If commit + * is not planned, then next writes might coalesce, and so the + * indirect may be perfect. + */ + boolean_t indirect = (size >= zfs_immediate_write_sz && + (size >= blocksize / 2 || !commit)); + + if (spa_has_slogs(zilog->zl_spa)) { + /* Dedicated slogs: never use indirect */ + indirect = B_FALSE; + } else if (spa_has_special(zilog->zl_spa)) { + /* Special vdevs: only when beneficial */ + boolean_t on_special = (blocksize <= + zilog->zl_os->os_zpl_special_smallblock); + indirect &= (on_special || !zil_special_is_slog); + } + + if (indirect) + return (WR_INDIRECT); + else if (commit) + return (WR_COPIED); + else + return (WR_NEED_COPY); +} + static uint64_t zil_itx_record_size(itx_t *itx) { @@ -4418,3 +4471,9 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, "Limit in bytes WR_COPIED size"); + +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, + "Largest write size to store data into ZIL"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, special_is_slog, INT, ZMOD_RW, + "Treat special vdevs as SLOG"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 67ee3d5ba2e1..c3d96c049d36 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4433,14 +4433,34 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; ZIOSTAT_BUMP(ziostat_total_allocations); + + /* Try log class (dedicated slog devices) first */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); *slog = (error == 0); + + /* Try special_embedded_log class (reserved on special vdevs) */ + if (error != 0) { + error = metaslab_alloc(spa, spa_special_embedded_log_class(spa), + size, new_bp, 1, txg, NULL, flags, &io_alloc_list, + allocator, NULL); + } + + /* Try special class (general special vdev allocation) */ + if (error != 0) { + error = metaslab_alloc(spa, spa_special_class(spa), size, + new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL); + } + + /* Try embedded_log class (reserved on normal vdevs) */ if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); } + + /* Finally fall back to normal class */ if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, spa_normal_class(spa), size, diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 3568d4f43fcb..4116e16133bc 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -859,13 +859,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { }; /* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). + * zvol_log_write() handles TX_WRITE transactions. */ -static const ssize_t zvol_immediate_write_sz = 32768; - void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit) @@ -878,15 +873,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, if (zil_replaying(zilog, tx)) return; - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - size >= blocksize && blocksize > zvol_immediate_write_sz) - write_state = WR_INDIRECT; - else if (commit) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; + write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit); while (size) { itx_t *itx;