Skip to content

Commit 6a5e50b

Browse files
committed
Allow and prefer special vdevs as ZIL
Before this change ZIL blocks were allocated only from normal or SLOG vdevs. In typical situation when special vdevs are SSDs and normal are HDDs it could cause weird inversions when data blocks are written to SSDs, but ZIL referencing them to HDDs. This change assumes that special vdevs typically have much better (or at least not worse) latency than normal, and so in absence of SLOGs should store ZIL blocks. It means similar to normal vdevs introduction of special embedded log allocation class and updating the allocation fallback order to: SLOG -> special embedded log -> special -> normal embedded log -> normal. The code tries to guess whether data block is going to be written to normal or special vdev (it can not be done precisely before compression) and prefer indirect writes for blocks written to a special vdev to avoid double-write. For blocks that are going to be written to normal vdev, special vdev by default plays as SLOG, reducing write latency by the cost of higher special vdev wear, but it is tunable via module parameter. This should allow HDD pools with decent SSD as special vdev to work under synchronous workloads without requiring additional SLOG SSD, impractical in many scenarios. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent eacf618 commit 6a5e50b

File tree

13 files changed

+163
-43
lines changed

13 files changed

+163
-43
lines changed

cmd/zdb/zdb.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6750,6 +6750,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
67506750
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
67516751
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
67526752
spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
6753+
spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops;
67536754

67546755
zcb->zcb_vd_obsolete_counts =
67556756
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
@@ -6887,7 +6888,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
68876888
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
68886889
metaslab_t *msp = vd->vdev_ms[m];
68896890
ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
6890-
spa_embedded_log_class(spa)) ?
6891+
spa_embedded_log_class(spa) ||
6892+
msp->ms_group->mg_class ==
6893+
spa_special_embedded_log_class(spa)) ?
68916894
vd->vdev_log_mg : vd->vdev_mg);
68926895

68936896
/*
@@ -7121,6 +7124,8 @@ dump_block_stats(spa_t *spa)
71217124
zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
71227125
zcb->zcb_totalasize +=
71237126
metaslab_class_get_alloc(spa_embedded_log_class(spa));
7127+
zcb->zcb_totalasize +=
7128+
metaslab_class_get_alloc(spa_special_embedded_log_class(spa));
71247129
zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
71257130
err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
71267131

@@ -7169,6 +7174,7 @@ dump_block_stats(spa_t *spa)
71697174
total_alloc = norm_alloc +
71707175
metaslab_class_get_alloc(spa_log_class(spa)) +
71717176
metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
7177+
metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) +
71727178
metaslab_class_get_alloc(spa_special_class(spa)) +
71737179
metaslab_class_get_alloc(spa_dedup_class(spa)) +
71747180
get_unflushed_alloc_space(spa);
@@ -7252,6 +7258,18 @@ dump_block_stats(spa_t *spa)
72527258
100.0 * alloc / space);
72537259
}
72547260

7261+
if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor
7262+
!= NULL) {
7263+
uint64_t alloc = metaslab_class_get_alloc(
7264+
spa_special_embedded_log_class(spa));
7265+
uint64_t space = metaslab_class_get_space(
7266+
spa_special_embedded_log_class(spa));
7267+
7268+
(void) printf("\t%-16s %14llu used: %5.2f%%\n",
7269+
"Special embedded log", (u_longlong_t)alloc,
7270+
100.0 * alloc / space);
7271+
}
7272+
72557273
for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
72567274
if (zcb->zcb_embedded_blocks[i] == 0)
72577275
continue;

include/sys/spa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,7 @@ extern metaslab_class_t *spa_normal_class(spa_t *spa);
10651065
extern metaslab_class_t *spa_log_class(spa_t *spa);
10661066
extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
10671067
extern metaslab_class_t *spa_special_class(spa_t *spa);
1068+
extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa);
10681069
extern metaslab_class_t *spa_dedup_class(spa_t *spa);
10691070
extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio);
10701071
extern boolean_t spa_special_has_ddt(spa_t *spa);

include/sys/spa_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ struct spa {
246246
metaslab_class_t *spa_log_class; /* intent log data class */
247247
metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
248248
metaslab_class_t *spa_special_class; /* special allocation class */
249+
metaslab_class_t *spa_special_embedded_log_class; /* log on special */
249250
metaslab_class_t *spa_dedup_class; /* dedup allocation class */
250251
uint64_t spa_first_txg; /* first txg after spa_open() */
251252
uint64_t spa_final_txg; /* txg of export/destroy */

include/sys/zil.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,13 +635,17 @@ extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
635635

636636
extern uint64_t zil_max_copied_data(zilog_t *zilog);
637637
extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize);
638+
extern itx_wr_state_t zil_write_state(zilog_t *zilog, uint64_t size,
639+
uint32_t blocksize, boolean_t o_direct, boolean_t commit);
638640

639641
extern void zil_sums_init(zil_sums_t *zs);
640642
extern void zil_sums_fini(zil_sums_t *zs);
641643
extern void zil_kstat_values_update(zil_kstat_values_t *zs,
642644
zil_sums_t *zil_sums);
643645

644646
extern int zil_replay_disable;
647+
extern uint_t zfs_immediate_write_sz;
648+
extern int zil_special_is_slog;
645649

646650
#ifdef __cplusplus
647651
}

man/man4/zfs.4

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,10 +1713,18 @@ Similar to
17131713
but for cleanup of old indirection records for removed vdevs.
17141714
.
17151715
.It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64
1716-
Largest data block to write to the ZIL.
1717-
Larger blocks will be treated as if the dataset being written to had the
1718-
.Sy logbias Ns = Ns Sy throughput
1719-
property set.
1716+
Largest write size to store the data directly into the ZIL if
1717+
.Sy logbias Ns = Ns Sy latency .
1718+
Larger writes may be written indirectly similar to
1719+
.Sy logbias Ns = Ns Sy throughput .
1720+
In presence of SLOG this parameter is ignored, as if it was set to infinity,
1721+
storing all written data into ZIL to not depend on regular vdev latency.
1722+
.
1723+
.It Sy zil_special_is_slog Ns = Ns Sy 1 Ns | Ns 0 Pq int
1724+
When enabled, and written blocks go to normal vdevs, treat present special
1725+
vdevs as SLOGs, storing all synchronously written data into ZIL directly.
1726+
Disabling this forces the indirect writes to preserve special vdev write
1727+
throughput and endurance, likely at the cost of normal vdev latency.
17201728
.
17211729
.It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64
17221730
Pattern written to vdev free space by

module/zfs/metaslab.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
750750
}
751751

752752
IMPLY(mg == mg->mg_vd->vdev_log_mg,
753-
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
753+
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
754+
mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
754755

755756
for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++)
756757
mc_hist[i] += mg->mg_histogram[i];
@@ -1288,7 +1289,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
12881289
mutex_enter(&mc->mc_lock);
12891290
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
12901291
IMPLY(mg == mg->mg_vd->vdev_log_mg,
1291-
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1292+
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
1293+
mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
12921294
mg->mg_histogram[i + ashift] +=
12931295
msp->ms_sm->sm_phys->smp_histogram[i];
12941296
mc->mc_histogram[i + ashift] +=
@@ -1316,7 +1318,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
13161318
ASSERT3U(mc->mc_histogram[i + ashift], >=,
13171319
msp->ms_sm->sm_phys->smp_histogram[i]);
13181320
IMPLY(mg == mg->mg_vd->vdev_log_mg,
1319-
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1321+
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
1322+
mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
13201323

13211324
mg->mg_histogram[i + ashift] -=
13221325
msp->ms_sm->sm_phys->smp_histogram[i];

module/zfs/spa.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,11 +417,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv)
417417
alloc += metaslab_class_get_alloc(spa_special_class(spa));
418418
alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
419419
alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
420+
alloc += metaslab_class_get_alloc(
421+
spa_special_embedded_log_class(spa));
420422

421423
size = metaslab_class_get_space(mc);
422424
size += metaslab_class_get_space(spa_special_class(spa));
423425
size += metaslab_class_get_space(spa_dedup_class(spa));
424426
size += metaslab_class_get_space(spa_embedded_log_class(spa));
427+
size += metaslab_class_get_space(
428+
spa_special_embedded_log_class(spa));
425429

426430
spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
427431
spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
@@ -1679,6 +1683,8 @@ spa_activate(spa_t *spa, spa_mode_t mode)
16791683
"embedded_log", msp, B_TRUE);
16801684
spa->spa_special_class = metaslab_class_create(spa, "special",
16811685
msp, B_FALSE);
1686+
spa->spa_special_embedded_log_class = metaslab_class_create(spa,
1687+
"special_embedded_log", msp, B_TRUE);
16821688
spa->spa_dedup_class = metaslab_class_create(spa, "dedup",
16831689
msp, B_FALSE);
16841690

@@ -1853,6 +1859,9 @@ spa_deactivate(spa_t *spa)
18531859
metaslab_class_destroy(spa->spa_special_class);
18541860
spa->spa_special_class = NULL;
18551861

1862+
metaslab_class_destroy(spa->spa_special_embedded_log_class);
1863+
spa->spa_special_embedded_log_class = NULL;
1864+
18561865
metaslab_class_destroy(spa->spa_dedup_class);
18571866
spa->spa_dedup_class = NULL;
18581867

@@ -9092,6 +9101,8 @@ spa_async_thread(void *arg)
90929101
old_space += metaslab_class_get_space(spa_dedup_class(spa));
90939102
old_space += metaslab_class_get_space(
90949103
spa_embedded_log_class(spa));
9104+
old_space += metaslab_class_get_space(
9105+
spa_special_embedded_log_class(spa));
90959106

90969107
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
90979108

@@ -9100,6 +9111,8 @@ spa_async_thread(void *arg)
91009111
new_space += metaslab_class_get_space(spa_dedup_class(spa));
91019112
new_space += metaslab_class_get_space(
91029113
spa_embedded_log_class(spa));
9114+
new_space += metaslab_class_get_space(
9115+
spa_special_embedded_log_class(spa));
91039116
mutex_exit(&spa_namespace_lock);
91049117

91059118
/*
@@ -10309,7 +10322,7 @@ spa_sync(spa_t *spa, uint64_t txg)
1030910322

1031010323
metaslab_class_evict_old(spa->spa_normal_class, txg);
1031110324
metaslab_class_evict_old(spa->spa_log_class, txg);
10312-
/* spa_embedded_log_class has only one metaslab per vdev. */
10325+
/* Embedded log classes have only one metaslab per vdev. */
1031310326
metaslab_class_evict_old(spa->spa_special_class, txg);
1031410327
metaslab_class_evict_old(spa->spa_dedup_class, txg);
1031510328

module/zfs/spa_misc.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
13081308
metaslab_class_validate(spa_log_class(spa));
13091309
metaslab_class_validate(spa_embedded_log_class(spa));
13101310
metaslab_class_validate(spa_special_class(spa));
1311+
metaslab_class_validate(spa_special_embedded_log_class(spa));
13111312
metaslab_class_validate(spa_dedup_class(spa));
13121313

13131314
spa_config_exit(spa, SCL_ALL, spa);
@@ -1896,6 +1897,8 @@ spa_get_slop_space(spa_t *spa)
18961897
*/
18971898
uint64_t embedded_log =
18981899
metaslab_class_get_dspace(spa_embedded_log_class(spa));
1900+
embedded_log += metaslab_class_get_dspace(
1901+
spa_special_embedded_log_class(spa));
18991902
slop -= MIN(embedded_log, slop >> 1);
19001903

19011904
/*
@@ -2000,6 +2003,12 @@ spa_special_class(spa_t *spa)
20002003
return (spa->spa_special_class);
20012004
}
20022005

2006+
metaslab_class_t *
2007+
spa_special_embedded_log_class(spa_t *spa)
2008+
{
2009+
return (spa->spa_special_embedded_log_class);
2010+
}
2011+
20032012
metaslab_class_t *
20042013
spa_dedup_class(spa_t *spa)
20052014
{

module/zfs/vdev.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -282,12 +282,15 @@ vdev_getops(const char *type)
282282
* Given a vdev and a metaslab class, find which metaslab group we're
283283
* interested in. All vdevs may belong to two different metaslab classes.
284284
* Dedicated slog devices use only the primary metaslab group, rather than a
285-
* separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
285+
* separate log group. For embedded slogs, vdev_log_mg will be non-NULL and
286+
* will point to either embedded_log_class (for normal vdevs) or
287+
* special_embedded_log_class (for special vdevs).
286288
*/
287289
metaslab_group_t *
288290
vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
289291
{
290-
if (mc == spa_embedded_log_class(vd->vdev_spa) &&
292+
if ((mc == spa_embedded_log_class(vd->vdev_spa) ||
293+
mc == spa_special_embedded_log_class(vd->vdev_spa)) &&
291294
vd->vdev_log_mg != NULL)
292295
return (vd->vdev_log_mg);
293296
else
@@ -1508,8 +1511,13 @@ vdev_metaslab_group_create(vdev_t *vd)
15081511
vd->vdev_mg = metaslab_group_create(mc, vd);
15091512

15101513
if (!vd->vdev_islog) {
1511-
vd->vdev_log_mg = metaslab_group_create(
1512-
spa_embedded_log_class(spa), vd);
1514+
if (mc == spa_special_class(spa)) {
1515+
vd->vdev_log_mg = metaslab_group_create(
1516+
spa_special_embedded_log_class(spa), vd);
1517+
} else {
1518+
vd->vdev_log_mg = metaslab_group_create(
1519+
spa_embedded_log_class(spa), vd);
1520+
}
15131521
}
15141522

15151523
/*
@@ -1624,9 +1632,10 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
16241632
/*
16251633
* Find the emptiest metaslab on the vdev and mark it for use for
16261634
* embedded slog by moving it from the regular to the log metaslab
1627-
* group.
1635+
* group. This works for normal and special vdevs.
16281636
*/
1629-
if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
1637+
if ((vd->vdev_mg->mg_class == spa_normal_class(spa) ||
1638+
vd->vdev_mg->mg_class == spa_special_class(spa)) &&
16301639
vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
16311640
avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
16321641
uint64_t slog_msid = 0;

module/zfs/zfs_log.c

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -607,8 +607,6 @@ zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
607607
* called as soon as the write is on stable storage (be it via a DMU sync or a
608608
* ZIL commit).
609609
*/
610-
static uint_t zfs_immediate_write_sz = 32768;
611-
612610
void
613611
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
614612
znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
@@ -626,15 +624,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
626624
return;
627625
}
628626

629-
if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
630-
write_state = WR_INDIRECT;
631-
else if (!spa_has_slogs(zilog->zl_spa) &&
632-
resid >= zfs_immediate_write_sz)
633-
write_state = WR_INDIRECT;
634-
else if (commit)
635-
write_state = WR_COPIED;
636-
else
637-
write_state = WR_NEED_COPY;
627+
write_state = zil_write_state(zilog, resid, blocksize, o_direct, commit);
638628

639629
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
640630
sizeof (gen));
@@ -939,5 +929,3 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
939929
}
940930
}
941931

942-
ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW,
943-
"Largest data block to write to zil");

0 commit comments

Comments
 (0)