diff --git a/include/sys/zil.h b/include/sys/zil.h index 384678b223d5..0f0f9327af0f 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(const char *osname, void **cookiep); extern void zil_resume(void *cookie); -extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); +extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio); extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 9a34bafc1c77..74c205a8f651 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -172,15 +172,6 @@ typedef struct itx_async_node { avl_node_t ia_node; /* AVL tree linkage */ } itx_async_node_t; -/* - * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs - * we've touched so we know which ones need a write cache flush at the end. - */ -typedef struct zil_vdev_node { - uint64_t zv_vdev; /* vdev to be flushed */ - avl_node_t zv_node; /* AVL tree linkage */ -} zil_vdev_node_t; - #define ZIL_BURSTS 8 /* diff --git a/include/sys/zio.h b/include/sys/zio.h index 6492d17360f6..1aa6bd533795 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -180,16 +180,17 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SCRUB (1ULL << 4) #define ZIO_FLAG_SCAN_THREAD (1ULL << 5) #define ZIO_FLAG_PHYSICAL (1ULL << 6) +#define ZIO_FLAG_VDEV_TRACE (1ULL << 7) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ -#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ -#define ZIO_FLAG_SPECULATIVE (1ULL << 8) -#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) -#define ZIO_FLAG_DONT_RETRY (1ULL << 10) +#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */ +#define ZIO_FLAG_SPECULATIVE (1ULL << 9) +#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10) +#define ZIO_FLAG_DONT_RETRY (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) @@ -445,6 +446,11 @@ enum zio_qstate { ZIO_QS_ACTIVE, }; +typedef struct zio_vdev_trace { + uint64_t zvt_guid; + avl_node_t zvt_node; +} zio_vdev_trace_t; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -513,6 +519,7 @@ struct zio { int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + avl_tree_t io_vdev_trace_tree; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -636,6 +643,13 @@ extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_vdev_trace_init(avl_tree_t *t); +extern void zio_vdev_trace_fini(avl_tree_t *t); +extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t); +extern void zio_vdev_trace_empty(avl_tree_t *t); + extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3dcf49ceb64e..588606977613 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1800,12 +1800,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) zgd_t *zgd = dsa->dsa_zgd; /* - * Record the vdev(s) backing this blkptr so they can be flushed after - * the writes for the lwb have completed. + * Capture the trace records for this zio so the vdevs can be flushed + * after the writes for the lwb have completed. */ - if (zio->io_error == 0) { - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - } + if (zio->io_error == 0) + zil_lwb_add_flush(zgd->zgd_lwb, zio); mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); @@ -1859,10 +1858,10 @@ dmu_sync_late_arrival_done(zio_t *zio) if (zio->io_error == 0) { /* - * Record the vdev(s) backing this blkptr so they can be + * Capture the trace records for this zio so the vdevs can be * flushed after the writes for the lwb have completed. */ - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + zil_lwb_add_flush(zgd->zgd_lwb, zio); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; @@ -1949,7 +1948,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb)); return (0); } @@ -2116,7 +2116,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, + &zb)); return (0); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index f451c170fcec..ae1a4f7ec01d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -798,15 +798,6 @@ zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, } } -static int -zil_lwb_vdev_compare(const void *x1, const void *x2) -{ - const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; - const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - - return (TREE_CMP(v1, v2)); -} - /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case @@ -1369,14 +1360,8 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) } void -zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) +zil_lwb_add_flush(struct lwb *lwb, zio_t *zio) { - avl_tree_t *t = &lwb->lwb_vdev_tree; - avl_index_t where; - zil_vdev_node_t *zv, zvsearch; - int ndvas = BP_GET_NDVAS(bp); - int i; - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); @@ -1384,53 +1369,10 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) return; mutex_enter(&lwb->lwb_vdev_lock); - for (i = 0; i < ndvas; i++) { - zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); - if (avl_find(t, &zvsearch, &where) == NULL) { - zv = kmem_alloc(sizeof (*zv), KM_SLEEP); - zv->zv_vdev = zvsearch.zv_vdev; - avl_insert(t, zv, where); - } - } + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); mutex_exit(&lwb->lwb_vdev_lock); } -static void -zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) -{ - avl_tree_t *src = &lwb->lwb_vdev_tree; - avl_tree_t *dst = &nlwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - - /* - * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified - * while holding zilog->zl_lock) as its writes and those of its - * children have all completed. The younger 'nlwb' may be waiting on - * future writes to additional vdevs. - */ - mutex_enter(&nlwb->lwb_vdev_lock); - /* - * Tear down the 'lwb' vdev tree, ensuring that entries which do not - * exist in 'nlwb' are moved to it, freeing any would-be duplicates. - */ - while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { - avl_index_t where; - - if (avl_find(dst, zv, &where) == NULL) { - avl_insert(dst, zv, where); - } else { - kmem_free(zv, sizeof (*zv)); - } - } - mutex_exit(&nlwb->lwb_vdev_lock); -} - void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { @@ -1573,9 +1515,6 @@ zil_lwb_write_done(zio_t *zio) lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; - avl_tree_t *t = &lwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); @@ -1601,13 +1540,9 @@ zil_lwb_write_done(zio_t *zio) nlwb = NULL; mutex_exit(&zilog->zl_lock); - if (avl_numnodes(t) == 0) - return; - /* - * If there was an IO error, we're not going to call zio_flush() - * on these vdevs, so we simply empty the tree and free the - * nodes. We avoid calling zio_flush() since there isn't any + * If there was an IO error, there's no point continuing. We + * avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * @@ -1617,38 +1552,78 @@ zil_lwb_write_done(zio_t *zio) * we expect any error seen here, to have been propagated to * that function). */ - if (zio->io_error != 0) { - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zv, sizeof (*zv)); + if (zio->io_error != 0 || zil_nocacheflush) { + /* + * Trace records may have been passed on to us from a + * previously-deferred lwb. Since we're not going to use them, + * we clean them up now. + */ + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); return; } /* - * If this lwb does not have any threads waiting for it to complete, we - * want to defer issuing the flush command to the vdevs written to by - * "this" lwb, and instead rely on the "next" lwb to handle the flush - * command for those vdevs. Thus, we merge the vdev tree of "this" lwb - * with the vdev tree of the "next" lwb in the list, and assume the - * "next" lwb will handle flushing the vdevs (or deferring the flush(s) - * again). + * This zio was issued with ZIO_FLAG_VDEV_TRACE, and so its + * io_vdev_trace_tree has the set of zio_vdev_trace_t records for vdevs + * that were written to by "this" lwb. * - * This is a useful performance optimization, especially for workloads + * To complete the commit, we need to issue a flush to those vdevs. If + * this lwb does not have any threads waiting for it to complete, we + * want to defer issuing that flush, and instead rely on the "next" lwb + * to hndle the flush command for those vdevs. + * + * (This is a useful performance optimization, especially for workloads * with lots of async write activity and few sync write and/or fsync * activity, as it has the potential to coalesce multiple flush - * commands to a vdev into one. + * commands to a vdev into one). + * + * However, if this lwb does have threads waiting for it, we must issue + * the flush now. + * + * Regardless of whether or not we issue the flush now or defer it to + * the "next" lwb, our lwb_vdev_tree may also have entries on it that + * were deferred to us. + * + * So, our job here is to assemble a single tree of vdev trace records + * including all those that were written by this zio, plus any that + * were deferred to us, and either issue the flush now, or pass them on + * to the "next" lwb. + * + * Because the zio owns the entries on its trace tree, and it is no + * longer available to us after this function returns, so we make our + * own copies instead of just stealing them. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { - zil_lwb_flush_defer(lwb, nlwb); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + + /* + * While 'lwb' is at a point in its lifetime where + * lwb_vdev_tree does not need the protection of lwb_vdev_lock + * (it will only be modified while holding zilog->zl_lock) as + * its writes and those of its children have all completed. + * The younger 'nlwb' may be waiting on future writes to + * additional vdevs. + */ + mutex_enter(&nlwb->lwb_vdev_lock); + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &nlwb->lwb_vdev_tree); + + /* + * Also move any trace records from this lwb to the next. + */ + zio_vdev_trace_move(&lwb->lwb_vdev_tree, &nlwb->lwb_vdev_tree); + mutex_exit(&nlwb->lwb_vdev_lock); return; } - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { - vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) - zio_flush(lwb->lwb_root_zio, vd, B_TRUE); - kmem_free(zv, sizeof (*zv)); - } + /* + * Fill out the trace tree with records, then issue the flush, + * delivering errors to zil_lwb_flush_vdevs_done(). + */ + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); + zio_vdev_trace_flush(lwb->lwb_root_zio, &lwb->lwb_vdev_tree); + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); } /* @@ -1931,8 +1906,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, - lwb, prio, ZIO_FLAG_CANFAIL, &zb); - zil_lwb_add_block(lwb, &lwb->lwb_blk); + lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, &zb); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ @@ -3773,8 +3747,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); - avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, - sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + zio_vdev_trace_init(&lwb->lwb_vdev_tree); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -3785,7 +3758,7 @@ zil_lwb_dest(void *vbuf, void *unused) (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); - avl_destroy(&lwb->lwb_vdev_tree); + zio_vdev_trace_fini(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } @@ -4373,7 +4346,6 @@ EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); -EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index db922e57ffaf..ff3a5b6176d3 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -76,6 +76,7 @@ static int zio_deadman_log_all = B_FALSE; */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; +static kmem_cache_t *zio_vdev_trace_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) @@ -157,6 +158,8 @@ zio_init(void) sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache", + sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; @@ -285,6 +288,7 @@ zio_fini(void) VERIFY3P(zio_data_buf_cache[i], ==, NULL); } + kmem_cache_destroy(zio_vdev_trace_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -796,6 +800,59 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + /* + * If all of the following are true: + * + * - the parent has requested vdev tracing + * - the child has just completed + * - the child was for a real vdev + * - the child is "interesting" for tracing purposes (see below) + * + * then we can stash some information about the vdev on the trace tree. + * + * "Interesting" means a vdev whose response was a direct contributor + * to the success of the overall zio; that is, we only consider zios + * that succeeded, and weren't something that was allowed or expected + * to fail (eg an aggregation padding write). + * + * This is important for the initial use case (knowing which vdevs were + * written to but not flushed), and is arguably correct for all cases + * (a vdev that returned an error, by definition, did not participate + * in the completing the zio). Its necessary in practice because an + * error from a leaf does not necessarily mean its parent will error + * out too (eg raidz can sometimes compensate for failed writes). If + * some future case requires more complex filtering we can look at + * stashing more info into zio_vdev_trace_t. + * + */ + if (pio->io_flags & ZIO_FLAG_VDEV_TRACE && + wait == ZIO_WAIT_DONE && zio->io_vd != NULL && + ((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0)) { + avl_tree_t *t = &pio->io_vdev_trace_tree; + zio_vdev_trace_t *zvt, zvt_search; + avl_index_t where; + + if (zio->io_error == 0) { + zvt_search.zvt_guid = zio->io_vd->vdev_guid; + if (avl_find(t, &zvt_search, &where) == NULL) { + zvt = kmem_cache_alloc( + zio_vdev_trace_cache, KM_SLEEP); + zvt->zvt_guid = zio->io_vd->vdev_guid; + avl_insert(t, zvt, where); + } + } + + /* + * If the child has itself collected trace records, copy them + * to ours. Note that we can't steal them, as there may be + * multiple parents. + */ + if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) { + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &pio->io_vdev_trace_tree); + } + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -882,6 +939,92 @@ zio_bookmark_compare(const void *x1, const void *x2) return (0); } +static int +zio_vdev_trace_compare(const void *x1, const void *x2) +{ + const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid; + const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid; + + return (TREE_CMP(v1, v2)); +} + +void +zio_vdev_trace_init(avl_tree_t *t) +{ + avl_create(t, zio_vdev_trace_compare, sizeof (zio_vdev_trace_t), + offsetof(zio_vdev_trace_t, zvt_node)); +} + +void +zio_vdev_trace_fini(avl_tree_t *t) +{ + ASSERT(avl_is_empty(t)); + avl_destroy(t); +} + +/* + * Copy trace records on src and add them to dst, skipping any that are already + * on dst. + */ +void +zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt, *nzvt; + avl_index_t where; + + for (zvt = avl_first(src); zvt != NULL; zvt = AVL_NEXT(src, zvt)) { + if (avl_find(dst, zvt, &where) == NULL) { + nzvt = kmem_cache_alloc(zio_vdev_trace_cache, KM_SLEEP); + nzvt->zvt_guid = zvt->zvt_guid; + avl_insert(dst, nzvt, where); + } + } +} + +/* Move trace records from src to dst. src will be empty upon return. */ +void +zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt; + avl_index_t where; + void *cookie; + + cookie = NULL; + while ((zvt = avl_destroy_nodes(src, &cookie)) != NULL) { + if (avl_find(dst, zvt, &where) == NULL) + avl_insert(dst, zvt, where); + else + kmem_cache_free(zio_vdev_trace_cache, zvt); + } + + ASSERT(avl_is_empty(src)); +} + +void +zio_vdev_trace_flush(zio_t *pio, avl_tree_t *t) +{ + spa_t *spa = pio->io_spa; + zio_vdev_trace_t *zvt; + vdev_t *vd; + + for (zvt = avl_first(t); zvt != NULL; zvt = AVL_NEXT(t, zvt)) { + vd = vdev_lookup_by_guid(spa->spa_root_vdev, zvt->zvt_guid); + if (vd != NULL && vd->vdev_children == 0) + zio_flush(pio, vd, B_TRUE); + } +} + +void +zio_vdev_trace_empty(avl_tree_t *t) +{ + zio_vdev_trace_t *zvt; + void *cookie = NULL; + while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_cache_free(zio_vdev_trace_cache, zvt); + ASSERT(avl_is_empty(t)); +} + + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -919,6 +1062,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); + zio_vdev_trace_init(&zio->io_vdev_trace_tree); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) @@ -984,6 +1129,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void zio_destroy(zio_t *zio) { + zio_vdev_trace_empty(&zio->io_vdev_trace_tree); + zio_vdev_trace_fini(&zio->io_vdev_trace_tree); metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list);