-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement parallel dbuf eviction #16487
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); | |
*/ | ||
static kmem_cache_t *dbuf_kmem_cache; | ||
static taskq_t *dbu_evict_taskq; | ||
static taskq_t *dbuf_evict_taskq; | ||
|
||
static kthread_t *dbuf_cache_evict_thread; | ||
static kmutex_t dbuf_evict_lock; | ||
|
@@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6; | |
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ | ||
static uint_t dbuf_mutex_cache_shift = 0; | ||
|
||
/* | ||
* Number of dbuf_evict threads | ||
*/ | ||
static uint_t dbuf_evict_threads = 0; | ||
|
||
/* | ||
* The minimum number of bytes we can evict at once is a block size. | ||
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task. | ||
* We use this value to compute a scaling factor for the eviction tasks. | ||
*/ | ||
#define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT) | ||
|
||
static uint_t dbuf_evict_parallel = 0; | ||
|
||
static unsigned long dbuf_cache_target_bytes(void); | ||
static unsigned long dbuf_metadata_cache_target_bytes(void); | ||
|
||
|
@@ -762,26 +777,47 @@ dbuf_cache_above_lowater(void) | |
} | ||
|
||
/* | ||
* Evict the oldest eligible dbuf from the dbuf cache. | ||
* Evict the oldest eligible dbufs from the dbuf cache. | ||
* Use the multilist sublist (mls) with the provided index #idx. | ||
*/ | ||
static void | ||
dbuf_evict_one(void) | ||
dbuf_evict_many(uint64_t bytes, unsigned int idx) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would change the arguments order. |
||
{ | ||
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); | ||
int64_t evicted = 0; | ||
dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); | ||
marker->db_objset = NULL; | ||
|
||
ASSERT3U(idx, <, multilist_get_num_sublists( | ||
&dbuf_caches[DB_DBUF_CACHE].cache)); | ||
|
||
multilist_sublist_t *mls = multilist_sublist_lock_idx( | ||
&dbuf_caches[DB_DBUF_CACHE].cache, idx); | ||
|
||
ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); | ||
|
||
dmu_buf_impl_t *db = multilist_sublist_tail(mls); | ||
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { | ||
db = multilist_sublist_prev(mls, db); | ||
} | ||
multilist_sublist_insert_after(mls, db, marker); | ||
|
||
while (db != NULL && evicted < bytes) { | ||
int skip = 0; | ||
while (db != NULL && (db->db_objset == NULL || | ||
mutex_tryenter(&db->db_mtx) == 0)) { | ||
db = multilist_sublist_prev(mls, db); | ||
if (skip == 0) | ||
skip = 1; | ||
Comment on lines
+806
to
+807
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why to check before assign? |
||
} | ||
|
||
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, | ||
multilist_sublist_t *, mls); | ||
if (db == NULL) | ||
break; | ||
|
||
if (skip) { | ||
multilist_sublist_remove(mls, marker); | ||
multilist_sublist_insert_before(mls, db, marker); | ||
} | ||
|
||
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, | ||
multilist_sublist_t *, mls); | ||
|
||
if (db != NULL) { | ||
multilist_sublist_remove(mls, db); | ||
multilist_sublist_unlock(mls); | ||
uint64_t size = db->db.db_size; | ||
|
@@ -797,9 +833,121 @@ dbuf_evict_one(void) | |
db->db_caching_status = DB_NO_CACHE; | ||
dbuf_destroy(db); | ||
DBUF_STAT_BUMP(cache_total_evicts); | ||
} else { | ||
multilist_sublist_unlock(mls); | ||
evicted += size + usize; | ||
|
||
mls = multilist_sublist_lock_idx( | ||
&dbuf_caches[DB_DBUF_CACHE].cache, idx); | ||
db = multilist_sublist_prev(mls, marker); | ||
} | ||
|
||
multilist_sublist_remove(mls, marker); | ||
multilist_sublist_unlock(mls); | ||
kmem_cache_free(dbuf_kmem_cache, marker); | ||
} | ||
|
||
typedef struct evict_arg { | ||
taskq_ent_t tqe; | ||
unsigned idx; | ||
uint64_t bytes; | ||
} evict_arg_t; | ||
|
||
static void | ||
dbuf_evict_task(void *arg) | ||
{ | ||
evict_arg_t *eva = arg; | ||
dbuf_evict_many(eva->bytes, eva->idx); | ||
} | ||
|
||
static void | ||
dbuf_evict(void) | ||
{ | ||
int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) - | ||
dbuf_cache_lowater_bytes()); | ||
|
||
if (bytes <= 0) | ||
return; | ||
|
||
unsigned idx = multilist_get_random_index( | ||
&dbuf_caches[DB_DBUF_CACHE].cache); | ||
|
||
if (!dbuf_evict_parallel) | ||
return (dbuf_evict_many(bytes, idx)); | ||
|
||
/* | ||
* Go to the parallel eviction. | ||
*/ | ||
unsigned int num_sublists = multilist_get_num_sublists( | ||
&dbuf_caches[DB_DBUF_CACHE].cache); | ||
evict_arg_t *evarg = kmem_zalloc(sizeof (*evarg) * num_sublists, | ||
KM_SLEEP); | ||
/* | ||
* How we scale | ||
* | ||
* Example 1, # of chunks less than # of tasks. | ||
* We have: | ||
* - 4 tasks | ||
* - 3 chunks | ||
* - 3 full col | ||
* - 0 low cols. | ||
* | ||
* The first low col index is 3. | ||
* The tasks #0-#2 evict 1 chunk each. | ||
* | ||
* 0 | 1 | 2 | 3 | | ||
* +===+===+===+===+ | ||
* | x | x | x | | | ||
* +---+---+---+---+ | ||
* | ||
* Example 2, # of chunks more than # of tasks. | ||
* We have: | ||
* - 4 tasks | ||
* - 9 chunks | ||
* - 1 full col | ||
* - 3 low cols | ||
* | ||
* The first low col index is 1. | ||
* The task #0 evicts 3 chunks, the others evict 2 chunks each. | ||
* | ||
* 0 | 1 | 2 | 3 | | ||
* +===+===+===+===+ | ||
* | x | x | x | x | | ||
* +---+---+---+---+ | ||
* | x | x | x | x | | ||
* +---+---+---+---+ | ||
* | x | | | | | ||
* +---+---+---+---+ | ||
*/ | ||
|
||
/* | ||
* Compute number of tasks to run (n), first low col index (k), | ||
* normal and low bytes per task. | ||
*/ | ||
uint64_t nchunks = ((bytes - 1) >> DBUF_MIN_EVICT_PERTASK_SHIFT) + 1; | ||
unsigned n = nchunks < num_sublists ? nchunks : num_sublists; | ||
uint64_t fullrows = nchunks / n; | ||
unsigned lastrowcols = nchunks % n; | ||
unsigned k = (lastrowcols ? lastrowcols : n); | ||
|
||
uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT; | ||
uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ? | ||
(1 << DBUF_MIN_EVICT_PERTASK_SHIFT) : 0); | ||
Comment on lines
+925
to
+933
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as for ARC seems over-engineered. No need for tasks to be multiple of |
||
|
||
for (unsigned i = 0; i < n; i++) { | ||
uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low; | ||
|
||
evarg[i].idx = idx; | ||
evarg[i].bytes = evict; | ||
|
||
taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task, | ||
&evarg[i], 0, &evarg[i].tqe); | ||
Comment on lines
+938
to
+942
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Somewhere here should be |
||
|
||
/* wrap idx */ | ||
if (++idx >= num_sublists) | ||
idx = 0; | ||
} | ||
|
||
taskq_wait(dbuf_evict_taskq); | ||
kmem_free(evarg, sizeof (*evarg) * num_sublists); | ||
} | ||
|
||
/* | ||
|
@@ -833,7 +981,7 @@ dbuf_evict_thread(void *unused) | |
* minimize lock contention. | ||
*/ | ||
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { | ||
dbuf_evict_one(); | ||
dbuf_evict(); | ||
} | ||
|
||
mutex_enter(&dbuf_evict_lock); | ||
|
@@ -860,7 +1008,7 @@ dbuf_evict_notify(uint64_t size) | |
*/ | ||
if (size > dbuf_cache_target_bytes()) { | ||
if (size > dbuf_cache_hiwater_bytes()) | ||
dbuf_evict_one(); | ||
dbuf_evict(); | ||
Comment on lines
1009
to
+1011
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets assume we have 10 user threads calling this. I suppose each of them will try to create own task sets to evict the same full amount of extra dbuf caches using all the same CPUs. In best case it may end up with empty dbuf cache. I am not sure I greatly like the design of one main eviction thread calling bunch of other taskqs, but each client thread doing that definitely looks weird. I think if user threads has to do evictions, they should do it directly, just doing more than one buffer at a time to be more efficient, as you have said. |
||
cv_signal(&dbuf_evict_cv); | ||
} | ||
} | ||
|
@@ -965,11 +1113,16 @@ dbuf_init(void) | |
|
||
dbuf_stats_init(h); | ||
|
||
if (dbuf_evict_threads == 0) | ||
dbuf_evict_threads = MAX(2, MIN(16, max_ncpus >> 3)); | ||
/* | ||
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc | ||
* configuration is not required. | ||
*/ | ||
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); | ||
dbuf_evict_taskq = taskq_create("dbuf_evict", | ||
MIN(dbuf_evict_threads, max_ncpus), defclsyspri, | ||
MIN(dbuf_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE); | ||
|
||
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { | ||
multilist_create(&dbuf_caches[dcs].cache, | ||
|
@@ -1035,6 +1188,8 @@ dbuf_fini(void) | |
|
||
kmem_cache_destroy(dbuf_kmem_cache); | ||
taskq_destroy(dbu_evict_taskq); | ||
taskq_wait(dbuf_evict_taskq); | ||
taskq_destroy(dbuf_evict_taskq); | ||
|
||
mutex_enter(&dbuf_evict_lock); | ||
dbuf_evict_thread_exit = B_TRUE; | ||
|
@@ -3963,7 +4118,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag) | |
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() | ||
* ^ | | ||
* | | | ||
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+ | ||
* +-----dbuf_destroy()<--dbuf_evict()<------------+ | ||
* | ||
*/ | ||
void | ||
|
@@ -5282,3 +5437,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, | |
|
||
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, | ||
"Set size of dbuf cache mutex array as log2 shift."); | ||
|
||
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_parallel, UINT, ZMOD_RW, | ||
"Evict from the dbuf cache in parallel using a taskq"); | ||
|
||
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_threads, UINT, ZMOD_RW, | ||
"Maximum number of dbuf_evict threads"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a time you would want both
dbuf_evict_threads >= 2
anddbuf_evict_parallel=0
? Just wondering if you can simplify this todbuf_evict_threads
only, and imply "parallel" if it's set to 2 threads or more.