Skip to content

Commit

Permalink
Special failsafe feature
Browse files Browse the repository at this point in the history
Special failsafe is a feature that allows your special allocation
class vdevs ('special' and 'dedup') to fail without losing any data.  It
works by automatically backing up all special data to the pool.  This
has the added benefit that you can safely create pools with non-matching
alloc class redundancy (like a mirrored pool with a single special
device).

This behavior is controlled via two properties:

1. feature@special_failsafe - This feature flag enables the special
   failsafe subsystem.  It prevents the backed-up pool from being
   imported read/write on an older version of ZFS that does not
   support special failsafe.

2. special_failsafe - This pool property is the main on/off switch
   to control special failsafe.  If you want to use special failsafe
   simply turn it on either at creation time or with `zpool set` prior
   to adding a special alloc class device.  After special device have
   been added, then you can either leave the property on or turn it
   off, but once it's off you can't turn it back on again.

Note that special failsafe may create a performance penalty over pure
alloc class writes due to the extra backup copy write to the pool.
Alloc class reads should not be affected as they always read from DVA 0
first (the copy of the data on the special device).  It can also inflate
disk usage on dRAID pools.

Closes: openzfs#15118

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
  • Loading branch information
tonyhutter committed May 14, 2024
1 parent abec7dc commit 0b0ecbb
Show file tree
Hide file tree
Showing 54 changed files with 2,303 additions and 295 deletions.
34 changes: 34 additions & 0 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,23 @@ zpool_do_add(int argc, char **argv)
}
}

/*
* Special case:
*
* We need to know the special_failsafe pool property value to determine
* if the new vdev configuration has the correct redundancy requirements
* for special and dedup vdevs.
*
* Pass in the current value for special_failsafe to the proplist.
*/
char strval[ZFS_MAXPROPLEN];
if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
verify(add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
&props, B_TRUE) == 0);
}

/* pass off to make_root_vdev for processing */
nvroot = make_root_vdev(zhp, props, !check_inuse,
check_replication, B_FALSE, dryrun, argc, argv);
Expand Down Expand Up @@ -6848,6 +6865,23 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
}
}

/*
* Special case:
*
* We need to know the special_failsafe pool property value to determine
* if the new vdev configuration has the correct redundancy requirements
* for special and dedup vdevs.
*
* Pass in the current value for special_failsafe to the proplist.
*/
char strval[ZFS_MAXPROPLEN];
if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
verify(add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
&props, B_TRUE) == 0);
}

nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
argc, argv);
if (nvroot == NULL) {
Expand Down
94 changes: 82 additions & 12 deletions cmd/zpool/zpool_vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
*/
boolean_t error_seen;
boolean_t is_force;
boolean_t is_alloc_class;

void
vdev_error(const char *fmt, ...)
Expand All @@ -94,8 +95,15 @@ vdev_error(const char *fmt, ...)
if (!error_seen) {
(void) fprintf(stderr, gettext("invalid vdev specification\n"));
if (!is_force)
(void) fprintf(stderr, gettext("use '-f' to override "
"the following errors:\n"));
if (is_alloc_class) {
(void) fprintf(stderr, gettext("Turn on the "
"special_failsafe pool property or use '-f'"
" to override the following errors:\n"));
is_alloc_class = B_FALSE;
} else {
(void) fprintf(stderr, gettext("use '-f' to "
"override the following errors:\n"));
}
else
(void) fprintf(stderr, gettext("the following errors "
"must be manually repaired:\n"));
Expand Down Expand Up @@ -442,6 +450,7 @@ typedef struct replication_level {
const char *zprl_type;
uint64_t zprl_children;
uint64_t zprl_parity;
boolean_t zprl_is_alloc_class;
} replication_level_t;

#define ZPOOL_FUZZ (16 * 1024 * 1024)
Expand Down Expand Up @@ -480,13 +489,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b)
return (B_FALSE);
}

/*
* Return true if 'props' contains:
*
* special_failsafe=on
*
* ... and feature@special_failsafe is NOT disabled.
*/
static boolean_t
is_special_failsafe_enabled_in_props(nvlist_t *props)
{
const char *str = NULL;

if (nvlist_lookup_string(props, "feature@special_failsafe",
&str) == 0) {
if ((str != NULL) && strcmp(str, "disabled") == 0) {
return (B_FALSE);
}
}

if (nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE),
&str) == 0) {
if ((str != NULL) && strcmp(str, "on") == 0) {
return (B_TRUE); /* It is enabled */
}
}

return (B_FALSE);
}

/*
* Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
* an error message will be displayed for each self-inconsistent vdev.
*/
static replication_level_t *
get_replication(nvlist_t *nvroot, boolean_t fatal)
get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal)
{
nvlist_t **top;
uint_t t, toplevels;
Expand All @@ -495,7 +534,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
nvlist_t *nv;
const char *type;
replication_level_t lastrep = {0};
replication_level_t rep;
replication_level_t rep = {0};
replication_level_t *ret;
replication_level_t *raidz, *mirror;
boolean_t dontreport;
Expand All @@ -507,6 +546,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)

for (t = 0; t < toplevels; t++) {
uint64_t is_log = B_FALSE;
const char *str = NULL;

nv = top[t];

Expand All @@ -528,12 +568,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
strcmp(type, VDEV_TYPE_INDIRECT) == 0)
continue;

rep.zprl_type = type;

/*
* If special_failsafe=on then we know the special allocation
* class devices have at least one copy of their data on the
* pool so we can ignore their replication level.
*/
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&str);
if (str &&
((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) ||
(strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) {
rep.zprl_is_alloc_class = B_TRUE;
if (is_special_failsafe_enabled_in_props(props)) {
continue; /* We're backed up, skip redundancy */
}
}

if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0) {
/*
* This is a 'file' or 'disk' vdev.
*/
rep.zprl_type = type;
rep.zprl_children = 1;
rep.zprl_parity = 0;
} else {
Expand All @@ -548,7 +605,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* We also check that the size of each vdev (if it can
* be determined) is the same.
*/
rep.zprl_type = type;
rep.zprl_children = 0;

if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
Expand Down Expand Up @@ -808,7 +864,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* report any difference between the two.
*/
static int
check_replication(nvlist_t *config, nvlist_t *newroot)
check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot)
{
nvlist_t **child;
uint_t children;
Expand All @@ -825,7 +881,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)

verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if ((current = get_replication(nvroot, B_FALSE)) == NULL)
if ((current = get_replication(props, nvroot, B_FALSE)) == NULL)
return (0);
}
/*
Expand All @@ -850,17 +906,31 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
* Get the replication level of the new vdev spec, reporting any
* inconsistencies found.
*/
if ((new = get_replication(newroot, B_TRUE)) == NULL) {
if ((new = get_replication(props, newroot, B_TRUE)) == NULL) {
free(current);
return (-1);
}

/*
* Check to see if the new vdev spec matches the replication level of
* the current pool.
*/
ret = 0;
if (current != NULL) {
if (current->zprl_is_alloc_class || new->zprl_is_alloc_class)
is_alloc_class = B_TRUE;
else
is_alloc_class = B_FALSE;

/*
* Special case:
* If there were any redundancy problems with alloc class vdevs
* BUT the pool had special_failsafe on, then we're fine since
* all the alloc class data has a copy in the main pool.
*/
if (is_special_failsafe_enabled_in_props(props) &&
is_alloc_class)
goto out;

if (is_raidz_mirror(current, new, &raidz, &mirror) ||
is_raidz_mirror(new, current, &raidz, &mirror)) {
if (raidz->zprl_parity != mirror->zprl_children - 1) {
Expand Down Expand Up @@ -899,7 +969,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
ret = -1;
}
}

out:
free(new);
if (current != NULL)
free(current);
Expand Down Expand Up @@ -1888,7 +1958,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
* found. We include the existing pool spec, if any, as we need to
* catch changes against the existing replication level.
*/
if (check_rep && check_replication(poolconfig, newroot) != 0) {
if (check_rep && check_replication(props, poolconfig, newroot) != 0) {
nvlist_free(newroot);
return (NULL);
}
Expand Down
2 changes: 2 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ typedef enum {
ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO,
ZPOOL_PROP_SPECIAL_FAILSAFE,
ZPOOL_NUM_PROPS
} zpool_prop_t;

Expand Down Expand Up @@ -1610,6 +1611,7 @@ typedef enum {
ZFS_ERR_CRYPTO_NOTSUP,
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE,
} zfs_errno_t;

/*
Expand Down
3 changes: 2 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -1117,7 +1117,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing,
uint64_t missing_special);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
Expand Down
10 changes: 10 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,13 @@ struct spa {
uint64_t spa_missing_tvds; /* unopenable tvds on load */
uint64_t spa_missing_tvds_allowed; /* allow loading spa? */

/*
* Number of 'spa_missing_tvds' that are alloc class devices
* in the pool that has special_failsafe on, and are thus recoverable
* from errors.
*/
uint64_t spa_missing_recovered_tvds;

uint64_t spa_nonallocating_dspace;
spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal;
Expand Down Expand Up @@ -473,6 +480,9 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
zfs_refcount_t spa_refcount; /* number of opens */

/* Backup special/dedup devices data to the pool */
boolean_t spa_special_failsafe;
};

extern char *spa_config_path;
Expand Down
5 changes: 5 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
void vdev_metaslab_group_create(vdev_t *vd);
uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
extern boolean_t vdev_is_leaf(vdev_t *vd);
extern boolean_t vdev_is_special(vdev_t *vd);
extern boolean_t vdev_is_dedup(vdev_t *vd);
extern boolean_t vdev_is_alloc_class(vdev_t *vd);
extern boolean_t vdev_is_special_failsafe(vdev_t *vd);

/*
* Vdev ashift optimization tunables
Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_SPECIAL_FAILSAFE,
SPA_FEATURES
} spa_feature_t;

Expand Down
10 changes: 5 additions & 5 deletions lib/libnvpair/libnvpair.abi
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,11 @@
<parameter type-id='80f4b756'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='strchr' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='95e97e5e'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='strcspn' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='80f4b756'/>
Expand Down Expand Up @@ -2536,11 +2541,6 @@
<parameter type-id='b59d7dce'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='strchr' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='95e97e5e'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='strlen' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='b59d7dce'/>
Expand Down
Loading

0 comments on commit 0b0ecbb

Please sign in to comment.