Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove retry from backers on failed candidate validation #2182

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
093d537
remove retry from backers on failed candidate validation
jpserrat Nov 6, 2023
186b5d9
rename PvfExecTimeoutKind to PvfExecKind on executor params
jpserrat Nov 6, 2023
11ed725
".git/.scripts/commands/fmt/fmt.sh"
Nov 6, 2023
ab5eb3f
rename PvfPrepKind to PvfPrepKind, remove exec_kind from validate_can…
jpserrat Nov 7, 2023
7fa3947
Merge branch 'jpserrat/backers-dont-retry-on-fail' of github.com:Jpse…
jpserrat Nov 7, 2023
364de31
close delimiter
jpserrat Nov 7, 2023
056146a
refactor candidate-validation tests
jpserrat Nov 8, 2023
d91540b
Merge branch 'master' of github.com:paritytech/polkadot-sdk into jpse…
jpserrat Nov 8, 2023
b5bd481
fix fmt
jpserrat Nov 9, 2023
769cef6
minor fix
jpserrat Nov 10, 2023
5bc8183
add backer dont retry to documentation and refactor candidate-validat…
jpserrat Nov 14, 2023
4777046
Merge branch 'master' into jpserrat/backers-dont-retry-on-fail
mrcnski Nov 14, 2023
d69ef1a
Fix tests
mrcnski Nov 14, 2023
7025c7b
fix multiple ambiguous errors test
jpserrat Nov 15, 2023
c2b69d2
Merge branch 'jpserrat/backers-dont-retry-on-fail' of github.com:Jpse…
jpserrat Nov 16, 2023
8431212
Merge branch 'master' of github.com:paritytech/polkadot-sdk into jpse…
jpserrat Nov 16, 2023
79bf9f5
change PvfPrepKind::Lenient to PvfPrepKind::Prepare
jpserrat Nov 17, 2023
7f74004
minor fix
jpserrat Nov 18, 2023
2507cc1
change doc from PfvPrepKind and PvfExecKind to function
jpserrat Nov 19, 2023
7381ce1
Update polkadot/primitives/src/v6/mod.rs
mrcnski Nov 19, 2023
70c8cbc
Merge branch 'master' into jpserrat/backers-dont-retry-on-fail
mrcnski Nov 20, 2023
852cc85
Merge remote-tracking branch 'Jpserrat/jpserrat/backers-dont-retry-on…
mrcnski Nov 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions polkadot/node/core/approval-voting/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ use polkadot_node_subsystem_util::{
};
use polkadot_primitives::{
ApprovalVote, BlockNumber, CandidateHash, CandidateIndex, CandidateReceipt, DisputeStatement,
ExecutorParams, GroupIndex, Hash, PvfExecTimeoutKind, SessionIndex, SessionInfo,
ExecutorParams, GroupIndex, Hash, PvfExecKind, SessionIndex, SessionInfo,
ValidDisputeStatementKind, ValidatorId, ValidatorIndex, ValidatorPair, ValidatorSignature,
};
use sc_keystore::LocalKeystore;
Expand Down Expand Up @@ -2867,7 +2867,7 @@ async fn launch_approval<Context>(
candidate.clone(),
available_data.pov,
executor_params,
PvfExecTimeoutKind::Approval,
PvfExecKind::Approval,
val_tx,
))
.await;
Expand Down
2 changes: 1 addition & 1 deletion polkadot/node/core/approval-voting/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2704,7 +2704,7 @@ async fn handle_double_assignment_import(

assert_matches!(
overseer_recv(virtual_overseer).await,
AllMessages::CandidateValidation(CandidateValidationMessage::ValidateFromExhaustive(_, _, _, _, _, timeout, tx)) if timeout == PvfExecTimeoutKind::Approval => {
AllMessages::CandidateValidation(CandidateValidationMessage::ValidateFromExhaustive(_, _, _, _, _, timeout, tx)) if timeout == PvfExecKind::Approval => {
tx.send(Ok(ValidationResult::Valid(Default::default(), Default::default())))
.unwrap();
}
Expand Down
4 changes: 2 additions & 2 deletions polkadot/node/core/backing/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ use polkadot_node_subsystem_util::{
use polkadot_primitives::{
BackedCandidate, CandidateCommitments, CandidateHash, CandidateReceipt,
CommittedCandidateReceipt, CoreIndex, CoreState, ExecutorParams, Hash, Id as ParaId,
PersistedValidationData, PvfExecTimeoutKind, SigningContext, ValidationCode, ValidatorId,
PersistedValidationData, PvfExecKind, SigningContext, ValidationCode, ValidatorId,
ValidatorIndex, ValidatorSignature, ValidityAttestation,
};
use sp_keystore::KeystorePtr;
Expand Down Expand Up @@ -566,7 +566,7 @@ async fn request_candidate_validation(
candidate_receipt,
pov,
executor_params,
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
tx,
))
.await;
Expand Down
20 changes: 10 additions & 10 deletions polkadot/node/core/backing/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use polkadot_node_subsystem::{
};
use polkadot_node_subsystem_test_helpers as test_helpers;
use polkadot_primitives::{
CandidateDescriptor, GroupRotationInfo, HeadData, PersistedValidationData, PvfExecTimeoutKind,
CandidateDescriptor, GroupRotationInfo, HeadData, PersistedValidationData, PvfExecKind,
ScheduledCore, SessionIndex, LEGACY_MIN_BACKING_VOTES,
};
use sp_application_crypto::AppCrypto;
Expand Down Expand Up @@ -351,7 +351,7 @@ async fn assert_validate_from_exhaustive(
) if _pvd == *pvd &&
_validation_code == *validation_code &&
*_pov == *pov && &candidate_receipt.descriptor == candidate.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate.commitments.hash() == candidate_receipt.commitments_hash =>
{
tx.send(Ok(ValidationResult::Valid(
Expand Down Expand Up @@ -557,7 +557,7 @@ fn backing_works() {
) if _pvd == pvd &&
_validation_code == validation_code &&
*_pov == pov && &candidate_receipt.descriptor == candidate_a.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate_a_commitments_hash == candidate_receipt.commitments_hash =>
{
tx.send(Ok(
Expand Down Expand Up @@ -736,7 +736,7 @@ fn backing_works_while_validation_ongoing() {
) if _pvd == pvd &&
_validation_code == validation_code &&
*_pov == pov && &candidate_receipt.descriptor == candidate_a.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate_a_commitments_hash == candidate_receipt.commitments_hash =>
{
// we never validate the candidate. our local node
Expand Down Expand Up @@ -897,7 +897,7 @@ fn backing_misbehavior_works() {
) if _pvd == pvd &&
_validation_code == validation_code &&
*_pov == pov && &candidate_receipt.descriptor == candidate_a.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate_a_commitments_hash == candidate_receipt.commitments_hash =>
{
tx.send(Ok(
Expand Down Expand Up @@ -1064,7 +1064,7 @@ fn backing_dont_second_invalid() {
) if _pvd == pvd_a &&
_validation_code == validation_code_a &&
*_pov == pov_block_a && &candidate_receipt.descriptor == candidate_a.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate_a.commitments.hash() == candidate_receipt.commitments_hash =>
{
tx.send(Ok(ValidationResult::Invalid(InvalidCandidate::BadReturn))).unwrap();
Expand Down Expand Up @@ -1104,7 +1104,7 @@ fn backing_dont_second_invalid() {
) if pvd == pvd_b &&
_validation_code == validation_code_b &&
*_pov == pov_block_b && &candidate_receipt.descriptor == candidate_b.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate_b.commitments.hash() == candidate_receipt.commitments_hash =>
{
tx.send(Ok(
Expand Down Expand Up @@ -1231,7 +1231,7 @@ fn backing_second_after_first_fails_works() {
) if _pvd == pvd &&
_validation_code == validation_code &&
*_pov == pov && &candidate_receipt.descriptor == candidate.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate.commitments.hash() == candidate_receipt.commitments_hash =>
{
tx.send(Ok(ValidationResult::Invalid(InvalidCandidate::BadReturn))).unwrap();
Expand Down Expand Up @@ -1375,7 +1375,7 @@ fn backing_works_after_failed_validation() {
) if _pvd == pvd &&
_validation_code == validation_code &&
*_pov == pov && &candidate_receipt.descriptor == candidate.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate.commitments.hash() == candidate_receipt.commitments_hash =>
{
tx.send(Err(ValidationFailed("Internal test error".into()))).unwrap();
Expand Down Expand Up @@ -1641,7 +1641,7 @@ fn retry_works() {
) if _pvd == pvd &&
_validation_code == validation_code &&
*_pov == pov && &candidate_receipt.descriptor == candidate.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate.commitments.hash() == candidate_receipt.commitments_hash
);
virtual_overseer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ async fn assert_validate_seconded_candidate(
&_validation_code == validation_code &&
&*_pov == pov &&
&candidate_receipt.descriptor == candidate.descriptor() &&
timeout == PvfExecTimeoutKind::Backing &&
timeout == PvfExecKind::Backing &&
candidate.commitments.hash() == candidate_receipt.commitments_hash =>
{
tx.send(Ok(ValidationResult::Valid(
Expand Down
67 changes: 39 additions & 28 deletions polkadot/node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ use polkadot_parachain_primitives::primitives::{
use polkadot_primitives::{
executor_params::{
DEFAULT_APPROVAL_EXECUTION_TIMEOUT, DEFAULT_BACKING_EXECUTION_TIMEOUT,
DEFAULT_LENIENT_PREPARATION_TIMEOUT, DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
DEFAULT_LENIENT_PREPARATION_TIMEOUT, DEFAULT_PRECHECK_PREPARATION_TIMEOUT, self,
},
CandidateCommitments, CandidateDescriptor, CandidateReceipt, ExecutorParams, Hash,
OccupiedCoreAssumption, PersistedValidationData, PvfExecTimeoutKind, PvfPrepTimeoutKind,
OccupiedCoreAssumption, PersistedValidationData, PvfExecKind, PvfPrepKind,
ValidationCode, ValidationCodeHash,
};

Expand Down Expand Up @@ -354,7 +354,7 @@ where
return PreCheckOutcome::Invalid
};

let timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Precheck);
let timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Precheck);

let pvf = match sp_maybe_compressed_blob::decompress(
&validation_code.0,
Expand Down Expand Up @@ -498,7 +498,7 @@ async fn validate_from_chain_state<Sender>(
candidate_receipt: CandidateReceipt,
pov: Arc<PoV>,
executor_params: ExecutorParams,
exec_timeout_kind: PvfExecTimeoutKind,
exec_kind: PvfExecKind,
metrics: &Metrics,
) -> Result<ValidationResult, ValidationFailed>
where
Expand All @@ -518,7 +518,7 @@ where
candidate_receipt.clone(),
pov,
executor_params,
exec_timeout_kind,
exec_kind,
metrics,
)
.await;
Expand Down Expand Up @@ -554,7 +554,7 @@ async fn validate_candidate_exhaustive(
candidate_receipt: CandidateReceipt,
pov: Arc<PoV>,
executor_params: ExecutorParams,
exec_timeout_kind: PvfExecTimeoutKind,
exec_kind: PvfExecKind,
metrics: &Metrics,
) -> Result<ValidationResult, ValidationFailed> {
let _timer = metrics.time_validate_candidate_exhaustive();
Expand Down Expand Up @@ -613,15 +613,30 @@ async fn validate_candidate_exhaustive(
relay_parent_storage_root: persisted_validation_data.relay_parent_storage_root,
};

let result = validation_backend
.validate_candidate_with_retry(
raw_validation_code.to_vec(),
pvf_exec_timeout(&executor_params, exec_timeout_kind),
exec_timeout_kind,
params,
executor_params,
)
.await;
let result = match exec_kind {
PvfExecKind::Backing => {
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Lenient);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it's not a timeout anymore, I think PvfPrepKind::Prepare makes more sense than ::Lenient.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double-check, do you mean PvfPrepKind::Precheck, right?

Copy link
Contributor

@mrcnski mrcnski Nov 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, regular preparation is more "lenient" than prechecking, which is more strict to filter out problematic PVFs. But calling regular preparation lenient doesn't match how we refer to it in the rest of the codebase. Thanks for double-checking. 👍

let exec_timeout = pvf_exec_timeout(&executor_params, exec_kind);
let pvf = PvfPrepData::from_code(
raw_validation_code.to_vec(),
executor_params,
prep_timeout,
PrepareJobKind::Compilation,
);

validation_backend.validate_candidate(pvf, exec_timeout, params.encode()).await
},
PvfExecKind::Approval =>
validation_backend
.validate_candidate_with_retry(
raw_validation_code.to_vec(),
pvf_exec_timeout(&executor_params, exec_kind),
params,
executor_params,
)
.await
}
};

if let Err(ref error) = result {
gum::info!(target: LOG_TARGET, ?para_id, ?error, "Failed to validate candidate");
Expand Down Expand Up @@ -701,7 +716,7 @@ trait ValidationBackend {
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;

/// Tries executing a PVF. Will retry once if an error is encountered that may have been
/// Tries executing a PVF for the approval subsystem. Will retry once if an error is encountered that may have been
/// transient.
///
/// NOTE: Should retry only on errors that are a result of execution itself, and not of
Expand All @@ -710,11 +725,10 @@ trait ValidationBackend {
&mut self,
raw_validation_code: Vec<u8>,
exec_timeout: Duration,
exec_timeout_kind: PvfExecTimeoutKind,
params: ValidationParams,
executor_params: ExecutorParams,
) -> Result<WasmValidationResult, ValidationError> {
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient);
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Lenient);
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = PvfPrepData::from_code(
raw_validation_code,
Expand All @@ -732,10 +746,7 @@ trait ValidationBackend {
return validation_result
}

let retry_delay = match exec_timeout_kind {
PvfExecTimeoutKind::Backing => PVF_BACKING_EXECUTION_RETRY_DELAY,
PvfExecTimeoutKind::Approval => PVF_APPROVAL_EXECUTION_RETRY_DELAY,
};
let retry_delay = PVF_APPROVAL_EXECUTION_RETRY_DELAY;
mrcnski marked this conversation as resolved.
Show resolved Hide resolved

// Allow limited retries for each kind of error.
let mut num_internal_retries_left = 1;
Expand Down Expand Up @@ -858,22 +869,22 @@ fn perform_basic_checks(
Ok(())
}

fn pvf_prep_timeout(executor_params: &ExecutorParams, kind: PvfPrepTimeoutKind) -> Duration {
fn pvf_prep_timeout(executor_params: &ExecutorParams, kind: PvfPrepKind) -> Duration {
if let Some(timeout) = executor_params.pvf_prep_timeout(kind) {
return timeout
}
match kind {
PvfPrepTimeoutKind::Precheck => DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
PvfPrepTimeoutKind::Lenient => DEFAULT_LENIENT_PREPARATION_TIMEOUT,
PvfPrepKind::Precheck => DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
PvfPrepKind::Lenient => DEFAULT_LENIENT_PREPARATION_TIMEOUT,
}
}

fn pvf_exec_timeout(executor_params: &ExecutorParams, kind: PvfExecTimeoutKind) -> Duration {
fn pvf_exec_timeout(executor_params: &ExecutorParams, kind: PvfExecKind) -> Duration {
if let Some(timeout) = executor_params.pvf_exec_timeout(kind) {
return timeout
}
match kind {
PvfExecTimeoutKind::Backing => DEFAULT_BACKING_EXECUTION_TIMEOUT,
PvfExecTimeoutKind::Approval => DEFAULT_APPROVAL_EXECUTION_TIMEOUT,
PvfExecKind::Backing => DEFAULT_BACKING_EXECUTION_TIMEOUT,
PvfExecKind::Approval => DEFAULT_APPROVAL_EXECUTION_TIMEOUT,
}
}
Loading
Loading