Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

approval-voting: Add no shows debug information #4726

Merged
merged 3 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions polkadot/node/core/approval-voting/src/approval_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use polkadot_primitives::ValidatorIndex;
use crate::{
persisted_entries::{ApprovalEntry, CandidateEntry, TrancheEntry},
time::Tick,
MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE,
};

/// Result of counting the necessary tranches needed for approving a block.
Expand All @@ -32,6 +33,7 @@ pub struct TranchesToApproveResult {
pub required_tranches: RequiredTranches,
/// The total number of no_shows at the moment we are doing the counting.
pub total_observed_no_shows: usize,
pub no_show_validators: Vec<ValidatorIndex>,
}

/// The required tranches of assignments needed to determine whether a candidate is approved.
Expand Down Expand Up @@ -188,6 +190,8 @@ struct State {
/// The last tick at which a considered assignment was received.
last_assignment_tick: Option<Tick>,
total_observed_no_shows: usize,
// The validator's index that are no-shows.
no_show_validators: Vec<ValidatorIndex>,
}

impl State {
Expand All @@ -203,6 +207,7 @@ impl State {
return TranchesToApproveResult {
required_tranches: RequiredTranches::All,
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
}

Expand All @@ -217,6 +222,7 @@ impl State {
last_assignment_tick: self.last_assignment_tick,
},
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
}

Expand All @@ -234,6 +240,7 @@ impl State {
clock_drift,
},
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
} else {
TranchesToApproveResult {
Expand All @@ -244,6 +251,7 @@ impl State {
clock_drift,
},
total_observed_no_shows: self.total_observed_no_shows,
no_show_validators: self.no_show_validators.clone(),
}
}
}
Expand All @@ -253,11 +261,12 @@ impl State {
}

fn advance(
&self,
mut self,
new_assignments: usize,
new_no_shows: usize,
next_no_show: Option<Tick>,
last_assignment_tick: Option<Tick>,
no_show_validators: Vec<ValidatorIndex>,
) -> State {
let new_covered = if self.depth == 0 {
new_assignments
Expand Down Expand Up @@ -290,6 +299,17 @@ impl State {
(self.depth, covering, uncovered)
};

// Make sure we don't store too many no-show validators, since this metric
// is valuable if there are just a few of them to identify the problematic
// validators.
// If there are a lot then we've got bigger problems and no need to make this
// array unnecessarily large.
if self.no_show_validators.len() + no_show_validators.len() <
MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE
{
self.no_show_validators.extend(no_show_validators);
}

State {
assignments,
depth,
Expand All @@ -299,6 +319,7 @@ impl State {
next_no_show,
last_assignment_tick,
total_observed_no_shows: self.total_observed_no_shows + new_no_shows,
no_show_validators: self.no_show_validators,
}
}
}
Expand Down Expand Up @@ -354,8 +375,9 @@ fn count_no_shows(
block_tick: Tick,
no_show_duration: Tick,
drifted_tick_now: Tick,
) -> (usize, Option<u64>) {
) -> (usize, Option<u64>, Vec<ValidatorIndex>) {
let mut next_no_show = None;
let mut no_show_validators = Vec::new();
let no_shows = assignments
.iter()
.map(|(v_index, tick)| {
Expand All @@ -379,12 +401,14 @@ fn count_no_shows(
// the clock drift will be removed again to do the comparison above.
next_no_show = super::min_prefer_some(next_no_show, Some(no_show_at + clock_drift));
}

if is_no_show {
no_show_validators.push(*v_index);
}
is_no_show
})
.count();

(no_shows, next_no_show)
(no_shows, next_no_show, no_show_validators)
}

/// Determine the amount of tranches of assignments needed to determine approval of a candidate.
Expand All @@ -408,6 +432,7 @@ pub fn tranches_to_approve(
next_no_show: None,
last_assignment_tick: None,
total_observed_no_shows: 0,
no_show_validators: Vec::new(),
};

// The `ApprovalEntry` doesn't have any data for empty tranches. We still want to iterate over
Expand Down Expand Up @@ -446,7 +471,7 @@ pub fn tranches_to_approve(
//
// While we count the no-shows, we also determine the next possible no-show we might
// see within this tranche.
let (no_shows, next_no_show) = count_no_shows(
let (no_shows, next_no_show, no_show_validators) = count_no_shows(
assignments,
approvals,
clock_drift,
Expand All @@ -455,7 +480,7 @@ pub fn tranches_to_approve(
drifted_tick_now,
);

let s = s.advance(n_assignments, no_shows, next_no_show, last_assignment_tick);
let s = s.advance(n_assignments, no_shows, next_no_show, last_assignment_tick, no_show_validators);
let output = s.output(tranche, needed_approvals, n_validators, no_show_duration);

*state = match output.required_tranches {
Expand Down
96 changes: 85 additions & 11 deletions polkadot/node/core/approval-voting/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ struct ApprovalStatus {
tranche_now: DelayTranche,
block_tick: Tick,
last_no_shows: usize,
no_show_validators: Vec<ValidatorIndex>,
}

#[derive(Copy, Clone)]
Expand Down Expand Up @@ -832,6 +833,49 @@ struct State {
// tranches we trigger and why.
per_block_assignments_gathering_times:
LruMap<BlockNumber, HashMap<(Hash, CandidateHash), AssignmentGatheringRecord>>,
no_show_stats: NoShowStats,
}

// Regularly dump the no-show stats at this block number frequency.
const NO_SHOW_DUMP_FREQUENCY: BlockNumber = 50;
// The maximum number of validators we record no-shows for, per candidate.
pub(crate) const MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE: usize = 20;

// No show stats per validator and per parachain.
// This is valuable information when we have to debug live network issue, because
// it gives information if things are going wrong only for some validators or just
// for some parachains.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
struct NoShowStats {
per_validator_no_show: HashMap<SessionIndex, HashMap<ValidatorIndex, usize>>,
per_parachain_no_show: HashMap<u32, usize>,
last_dumped_block_number: BlockNumber,
}

impl NoShowStats {
// Print the no-show stats if NO_SHOW_DUMP_FREQUENCY blocks have passed since the last
// print.
fn maybe_print(&mut self, current_block_number: BlockNumber) {
if self.last_dumped_block_number > current_block_number ||
current_block_number - self.last_dumped_block_number < NO_SHOW_DUMP_FREQUENCY
{
return
}
if self.per_parachain_no_show.is_empty() && self.per_validator_no_show.is_empty() {
return
}

self.last_dumped_block_number = current_block_number;

gum::debug!(
target: LOG_TARGET,
"Validators with no_show {:?} and parachains with no_shows {:?}",
ordian marked this conversation as resolved.
Show resolved Hide resolved
self.per_validator_no_show,
self.per_parachain_no_show,
);
self.per_validator_no_show.clear();
self.per_parachain_no_show.clear();
}
}

#[derive(Debug, Clone, PartialEq, Eq)]
Expand Down Expand Up @@ -887,21 +931,25 @@ impl State {
);

if let Some(approval_entry) = candidate_entry.approval_entry(&block_hash) {
let TranchesToApproveResult { required_tranches, total_observed_no_shows } =
approval_checking::tranches_to_approve(
approval_entry,
candidate_entry.approvals(),
tranche_now,
block_tick,
no_show_duration,
session_info.needed_approvals as _,
);
let TranchesToApproveResult {
required_tranches,
total_observed_no_shows,
no_show_validators,
} = approval_checking::tranches_to_approve(
approval_entry,
candidate_entry.approvals(),
tranche_now,
block_tick,
no_show_duration,
session_info.needed_approvals as _,
);

let status = ApprovalStatus {
required_tranches,
block_tick,
tranche_now,
last_no_shows: total_observed_no_shows,
no_show_validators,
};

Some((approval_entry, status))
Expand Down Expand Up @@ -1044,6 +1092,26 @@ impl State {
},
}
}

fn record_no_shows(
&mut self,
session_index: SessionIndex,
para_id: u32,
no_show_validators: &Vec<ValidatorIndex>,
) {
if !no_show_validators.is_empty() {
*self.no_show_stats.per_parachain_no_show.entry(para_id.into()).or_default() += 1;
}
for validator_index in no_show_validators {
*self
.no_show_stats
.per_validator_no_show
.entry(session_index)
.or_default()
.entry(*validator_index)
.or_default() += 1;
}
}
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -1096,6 +1164,7 @@ where
per_block_assignments_gathering_times: LruMap::new(ByLength::new(
MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
)),
no_show_stats: NoShowStats::default(),
};

// `None` on start-up. Gets initialized/updated on leaf update
Expand Down Expand Up @@ -1740,6 +1809,8 @@ async fn handle_from_overseer<Context>(
"Imported new block.",
);

state.no_show_stats.maybe_print(block_batch.block_number);

for (c_hash, c_entry) in block_batch.imported_candidates {
metrics.on_candidate_imported();

Expand Down Expand Up @@ -2904,7 +2975,8 @@ where
let mut actions = Vec::new();
let block_hash = block_entry.block_hash();
let block_number = block_entry.block_number();

let session_index = block_entry.session();
let para_id = candidate_entry.candidate_receipt().descriptor().para_id;
let tick_now = state.clock.tick_now();

let (is_approved, status) = if let Some((approval_entry, status)) = state
Expand Down Expand Up @@ -3000,7 +3072,9 @@ where
if is_approved {
approval_entry.mark_approved();
}

if newly_approved {
state.record_no_shows(session_index, para_id.into(), &status.no_show_validators);
}
actions.extend(schedule_wakeup_action(
&approval_entry,
block_hash,
Expand Down
Loading