Skip to content

Commit

Permalink
fix(dre): Fix for the HostOS rollout in groups (#432)
Browse files Browse the repository at this point in the history
The HostOS rollout did not have the local registry correctly populated so it couldn't calculate the rollout strategy.
  • Loading branch information
sasa-tomic authored May 28, 2024
1 parent e9d875b commit cd61ab0
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 28 deletions.
20 changes: 20 additions & 0 deletions rs/cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,16 @@ pub mod hostos {
All,
}

impl std::fmt::Display for NodeOwner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
NodeOwner::Dfinity => write!(f, "DFINITY"),
NodeOwner::Others => write!(f, "External"),
NodeOwner::All => write!(f, "DFINITY+External"),
}
}
}

#[derive(ValueEnum, Copy, Clone, Debug, Ord, Eq, PartialEq, PartialOrd, Default)]
pub enum NodeAssignment {
Unassigned,
Expand All @@ -340,6 +350,16 @@ pub mod hostos {
All,
}

impl std::fmt::Display for NodeAssignment {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
NodeAssignment::Unassigned => write!(f, "Unassigned"),
NodeAssignment::Assigned => write!(f, "In Subnet"),
NodeAssignment::All => write!(f, "In Subnet+Unassigned"),
}
}
}

use super::*;
use clap::ValueEnum;
use ic_base_types::PrincipalId;
Expand Down
74 changes: 50 additions & 24 deletions rs/cli/src/operations/hostos_rollout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub struct HostosRolloutSubnetAffected {
pub subnet_size: usize,
}

#[derive(Eq, PartialEq)]
#[derive(Eq, PartialEq, Debug)]
pub enum HostosRolloutReason {
NoNodeHealthy,
NoNodeWithoutProposal,
Expand Down Expand Up @@ -275,23 +275,7 @@ impl HostosRollout {
info!("All valid nodes in the subnet: {} have been updated", subnet_id);
None
} else {
if nodes.len() < nodes_to_take {
debug!(
"Updating all valid nodes ({}) left in the subnet: {}\n\
{}% of all nodes in the subnet",
nodes.len(),
subnet_id,
actual_percent
);
} else {
debug!(
"Updating {} valid nodes in the subnet: {}\n\
{}% of all nodes in the subnet",
nodes.len(),
subnet_id,
actual_percent
);
}
info!("Updating {} nodes ({}%) in the subnet {}", nodes.len(), actual_percent, subnet_id,);
Some(nodes)
}
})
Expand Down Expand Up @@ -385,7 +369,22 @@ impl HostosRollout {
nodes_with_open_proposals: Vec<UpdateNodesHostosVersionsProposal>,
update_group: NodeGroupUpdate,
) -> anyhow::Result<HostosRolloutResponse> {
info!("CANDIDATES SELECTION FOR {:?}", &update_group);
info!(
"CANDIDATES SELECTION FROM {} HEALTHY NODES FOR {} {} {}",
nodes_health
.iter()
.filter_map(|(principal, status)| {
if *status == Status::Healthy {
Some(principal)
} else {
None
}
})
.count(),
update_group.maybe_number_nodes.unwrap_or_default(),
update_group.node_group.owner,
update_group.node_group.assignment
);

match update_group.node_group.assignment {
NodeAssignment::Unassigned => {
Expand All @@ -398,13 +397,18 @@ impl HostosRollout {
CandidatesSelection::Ok(candidates_unassigned) => {
let nodes_to_take = update_group.nodes_to_take(unassigned_nodes.len());
let nodes_to_update = candidates_unassigned.into_iter().take(nodes_to_take).collect::<Vec<_>>();
info!("{} candidate nodes selected for: {}", nodes_to_update.len(), update_group.node_group);
Ok(HostosRolloutResponse::Ok(nodes_to_update, None))
}
CandidatesSelection::None(reason) => Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)])),
CandidatesSelection::None(reason) => {
info!("No candidate nodes selected for: {} ==> {:?}", update_group.node_group, reason);
Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)]))
}
}
}
NodeAssignment::Assigned => {
let assigned_nodes = self.filter_nodes_in_group(update_group).await?;
info!("{} candidate nodes selected for: {}", assigned_nodes.len(), update_group.node_group);

match self
.candidates_selection(nodes_health, nodes_with_open_proposals, assigned_nodes.clone())
Expand All @@ -428,9 +432,13 @@ impl HostosRollout {
})
.collect::<Vec<HostosRolloutSubnetAffected>>();

info!("{} candidate nodes selected for: {}", nodes_to_update.len(), update_group.node_group);
Ok(HostosRolloutResponse::Ok(nodes_to_update, Some(subnets_affected)))
}
CandidatesSelection::None(reason) => Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)])),
CandidatesSelection::None(reason) => {
info!("No candidate nodes selected for: {} ==> {:?}", update_group.node_group, reason);
Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)]))
}
}
}
NodeAssignment::All => {
Expand All @@ -449,14 +457,32 @@ impl HostosRollout {
)
.await
.map(|response| match response {
(Ok(assigned_nodes, subnet_affected), None(_)) => Ok(assigned_nodes, subnet_affected),
(None(_), Ok(unassigned_nodes, _)) => Ok(unassigned_nodes, Option::None),
(Ok(assigned_nodes, subnet_affected), None(reason)) => {
info!("No unassigned nodes selected for: {:?} ==> {:?}", update_group.node_group, reason);
Ok(assigned_nodes, subnet_affected)
}
(None(reason), Ok(unassigned_nodes, _)) => {
info!("No assigned nodes selected for: {:?} ==> {:?}", update_group.node_group, reason);
Ok(unassigned_nodes, Option::None)
}

(Ok(assigned_nodes, subnet_affected), Ok(unassigned_nodes, _)) => {
info!(
"{} assigned nodes and {} unassigned nodes selected for: {}",
assigned_nodes.len(),
unassigned_nodes.len(),
update_group.node_group
);
Ok(assigned_nodes.into_iter().chain(unassigned_nodes).collect(), subnet_affected.clone())
}

(None(assigned_reason), None(unassigned_reason)) => None(assigned_reason.into_iter().chain(unassigned_reason).collect()),
(None(assigned_reason), None(unassigned_reason)) => {
info!(
"No candidate nodes selected for: {:?} ==> {:?} {:?}",
update_group.node_group, assigned_reason, unassigned_reason
);
None(assigned_reason.into_iter().chain(unassigned_reason).collect())
}
})
}
}
Expand Down
12 changes: 8 additions & 4 deletions rs/cli/src/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ impl Runner {
pub fn as_automation(self) -> Self {
Self {
ic_admin: self.ic_admin.as_automation(),
dashboard_backend_client: self.dashboard_backend_client,
registry: self.registry,
..self
}
}

Expand Down Expand Up @@ -182,12 +181,17 @@ impl Runner {
let backend_url = format!("http://localhost:{}/", backend_port);

let dashboard_backend_client = DashboardBackendClient::new_with_backend_url(backend_url);
let mut registry = registry::RegistryState::new(network, true).await;
let node_providers = query_ic_dashboard_list::<NodeProvidersResponse>("v3/node-providers")
.await?
.node_providers;
registry.update_node_details(&node_providers).await?;

Ok(Self {
ic_admin,
dashboard_backend_client,
// TODO: Remove once DREL-118 completed.
// Fake registry that is not used, but some methods still rely on backend.
registry: registry::RegistryState::new(network, true).await,
registry,
})
}

Expand Down

0 comments on commit cd61ab0

Please sign in to comment.