diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 95297d7064f..f8ff31cf73c 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -360,6 +360,7 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker) dummy_iface_attr.cap_flags = -1; dummy_iface_attr.overhead = 0; dummy_iface_attr.priority = 0; + dummy_iface_attr.lat_ovh = 0; supp_tls = 0; best_score = -1; diff --git a/src/ucp/wireup/address.c b/src/ucp/wireup/address.c index 308ad9f813b..d828a471af6 100644 --- a/src/ucp/wireup/address.c +++ b/src/ucp/wireup/address.c @@ -47,6 +47,7 @@ typedef struct { typedef struct { float overhead; float bandwidth; + double lat_ovh; uint32_t prio_cap_flags; /* 8 lsb: prio, 24 msb - cap flags */ } ucp_address_packed_iface_attr_t; @@ -234,6 +235,7 @@ static void ucp_address_pack_iface_attr(ucp_address_packed_iface_attr_t *packed, packed->prio_cap_flags = ((uint8_t)iface_attr->priority); packed->overhead = iface_attr->overhead; packed->bandwidth = iface_attr->bandwidth; + packed->lat_ovh = iface_attr->latency.overhead; /* Keep only the bits defined by UCP_ADDRESS_IFACE_FLAGS, to shrink address. */ packed_flag = UCS_BIT(8); @@ -260,6 +262,7 @@ ucp_address_unpack_iface_attr(ucp_address_iface_attr_t *iface_attr, iface_attr->priority = packed->prio_cap_flags & UCS_MASK(8); iface_attr->overhead = packed->overhead; iface_attr->bandwidth = packed->bandwidth; + iface_attr->lat_ovh = packed->lat_ovh; packed_flag = UCS_BIT(8); bit = 1; @@ -387,12 +390,15 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT - " md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e ", + " md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " + "lat_ovh: %e dev_priority %d", index, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[i].tl_rsc), md_flags, worker->iface_attrs[i].cap.flags, worker->iface_attrs[i].bandwidth, - worker->iface_attrs[i].overhead); + worker->iface_attrs[i].overhead, + worker->iface_attrs[i].latency.overhead, + worker->iface_attrs[i].priority); ++index; } } @@ -568,10 +574,13 @@ ucs_status_t ucp_address_unpack(const void *buffer, uint64_t *remote_uuid_p, address->tl_addr = (tl_addr_len > 0) ? ptr : NULL; address->tl_addr_len = tl_addr_len; - ucs_trace("unpack addr[%d] : md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e ", + ucs_trace("unpack addr[%d] : md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " + "lat_ovh %e dev_priority %d", (int)(address - address_list), address->md_flags, address->iface_attr.cap_flags, - address->iface_attr.bandwidth, address->iface_attr.overhead); + address->iface_attr.bandwidth, address->iface_attr.overhead, + address->iface_attr.lat_ovh, + address->iface_attr.priority); ++address; ptr += tl_addr_len; diff --git a/src/ucp/wireup/address.h b/src/ucp/wireup/address.h index bcd24447cf2..b0aa9371171 100644 --- a/src/ucp/wireup/address.h +++ b/src/ucp/wireup/address.h @@ -42,6 +42,7 @@ struct ucp_address_iface_attr { double overhead; /* Interface performance - overhead */ double bandwidth; /* Interface performance - bandwidth */ int priority; /* Priority of device */ + double lat_ovh; /* latency overhead */ }; diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c index eef9070fd9e..43098e623bf 100644 --- a/src/ucp/wireup/select.c +++ b/src/ucp/wireup/select.c @@ -291,6 +291,14 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list return UCS_OK; } +static inline double ucp_wireup_tl_iface_latency(ucp_context_h context, + const uct_iface_attr_t *iface_attr, + const ucp_address_iface_attr_t *remote_iface_attr) +{ + return ucs_max(iface_attr->latency.overhead, remote_iface_attr->lat_ovh) + + (iface_attr->latency.growth * context->config.est_num_eps); +} + static UCS_F_NOINLINE void ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, ucp_lane_index_t *num_lanes_p, ucp_rsc_index_t rsc_index, @@ -452,7 +460,8 @@ static double ucp_wireup_rma_score_func(ucp_context_h context, const ucp_address_iface_attr_t *remote_iface_attr) { /* best for 4k messages */ - return 1e-3 / (ucp_tl_iface_latency(context, iface_attr) + iface_attr->overhead + + return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + + iface_attr->overhead + (4096.0 / ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth))); } @@ -488,7 +497,8 @@ double ucp_wireup_amo_score_func(ucp_context_h context, const ucp_address_iface_attr_t *remote_iface_attr) { /* best one-sided latency */ - return 1e-3 / (ucp_tl_iface_latency(context, iface_attr) + iface_attr->overhead); + return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + + iface_attr->overhead); } static ucs_status_t ucp_wireup_add_amo_lanes(ucp_ep_h ep, unsigned address_count, @@ -536,7 +546,7 @@ static double ucp_wireup_am_score_func(ucp_context_h context, const ucp_address_iface_attr_t *remote_iface_attr) { /* best end-to-end latency */ - return 1e-3 / (ucp_tl_iface_latency(context, iface_attr) + + return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + iface_attr->overhead + remote_iface_attr->overhead); } @@ -549,8 +559,9 @@ static double ucp_wireup_rndv_score_func(ucp_context_h context, * a size which is likely to be used with the Rendezvous protocol, for * how long it would take to transfer it with a certain transport. */ - return 1 / ((UCP_WIREUP_RNDV_TEST_MSG_SIZE / iface_attr->bandwidth) + - ucp_tl_iface_latency(context, iface_attr) + + return 1 / ((UCP_WIREUP_RNDV_TEST_MSG_SIZE / + ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth)) + + ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + iface_attr->overhead + md_attr->reg_cost.overhead + (UCP_WIREUP_RNDV_TEST_MSG_SIZE * md_attr->reg_cost.growth)); } @@ -801,8 +812,8 @@ static double ucp_wireup_aux_score_func(ucp_context_h context, const ucp_address_iface_attr_t *remote_iface_attr) { /* best end-to-end latency and larger bcopy size */ - return (1e-3 / (ucp_tl_iface_latency(context, iface_attr) + iface_attr->overhead + - remote_iface_attr->overhead)) + + return (1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + + iface_attr->overhead + remote_iface_attr->overhead)) + (1e3 * ucs_max(iface_attr->cap.am.max_bcopy, iface_attr->cap.am.max_short)); }