Skip to content

Commit

Permalink
UCP: Add the latency.overhead to the passed address.
Browse files Browse the repository at this point in the history
- Add the lanency.overhead to the passed address so that each rank can
  see the same values when selecting a lane - since this value maybe
  different for different ranks.

- Consider the remote peer's bandwidth in the rndv score function - this
  will allow support for cases where different ranks have different
  speeds on their HCAs - heterogeneous fabric.

- enhance the logging for pack/unpack address - include the priority of
  the device and the lantency overhead.

fixes openucx#1534
  • Loading branch information
alinask committed May 29, 2017
1 parent b972dd3 commit eb7fd1b
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 12 deletions.
1 change: 1 addition & 0 deletions src/ucp/core/ucp_worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker)
dummy_iface_attr.cap_flags = -1;
dummy_iface_attr.overhead = 0;
dummy_iface_attr.priority = 0;
dummy_iface_attr.lat_ovh = 0;

supp_tls = 0;
best_score = -1;
Expand Down
2 changes: 1 addition & 1 deletion src/ucp/tag/tag_send.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ static ucs_status_t ucp_tag_req_start(ucp_request_t *req, size_t count,
flag_iov_single = (count <= config->tag.eager.max_iov);

if (!flag_iov_single && ucp_ep_is_tag_offload_enabled(config)) {
/* Make sure SW RNDV will be used, becasuse tag offload does
/* Make sure SW RNDV will be used, because tag offload does
* not support multi-packet eager protocols. */
force_sw_rndv = 1;
}
Expand Down
17 changes: 13 additions & 4 deletions src/ucp/wireup/address.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ typedef struct {
typedef struct {
float overhead;
float bandwidth;
double lat_ovh;
uint32_t prio_cap_flags; /* 8 lsb: prio, 24 msb - cap flags */
} ucp_address_packed_iface_attr_t;

Expand Down Expand Up @@ -234,6 +235,7 @@ static void ucp_address_pack_iface_attr(ucp_address_packed_iface_attr_t *packed,
packed->prio_cap_flags = ((uint8_t)iface_attr->priority);
packed->overhead = iface_attr->overhead;
packed->bandwidth = iface_attr->bandwidth;
packed->lat_ovh = iface_attr->latency.overhead;

/* Keep only the bits defined by UCP_ADDRESS_IFACE_FLAGS, to shrink address. */
packed_flag = UCS_BIT(8);
Expand All @@ -260,6 +262,7 @@ ucp_address_unpack_iface_attr(ucp_address_iface_attr_t *iface_attr,
iface_attr->priority = packed->prio_cap_flags & UCS_MASK(8);
iface_attr->overhead = packed->overhead;
iface_attr->bandwidth = packed->bandwidth;
iface_attr->lat_ovh = packed->lat_ovh;

packed_flag = UCS_BIT(8);
bit = 1;
Expand Down Expand Up @@ -387,12 +390,15 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,


ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT
" md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e ",
" md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e "
"lat_ovh: %e dev_priority %d",
index,
UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[i].tl_rsc),
md_flags, worker->ifaces[i].attr.cap.flags,
worker->ifaces[i].attr.bandwidth,
worker->ifaces[i].attr.overhead);
worker->ifaces[i].attr.overhead,
worker->ifaces[i].attr.latency.overhead,
worker->ifaces[i].attr.priority);
++index;
}
}
Expand Down Expand Up @@ -568,10 +574,13 @@ ucs_status_t ucp_address_unpack(const void *buffer, uint64_t *remote_uuid_p,
address->tl_addr = (tl_addr_len > 0) ? ptr : NULL;
address->tl_addr_len = tl_addr_len;

ucs_trace("unpack addr[%d] : md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e ",
ucs_trace("unpack addr[%d] : md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e "
"lat_ovh %e dev_priority %d",
(int)(address - address_list),
address->md_flags, address->iface_attr.cap_flags,
address->iface_attr.bandwidth, address->iface_attr.overhead);
address->iface_attr.bandwidth, address->iface_attr.overhead,
address->iface_attr.lat_ovh,
address->iface_attr.priority);
++address;

ptr += tl_addr_len;
Expand Down
1 change: 1 addition & 0 deletions src/ucp/wireup/address.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ struct ucp_address_iface_attr {
double overhead; /* Interface performance - overhead */
double bandwidth; /* Interface performance - bandwidth */
int priority; /* Priority of device */
double lat_ovh; /* latency overhead */
};


Expand Down
25 changes: 18 additions & 7 deletions src/ucp/wireup/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,14 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list
return UCS_OK;
}

static inline double ucp_wireup_tl_iface_latency(ucp_context_h context,
const uct_iface_attr_t *iface_attr,
const ucp_address_iface_attr_t *remote_iface_attr)
{
return ucs_max(iface_attr->latency.overhead, remote_iface_attr->lat_ovh) +
(iface_attr->latency.growth * context->config.est_num_eps);
}

static UCS_F_NOINLINE void
ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs,
ucp_lane_index_t *num_lanes_p, ucp_rsc_index_t rsc_index,
Expand Down Expand Up @@ -459,7 +467,8 @@ static double ucp_wireup_rma_score_func(ucp_context_h context,
const ucp_address_iface_attr_t *remote_iface_attr)
{
/* best for 4k messages */
return 1e-3 / (ucp_tl_iface_latency(context, iface_attr) + iface_attr->overhead +
return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) +
iface_attr->overhead +
(4096.0 / ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth)));
}

Expand Down Expand Up @@ -495,7 +504,8 @@ double ucp_wireup_amo_score_func(ucp_context_h context,
const ucp_address_iface_attr_t *remote_iface_attr)
{
/* best one-sided latency */
return 1e-3 / (ucp_tl_iface_latency(context, iface_attr) + iface_attr->overhead);
return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) +
iface_attr->overhead);
}

static ucs_status_t ucp_wireup_add_amo_lanes(ucp_ep_h ep, unsigned address_count,
Expand Down Expand Up @@ -543,7 +553,7 @@ static double ucp_wireup_am_score_func(ucp_context_h context,
const ucp_address_iface_attr_t *remote_iface_attr)
{
/* best end-to-end latency */
return 1e-3 / (ucp_tl_iface_latency(context, iface_attr) +
return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) +
iface_attr->overhead + remote_iface_attr->overhead);
}

Expand All @@ -556,8 +566,9 @@ static double ucp_wireup_rndv_score_func(ucp_context_h context,
* a size which is likely to be used with the Rendezvous protocol, for
* how long it would take to transfer it with a certain transport. */

return 1 / ((UCP_WIREUP_RNDV_TEST_MSG_SIZE / iface_attr->bandwidth) +
ucp_tl_iface_latency(context, iface_attr) +
return 1 / ((UCP_WIREUP_RNDV_TEST_MSG_SIZE /
ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth)) +
ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) +
iface_attr->overhead + md_attr->reg_cost.overhead +
(UCP_WIREUP_RNDV_TEST_MSG_SIZE * md_attr->reg_cost.growth));
}
Expand Down Expand Up @@ -861,8 +872,8 @@ static double ucp_wireup_aux_score_func(ucp_context_h context,
const ucp_address_iface_attr_t *remote_iface_attr)
{
/* best end-to-end latency and larger bcopy size */
return (1e-3 / (ucp_tl_iface_latency(context, iface_attr) + iface_attr->overhead +
remote_iface_attr->overhead)) +
return (1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) +
iface_attr->overhead + remote_iface_attr->overhead)) +
(1e3 * ucs_max(iface_attr->cap.am.max_bcopy, iface_attr->cap.am.max_short));
}

Expand Down

0 comments on commit eb7fd1b

Please sign in to comment.