diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 427d99160f8..926389189fe 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -117,7 +117,8 @@ typedef struct uct_ib_address { * - uint16_t subnet16 * - uint64_t subnet64 * For RoCE: - * - uint8_t gid[16] + * - uint16_t udp_sport + * - uint8_t gid[16] */ } UCS_S_PACKED uct_ib_address_t; diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index d9bfcc8c522..8dc4356c407 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -256,8 +256,8 @@ size_t uct_ib_address_size(const union ibv_gid *gid, unsigned pack_flags) size_t size = sizeof(uct_ib_address_t); if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) { - /* Ethernet: address contains only raw GID */ - return size + sizeof(union ibv_gid); + /* Ethernet: address contains only udp sport and raw GID */ + return size + sizeof(uint16_t) + sizeof(union ibv_gid); } /* InfiniBand: address always contains LID */ @@ -303,6 +303,10 @@ void uct_ib_address_pack(const union ibv_gid *gid, uint16_t lid, ib_addr->flags |= UCT_IB_ADDRESS_FLAG_ROCE_IPV6; } + /* udp sport */ + *(uint16_t*)ptr = lid; + ptr = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint16_t)); + /* uint8_t raw[16]; */ memcpy(ptr, gid->raw, sizeof(gid->raw) * sizeof(uint8_t)); return; @@ -371,6 +375,9 @@ void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, uint16_t *lid, *lid = 0; if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH) { + /* udp sport */ + *lid = *(uint16_t*)ptr; + ptr = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint16_t)); memcpy(gid->raw, ptr, sizeof(gid->raw) * sizeof(uint8_t)); /* uint8_t raw[16]; */ return; } @@ -550,8 +557,8 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, /* older drivers use dlid for udp.sport, new drivers use flow_label when its nonzero */ - ah_attr->dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE | udp_sport; - ah_attr->grh.flow_label = ~udp_sport; + ah_attr->dlid = lid; + ah_attr->grh.flow_label = 0; } else { /* TODO iface->path_bits should be removed and replaced by path_index */ path_bits = iface->path_bits[path_index % diff --git a/src/uct/ib/cm/cm_ep.c b/src/uct/ib/cm/cm_ep.c index a5ed56430d8..475dfd590e7 100644 --- a/src/uct/ib/cm/cm_ep.c +++ b/src/uct/ib/cm/cm_ep.c @@ -88,6 +88,8 @@ static void uct_cm_dump_path(struct ibv_sa_path_rec *path) uct_ib_gid_str(&path->dgid, dgid_buf, sizeof(dgid_buf)); uct_ib_gid_str(&path->sgid, sgid_buf, sizeof(sgid_buf)); + ucs_warn("flow lable is %d", path->flow_label); + ucs_trace_data("slid %d sgid %s dlid %d dgid %s", ntohs(path->slid), sgid_buf, ntohs(path->dlid), dgid_buf); diff --git a/src/uct/ib/rdmacm/rdmacm_cm.c b/src/uct/ib/rdmacm/rdmacm_cm.c index 00603d1d411..5c98bf86c60 100644 --- a/src/uct/ib/rdmacm/rdmacm_cm.c +++ b/src/uct/ib/rdmacm/rdmacm_cm.c @@ -200,6 +200,8 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id, unsigned address_pack_flags; union ibv_gid gid; int ret; + uint16_t udp_sport; + uint16_t lid; /* get the qp attributes in order to modify the qp state. * the ah_attr fields from them are required to extract the device address @@ -218,7 +220,8 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id, ucs_error("ibv_query_port (%s) failed: %m", dev_name); return UCS_ERR_IO_ERROR; } - + ucs_warn("flow lable %d", qp_attr.ah_attr.grh.flow_label); + /* Print diagnostic if gid does not match */ if (qp_attr.ah_attr.is_global && (memcmp(&cm_id->route.addr.addr.ibaddr.dgid, &qp_attr.ah_attr.grh.dgid, @@ -230,6 +233,7 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id, qp_attr_gid_str, sizeof(qp_attr_gid_str))); } + udp_sport = 0; if (IBV_PORT_IS_LINK_LAYER_ETHERNET(&port_attr)) { /* Ethernet address */ ucs_assert(qp_attr.ah_attr.is_global); @@ -240,6 +244,9 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id, * that the remote peer is reachable to the local one */ roce_info.ver = UCT_IB_DEVICE_ROCE_ANY; roce_info.addr_family = 0; + + udp_sport = ibv_flow_label_to_udp_sport(qp_attr.ah_attr.grh.flow_label); + ucs_warn("udp_sport %u", udp_sport); } else if (qp_attr.ah_attr.is_global) { gid = qp_attr.ah_attr.grh.dgid; address_pack_flags = UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX | @@ -266,7 +273,12 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id, return UCS_ERR_NO_MEMORY; } - uct_ib_address_pack(&gid, qp_attr.ah_attr.dlid, address_pack_flags, + if (IBV_PORT_IS_LINK_LAYER_ETHERNET(&port_attr)) { + lid = udp_sport; + } else { + lid = qp_attr.ah_attr.dlid; + } + uct_ib_address_pack(&gid, lid, address_pack_flags, &roce_info, dev_addr); *dev_addr_p = (uct_device_addr_t *)dev_addr;