From 6ddb31890766c9f31a5a6dbe3123a3eaf9cdffde Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 2 May 2024 15:21:20 -0400 Subject: [PATCH 1/5] prov/cxi: Add FI_PEER capability bit Add the FI_PEER capability bit to the CXI provider fi_info Signed-off-by: Amir Shehata --- prov/cxi/include/cxip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index b959ff99eaa..6c0541f05b2 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -138,7 +138,7 @@ FI_REMOTE_COMM | FI_RMA_EVENT | FI_MULTI_RECV | FI_FENCE | FI_TRIGGER) #define CXIP_EP_CAPS (CXIP_EP_PRI_CAPS | CXIP_EP_SEC_CAPS) #define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) -#define CXIP_CAPS (CXIP_DOM_CAPS | CXIP_EP_CAPS) +#define CXIP_CAPS (CXIP_DOM_CAPS | CXIP_EP_CAPS | FI_PEER) #define CXIP_MSG_ORDER (FI_ORDER_SAS | \ FI_ORDER_WAW | \ FI_ORDER_RMA_WAW | \ From 538e2fbda992925bfef60096897004df873ce342 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Fri, 8 Apr 2022 18:17:30 -0700 Subject: [PATCH 2/5] prov/cxi: Implement shared Completion Queues On cq_open, check the FI_PEER_IMPORT, if set, set all internal cq operation to be enosys, with the exception to the read callback. The read callback is overloaded to operate as a progress callback function. Invoking the read callback will progress the enpoints linked to this CQ. Keep track of the fid_peer_cq structure passed in. If the FI_PEER_IMPORT flag is set, then set the callbacks in cxip_cq structure which handle writing to the peer_cq, otherwise set them to the ones which write to the util_cq. A provider needs to call a different set of functions to insert completion events into an imported CQ vs an internal CQ. These set of callback definition standardize a way to assign a different function to a CQ object, which can then be called to insert into the CQ. For example: struct prov_cq { struct util_cq *util_cq; struct fid_peer_cq *peer_cq; ofi_peer_cq_cb cq_cb; }; When a provider opens a CQ it can: if (attr->flags & FI_PEER_IMPORT) { prov_cq->cq_cb.cq_comp = prov_peer_cq_comp; } else { prov_cq->cq_cb.cq_comp = prov_cq_comp; } Collect the peer CQ callbacks in one structure for use in CXI. Signed-off-by: Amir Shehata --- prov/cxi/include/cxip.h | 21 +++++++++++ prov/cxi/src/cxip_cq.c | 83 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 6c0541f05b2..6642249e88e 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -1381,6 +1381,23 @@ struct cxip_evtq { struct dlist_entry req_list; }; +/* + * Peer CQ callbacks. + * These callback definitions can be used by providers to define generic + * callbacks which can be assigned different functions to handle completion + * for an imported cq vs an internal cq + */ +struct cxip_peer_cq_cb { + int (*cq_comp)(struct util_cq *cq, void *context, + uint64_t flags, size_t len, void *buf, uint64_t data, + uint64_t tag); + int (*cq_comp_src)(struct util_cq *cq, void *context, + uint64_t flags, size_t len, void *buf, uint64_t data, + uint64_t tag, fi_addr_t addr); + int (*cq_err)(struct util_cq *cq, + const struct fi_cq_err_entry *err_entry); +}; + /* * CXI Libfbric software completion queue */ @@ -1394,6 +1411,10 @@ struct cxip_cq { */ struct ofi_genlock ep_list_lock; + /* Peer CQ */ + struct fid_peer_cq *peer_cq; + struct cxip_peer_cq_cb cq_cb; + /* Internal CXI wait object allocated only if required. */ struct cxil_wait_obj *priv_wait; diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c index 675d91eeb56..f6e46f4d4dd 100644 --- a/prov/cxi/src/cxip_cq.c +++ b/prov/cxi/src/cxip_cq.c @@ -24,6 +24,46 @@ #define CXIP_DBG(...) _CXIP_DBG(FI_LOG_CQ, __VA_ARGS__) #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_CQ, __VA_ARGS__) +static int cxip_peer_cq_comp(struct util_cq *cq, void *context, + uint64_t flags, size_t len, void *buf, uint64_t data, + uint64_t tag) +{ + struct cxip_cq *cxip_cq; + struct fid_peer_cq *peer_cq; + + cxip_cq = container_of(cq, struct cxip_cq, util_cq); + peer_cq = cxip_cq->peer_cq; + + return peer_cq->owner_ops->write(peer_cq, context, flags, len, + buf, data, tag, FI_ADDR_NOTAVAIL); +} + +static int cxip_peer_cq_comp_src(struct util_cq *cq, void *context, + uint64_t flags, size_t len, void *buf, uint64_t data, + uint64_t tag, fi_addr_t addr) +{ + struct cxip_cq *cxip_cq; + struct fid_peer_cq *peer_cq; + + cxip_cq = container_of(cq, struct cxip_cq, util_cq); + peer_cq = cxip_cq->peer_cq; + + return peer_cq->owner_ops->write(peer_cq, context, flags, len, + buf, data, tag, addr); +} + +static int cxip_peer_cq_err(struct util_cq *cq, + const struct fi_cq_err_entry *err_entry) +{ + struct cxip_cq *cxip_cq; + struct fid_peer_cq *peer_cq; + + cxip_cq = container_of(cq, struct cxip_cq, util_cq); + peer_cq = cxip_cq->peer_cq; + + return peer_cq->owner_ops->writeerr(peer_cq, err_entry); +} + /* * cxip_cq_req_complete() - Generate a completion event for the request. */ @@ -34,9 +74,9 @@ int cxip_cq_req_complete(struct cxip_req *req) return FI_SUCCESS; } - return ofi_cq_write(&req->cq->util_cq, (void *)req->context, - req->flags, req->data_len, (void *)req->buf, - req->data, req->tag); + return req->cq->cq_cb.cq_comp(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag); } /* @@ -50,7 +90,7 @@ int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src) return FI_SUCCESS; } - return ofi_cq_write_src(&req->cq->util_cq, (void *)req->context, + return req->cq->cq_cb.cq_comp_src(&req->cq->util_cq, (void *)req->context, req->flags, req->data_len, (void *)req->buf, req->data, req->tag, src); } @@ -94,7 +134,7 @@ int cxip_cq_req_error(struct cxip_req *req, size_t olen, err_entry.buf = (void *)(uintptr_t)req->buf; err_entry.src_addr = src_addr; - return ofi_cq_write_error(&req->cq->util_cq, &err_entry); + return req->cq->cq_cb.cq_err(&req->cq->util_cq, &err_entry); } /* @@ -316,6 +356,20 @@ static int cxip_cq_verify_attr(struct fi_cq_attr *attr) return FI_SUCCESS; } +ssize_t cxip_peer_cq_progress(struct fid_cq *cq, void *buf, size_t count) +{ + struct util_cq *util_cq; + + if (buf || count > 0) + return -FI_EINVAL; + + util_cq = container_of(cq, struct util_cq, cq_fid); + + cxip_util_cq_progress(util_cq); + + return 0; +} + /* * cxip_cq_alloc_priv_wait - Allocate an internal wait channel for the CQ. */ @@ -400,7 +454,24 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, goto err_util_cq; } - cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror; + if (attr->flags & FI_PEER) { + struct fi_peer_cq_context *cq_cntxt = context; + + if (!cq_cntxt) + return -FI_EINVAL; + + cxi_cq->peer_cq = cq_cntxt->cq; + + cxi_cq->cq_cb.cq_comp = cxip_peer_cq_comp; + cxi_cq->cq_cb.cq_comp_src = cxip_peer_cq_comp_src; + cxi_cq->cq_cb.cq_err = cxip_peer_cq_err; + } else { + cxi_cq->cq_cb.cq_comp = ofi_cq_write; + cxi_cq->cq_cb.cq_comp_src = ofi_cq_write_src; + cxi_cq->cq_cb.cq_err = ofi_cq_write_error; + cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror; + } + cxi_cq->util_cq.cq_fid.fid.ops = &cxip_cq_fi_ops; cxi_cq->domain = cxi_dom; From ef4a04a84f9c4e9885909f7b90936f1b01955b11 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 7 May 2024 14:26:07 -0700 Subject: [PATCH 3/5] prov/cxi: Support shared receive queues Restructure the code to allow for posting on the owner provider's shared receive queues. Do not do a reverse lookup on the AV table to get the fi_addr_t, instead register an address matching callback with the owner. The owner can then call the address matching callback to match an fi_addr_t to the source address in the message received. This is more efficient as the peer lookup can be an O(1) operation; AV[fi_addr_t]. The peer's CXI address can be compared with the CXI address in the message received. Signed-off-by: Amir Shehata --- prov/cxi/include/cxip.h | 91 +++++++++ prov/cxi/src/cxip_dom.c | 31 ++- prov/cxi/src/cxip_ep.c | 3 + prov/cxi/src/cxip_info.c | 69 +------ prov/cxi/src/cxip_msg.c | 4 + prov/cxi/src/cxip_msg_hpc.c | 374 ++++++++++++++++++++++++++++-------- prov/cxi/src/cxip_req_buf.c | 18 +- prov/cxi/src/cxip_rxc.c | 5 + 8 files changed, 447 insertions(+), 148 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 6642249e88e..9b4b5a50fb4 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -853,6 +853,8 @@ struct cxip_domain { ofi_spin_t lock; ofi_atomic32_t ref; + struct fid_peer_srx *owner_srx; + uint32_t tclass; struct cxip_eq *eq; //unused @@ -1263,6 +1265,9 @@ struct cxip_req { uint64_t trig_thresh; struct cxip_cntr *trig_cntr; + /* pointer to the shared receive entry */ + struct fi_peer_rx_entry *rx_entry; + /* CQ event fields, set according to fi_cq.3 * - set by provider * - returned to user in completion event @@ -1457,6 +1462,8 @@ struct cxip_cntr { struct cxip_ux_send { struct dlist_entry rxc_entry; struct cxip_req *req; + struct cxip_rxc *rxc; + struct fi_peer_rx_entry *rx_entry; union c_event put_ev; bool claimed; /* Reserved with FI_PEEK | FI_CLAIM */ }; @@ -3197,6 +3204,11 @@ double cxip_rep_sum(size_t count, double *values); int cxip_check_auth_key_info(struct fi_info *info); int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key); +static inline struct fid_peer_srx *cxip_get_owner_srx(struct cxip_rxc *rxc) +{ + return rxc->domain->owner_srx; +} + #define CXIP_FC_SOFTWARE_INITIATED -1 /* cxip_fc_reason() - Returns the event reason for portal state @@ -3241,6 +3253,15 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr); +static inline int cxip_discard(struct fi_peer_rx_entry *rx_entry) +{ + /* TODO: how do we discard a message properly? */ + return -FI_ENOSYS; +} + +int cxip_unexp_start(struct fi_peer_rx_entry *entry); +int cxip_addr_match(fi_addr_t addr, struct fi_peer_match *match); + /* * Request variants: * CXIP_RQ_AMO @@ -3704,4 +3725,74 @@ int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni, struct c_dma_amo_cmd *amo, uint64_t flags, bool fetching, bool flush); +static inline void cxip_set_env_rx_match_mode(void) +{ + char *param_str = NULL; + + fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); + /* Parameters to tailor hybrid hardware to software transitions + * that are initiated by software. + */ + fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive UX transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_preemptive", + &cxip_env.hybrid_preemptive); + fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive recv transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", + &cxip_env.hybrid_recv_preemptive); + fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", + &cxip_env.hybrid_unexpected_msg_preemptive); + fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", + &cxip_env.hybrid_posted_recv_preemptive); + + if (param_str) { + if (!strcasecmp(param_str, "hardware")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } else if (!strcmp(param_str, "software")) { + cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; + cxip_env.msg_offload = false; + } else if (!strcmp(param_str, "hybrid")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; + cxip_env.msg_offload = true; + } else { + _CXIP_WARN(FI_LOG_FABRIC, "Unrecognized rx_match_mode: %s\n", + param_str); + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_preemptive) { + cxip_env.hybrid_preemptive = false; + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignoring preemptive\n"); + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_recv_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore LE recv preemptive\n"); + cxip_env.hybrid_recv_preemptive = 0; + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_posted_recv_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); + cxip_env.hybrid_posted_recv_preemptive = 0; + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_unexpected_msg_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); + cxip_env.hybrid_unexpected_msg_preemptive = 0; + } +} + #endif diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c index 4a928018679..7b0e6730b77 100644 --- a/prov/cxi/src/cxip_dom.c +++ b/prov/cxi/src/cxip_dom.c @@ -1556,6 +1556,35 @@ static int cxip_query_atomic(struct fid_domain *domain, return FI_SUCCESS; } +struct fi_ops_srx_peer cxip_srx_peer_ops = { + .size = sizeof(struct fi_ops_srx_peer), + .start_msg = cxip_unexp_start, + .start_tag = cxip_unexp_start, + .discard_msg = cxip_discard, + .discard_tag = cxip_discard, + .addr_match = cxip_addr_match, +}; + +static int cxip_srx_context(struct fid_domain *fid, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct cxip_domain *dom; + + if (!context || ! attr || !fid) + return -FI_EINVAL; + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + + if (attr->op_flags & FI_PEER) { + dom->owner_srx = ((struct fi_peer_srx_context *) context)->srx; + dom->owner_srx->peer_ops = &cxip_srx_peer_ops; + return 0; + } + + return -FI_ENOSYS; +} + static int cxip_query_collective(struct fid_domain *domain, enum fi_collective_op coll, struct fi_collective_attr *attr, @@ -1695,7 +1724,7 @@ static struct fi_ops_domain cxip_dom_ops = { .cntr_open = cxip_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_no_srx_context, + .srx_ctx = cxip_srx_context, .query_atomic = cxip_query_atomic, .query_collective = cxip_query_collective }; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index fabdea22be3..d4fd5ff3289 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -918,6 +918,9 @@ int cxip_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; + case FI_CLASS_SRX_CTX: + break; + default: return -FI_EINVAL; } diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 5c6e34ac1a1..37171f99ebf 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -828,27 +828,8 @@ static void cxip_env_init(void) fi_param_define(&cxip_prov, "rx_match_mode", FI_PARAM_STRING, "Sets RX message match mode (hardware | software | hybrid)."); - fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); - if (param_str) { - if (!strcasecmp(param_str, "hardware")) { - cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; - cxip_env.msg_offload = true; - } else if (!strcmp(param_str, "software")) { - cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; - cxip_env.msg_offload = false; - } else if (!strcmp(param_str, "hybrid")) { - cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; - cxip_env.msg_offload = true; - } else { - CXIP_WARN("Unrecognized rx_match_mode: %s\n", - param_str); - cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; - cxip_env.msg_offload = true; - } - - param_str = NULL; - } + cxip_set_env_rx_match_mode(); fi_param_define(&cxip_prov, "rdzv_threshold", FI_PARAM_SIZE_T, "Message size threshold for rendezvous protocol."); @@ -1036,54 +1017,6 @@ static void cxip_env_init(void) fi_param_get_size_t(&cxip_prov, "req_buf_max_cached", &cxip_env.req_buf_max_cached); - /* Parameters to tailor hybrid hardware to software transitions - * that are initiated by software. - */ - fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, - "Enable/Disable low LE preemptive UX transitions."); - fi_param_get_bool(&cxip_prov, "hybrid_preemptive", - &cxip_env.hybrid_preemptive); - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_preemptive) { - cxip_env.hybrid_preemptive = false; - CXIP_WARN("Not in hybrid mode, ignoring preemptive\n"); - } - - fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, - "Enable/Disable low LE preemptive recv transitions."); - fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", - &cxip_env.hybrid_recv_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_recv_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore LE recv preemptive\n"); - cxip_env.hybrid_recv_preemptive = 0; - } - - fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", - FI_PARAM_BOOL, - "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); - fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", - &cxip_env.hybrid_posted_recv_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_posted_recv_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); - cxip_env.hybrid_posted_recv_preemptive = 0; - } - - fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", - FI_PARAM_BOOL, - "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); - fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", - &cxip_env.hybrid_unexpected_msg_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_unexpected_msg_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); - cxip_env.hybrid_unexpected_msg_preemptive = 0; - } - if (cxip_software_pte_allowed()) { min_free = CXIP_REQ_BUF_HEADER_MAX_SIZE + cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index 4d3830dc18f..3277ff8d95a 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -118,6 +118,7 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, void cxip_recv_req_free(struct cxip_req *req) { struct cxip_rxc *rxc = req->recv.rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(rxc); assert(req->type == CXIP_REQ_RECV); assert(dlist_empty(&req->recv.children)); @@ -128,6 +129,9 @@ void cxip_recv_req_free(struct cxip_req *req) if (req->recv.recv_md && !req->recv.hybrid_md) cxip_unmap(req->recv.recv_md); + if (owner_srx && req->rx_entry) + owner_srx->owner_ops->free_entry(req->rx_entry); + cxip_evtq_req_free(req); } diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 5d68d40c51a..3f5e132d315 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -2030,6 +2030,25 @@ static void cxip_post_ux_onload_fc(struct cxip_rxc_hpc *rxc) } } +static int cxip_srx_add_ux(struct fid_peer_srx *owner_srx, + struct cxip_ux_send *ux_send) +{ + union cxip_match_bits ux_mb; + struct fi_peer_rx_entry *entry = calloc(sizeof(*entry), 1); + + if (!entry) + return -FI_ENOMEM; + + ux_mb.raw = ux_send->put_ev.tgt_long.match_bits; + entry->peer_context = ux_send; + if (ux_mb.tagged) + owner_srx->owner_ops->queue_tag(entry); + else + owner_srx->owner_ops->queue_msg(entry); + + return FI_SUCCESS; +} + /* * cxip_ux_onload_complete() - Unexpected list entry onload complete. * @@ -2038,6 +2057,7 @@ static void cxip_post_ux_onload_fc(struct cxip_rxc_hpc *rxc) static void cxip_ux_onload_complete(struct cxip_req *req) { struct cxip_rxc_hpc *rxc = req->search.rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); @@ -2045,26 +2065,37 @@ static void cxip_ux_onload_complete(struct cxip_req *req) free(rxc->ule_offsets); rxc->ule_offsets = 0; - /* During a transition to software managed PtlTE, received - * request list entries resulting from hardware not matching - * the priority list on an incoming packet were added to a - * pending unexpected message list. We merge the two - * expected list here. - */ - RXC_DBG(rxc, "Req pending %d UX entries, SW list %d UX entries\n", - rxc->sw_pending_ux_list_len, rxc->sw_ux_list_len); + if (owner_srx) { + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + int ret; + + dlist_foreach_container_safe(&rxc->sw_pending_ux_list, + struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + ret = cxip_srx_add_ux(owner_srx, ux_send); + if (ret) + RXC_WARN(rxc, "Failed to add %p on owner srx %p\n", + ux_send, owner_srx); + } - dlist_splice_tail(&rxc->sw_ux_list, &rxc->sw_pending_ux_list); - rxc->sw_ux_list_len += rxc->sw_pending_ux_list_len; - rxc->sw_pending_ux_list_len = 0; + } else { + /* During a transition to software managed PtlTE, received + * request list entries resulting from hardware not matching + * the priority list on an incoming packet were added to a + * pending unexpected message list. We merge the two + * expected list here. + */ + RXC_DBG(rxc, "Req pending %d UX entries, SW list %d UX entries\n", + rxc->sw_pending_ux_list_len, rxc->sw_ux_list_len); - RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", - rxc->sw_ux_list_len); + dlist_splice_tail(&rxc->sw_ux_list, &rxc->sw_pending_ux_list); + rxc->sw_ux_list_len += rxc->sw_pending_ux_list_len; + rxc->sw_pending_ux_list_len = 0; - if (rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) - cxip_post_ux_onload_sw(rxc); - else - cxip_post_ux_onload_fc(rxc); + RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", + rxc->sw_ux_list_len); + } ofi_atomic_dec32(&rxc->base.orx_reqs); cxip_evtq_req_free(req); @@ -2126,6 +2157,7 @@ static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) struct cxip_deferred_event *def_ev; struct cxip_ux_send *ux_send; bool matched; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || @@ -2180,8 +2212,16 @@ static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) } rxc->cur_ule_offsets++; - dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); - rxc->sw_ux_list_len++; + /* AMIR: insert on the shared unexpected queue */ + if (owner_srx) { + int ret; + ret = cxip_srx_add_ux(owner_srx, ux_send); + if (ret) + return ret; + } else { + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + } RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send); @@ -3314,6 +3354,236 @@ static int cxip_recv_sw_matcher(struct cxip_rxc_hpc *rxc, struct cxip_req *req, return ret; } +static uint32_t cxip_get_match_id(struct cxip_rxc *rxc, + fi_addr_t src_addr) +{ + int ret; + uint32_t match_id; + struct cxip_addr caddr; + + if (rxc->attr.caps & FI_DIRECTED_RECV && + src_addr != FI_ADDR_UNSPEC) { + if (rxc->ep_obj->av->symmetric) { + /* PID is not used for matching */ + match_id = CXI_MATCH_ID(rxc->pid_bits, C_PID_ANY, + src_addr); + } else { + ret = cxip_av_lookup_addr(rxc->ep_obj->av, src_addr, + &caddr); + if (ret != FI_SUCCESS) { + RXC_WARN(rxc, "Failed to look up FI addr: %d\n", + ret); + return -FI_EINVAL; + } + + match_id = CXI_MATCH_ID(rxc->pid_bits, caddr.pid, + caddr.nic); + } + } else { + match_id = CXI_MATCH_ID_ANY; + } + + return match_id; +} + +static int +cxip_recv_req_init(struct cxip_rxc *rxc, void *buf, size_t len, fi_addr_t addr, + uint64_t tag, uint64_t ignore, uint64_t flags, bool tagged, + void *context, struct cxip_cntr *comp_cntr, + struct cxip_req **req_out) +{ + struct cxip_req *req; + uint32_t match_id; + int ret; + uint16_t vni; + + ofi_genlock_unlock(&rxc->ep_obj->lock); + + if (len && !buf) { + ret = -FI_EINVAL; + goto lock_err; + } + + if (rxc->state == RXC_DISABLED) { + ret = -FI_EOPBADSTATE; + goto lock_err; + } + + /* HW to SW PtlTE transition, ensure progress is made */ + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(rxc->recv_cq); + ret = -FI_EAGAIN; + goto lock_err; + } + + if (tagged) { + if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { + RXC_WARN(rxc, + "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", + tag, ignore, CXIP_TAG_MASK); + ret = -FI_EINVAL; + goto lock_err; + } + flags &= ~FI_MULTI_RECV; + } + + ret = cxip_set_recv_match_id(rxc, addr, rxc->ep_obj->av_auth_key && + (flags & FI_AUTH_KEY), &match_id, &vni); + if (ret) { + RXC_WARN(rxc, "Error setting match_id: %d %s\n", + ret, fi_strerror(-ret)); + goto lock_err; + } + + ofi_genlock_lock(&rxc->ep_obj->lock); + ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + if (ret) + return ret; + + /* req->data_len, req->tag, req->data must be set later. req->buf may + * be overwritten later. + */ + req->context = (uint64_t)context; + + req->flags = FI_RECV | (flags & FI_COMPLETION); + if (tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + + req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; + req->recv.match_id = match_id; + req->recv.tag = tag; + req->recv.ignore = ignore; + req->recv.flags = flags; + req->recv.tagged = tagged; + req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); + + *req_out = req; + + return FI_SUCCESS; + +lock_err: + ofi_genlock_lock(&rxc->ep_obj->lock); + return ret; +} + +int cxip_addr_match(fi_addr_t addr, struct fi_peer_match *match) +{ + uint32_t ux_init; + uint32_t match_id; + struct cxip_ux_send *ux = match->context; + struct cxip_rxc *rxc = ux->rxc; + + /* TODO: this is sometimes called with the rxc_lock held in the case + * of cxip_process_srx_ux_matcher() and sometimes not if the owner is + * iterating through its unexpected queue. Is this going to be + * a problem? This function shouldn't be making any changes to the + * rxc. But do we need a read lock? + */ + match_id = cxip_get_match_id(rxc, addr); + + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + + return init_match(rxc, ux_init, match_id); +} + +int cxip_unexp_start(struct fi_peer_rx_entry *rx_entry) +{ + int ret; + struct cxip_ux_send *ux; + union cxip_match_bits ux_mb; + struct cxip_req *req; + struct cxip_rxc *rxc; + + ux = rx_entry->peer_context; + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + rxc = ux->rxc; + + ret = cxip_recv_req_init(rxc, rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len, rx_entry->addr, + rx_entry->tag, rx_entry->ignore, rx_entry->flags, + ux_mb.tagged, rx_entry->context, NULL, &req); + if (ret) + return ret; + + req->rx_entry = rx_entry; + + ret = cxip_recv_sw_matched(req, ux); + + if (ux->req && ux->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux); + else + free(ux); + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p (sw_ux_list_len: %u)\n", + req, ux, req->recv.rxc->sw_ux_list_len); + + return ret; +} + +static int cxip_process_srx_ux_matcher(struct cxip_rxc *rxc, + struct fid_peer_srx *owner_srx, struct cxip_ux_send *ux) +{ + int ret; + union cxip_match_bits ux_mb; + struct fi_peer_rx_entry *rx_entry = NULL; + struct cxip_req *req; + struct fi_peer_match match = {0}; + + /* stash the rxc because we're going to need it during address + * matching + */ + ux->rxc = rxc; + match.context = ux; + /* not being used */ + match.addr = FI_ADDR_UNSPEC; + match.size = 0; + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + + if (ux_mb.tagged) { + match.tag = ux_mb.tag; + ret = owner_srx->owner_ops->get_tag(owner_srx, &match, &rx_entry); + } else { + ret = owner_srx->owner_ops->get_msg(owner_srx, &match, &rx_entry); + } + + /* return it back to the caller */ + ux->rx_entry = rx_entry; + + if (ret == -FI_ENOENT) { + /* this is used when the owner calls start_msg */ + rx_entry->peer_context = ux; + return -FI_ENOMSG; + } else if (ret) { + return ret; + } + + ret = cxip_recv_req_init(rxc, rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len, rx_entry->addr, + rx_entry->tag, rx_entry->ignore, rx_entry->flags, + ux_mb.tagged, rx_entry->context, NULL, &req); + if (ret) + return ret; + + req->rx_entry = rx_entry; + + ret = cxip_recv_sw_matched(req, ux); + + if (ux->req && ux->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux); + else + free(ux); + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p (sw_ux_list_len: %u)\n", + req, ux, req->recv.rxc->sw_ux_list_len); + + return ret; +} + /* * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user * posted receive. @@ -3324,10 +3594,17 @@ int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux) { struct cxip_ptelist_buf *rbuf = ux->req->req_ctx; struct cxip_rxc_hpc *rxc = rbuf->rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); struct cxip_req *req; struct dlist_entry *tmp; int ret; + if (owner_srx) { + /* we never add anything on the sw_ux_list */ + rxc->sw_ux_list_len--; + return cxip_process_srx_ux_matcher(&rxc->base, owner_srx, ux); + } + if (dlist_empty(&rxc->sw_recv_queue)) return -FI_ENOMSG; @@ -3985,71 +4262,16 @@ cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, int ret; struct cxip_req *req; struct cxip_ux_send *ux_msg; - uint32_t match_id; - uint16_t vni; assert(rxc_hpc->base.protocol == FI_PROTO_CXI); - if (len && !buf) - return -FI_EINVAL; - - if (rxc->state == RXC_DISABLED) - return -FI_EOPBADSTATE; - - /* HW to SW PtlTE transition, ensure progress is made */ - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - cxip_cq_progress(rxc->recv_cq); - return -FI_EAGAIN; - } - - if (tagged) { - if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { - RXC_WARN(rxc, - "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", - tag, ignore, CXIP_TAG_MASK); - return -FI_EINVAL; - } - } - - ret = cxip_set_recv_match_id(rxc, src_addr, rxc->ep_obj->av_auth_key && - (flags & FI_AUTH_KEY), &match_id, &vni); - if (ret) { - RXC_WARN(rxc, "Error setting match_id: %d %s\n", - ret, fi_strerror(-ret)); - return ret; - } - ofi_genlock_lock(&rxc->ep_obj->lock); - ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + ret = cxip_recv_req_init(rxc, buf, len, src_addr, tag, ignore, flags, + tagged, context, comp_cntr, &req); if (ret) goto err; - /* req->data_len, req->tag, req->data must be set later. req->buf may - * be overwritten later. - */ - req->context = (uint64_t)context; - - req->flags = FI_RECV | (flags & FI_COMPLETION); - if (tagged) - req->flags |= FI_TAGGED; - else - req->flags |= FI_MSG; - - req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; - req->recv.match_id = match_id; - req->recv.tag = tag; - req->recv.ignore = ignore; - req->recv.flags = flags; - req->recv.tagged = tagged; - req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); - - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - ret = -FI_EAGAIN; - goto err_free_request; - } - if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) { - ret = cxip_recv_req_queue(req, false); /* Match made in software? */ if (ret == -FI_EALREADY) { diff --git a/prov/cxi/src/cxip_req_buf.c b/prov/cxi/src/cxip_req_buf.c index 4a4624c59b7..09b1432b214 100644 --- a/prov/cxi/src/cxip_req_buf.c +++ b/prov/cxi/src/cxip_req_buf.c @@ -150,10 +150,22 @@ static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf, "rbuf=%p ux=%p sw_pending_ux_list_len=%u\n", buf, ux, buf->rxc->sw_pending_ux_list_len); } else { - dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); - RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", - buf, ux, buf->rxc->sw_ux_list_len); + if (owner_srx) { + union cxip_match_bits ux_mb; + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + + if (ux_mb.tagged) + owner_srx->owner_ops->queue_tag(ux->rx_entry); + else + owner_srx->owner_ops->queue_msg(ux->rx_entry); + } else { + dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", + buf, ux, buf->rxc->sw_ux_list_len); + } } break; diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c index cdcaed39a59..f65f614314a 100644 --- a/prov/cxi/src/cxip_rxc.c +++ b/prov/cxi/src/cxip_rxc.c @@ -402,6 +402,11 @@ struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context) { struct cxip_rxc *rxc = NULL; + /* update the rx_match_mode in case it has changed since + * initialization + */ + cxip_set_env_rx_match_mode(); + switch (ep_obj->protocol) { case FI_PROTO_CXI: rxc = calloc(1, sizeof(struct cxip_rxc_hpc)); From e9db7c55e0f028c8a2f60e36962f0282beb5814a Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Wed, 2 Oct 2024 16:41:04 -0400 Subject: [PATCH 4/5] prov/cxi: handle late peer insertion Handle the case where a message from a peer arrives before the peer is inserted. Implement the callflow to support this scenario. Signed-off-by: Amir Shehata --- prov/cxi/include/cxip.h | 7 ++-- prov/cxi/src/cxip_av.c | 17 +++++++++ prov/cxi/src/cxip_dom.c | 1 - prov/cxi/src/cxip_msg.c | 20 +++++------ prov/cxi/src/cxip_msg_hpc.c | 72 +++++++------------------------------ 5 files changed, 43 insertions(+), 74 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 9b4b5a50fb4..b482130e3a5 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -177,7 +177,7 @@ #define CXIP_MINOR_VERSION 1 #define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \ CXIP_MINOR_VERSION) -#define CXIP_FI_VERSION FI_VERSION(1, 21) +#define CXIP_FI_VERSION FI_VERSION(2, 0) #define CXIP_WIRE_PROTO_VERSION 1 #define CXIP_COLL_MAX_CONCUR 8 @@ -3260,7 +3260,6 @@ static inline int cxip_discard(struct fi_peer_rx_entry *rx_entry) } int cxip_unexp_start(struct fi_peer_rx_entry *entry); -int cxip_addr_match(fi_addr_t addr, struct fi_peer_match *match); /* * Request variants: @@ -3673,7 +3672,9 @@ int cxip_set_recv_match_id(struct cxip_rxc *rxc, fi_addr_t src_addr, return FI_SUCCESS; } -fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req); +fi_addr_t cxip_recv_req_src_addr(struct cxip_rxc *rxc, + uint32_t init, uint16_t vni, + bool force); int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, struct cxip_md *md, struct cxip_req **cxip_req, int (*recv_cb)(struct cxip_req *req, diff --git a/prov/cxi/src/cxip_av.c b/prov/cxi/src/cxip_av.c index 6dd4aa4e415..031bf0fb22e 100644 --- a/prov/cxi/src/cxip_av.c +++ b/prov/cxi/src/cxip_av.c @@ -229,6 +229,18 @@ struct cxip_addr *(*cxip_av_addr_in)(const void *addr) = insert_in; void (*cxip_av_addr_out)(struct cxip_addr *addr_out, struct cxip_addr *addr) = insert_out; +static fi_addr_t cxip_get_addr(struct fi_peer_rx_entry *entry) +{ + uint32_t ux_init; + uint16_t vni; + struct cxip_ux_send *ux = entry->peer_context; + + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + vni = ux->put_ev.tgt_long.vni; + + return cxip_recv_req_src_addr(ux->rxc, ux_init, vni, true); +} + static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { @@ -236,6 +248,7 @@ static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, size_t i; size_t success_cnt = 0; int ret; + struct fid_peer_srx *owner_srx; ret = cxip_av_insert_validate_args(fid, addr_in, count, fi_addr, flags, context); @@ -253,6 +266,10 @@ static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, cxip_av_unlock(av); + owner_srx = av->domain->owner_srx; + if (owner_srx) + owner_srx->owner_ops->foreach_unspec_addr(owner_srx, &cxip_get_addr); + return success_cnt; } diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c index 7b0e6730b77..629c2e1d451 100644 --- a/prov/cxi/src/cxip_dom.c +++ b/prov/cxi/src/cxip_dom.c @@ -1562,7 +1562,6 @@ struct fi_ops_srx_peer cxip_srx_peer_ops = { .start_tag = cxip_unexp_start, .discard_msg = cxip_discard, .discard_tag = cxip_discard, - .addr_match = cxip_addr_match, }; static int cxip_srx_context(struct fid_domain *fid, struct fi_rx_attr *attr, diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index 3277ff8d95a..b0227e37d1d 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -23,26 +23,25 @@ /* * cxip_recv_req_src_addr() - Translate request source address to FI address. */ -fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req) +fi_addr_t cxip_recv_req_src_addr(struct cxip_rxc *rxc, + uint32_t init, uint16_t vni, + bool force) { - struct cxip_rxc *rxc = req->recv.rxc; - /* If the FI_SOURCE capability is enabled, convert the initiator's * address to an FI address to be reported in a CQ event. If * application AVs are symmetric, the match_id in the EQ event is * logical and translation is not needed. Otherwise, translate the * physical address in the EQ event to logical FI address. */ - if (rxc->attr.caps & FI_SOURCE) { + if ((rxc->attr.caps & FI_SOURCE) || force) { struct cxip_addr addr = {}; if (rxc->ep_obj->av->symmetric) - return CXI_MATCH_ID_EP(rxc->pid_bits, - req->recv.initiator); + return CXI_MATCH_ID_EP(rxc->pid_bits, init); - addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator); - addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, req->recv.initiator); - addr.vni = req->recv.vni; + addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); + addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); + addr.vni = vni; return cxip_av_lookup_fi_addr(rxc->ep_obj->av, &addr); } @@ -154,7 +153,8 @@ static inline int recv_req_event_success(struct cxip_rxc *rxc, } if (req->recv.rxc->attr.caps & FI_SOURCE) { - src_addr = cxip_recv_req_src_addr(req); + src_addr = cxip_recv_req_src_addr(req->recv.rxc, req->recv.initiator, + req->recv.vni, false); if (src_addr != FI_ADDR_NOTAVAIL || !(rxc->attr.caps & FI_SOURCE_ERR)) return cxip_cq_req_complete_addr(req, src_addr); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 3f5e132d315..8a7dc63ed99 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -3074,7 +3074,9 @@ static void cxip_set_ux_dump_entry(struct cxip_req *req, } if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE) - *src_addr = cxip_recv_req_src_addr(req); + *src_addr = cxip_recv_req_src_addr(req->recv.rxc, + req->recv.initiator, + req->recv.vni, false); } } @@ -3354,38 +3356,6 @@ static int cxip_recv_sw_matcher(struct cxip_rxc_hpc *rxc, struct cxip_req *req, return ret; } -static uint32_t cxip_get_match_id(struct cxip_rxc *rxc, - fi_addr_t src_addr) -{ - int ret; - uint32_t match_id; - struct cxip_addr caddr; - - if (rxc->attr.caps & FI_DIRECTED_RECV && - src_addr != FI_ADDR_UNSPEC) { - if (rxc->ep_obj->av->symmetric) { - /* PID is not used for matching */ - match_id = CXI_MATCH_ID(rxc->pid_bits, C_PID_ANY, - src_addr); - } else { - ret = cxip_av_lookup_addr(rxc->ep_obj->av, src_addr, - &caddr); - if (ret != FI_SUCCESS) { - RXC_WARN(rxc, "Failed to look up FI addr: %d\n", - ret); - return -FI_EINVAL; - } - - match_id = CXI_MATCH_ID(rxc->pid_bits, caddr.pid, - caddr.nic); - } - } else { - match_id = CXI_MATCH_ID_ANY; - } - - return match_id; -} - static int cxip_recv_req_init(struct cxip_rxc *rxc, void *buf, size_t len, fi_addr_t addr, uint64_t tag, uint64_t ignore, uint64_t flags, bool tagged, @@ -3468,26 +3438,6 @@ cxip_recv_req_init(struct cxip_rxc *rxc, void *buf, size_t len, fi_addr_t addr, return ret; } -int cxip_addr_match(fi_addr_t addr, struct fi_peer_match *match) -{ - uint32_t ux_init; - uint32_t match_id; - struct cxip_ux_send *ux = match->context; - struct cxip_rxc *rxc = ux->rxc; - - /* TODO: this is sometimes called with the rxc_lock held in the case - * of cxip_process_srx_ux_matcher() and sometimes not if the owner is - * iterating through its unexpected queue. Is this going to be - * a problem? This function shouldn't be making any changes to the - * rxc. But do we need a read lock? - */ - match_id = cxip_get_match_id(rxc, addr); - - ux_init = ux->put_ev.tgt_long.initiator.initiator.process; - - return init_match(rxc, ux_init, match_id); -} - int cxip_unexp_start(struct fi_peer_rx_entry *rx_entry) { int ret; @@ -3527,19 +3477,21 @@ static int cxip_process_srx_ux_matcher(struct cxip_rxc *rxc, struct fid_peer_srx *owner_srx, struct cxip_ux_send *ux) { int ret; + uint32_t ux_init; union cxip_match_bits ux_mb; struct fi_peer_rx_entry *rx_entry = NULL; struct cxip_req *req; - struct fi_peer_match match = {0}; + uint16_t vni; + struct fi_peer_match_attr match = {0}; - /* stash the rxc because we're going to need it during address - * matching + /* stash the rxc because we're going to need it if the peer + * address isn't already inserted into the AV table. */ ux->rxc = rxc; - match.context = ux; - /* not being used */ - match.addr = FI_ADDR_UNSPEC; - match.size = 0; + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + vni = ux->put_ev.tgt_long.vni; + + match.addr = cxip_recv_req_src_addr(rxc, ux_init, vni, true); ux_mb.raw = ux->put_ev.tgt_long.match_bits; From eb137358dec3020ade6d9325c770ca3db4815120 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 2 May 2024 18:48:10 -0400 Subject: [PATCH 5/5] prov/lnx: LINKx provider The LINKx (lnx) provider offers a framework by which multiple providers can be linked together and presented as one provider to the application. This abstracts away the details of the traffic providers from the application. This iteration of the provider allows linking only two providers, shm and another provider, ex; CXI or RXM. The composite providers which are linked together need to support the peer infrastructure. In order to use the lnx provider the user needs to: export FI_LNX_PROV_LINKS="shm+" ex: export FI_LNX_PROV_LINKS="shm+cxi" or export FI_LNX_PROV_LINKS="shm+tcp;ofi_rxm" Signed-off-by: Amir Shehata --- Makefile.am | 2 + configure.ac | 1 + include/ofi.h | 1 + include/ofi_lnx.h | 59 ++ include/ofi_mr.h | 2 + include/ofi_prov.h | 11 + include/ofi_util.h | 15 +- include/rdma/fabric.h | 3 + include/rdma/fi_domain.h | 1 + include/rdma/fi_errno.h | 2 +- include/rdma/providers/fi_peer.h | 1 + include/rdma/providers/fi_prov.h | 1 - libfabric.map.in | 1 + man/fi_lnx.7.md | 156 ++++ man/man7/fi_lnx.7 | 173 +++++ prov/cxi/configure.m4 | 152 +--- prov/lnx/Makefile.include | 61 ++ prov/lnx/configure.m4 | 15 + prov/lnx/include/lnx.h | 584 +++++++++++++++ prov/lnx/src/lnx_cq.c | 241 ++++++ prov/lnx/src/lnx_domain.c | 584 +++++++++++++++ prov/lnx/src/lnx_ep.c | 1188 ++++++++++++++++++++++++++++++ prov/lnx/src/lnx_init.c | 924 +++++++++++++++++++++++ prov/lnx/src/lnx_ops.c | 993 +++++++++++++++++++++++++ prov/lnx/src/lnx_peers.c | 698 ++++++++++++++++++ prov/shm/src/smr_init.c | 3 + prov/util/src/util_attr.c | 14 +- prov/util/src/util_mr_cache.c | 7 +- src/fabric.c | 43 +- src/fi_tostr.c | 1 + src/hmem_rocr.c | 2 +- 31 files changed, 5802 insertions(+), 137 deletions(-) create mode 100644 include/ofi_lnx.h create mode 100644 man/fi_lnx.7.md create mode 100644 man/man7/fi_lnx.7 create mode 100644 prov/lnx/Makefile.include create mode 100644 prov/lnx/configure.m4 create mode 100644 prov/lnx/include/lnx.h create mode 100644 prov/lnx/src/lnx_cq.c create mode 100644 prov/lnx/src/lnx_domain.c create mode 100644 prov/lnx/src/lnx_ep.c create mode 100644 prov/lnx/src/lnx_init.c create mode 100644 prov/lnx/src/lnx_ops.c create mode 100644 prov/lnx/src/lnx_peers.c diff --git a/Makefile.am b/Makefile.am index 00242c7d65e..fc437ac0f74 100644 --- a/Makefile.am +++ b/Makefile.am @@ -205,6 +205,7 @@ src_libfabric_la_SOURCES = \ include/uthash.h \ include/ofi_prov.h \ include/ofi_profile.h \ + include/ofi_lnx.h \ include/rdma/providers/fi_log.h \ include/rdma/providers/fi_prov.h \ src/fabric.c \ @@ -484,6 +485,7 @@ include prov/sm2/Makefile.include include prov/tcp/Makefile.include include prov/ucx/Makefile.include include prov/lpp/Makefile.include +include prov/lnx/Makefile.include include prov/hook/Makefile.include include prov/hook/perf/Makefile.include include prov/hook/trace/Makefile.include diff --git a/configure.ac b/configure.ac index e56e370ee7a..d5967e8f9ba 100644 --- a/configure.ac +++ b/configure.ac @@ -1026,6 +1026,7 @@ FI_PROVIDER_SETUP([hook_debug]) FI_PROVIDER_SETUP([hook_hmem]) FI_PROVIDER_SETUP([dmabuf_peer_mem]) FI_PROVIDER_SETUP([opx]) +FI_PROVIDER_SETUP([lnx]) FI_PROVIDER_FINI dnl Configure the .pc file FI_PROVIDER_SETUP_PC diff --git a/include/ofi.h b/include/ofi.h index 7592281c766..9661a7553d9 100644 --- a/include/ofi.h +++ b/include/ofi.h @@ -297,6 +297,7 @@ enum ofi_prov_type { OFI_PROV_UTIL, OFI_PROV_HOOK, OFI_PROV_OFFLOAD, + OFI_PROV_LNX, }; /* Restrict to size of struct fi_provider::context (struct fi_context) */ diff --git a/include/ofi_lnx.h b/include/ofi_lnx.h new file mode 100644 index 00000000000..b0b04e5eebb --- /dev/null +++ b/include/ofi_lnx.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL); Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef OFI_LNX_H +#define OFI_LNX_H + +/* ofi_create_link() + * prov_list (IN): number of providers to link + * fabric (OUT): lnx fabric which abstracts the bond + * caps (IN): bond capabilities requested + * context (IN): user context to store. + * + * The LNX provider is not inserted directly on the list + * of core providers. In that sense, it's a special provider + * that only gets returned on a call of fi_link(), if that + * function determines that there are multiple providers to link. + * + * ofi_create_link() binds the core provider endpoints and returns + * the LNX fabric which abstracts away these provider endpoints. + */ +int ofi_create_link(struct fi_info *prov_list, struct fid_fabric **fabric, + uint64_t caps, void *context); + +/* + * ofi_finish_link() + * Uninitialize and cleanup all the core providers + */ +void ofi_link_fini(void); + +#endif /* OFI_LNX_H */ diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 1ebb07a8e11..97e5f43ec26 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -255,6 +255,8 @@ int ofi_mr_map_init(const struct fi_provider *in_prov, int mode, struct ofi_mr_map *map); void ofi_mr_map_close(struct ofi_mr_map *map); +struct fi_mr_attr * +ofi_dup_mr_attr(const struct fi_mr_attr *attr, uint64_t flags); int ofi_mr_map_insert(struct ofi_mr_map *map, const struct fi_mr_attr *attr, uint64_t *key, void *context, diff --git a/include/ofi_prov.h b/include/ofi_prov.h index ccb3fbf616d..7ffcda76268 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -211,6 +211,17 @@ MRAIL_INI ; # define MRAIL_INIT NULL #endif +#if (HAVE_LNX) && (HAVE_LNX_DL) +# define LNX_INI FI_EXT_INI +# define LNX_INIT NULL +#elif (HAVE_LNX) +# define LNX_INI INI_SIG(fi_lnx_ini) +# define LNX_INIT fi_lnx_ini() +LNX_INI ; +#else +# define LNX_INIT NULL +#endif + #if (HAVE_PERF) && (HAVE_PERF_DL) # define HOOK_PERF_INI FI_EXT_INI # define HOOK_PERF_INIT NULL diff --git a/include/ofi_util.h b/include/ofi_util.h index 911a69893ba..dda5c903e6e 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -1172,9 +1172,11 @@ void ofi_fabric_remove(struct util_fabric *fabric); * Utility Providers */ -#define OFI_NAME_DELIM ';' +#define OFI_NAME_LNX_DELIM ':' +#define OFI_NAME_DELIM ';' #define OFI_UTIL_PREFIX "ofi_" #define OFI_OFFLOAD_PREFIX "off_" +#define OFI_LNX "lnx" static inline int ofi_has_util_prefix(const char *str) { @@ -1186,6 +1188,16 @@ static inline int ofi_has_offload_prefix(const char *str) return !strncasecmp(str, OFI_OFFLOAD_PREFIX, strlen(OFI_OFFLOAD_PREFIX)); } +static inline int ofi_is_lnx(const char *str) +{ + return !strncasecmp(str, OFI_LNX, strlen(OFI_LNX)); +} + +static inline int ofi_is_linked(const char *str) +{ + return (strcasestr(str, OFI_LNX)) ? 1 : 0; +} + int ofi_get_core_info(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, const struct fi_info *util_hints, @@ -1201,6 +1213,7 @@ int ofi_get_core_info_fabric(const struct fi_provider *prov, struct fi_info **core_info); +char *ofi_strdup_link_append(const char *head, const char *tail); char *ofi_strdup_append(const char *head, const char *tail); // char *ofi_strdup_head(const char *str); // char *ofi_strdup_tail(const char *str); diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 420d2eacc05..69ab4874805 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -339,6 +339,7 @@ enum { FI_PROTO_SM2, FI_PROTO_CXI_RNR, FI_PROTO_LPP, + FI_PROTO_LNX, }; enum { @@ -622,6 +623,8 @@ int fi_fabric2(struct fi_info *info, struct fid_fabric **fabric, uint64_t flags, void *context); int fi_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context); +int fi_link(struct fi_info *prov_list, struct fid_fabric **fabric, + uint64_t caps, void *context); int fi_open(uint32_t version, const char *name, void *attr, size_t attr_len, uint64_t flags, struct fid **fid, void *context); diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index 548e4b6ad3e..61cdcf0ffab 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -167,6 +167,7 @@ struct fi_mr_attr { size_t auth_key_size; uint8_t *auth_key; enum fi_hmem_iface iface; + fi_addr_t addr; union { uint64_t reserved; int cuda; diff --git a/include/rdma/fi_errno.h b/include/rdma/fi_errno.h index f5af121ec79..b90dbd5f42d 100644 --- a/include/rdma/fi_errno.h +++ b/include/rdma/fi_errno.h @@ -114,7 +114,7 @@ extern "C" { //#define FI_EADV EADV /* Advertise error */ //#define FI_ESRMNT ESRMNT /* Srmount error */ //#define FI_ECOMM ECOMM /* Communication error on send */ -//#define FI_EPROTO EPROTO /* Protocol error */ +#define FI_EPROTO EPROTO /* Protocol error */ //#define FI_EMULTIHOP EMULTIHOP /* Multihop attempted */ //#define FI_EDOTDOT EDOTDOT /* RFS specific error */ //#define FI_EBADMSG EBADMSG /* Not a data message */ diff --git a/include/rdma/providers/fi_peer.h b/include/rdma/providers/fi_peer.h index 782a1496531..1cdddb8ee09 100644 --- a/include/rdma/providers/fi_peer.h +++ b/include/rdma/providers/fi_peer.h @@ -169,6 +169,7 @@ struct fi_peer_rx_entry { uint64_t tag; uint64_t cq_data; uint64_t flags; + uint64_t ignore; void *context; size_t count; void **desc; diff --git a/include/rdma/providers/fi_prov.h b/include/rdma/providers/fi_prov.h index ab8858d8f9d..8c0434761e1 100644 --- a/include/rdma/providers/fi_prov.h +++ b/include/rdma/providers/fi_prov.h @@ -73,7 +73,6 @@ struct fi_provider { void (*cleanup)(void); }; - /* * Defines a configuration parameter for use with libfabric. */ diff --git a/libfabric.map.in b/libfabric.map.in index 0659a1cac14..479ef5532c2 100644 --- a/libfabric.map.in +++ b/libfabric.map.in @@ -59,6 +59,7 @@ FABRIC_1.7 { fi_getinfo; fi_freeinfo; fi_dupinfo; + fi_link; } FABRIC_1.6; FABRIC_1.8 { diff --git a/man/fi_lnx.7.md b/man/fi_lnx.7.md new file mode 100644 index 00000000000..d0e8dda9bd8 --- /dev/null +++ b/man/fi_lnx.7.md @@ -0,0 +1,156 @@ +--- +layout: page +title: fi_lnx(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_lnx \- The LINKx (LNX) Provider + +# OVERVIEW + +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. This provider uses +the libfabric peer infrastructure to aid in the use of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. Future releases of the provider will allow linking any +number of providers and provide the users with the ability to influence +the way the providers are utilized for traffic load. + +# SUPPORTED FEATURES + +This release contains an initial implementation of the LNX provider that +offers the following support: + +*Endpoint types* +: The provider supports only endpoint type *FI_EP_RDM*. + +*Endpoint capabilities* +: LNX is a passthrough layer on the send path. On the receive path LNX + utilizes the peer infrastructure to create shared receive queues (SRQ). + Receive requests are placed on the SRQ instead of on the core provider + receive queue. When the provider receives a message it queries the SRQ for + a match. If one is found the receive request is completed, otherwise the + message is placed on the LNX shared unexpected queue (SUQ). Further receive + requests query the SUQ for matches. + The first release of the provider only supports tagged and RMA operations. + Other message types will be supported in future releases. + +*Modes* +: The provider does not require the use of any mode bits. + +*Progress* +: LNX utilizes the peer infrastructure to provide a shared completion + queue. Each linked provider still needs to handle its own progress. + Completion events will however be placed on the shared completion queue, + which is passed to the application for access. + +*Address Format* +: LNX wraps the linked providers addresses in one common binary blob. + It does not alter or change the linked providers address format. It wraps + them into a LNX structure which is then flattened and returned to the + application. This is passed between different nodes. The LNX provider + is able to parse the flattened format and operate on the different links. + This assumes that nodes in the same group are all using the same version of + the provider with the exact same links. IE: you can't have one node linking + SHM+CXI while another linking SHM+RXM. + +*Message Operations* +: LNX is designed to intercept message operations such as fi_tsenddata + and based on specific criteria forward the operation to the appropriate + provider. For the first release, LNX will only support linking SHM + provider for intra-node traffic and another provider (ex: CXI) for inter + node traffic. LNX send operation looks at the destination and based on + whether the destination is local or remote it will select the provider to + forward the operation to. The receive case has been described earlier. + +*Using the Provider* +: In order to use the provider the user needs to set FI_LNX_PROV_LINKS + environment variable to the linked providers in the following format + shm+. This will allow LNX to report back to the application in the + fi_getinfo() call the different links which can be selected. Since there are + multiple domains per provider LNX reports a permutation of all the + possible links. For example if there are two CXI interfaces on the machine + LNX will report back shm+cxi0 and shm+cxi1. The application can then + select based on its own criteria the link it wishes to use. + The application typically uses the PCI information in the fi_info + structure to select the interface to use. A common selection criteria is + the interface nearest the core the process is bound to. In order to make + this determination, the application requires the PCI information about the + interface. For this reason LNX forwards the PCI information for the + inter-node provider in the link to the application. + +# LIMITATIONS AND FUTURE WORK + +*Hardware Support* +: LNX doesn't support hardware offload; ex hardware tag matching. This is + an inherit limitation when using the peer infrastructure. Due to the use + of a shared receive queue which linked providers need to query when + a message is received, any hardware offload which requires sending the + receive buffers to the hardware directly will not work with the shared + receive queue. The shared receive queue provides two advantages; 1) reduce + memory usage, 2) coordinate the receive operations. For #2 this is needed + when receiving from FI_ADDR_UNSPEC. In this case both providers which are + part of the link can race to gain access to the receive buffer. It is + a future effort to determine a way to use hardware tag matching and other + hardware offload capability with LNX + +*Limited Linking* +: This release of the provider supports linking SHM provider for intra-node + operations and another provider which supports the FI_PEER capability for + inter-node operations. It is a future effort to expand to link any + multiple sets of providers. + +*Memory Registration* +: As part of the memory registration operation, varying hardware can perform + hardware specific steps such as memory pinning. Due to the fact that + memory registration APIs do not specify the source or destination + addresses it is not possible for LNX to determine which provider to + forward the memory registration to. LNX, therefore, registers the memory + with all linked providers. This might not be efficient and might have + unforeseen side effects. A better method is needed to support memory + registration. + +*Operation Types* +: This release of LNX supports tagged and RMA operations only. Future + releases will expand the support to other operation types. + +*Multi-Rail* +: Future design effort is being planned to support utilizing multiple interfaces + for traffic simultaneously. This can be over homogeneous interfaces or over + heterogeneous interfaces. + +# RUNTIME PARAMETERS + +The *LNX* provider checks for the following environment variables: + +*FI_LNX_PROV_LINKS* +: This environment variable is used to specify which providers to link. This + must be set in order for the LNX provider to return a list of fi_info + blocks in the fi_getinfo() call. The format which must be used is: + ++... As mentioned earlier currently LNX supports linking + only two providers the first of which is SHM followed by one other + provider for inter-node operations + +*FI_LNX_DISABLE_SHM* +: By default this environment variable is set to 0. However, the user can + set it to one and then the SHM provider will not be used. This can be + useful for debugging and performance analysis. The SHM provider will + naturally be used for all intra-node operations. Therefore, to test SHM in + isolation with LNX, the processes can be limited to the same node only. + +*FI_LNX_SRQ_SUPPORT* +: Shared Receive Queues are integral part of the peer infrastructure, but + they have the limitation of not using hardware offload, such as tag + matching. SRQ is needed to support the FI_ADDR_UNSPEC case. If the application + is sure this will never be the case, then it can turn off SRQ support by + setting this environment variable to 0. It is 1 by default. + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_provider`(7)](fi_provider.7.html), +[`fi_getinfo`(3)](fi_getinfo.3.html) diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 new file mode 100644 index 00000000000..0992463bf69 --- /dev/null +++ b/man/man7/fi_lnx.7 @@ -0,0 +1,173 @@ +.\" Automatically generated by Pandoc 2.9.2.1 +.\" +.TH "fi_lnx" "7" "" "" "" +.hy +.PP +{% include JB/setup %} +.SH NAME +.PP +fi_lnx - The LINKx Provider +.SH OVERVIEW +.PP +The LINKx provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. +This provider uses the libfabric peer infrastructure to aid in the use +of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. +Future releases of the provider will allow linking any number of +providers and provide the users with the ability to influence the way +the providers are utilized for traffic load. +.SH SUPPORTED FEATURES +.PP +This release contains an initial implementation of the LINKx provider +that offers the following support: +.TP +\f[I]Endpoint types\f[R] +The provider supports only endpoint type \f[I]FI_EP_RDM\f[R]. +.TP +\f[I]Endpoint capabilities\f[R] +LINKx is a passthrough layer on the send path. +On the receive path LINKx utilizes the peer infrastructure to create +shared receive queues (SRQ). +Receive requests are placed on the SRQ instead of on the core provider +receive queue. +When the provider receives a message it queries the SRQ for a match. +If one is found the receive request is completed, otherwise the message +is placed on the LINKx shared unexpected queue (SUQ). +Further receive requests query the SUQ for matches. +The first release of the provider only supports tagged and RMA +operations. +Other message types will be supported in future releases. +.TP +\f[I]Modes\f[R] +The provider does not require the use of any mode bits. +.TP +\f[I]Progress\f[R] +LINKx utilizes the peer infrastructure to provide a shared completion +queue. +Each linked provider still needs to handle its own progress. +Completion events will however be placed on the shared completion queue, +which is passed to the application for access. +.TP +\f[I]Address Format\f[R] +LINKx wraps the linked providers addresses in one common binary blob. +It does not alter or change the linked providers address format. +It wraps them into a LINKx structure which is then flattened and +returned to the application. +This is passed between different nodes. +The LINKx provider is able to parse the flattened format and operate on +the different links. +This assumes that nodes in the same group are all using the same version +of the provider with the exact same links. +IE: you can\[cq]t have one node linking SHM+CXI while another linking +SHM+RXM. +.TP +\f[I]Message Operations\f[R] +LINKx is designed to intercept message operations such as fi_tsenddata +and based on specific criteria forward the operation to the appropriate +provider. +For the first release, LINKx will only support linking SHM provider for +intra-node traffic and another provider (ex: CXI) for inter node +traffic. +LINKx send operation looks at the destination and based on whether the +destination is local or remote it will select the provider to forward +the operation to. +The receive case has been described earlier. +.TP +\f[I]Using the Provider\f[R] +In order to use the provider the user needs to set FI_LINKX_PROV_LINKS +environment variable to the linked providers in the following format +shm+. +This will allow LINKx to report back to the application in the +fi_getinfo() call the different links which can be selected. +Since there are multiple domains per provider LINKx reports a +permutation of all the possible links. +For example if there are two CXI interfaces on the machine LINKx will +report back shm+cxi0 and shm+cxi1. +The application can then select based on its own criteria the link it +wishes to use. +The application typically uses the PCI information in the fi_info +structure to select the interface to use. +A common selection criteria is the interface nearest the core the +process is bound to. +In order to make this determination, the application requires the PCI +information about the interface. +For this reason LINKx forwards the PCI information for the inter-node +provider in the link to the application. +.SH LIMITATIONS AND FUTURE WORK +.TP +\f[I]Hardware Support\f[R] +LINKx doesn\[cq]t support hardware offload; ex hardware tag matching. +This is an inherit limitation when using the peer infrastructure. +Due to the use of a shared receive queue which linked providers need to +query when a message is received, any hardware offload which requires +sending the receive buffers to the hardware directly will not work with +the shared receive queue. +The shared receive queue provides two advantages; 1) reduce memory +usage, 2) coordinate the receive operations. +For #2 this is needed when receiving from FI_ADDR_UNSPEC. +In this case both providers which are part of the link can race to gain +access to the receive buffer. +It is a future effort to determine a way to use hardware tag matching +and other hardware offload capability with LINKx +.TP +\f[I]Limited Linking\f[R] +This release of the provider supports linking SHM provider for +intra-node operations and another provider which supports the FI_PEER +capability for inter-node operations. +It is a future effort to expand to link any multiple sets of providers. +.TP +\f[I]Memory Registration\f[R] +As part of the memory registration operation, varying hardware can +perform hardware specific steps such as memory pinning. +Due to the fact that memory registration APIs do not specify the source +or destination addresses it is not possible for LINKx to determine which +provider to forward the memory registration to. +LINkx, therefore, registers the memory with all linked providers. +This might not be efficient and might have unforeseen side effects. +A better method is needed to support memory registration. +.TP +\f[I]Operation Types\f[R] +This release of LINKx supports tagged and RMA operations only. +Future releases will expand the support to other operation types. +.TP +\f[I]Multi-Rail\f[R] +Future design effort is being planned to support utilizing multiple +interfaces for traffic simultaneously. +This can be over homogeneous interfaces or over heterogeneous +interfaces. +.SH RUNTIME PARAMETERS +.PP +The \f[I]LINKx\f[R] provider checks for the following environment +variables: +.TP +\f[I]FI_LINKX_PROV_LINKS\f[R] +This environment variable is used to specify which providers to link. +This must be set in order for the LINKx provider to return a list of +fi_info blocks in the fi_getinfo() call. +The format which must be used is: ++\&... As mentioned earlier currently +LINKx supports linking only two providers the first of which is SHM +followed by one other provider for inter-node operations +.TP +\f[I]FI_LINKX_DISABLE_SHM\f[R] +By default this environment variable is set to 0. +However, the user can set it to one and then the SHM provider will not +be used. +This can be useful for debugging and performance analysis. +The SHM provider will naturally be used for all intra-node operations. +Therefore, to test SHM in isolation with LINKx, the processes can be +limited to the same node only. +.TP +\f[I]FI_LINKX_SRQ_SUPPORT\f[R] +Shared Receive Queues are integral part of the peer infrastructure, but +they have the limitation of not using hardware offload, such as tag +matching. +SRQ is needed to support the FI_ADDR_UNSPEC case. +If the application is sure this will never be the case, then it can turn +off SRQ support by setting this environment variable to 0. +It is 1 by default. +.SH SEE ALSO +.PP +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4 index b8b53d9fdb3..22aab260a38 100644 --- a/prov/cxi/configure.m4 +++ b/prov/cxi/configure.m4 @@ -1,6 +1,6 @@ -dnl SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +dnl SPDX-License-Identifier: GPL-2.0 dnl -dnl Copyright 2018 Hewlett Packard Enterprise Development LP +dnl Copyright 2018 Cray Inc. All rights reserved. dnl CXI provider specific configuration @@ -12,142 +12,42 @@ dnl $1: action if configured successfully dnl $2: action if not configured successfully dnl +m4_include([config/fi_pkg.m4]) + AM_CONDITIONAL([HAVE_PMI], [test "x$have_pmi" = "xtrue"]) AM_CONDITIONAL([HAVE_ZE], [test "$have_ze" = "1" && test "$with_ze" != ""]) AM_CONDITIONAL([HAVE_CUDA], [test "$have_cuda" = "1" && test "$with_cuda" != ""]) AM_CONDITIONAL([HAVE_ROCR], [test "$have_rocr" = "1" && test "$with_rocr" != ""]) - AC_DEFUN([FI_CXI_CONFIGURE],[ - - cxi_happy=1 - - # Support non-standard install path for cassini headers. This is needed - # by libcxi. - AC_ARG_WITH([cassini-headers], - [AS_HELP_STRING([--with-cassin-headers=DIR], [Install directory for Cassini headers])], - [CPPFLAGS="-I$with_cassini_headers/include $CPPFLAGS"]) - - # Support non-standard install path for cxi kernel UAPI headers. This is - # needed by libcxi. - AC_ARG_WITH([cxi-uapi-headers], - [AS_HELP_STRING([--with-cxi-uapi-headers=DIR], [Install directory for kernel CXI UAPI headers])], - [CPPFLAGS="-I$with_cxi_uapi_headers/include $CPPFLAGS"]) - - # Support non-standard install path for curl. This is needed by CXI provider. - AC_ARG_WITH([curl], - [AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])]) - - # Support non-standard install path for json-c. This is needed by CXI provider. - AC_ARG_WITH([json-c], - [AS_HELP_STRING([--with-json-c=DIR], [Install directory for json-c])]) + # Determine if we can support the cxi provider + cxi_happy=0 AS_IF([test x"$enable_cxi" != x"no"], - [ - AC_CHECK_HEADER(cxi_prov_hw.h, - [], - [cxi_happy=0]) - - AC_CHECK_HEADER(uapi/misc/cxi.h, - [], - [cxi_happy=0]) - - FI_CHECK_PACKAGE([libcxi], - [libcxi/libcxi.h], - [cxi], - [cxil_open_device], - [], - [$cxi_PREFIX], - [$cxi_LIBDIR], - [], - [cxi_happy=0]) - - cxi_CPPFLAGS=$libcxi_CPPFLAGS - cxi_LDFLAGS=$libcxi_LDFLAGS - cxi_LIBS=$libcxi_LIBS - - if test "$with_cassini_headers" != "" && test "$with_cassini_headers" != "no"; then - cxi_CPPFLAGS="$cxi_CPPFLAGS -I$with_cassini_headers/include" - fi + [FI_PKG_CHECK_MODULES([CXI], [libcxi], + [cxi_CPPFLAGS=$CXI_CFLAGS + cxi_LDFLAGS=$CXI_LIBS + cxi_happy=1], + [cxi_happy=0])]) + + AS_IF([test "$with_criterion" != ""], + [cxitest_CPPFLAGS="-I$with_criterion/include" + cxitest_LDFLAGS="-L$with_criterion/lib64 -Wl,-rpath=$(realpath $with_criterion/lib64)" + cxitest_LIBS="-lcriterion" + have_criterion=true]) - if test "$with_cxi_uapi_headers" != "" && test "$with_cxi_uapi_headers" != "no"; then - cxi_CPPFLAGS="$cxi_CPPFLAGS -I$with_cxi_uapi_headers/include" - fi - - # Add on curl if installed in non-default location. - if test "$with_curl" != "" && test "$with_curl" != "no"; then - FI_CHECK_PREFIX_DIR([$with_curl], [curl]) - else - curl_PREFIX="" - curl_LIBDIR="" - fi - - FI_CHECK_PACKAGE([libcurl], - [curl/curl.h], - [curl], - [curl_global_init], - [], - [$curl_PREFIX], - [$curl_LIBDIR], - [], - [cxi_happy=0]) - - cxi_CPPFLAGS="$cxi_CPPFLAGS $libcurl_CPPFLAGS" - cxi_LDFLAGS="$cxi_LDFLAGS $libcurl_LDFLAGS" - cxi_LIBS="$cxi_LIBS $libcurl_LIBS" - - # Add on json if installed in non-default location. - if test "$with_json" != "" && test "$with_json" != "no"; then - FI_CHECK_PREFIX_DIR([$with_json], [json]) - else - json_PREFIX="" - json_LIBDIR="" - fi - - FI_CHECK_PACKAGE([libjson], - [json-c/json.h], - [json-c], - [json_object_get_type], - [], - [$json_PREFIX], - [$json_LIBDIR], - [], - [cxi_happy=0]) - - cxi_CPPFLAGS="$cxi_CPPFLAGS $libjson_CPPFLAGS" - cxi_LDFLAGS="$cxi_LDFLAGS $libjson_LDFLAGS" - cxi_LIBS="$cxi_LIBS $libjson_LIBS" - - # Need to explicitly link to libmath - cxi_LIBS="$cxi_LIBS -lm" - - AC_SUBST(cxi_CPPFLAGS) - AC_SUBST(cxi_LDFLAGS) - AC_SUBST(cxi_LIBS) + AM_CONDITIONAL([HAVE_CRITERION], [test "x$have_criterion" = "xtrue"]) - # Checks to enable cxitest - AS_IF([test "$with_criterion" != ""], - [cxitest_CPPFLAGS="-I$with_criterion/include" - cxitest_LDFLAGS="-L$with_criterion/lib64 -Wl,-rpath=$(realpath $with_criterion/lib64)" - cxitest_LIBS="-lcriterion" - have_criterion=true]) + AS_IF([test "$with_pmi" != ""], + [have_pmi=true]) - AS_IF([test "$have_ze" = "1" && test "$with_ze" != "" && test x"$with_ze" != x"yes"], - [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_ze/include" - cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_ze/lib64"]) - AS_IF([test "$have_cuda" = "1" && test "$with_cuda" != "" && test x"$with_cuda" != x"yes"], - [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_cuda/include" - cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_cuda/lib64"]) - AS_IF([test "$have_rocr" = "1" && test "$with_rocr" != "" && test x"$with_rocr" != x"yes"], - [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_rocr/include" - cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_rocr/lib"]) + AM_CONDITIONAL([HAVE_PMI], [test "x$have_pmi" = "xtrue"]) - AC_SUBST(cxitest_CPPFLAGS) - AC_SUBST(cxitest_LDFLAGS) - AC_SUBST(cxitest_LIBS) - ], - [cxi_happy=0]) + AC_SUBST(cxi_CPPFLAGS) + AC_SUBST(cxi_LDFLAGS) + AC_SUBST(cxitest_CPPFLAGS) + AC_SUBST(cxitest_LDFLAGS) + AC_SUBST(cxitest_LIBS) - AM_CONDITIONAL([HAVE_CRITERION], [test "x$have_criterion" = "xtrue"]) AS_IF([test $cxi_happy -eq 1], [$1], [$2]) ]) diff --git a/prov/lnx/Makefile.include b/prov/lnx/Makefile.include new file mode 100644 index 00000000000..f2800cb34d3 --- /dev/null +++ b/prov/lnx/Makefile.include @@ -0,0 +1,61 @@ +# +# Copyright (c) 2022 ORNL. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +if HAVE_LNX +_lnx_files = \ + prov/lnx/src/lnx_cq.c \ + prov/lnx/src/lnx_domain.c \ + prov/lnx/src/lnx_ep.c \ + prov/lnx/src/lnx_init.c \ + prov/lnx/src/lnx_ops.c \ + prov/lnx/src/lnx_peers.c + +_lnx_headers = \ + prov/lnx/include/lnx.h + +if HAVE_LNX_DL +pkglib_LTLIBRARIES += liblnx-fi.la +liblnx_fi_la_SOURCES = $(_lnx_files) $(_lnx_headers) +liblnx_fi_la_LIBADD = $(linkback) $(lnx_LIBS) +liblnx_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +liblnx_fi_la_DEPENDENCIES = $(linkback) +else +src_libfabric_la_SOURCES += $(_lnx_files) $(_lnx_headers) +src_libfabric_la_CPPFLAGS += -I$(top_srcdir)/prov/lnx/include +endif + +prov_install_man_pages += man/man7/fi_lnx.7 + +endif HAVE_LNX + +prov_dist_man_pages += man/man7/fi_lnx.7 diff --git a/prov/lnx/configure.m4 b/prov/lnx/configure.m4 new file mode 100644 index 00000000000..737b62bc46d --- /dev/null +++ b/prov/lnx/configure.m4 @@ -0,0 +1,15 @@ +dnl Configury specific to the libfabric lnx provider + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl +AC_DEFUN([FI_LNX_CONFIGURE],[ + # Determine if we can support the lnx provider + lnx_happy=0 + AS_IF([test x"$enable_lnx" != x"no"], [lnx_happy=1]) + AS_IF([test $lnx_happy -eq 1], [$1], [$2]) +]) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h new file mode 100644 index 00000000000..341f5619876 --- /dev/null +++ b/prov/lnx/include/lnx.h @@ -0,0 +1,584 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LNX_H +#define LNX_H + +#define LNX_DEF_AV_SIZE 1024 +#define LNX_MAX_LOCAL_EPS 16 +#define LNX_IOV_LIMIT 5 + +#define lnx_ep_rx_flags(lnx_ep) ((lnx_ep)->le_ep.rx_op_flags) + +struct local_prov_ep; + +struct lnx_match_attr { + fi_addr_t lm_addr; + uint64_t lm_tag; + uint64_t lm_ignore; + struct lnx_peer *lm_peer; + struct local_prov_ep *lm_cep; +}; + +/* + * . Each endpoint LNX manages will have an instance of this structure. + * . The structure has a pointer to the general shared cq. + * . All events are written into that CQ. + * . The structure has a pointer to the core cq which the core provider + * returns + * . The structure has an instance of the peer_cq which is unique for + * handling communication with the constituent endpoint. + */ +struct lnx_peer_cq { + struct lnx_cq *lpc_shared_cq; + struct fid_peer_cq lpc_cq; + struct fid_cq *lpc_core_cq; +}; + +struct lnx_queue { + struct dlist_entry lq_queue; + dlist_func_t *lq_match_func; + ofi_spin_t lq_qlock; +}; + +struct lnx_qpair { + struct lnx_queue lqp_recvq; + struct lnx_queue lqp_unexq; +}; + +struct lnx_peer_srq { + struct lnx_qpair lps_trecv; + struct lnx_qpair lps_recv; +}; + +struct local_prov_ep { + struct dlist_entry entry; + bool lpe_local; + char lpe_fabric_name[FI_NAME_MAX]; + struct fid_fabric *lpe_fabric; + struct fid_domain *lpe_domain; + struct fid_ep *lpe_ep; + struct fid_ep **lpe_txc; + struct fid_ep **lpe_rxc; + struct fid_av *lpe_av; + struct lnx_peer_cq lpe_cq; + struct fi_info *lpe_fi_info; + struct fid_peer_srx lpe_srx; + struct ofi_bufpool *lpe_recv_bp; + ofi_spin_t lpe_bplock; + struct local_prov *lpe_parent; +}; + +/* The lnx_rx_entry is used for two things: + * 1. posting receive requests on the SRQ + * - In this case the peer can be known if the source address is + * specified in the receive API + * - or the peer is not known if the source address is FI_ADDR_UNSPEC. + * 2. posting messages received on the SUQ if they do not match an + * existing RR on the SRQ. + * - there are two cases to consider: + * 1. The core provider did a reverse AV lookup on the message received + * to find the fi_addr_t of the peer sending the message + * 2. The core provider registered an address matching function to + * be used by LNX to match the RR with the message + * + * Case 1: RR is posted before message is received + * - Application calls a LNX receive API + * - LNX creates a new rx_entry + * - caches the data passed in the API in the rx_entry + * - The addr is the LNX level fi_addr_t. It refers to a peer + * - lookup the peer if the addr != FI_ADDR_UNSPEC and cache that in the + * rx_entry + * - if addr == FI_ADDR_UNSPEC then set rx_entry peer to NULL + * - post the rx_entry on the SRQ + * - When a message is received by a core provider, it calls the + * get_tag() or get_msg() callbacks. + * - The CEP is known and from there we know the Local Provider it + * belongs to. + * - The SRQ is traversed to see if there is an RR that matches this + * message. + * - if an RR has no peer and the tag matches, then the message matches + * this RR and we return it to the core provider to complete message + * receive. + * - if an RR has a peer and the tag matches, then we need to also match + * the address. Since a lnx peer can abstract multiple different + * types of peer providers, we need to rely on the CEP to find the peer + * provider matching our local peer provider, then we need to go over + * all the fi_addr_t in that peer provider to find if it matches the + * source address of the message. This can be done in two ways: + * 1. If the core provider has done a reverse lookup and gave us the + * fi_addr_t of the peer which sent the message, then we can directly + * do the matching at the lnx level. + * 2. if the core provider didn't do a reverse lookup and instead + * provided us with a address matching callback, then we call that + * with every fi_addr_t for that peer, and the matching + * information given to us by the core provider and let the provider + * do the matching. + * - If the message matches, then return it to the core provider to + * complete the message receive. + * + * Case 2: message is received before RR is posted + * - Core provider calls into LNX with get_tag() or get_msg() + * callbacks + * - LNX will traverse the SRQ as in Case 1, but no match will be + * found + * - LNX will create an rx_entry and store the information passed on by + * the core provider + * - addr if core provider did a reverse lookup + * - tag if it's a tagged message + * - CEP is known and therefore will be cached + * - At a later time the application will issue an RR + * - if the application provided an address then a peer is looked up and + * cached. Otherwise the lnx peer will be set to NULL + * - LNX will traverse the SUQ + * - for each rx_entry on the SUQ, if the RR has no address and the tag + * matches in case of a tagged message, then return that rx_entry and + * tell the core provider to complete the message. + * - if the RR has an lnx peer and the tag matches, then we need to also + * match the address. Since a lnx peer can abstract multiple different + * types of peer providers, we need to rely on the CEP cached in the + * rx_entry to find the peer provider matching our local peer provider + * which received the message. Then we need to go over all the fi_addr_t + * in the peer provider to find if it matches the source address of the + * message. This can be done in two ways: + * 1. If the core provider has done a reverse lookup and gave us the + * fi_addr_t of the peer which sent the message, then we can directly + * do the matching at the lnx level. + * 2. if the core provider didn't do a reverse lookup and instead + * provided us with an address matching callback, then we call that + * with every fi_addr_t for that peer, and the matching + * information given to us by the core provider and let the provider + * do the matching. + * - If the message matches, then return it to the core provider to + * complete the message receive. + */ +struct lnx_rx_entry { + /* the entry which will be passed to the core provider */ + struct fi_peer_rx_entry rx_entry; + /* iovec to use to point to receive buffers */ + struct iovec rx_iov[LNX_IOV_LIMIT]; + /* desc array to be used to point to the descs passed by the user */ + void *rx_desc[LNX_IOV_LIMIT]; + /* peer we expect messages from. + * This is available if the receive request provided a source address. + * Otherwise it will be NULL + */ + struct lnx_peer *rx_peer; + /* local prov endpoint receiving the message if this entry is + * added to the SUQ + */ + struct local_prov_ep *rx_cep; + /* match information which will be given to us by the core provider */ + struct fi_peer_match_attr rx_match_info; + /* which pool this rx_entry came from. It's either from the global + * pool or some core provider pool + */ + bool rx_global; +}; + +OFI_DECLARE_FREESTACK(struct lnx_rx_entry, lnx_recv_fs); + +struct local_prov { + struct dlist_entry lpv_entry; + char lpv_prov_name[FI_NAME_MAX]; + int lpv_ep_count; + struct dlist_entry lpv_prov_eps; +}; + +struct lnx_address_prov { + char lap_prov[FI_NAME_MAX]; + /* an array of addresses of size count. */ + /* entry 0 is shm if available */ + /* array can't be larger than LNX_MAX_LOCAL_EPS */ + int lap_addr_count; + /* size as specified by the provider */ + int lap_addr_size; + /* payload */ + char lap_addrs[]; +}; + +struct lnx_addresses { + /* used to determine if the address is node local or node remote */ + char la_hostname[FI_NAME_MAX]; + /* number of providers <= LNX_MAX_LOCAL_EPS */ + int la_prov_count; + struct lnx_address_prov la_addr_prov[]; +}; + +struct lnx_local2peer_map { + struct dlist_entry entry; + struct local_prov_ep *local_ep; + int addr_count; + fi_addr_t peer_addrs[LNX_MAX_LOCAL_EPS]; +}; + +struct lnx_peer_prov { + struct dlist_entry entry; + + /* provider name */ + char lpp_prov_name[FI_NAME_MAX]; + + uint64_t lpp_flags; + + /* pointer to the local endpoint information to be used for + * communication with this peer. + * + * If the peer is on-node, then lp_endpoints[0] = shm + * + * if peer is off-node, then there could be up to LNX_MAX_LOCAL_EPS + * local endpoints we can use to reach that peer. + */ + struct local_prov *lpp_prov; + + /* each peer can be reached from any of the local provider endpoints + * on any of the addresses which are given to us. It's an N:N + * relationship + */ + struct dlist_entry lpp_map; +}; + +struct lnx_peer { + /* true if peer can be reached over shared memory, false otherwise */ + bool lp_local; + + /* Each provider that we can reach the peer on will have an entry + * below. Each entry will contain all the local provider endpoints we + * can reach the peer through, as well as all the peer addresses on that + * provider. + * + * We can potentially multi-rail between the interfaces on the same + * provider, both local and remote. + * + * Or we can multi-rail across different providers. Although this + * might be more complicated due to the differences in provider + * capabilities. + */ + struct lnx_peer_prov *lp_shm_prov; + struct dlist_entry lp_provs; +}; + +struct lnx_peer_table { + struct util_av lpt_av; + int lpt_max_count; + int lpt_count; + struct lnx_domain *lpt_domain; + /* an array of peer entries */ + struct lnx_peer **lpt_entries; +}; + +struct lnx_ctx { + struct dlist_entry ctx_head; + int ctx_idx; + struct lnx_ep *ctx_parent; + struct fid_ep ctx_ep; +}; + +struct lnx_ep { + struct util_ep le_ep; + struct dlist_entry le_tx_ctx; + struct dlist_entry le_rx_ctx; + struct lnx_domain *le_domain; + size_t le_fclass; + struct lnx_peer_table *le_peer_tbl; + struct lnx_peer_srq le_srq; +}; + +struct lnx_srx_context { + struct lnx_ep *srx_lep; + struct local_prov_ep *srx_cep; +}; + +struct lnx_mem_desc_prov { + struct local_prov *prov; + struct fid_mr *core_mr; +}; + +struct lnx_mem_desc { + struct lnx_mem_desc_prov desc[LNX_MAX_LOCAL_EPS]; + int desc_count; +}; + +struct lnx_mr { + struct ofi_mr mr; + struct lnx_mem_desc desc; +}; + +struct lnx_domain { + struct util_domain ld_domain; + struct lnx_fabric *ld_fabric; + bool ld_srx_supported; + struct ofi_mr_cache ld_mr_cache; +}; + +struct lnx_cq { + struct util_cq util_cq; + struct lnx_domain *lnx_domain; +}; + +struct lnx_fabric { + struct util_fabric util_fabric; + /* providers linked by this fabric */ + struct dlist_entry local_prov_table; + /* memory registration buffer pool */ + struct ofi_bufpool *mem_reg_bp; + /* shared memory provider used in this link */ + struct local_prov *shm_prov; + /* peers associated with this link */ + struct lnx_peer_table *lnx_peer_tbl; +}; + +extern struct util_prov lnx_util_prov; +extern struct fi_provider lnx_prov; +extern struct ofi_bufpool *global_recv_bp; +extern ofi_spin_t global_bplock; + +struct fi_info *lnx_get_link_by_dom(char *domain_name); + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info); + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context); +int lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context); + +void lnx_fini(void); + +int lnx_fabric_close(struct fid *fid); + +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **dom, void *context); + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context); + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_cq2ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags); + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_queue_msg(struct fi_peer_rx_entry *entry); +int lnx_queue_tag(struct fi_peer_rx_entry *entry); +void lnx_free_entry(struct fi_peer_rx_entry *entry); +void lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)); + +static inline struct lnx_peer * +lnx_get_peer(struct lnx_peer **peers, fi_addr_t addr) +{ + if (!peers || addr == FI_ADDR_UNSPEC) + return NULL; + + return peers[addr]; +} + +static inline +void lnx_get_core_desc(struct lnx_mem_desc *desc, void **mem_desc) +{ + if (desc && desc->desc[0].core_mr) { + if (mem_desc) + *mem_desc = desc->desc[0].core_mr->mem_desc; + return; + } + + *mem_desc = NULL; +} + +static inline +int lnx_create_mr(const struct iovec *iov, fi_addr_t addr, + struct lnx_domain *lnx_dom, struct fid_mr **fid_mr) +{ + struct ofi_mr *mr; + struct ofi_mr_entry *mre; + struct fi_mr_attr attr = {}; + struct fi_mr_attr cur_abi_attr; + struct ofi_mr_info info = {}; + uint64_t flags; + int rc; + + attr.addr = addr; + attr.iov_count = 1; + attr.mr_iov = iov; + mre = ofi_mr_cache_find(&lnx_dom->ld_mr_cache, &attr, 0); + if (mre) { + mr = (struct ofi_mr *)mre->data; + goto out; + } + + attr.iface = ofi_get_hmem_iface(iov->iov_base, + &attr.device.reserved, &flags); + info.iov = *iov; + info.iface = attr.iface; + rc = ofi_hmem_dev_register(attr.iface, iov->iov_base, iov->iov_len, + (uint64_t *) &attr.hmem_data); + if (rc) + return rc; + + rc = ofi_mr_cache_search(&lnx_dom->ld_mr_cache, &info, &mre); + if (rc) + return rc; + + mr = (struct ofi_mr *)mre->data; + ofi_mr_update_attr(lnx_dom->ld_domain.fabric->fabric_fid.api_version, + lnx_dom->ld_domain.info_domain_caps, &attr, &cur_abi_attr, 0); + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr.context; + mr->domain = &lnx_dom->ld_domain; + mr->flags = flags; + mr->iface = cur_abi_attr.iface; + mr->device = cur_abi_attr.device.reserved; + mr->hmem_data = cur_abi_attr.hmem_data; + mr->mr_fid.mem_desc = (void*) mr; + +out: + *fid_mr = &mr->mr_fid; + ofi_mr_cache_delete(&lnx_dom->ld_mr_cache, mre); + + return FI_SUCCESS; +} + +static inline +int lnx_select_send_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + void **mem_desc, uint64_t *rkey) +{ + int idx = 0; + int rc; + struct lnx_peer_prov *prov; + struct lnx_local2peer_map *lpm; + //struct fi_mr_attr core_attr; + //uint64_t flags; + struct fid_mr *mr = NULL; + + if (lp->lp_local) { + prov = lp->lp_shm_prov; + } else { + prov = dlist_first_entry_or_null( + &lp->lp_provs, struct lnx_peer_prov, entry); + idx = 1; + } + + /* TODO when we support multi-rail we can have multiple maps */ + lpm = dlist_first_entry_or_null(&prov->lpp_map, + struct lnx_local2peer_map, entry); + *addr = lpm->peer_addrs[0]; + + /* TODO this will need to be expanded to handle Multi-Rail. For now + * the assumption is that local peers can be reached on shm and remote + * peers have only one interface, hence indexing on 0 and 1 + * + * If we did memory registration, then we've already figured out the + * pathway + */ + if (desc && desc->desc[idx].core_mr) { + *cep = dlist_first_entry_or_null( + &desc->desc[idx].prov->lpv_prov_eps, + struct local_prov_ep, entry); + if (mem_desc) + *mem_desc = fi_mr_desc(desc->desc[idx].core_mr); + if (rkey) + *rkey = fi_mr_key(desc->desc[idx].core_mr); + return 0; + } + + *cep = lpm->local_ep; + if (mem_desc) + *mem_desc = NULL; + + if (!lp->lp_local || !mem_desc || (mem_desc && *mem_desc) || !iov || (iov && iov->iov_base == NULL)) + return 0; + + /* TODO: Look up the address in the cache: + * - if it's found then use the cached fid_mr + * - This will include the iface, which is really all we need + * - if it's not then lookup the iface, create the fid_mr and + * cache it. + */ + rc = lnx_create_mr(iov, *addr, lnx_dom, &mr); + if (!rc && mr) + *mem_desc = mr->mem_desc; + /* SHM provider relies on the user to register the memory attribute in + * order for it to determine the type of the memory, host vs device. + * LNX will do that here if the application hasn't done it already + memset(&core_attr, 0, sizeof(core_attr)); + core_attr.iface = ofi_get_hmem_iface(iov->iov_base, + &core_attr.device.reserved, &flags); + if (core_attr.iface == FI_HMEM_SYSTEM) + return 0; + core_attr.addr = *addr; + core_attr.mr_iov = iov; + core_attr.iov_count = iov_count; + rc = fi_mr_regattr((*cep)->lpe_domain, &core_attr, flags, &mr); + if (!rc && mr) + *mem_desc = mr->mem_desc; + */ + + return rc; +} + +static inline +int lnx_select_recv_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + void **mem_desc) +{ + /* if the src address is FI_ADDR_UNSPEC, then we'll need to trigger + * all core providers to listen for a receive, since we don't know + * which one will endup getting the message. + * + * For each core provider we're tracking, trigger the recv operation + * on it. + * + * if the src address is specified then we just need to select and + * exact core endpoint to trigger the recv on. + */ + if (!lp) + return -FI_ENOSYS; + + return lnx_select_send_pathway(lp, lnx_dom, desc, cep, addr, iov, + iov_count, mem_desc, NULL); +} + +#endif /* LNX_H */ diff --git a/prov/lnx/src/lnx_cq.c b/prov/lnx/src/lnx_cq.c new file mode 100644 index 00000000000..397dabf9278 --- /dev/null +++ b/prov/lnx/src/lnx_cq.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +ssize_t lnx_peer_cq_write(struct fid_peer_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write(&lnx_cq->lpc_shared_cq->util_cq, context, + flags, len, buf, data, tag); + + return rc; +} + +ssize_t lnx_peer_cq_writeerr(struct fid_peer_cq *cq, + const struct fi_cq_err_entry *err_entry) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write_error(&lnx_cq->lpc_shared_cq->util_cq, err_entry); + + return rc; +} + +static int lnx_cleanup_cqs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_cq.lpc_core_cq->fid); + if (rc) + frc = rc; + ep->lpe_cq.lpc_core_cq = NULL; + } + + return frc; +} + +static int lnx_cq_close(struct fid *fid) +{ + int rc; + struct lnx_cq *lnx_cq; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(fid, struct lnx_cq, util_cq.cq_fid); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* close all the open core cqs */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_cqs(entry); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + return rc; + } + } + + rc = ofi_cq_cleanup(&lnx_cq->util_cq); + if (rc) + return rc; + + free(lnx_cq); + return 0; +} + +struct fi_ops_cq_owner lnx_cq_write = { + .size = sizeof(lnx_cq_write), + .write = lnx_peer_cq_write, + .writeerr = lnx_peer_cq_writeerr, +}; + +static struct fi_ops lnx_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_cq_close, + .bind = fi_no_bind, + .control = ofi_cq_control, + .ops_open = fi_no_ops_open, +}; + +static void lnx_cq_progress(struct util_cq *cq) +{ + struct lnx_cq *lnx_cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(cq, struct lnx_cq, util_cq); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; +/* + entry = lnx_cq->lnx_domain->ld_fabric->shm_prov; + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + fi_cq_read(ep->lpe_cq.lpc_core_cq, NULL, 0); + + return; +*/ + /* Kick the core provider endpoints to progress */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + fi_cq_read(ep->lpe_cq.lpc_core_cq, NULL, 0); + } +} + +static int lnx_cq_open_core_prov(struct lnx_cq *cq, struct fi_cq_attr *attr) +{ + int rc; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table = + &cq->lnx_domain->ld_fabric->local_prov_table; + + /* tell the core providers to import my CQ */ + attr->flags |= FI_PEER; + + /* create all the core provider endpoints */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + struct fid_cq *core_cq; + struct fi_peer_cq_context cq_ctxt; + + ep->lpe_cq.lpc_shared_cq = cq; + ep->lpe_cq.lpc_cq.owner_ops = &lnx_cq_write; + + cq_ctxt.size = sizeof(cq_ctxt); + cq_ctxt.cq = &ep->lpe_cq.lpc_cq; + + /* pass my CQ into the open and get back the core's cq */ + rc = fi_cq_open(ep->lpe_domain, attr, &core_cq, &cq_ctxt); + if (rc) + return rc; + + /* before the fi_cq_open() returns the core provider should + * have called fi_export_fid() and got a pointer to the peer + * CQ which we have allocated for this core provider + */ + + ep->lpe_cq.lpc_core_cq = core_cq; + } + } + + return 0; +} + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context) +{ + struct lnx_cq *lnx_cq; + struct lnx_domain *lnx_dom; + int rc; + + lnx_cq = calloc(1, sizeof(*lnx_cq)); + if (!lnx_cq) + return -FI_ENOMEM; + + /* this is going to be a standard CQ from the read side. From the + * write side, it'll use the peer_cq callbacks to write + */ + rc = ofi_cq_init(&lnx_prov, domain, attr, &lnx_cq->util_cq, + &lnx_cq_progress, context); + if (rc) + goto free; + + lnx_dom = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + + lnx_cq->lnx_domain = lnx_dom; + lnx_cq->util_cq.cq_fid.fid.ops = &lnx_cq_fi_ops; + (*cq_fid) = &lnx_cq->util_cq.cq_fid; + + /* open core CQs and tell them to import my CQ */ + rc = lnx_cq_open_core_prov(lnx_cq, attr); + + return rc; + +free: + free(lnx_cq); + return rc; +} diff --git a/prov/lnx/src/lnx_domain.c b/prov/lnx/src/lnx_domain.c new file mode 100644 index 00000000000..8cb09d75eff --- /dev/null +++ b/prov/lnx/src/lnx_domain.c @@ -0,0 +1,584 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static struct fi_ops_domain lnx_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = lnx_av_open, + .cq_open = lnx_cq_open, + .endpoint = lnx_endpoint, + .scalable_ep = lnx_scalable_ep, + .cntr_open = fi_no_cntr_open, + .poll_open = fi_no_poll_open, + .stx_ctx = fi_no_stx_context, + .srx_ctx = fi_no_srx_context, + .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, +}; + +static int lnx_cleanup_domains(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (!ep->lpe_domain) + continue; + rc = fi_close(&ep->lpe_domain->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static int lnx_domain_close(fid_t fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *domain; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + + /* close all the open core domains */ + dlist_foreach_container(&domain->ld_fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_domains(entry); + if (rc) + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + } + + ofi_mr_cache_cleanup(&domain->ld_mr_cache); + + rc = ofi_domain_close(&domain->ld_domain); + + free(domain); + + return rc; +} + +static int +lnx_mr_regattrs_all(struct local_prov *prov, const struct fi_mr_attr *attr, + uint64_t flags, struct lnx_mem_desc_prov *desc) +{ + int rc = 0; + struct local_prov_ep *ep; + + desc->prov = prov; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_regattr(ep->lpe_domain, attr, + flags, &desc->core_mr); + + /* TODO: SHM provider returns FI_ENOKEY if requested_key is the + * same as the previous call. Application, like OMPI, might not + * specify the requested key in fi_mr_attr, so for now ignore that + * error. + * We need a better way of handling this. + * if (rc == -FI_ENOKEY) + * rc = 0; + * I made a change in SHM to support FI_MR_PROV_KEY if set by the + * application. This tells ofi to generate its own requested_key + * for each fi_mr_regattr call + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_regattr() failed: %d\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + return rc; +} + +static int +lnx_mr_close_all(struct lnx_mem_desc *mem_desc) +{ + int i, rc, frc = 0; + struct fid_mr *mr; + + for (i = 0; i < mem_desc->desc_count; i++) { + mr = mem_desc->desc[i].core_mr; + if (!mr) + continue; + rc = fi_close(&mr->fid); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_close() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +int lnx_mr_close(struct fid *fid) +{ + struct lnx_mr *lnx_mr; + struct ofi_mr *mr; + int rc, frc = 0; + + mr = container_of(fid, struct ofi_mr, mr_fid.fid); + lnx_mr = container_of(mr, struct lnx_mr, mr); + + rc = lnx_mr_close_all(mr->mr_fid.mem_desc); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to complete Memory Deregistration\n"); + frc = rc; + } + + ofi_atomic_dec32(&mr->domain->ref); + + ofi_buf_free(lnx_mr); + + return frc; +} + +static int lnx_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int i, rc, frc = 0; + struct local_prov_ep *ep; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + dlist_foreach_container(&desc->prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_bind(cmr, &ep->lpe_ep->fid, flags); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s lnx_mr_bind() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + } + + return frc; +} + +static int lnx_mr_control(struct fid *fid, int command, void *arg) +{ + int i, rc, frc = 0; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + if (command != FI_ENABLE) + return -FI_ENOSYS; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + rc = fi_mr_enable(cmr); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s lnx_mr_control() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +static struct fi_ops lnx_mr_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_mr_close, + .bind = lnx_mr_bind, + .control = lnx_mr_control, + .ops_open = fi_no_ops_open +}; + +static int +lnx_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid) +{ + /* + * If the address is specified then use it to find out which + * domain to register the memory against. LNX can be managing + * multiple underlying core provider endpoints, I need to register the + * memory against the correct one. + * + * Once the domain is determined, I need to set the mr->mem_desc to + * point to a structure which contains my local endpoint I'll end up + * using (which is the same one that I registered the memory against) + * and the associate fid_mr which the core provider set for me. + * + * I return that to the application. + * + * When the application calls back into the data operations API it'll + * pass the mr. I can then pull out a pointer to my local endpoint + * which I'll use in the data operation and pass it the correct mr. + * + * If the address is not provided, then I'll register the memory + * buffer against all my core domains, store those and return them to + * the user + */ + + struct lnx_domain *domain; + struct lnx_fabric *fabric; + struct lnx_mr *lnx_mr = NULL;; + struct ofi_mr *mr; + struct lnx_mem_desc *mem_desc; + struct local_prov *entry; + int rc = 0, i = 1; + bool shm = false; + + if (fid->fclass != FI_CLASS_DOMAIN || !attr || attr->iov_count <= 0) + return -FI_EINVAL; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + fabric = domain->ld_fabric; + + lnx_mr = ofi_buf_alloc(fabric->mem_reg_bp); + if (!lnx_mr) { + rc = -FI_ENOMEM; + goto fail; + } + + mr = &lnx_mr->mr; + mem_desc = &lnx_mr->desc; + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr->context; + mr->mr_fid.fid.ops = &lnx_mr_fi_ops; + mr->mr_fid.mem_desc = mem_desc; + mr->domain = &domain->ld_domain; + mr->flags = flags; + + /* TODO: What's gonna happen if you try to register the same piece + * of memory via multiple providers? + * TODO 2: We need a better way to handle memory registration. + * This is simply not very good. We need to have a peer interface + * to memory registration + */ + /* register against all domains */ + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + if (!strcmp(entry->lpv_prov_name, "shm")) + shm = true; + else + shm = false; + if (i >= LNX_MAX_LOCAL_EPS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Exceeded number of allowed memory registrations %s\n", + entry->lpv_prov_name); + rc = -FI_ENOSPC; + goto fail; + } + if (shm) + flags |= FI_HMEM_DEVICE_ONLY; + rc = lnx_mr_regattrs_all(entry, attr, flags, + (shm) ? &mem_desc->desc[0] : + &mem_desc->desc[i]); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to complete Memory Registration %s\n", + entry->lpv_prov_name); + goto fail; + } + if (!shm) + i++; + } + + mem_desc->desc_count = i; + if (shm) + mr->mr_fid.key = mem_desc->desc[0].core_mr->key; + else + mr->mr_fid.key = mem_desc->desc[1].core_mr->key; + *mr_fid = &mr->mr_fid; + ofi_atomic_inc32(&domain->ld_domain.ref); + + return 0; + +fail: + if (lnx_mr) + ofi_buf_free(lnx_mr); + return rc; +} + +static struct fi_ops lnx_domain_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_domain_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_mr lnx_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = fi_no_mr_reg, + .regv = fi_no_mr_regv, + .regattr = lnx_mr_regattr, +}; + +static int lnx_setup_core_domain(struct local_prov_ep *ep, struct fi_info *info) +{ + struct fi_info *fi, *itr; + + fi = lnx_get_link_by_dom(info->domain_attr->name); + if (!fi) + return -FI_ENODATA; + + for (itr = fi; itr; itr = itr->next) { + if (!strcmp(itr->fabric_attr->name, ep->lpe_fabric_name)) { + ep->lpe_fi_info = fi_dupinfo(itr); + return FI_SUCCESS; + } + } + + ep->lpe_fi_info = NULL; + + return -FI_ENOENT; +} + +static struct fi_ops_srx_owner lnx_srx_ops = { + .size = sizeof(struct fi_ops_srx_owner), + .get_msg = lnx_get_msg, + .get_tag = lnx_get_tag, + .queue_msg = lnx_queue_msg, + .queue_tag = lnx_queue_tag, + .free_entry = lnx_free_entry, + .foreach_unspec_addr = lnx_foreach_unspec_addr, +}; + +static int lnx_open_core_domains(struct local_prov *prov, + void *context, struct lnx_domain *lnx_domain, + struct fi_info *info) +{ + int rc; + struct local_prov_ep *ep; + struct fi_rx_attr attr; + struct fi_peer_srx_context peer_srx; + struct dlist_entry *tmp; + int srq_support = 1; + + fi_param_get_bool(&lnx_prov, "srq_support", &srq_support); + + memset(&attr, 0, sizeof(attr)); + attr.op_flags = FI_PEER; + peer_srx.size = sizeof(peer_srx); + + if (srq_support) + lnx_domain->ld_srx_supported = true; + else + lnx_domain->ld_srx_supported = false; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + /* the fi_info we setup when we created the fabric might not + * necessarily be the correct one. It'll have the same fabric + * information, since the fabric information is common among all + * the domains the provider manages. However at this point we need + * to get the fi_info that the application is requesting */ + rc = lnx_setup_core_domain(ep, info); + if (rc) + return rc; + + if (srq_support) { + /* special case for CXI provider. We need to turn off tag + * matching HW offload if we're going to support shared + * receive queues. + */ + if (strstr(ep->lpe_fabric_name, "cxi")) + setenv("FI_CXI_RX_MATCH_MODE", "software", 1); + } + + rc = fi_domain(ep->lpe_fabric, ep->lpe_fi_info, + &ep->lpe_domain, context); + + if (!rc && srq_support) { + ep->lpe_srx.owner_ops = &lnx_srx_ops; + peer_srx.srx = &ep->lpe_srx; + rc = fi_srx_context(ep->lpe_domain, &attr, NULL, &peer_srx); + } + + /* if one of the constituent endpoints doesn't support shared + * receive context, then fail, as we can't continue with this + * inconsistency + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s does not support shared" + " receive queues. Failing\n", ep->lpe_fabric_name); + return rc; + } + } + + return 0; +} + +static int lnx_addr_add_region_noop(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + return FI_SUCCESS; +} + +static void lnx_addr_del_region(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + struct ofi_mr *mr = (struct ofi_mr *)entry->data; + + ofi_hmem_dev_unregister(mr->iface, (uint64_t) mr->hmem_data); +} + +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *lnx_domain; + struct util_domain *lnx_domain_info; + struct lnx_fabric *lnx_fab = container_of(fabric, struct lnx_fabric, + util_fabric.fabric_fid); + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = default_monitor, + [FI_HMEM_CUDA] = default_cuda_monitor, + [FI_HMEM_ROCR] = default_rocr_monitor, + [FI_HMEM_ZE] = default_ze_monitor, + }; + + /* + * provider: shm+cxi:lnx + * fabric: ofi_lnx_fabric + * domain: shm+cxi3:ofi_lnx_domain + * version: 120.0 + * type: FI_EP_RDM + * protocol: FI_PROTO_LNX + * + * Parse out the provider name. It should be shm+ + * + * Create a fabric for shm and one for the other provider. + * + * When fi_domain() is called, we get the fi_info for the + * second provider, which we should've returned as part of the + * fi_getinfo() call. + */ + /* create a new entry for shm. + * Create its fabric. + * insert fabric in the global table + */ + rc = lnx_setup_core_fabrics(info->domain_attr->name, lnx_fab, context); + if (rc) + goto fail; + + rc = -FI_ENOMEM; + lnx_domain = calloc(sizeof(*lnx_domain), 1); + if (!lnx_domain) + goto fail; + + lnx_domain_info = &lnx_domain->ld_domain; + lnx_domain->ld_fabric = lnx_fab; + + rc = ofi_domain_init(fabric, info, lnx_domain_info, context, + OFI_LOCK_SPINLOCK); + if (rc) + goto fail; + + dlist_foreach_container(&lnx_domain->ld_fabric->local_prov_table, + struct local_prov, entry, lpv_entry) { + rc = lnx_open_core_domains(entry, context, lnx_domain, info); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close_domain; + } + } + + lnx_domain_info->domain_fid.fid.ops = &lnx_domain_fi_ops; + lnx_domain_info->domain_fid.ops = &lnx_domain_ops; + lnx_domain_info->domain_fid.mr = &lnx_mr_ops; + + lnx_domain->ld_mr_cache.add_region = lnx_addr_add_region_noop; + lnx_domain->ld_mr_cache.delete_region = lnx_addr_del_region; + lnx_domain->ld_mr_cache.entry_data_size = sizeof(struct ofi_mr); + rc = ofi_mr_cache_init(&lnx_domain->ld_domain, memory_monitors, + &lnx_domain->ld_mr_cache); + if (rc) + goto close_domain; + + *domain = &lnx_domain_info->domain_fid; + + return 0; + +close_domain: + lnx_domain_close(&(lnx_domain_info->domain_fid.fid)); +fail: + return rc; +} + diff --git a/prov/lnx/src/lnx_ep.c b/prov/lnx/src/lnx_ep.c new file mode 100644 index 00000000000..69b763727c5 --- /dev/null +++ b/prov/lnx/src/lnx_ep.c @@ -0,0 +1,1188 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +extern struct fi_ops_cm lnx_cm_ops; +extern struct fi_ops_msg lnx_msg_ops; +extern struct fi_ops_tagged lnx_tagged_ops; +extern struct fi_ops_rma lnx_rma_ops; +extern struct fi_ops_atomic lnx_atomic_ops; + +static void lnx_init_ctx(struct fid_ep *ctx, size_t fclass); + +static int lnx_close_ceps(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + + if (ep->lpe_srx.ep_fid.fid.context) + free(ep->lpe_srx.ep_fid.fid.context); + + rc = fi_close(&ep->lpe_ep->fid); + if (rc) + frc = rc; + ofi_bufpool_destroy(ep->lpe_recv_bp); + } + + return frc; +} + +int lnx_ep_close(struct fid *fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_ep *ep; + struct lnx_fabric *fabric; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = ep->le_domain->ld_fabric; + + /* close all the open core endpoints */ + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + lnx_close_ceps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close endpoint for %s\n", + entry->lpv_prov_name); + } + + ofi_endpoint_close(&ep->le_ep); + free(ep); + + return rc; +} + +static int lnx_enable_core_eps(struct lnx_ep *lep) +{ + int rc; + struct local_prov *entry; + struct local_prov_ep *ep; + int srq_support = 1; + struct lnx_fabric *fabric = lep->le_domain->ld_fabric; + + fi_param_get_bool(&lnx_prov, "srq_support", &srq_support); + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (srq_support) { + /* bind the shared receive context */ + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_srx.ep_fid.fid, 0); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, + "%s doesn't support SRX (%d)\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + rc = fi_enable(ep->lpe_ep); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_control(struct fid *fid, int command, void *arg) +{ + struct lnx_ep *ep; + int rc; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + + switch (command) { + case FI_ENABLE: + if (ep->le_fclass == FI_CLASS_EP && + ((ofi_needs_rx(ep->le_ep.caps) && !ep->le_ep.rx_cq) || + (ofi_needs_tx(ep->le_ep.caps) && !ep->le_ep.tx_cq))) + return -FI_ENOCQ; + if (!ep->le_peer_tbl) + return -FI_ENOAV; + rc = lnx_enable_core_eps(ep); + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +int lnx_cq_bind_core_prov(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + /* LNX CQ */ + struct util_cq *cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + cq = container_of(bfid, struct util_cq, cq_fid.fid); + fabric = lep->le_domain->ld_fabric; + + rc = ofi_ep_bind_cq(&lep->le_ep, cq, flags); + if (rc) + return rc; + + /* bind the core providers to their respective CQs */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_cq.lpc_core_cq->fid, flags); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_bind_core_prov(struct lnx_fabric *fabric, uint64_t flags) +{ + struct local_prov *entry; + struct local_prov_ep *ep; + int rc; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, &ep->lpe_av->fid, flags); + if (rc) + return rc; + } + } + + return rc; +} + +static int +lnx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc = 0; + struct lnx_ep *ep; + struct lnx_peer_table *peer_tbl; + + switch (fid->fclass) { + case FI_CLASS_EP: /* Standard EP */ + case FI_CLASS_SEP: /* Scalable EP */ + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + + default: + return -FI_EINVAL; + } + + switch (bfid->fclass) { + case FI_CLASS_EQ: + /* TODO */ + break; + + case FI_CLASS_CQ: + rc = lnx_cq_bind_core_prov(fid, bfid, flags); + break; + + case FI_CLASS_CNTR: + /* TODO */ + break; + + case FI_CLASS_AV: + peer_tbl = container_of(bfid, struct lnx_peer_table, + lpt_av.av_fid.fid); + if (peer_tbl->lpt_domain != ep->le_domain) + return -FI_EINVAL; + ep->le_peer_tbl = peer_tbl; + /* forward the bind to the core provider endpoints */ + rc = lnx_ep_bind_core_prov(ep->le_domain->ld_fabric, flags); + break; + + case FI_CLASS_STX_CTX: /* shared TX context */ + return -FI_ENOSYS; + + case FI_CLASS_SRX_CTX: /* shared RX context */ + return -FI_ENOSYS; + + default: + return -FI_EINVAL; + } + + return rc; +} + +int lnx_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct local_prov *entry; + size_t size = sizeof(struct lnx_addresses); + /* initial location to put the address */ + char ep_addr[FI_NAME_MAX]; + char *tmp = NULL; + struct lnx_addresses *la; + struct lnx_address_prov *lap; + char hostname[FI_NAME_MAX]; + size_t prov_addrlen; + size_t addrlen_list[LNX_MAX_LOCAL_EPS]; + int rc, j = 0; + struct lnx_ep *lnx_ep; + struct lnx_fabric *fabric; + struct local_prov_ep *ep; + + lnx_ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lnx_ep->le_domain->ld_fabric; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + addrlen_list[0] = 0; + + /* calculate the size of the address */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + size += sizeof(struct lnx_address_prov); + prov_addrlen = 0; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_getname(&ep->lpe_ep->fid, (void*)ep_addr, &prov_addrlen); + if (rc == -FI_ETOOSMALL) { + size += prov_addrlen * entry->lpv_ep_count; + addrlen_list[j] = prov_addrlen; + j++; + break; + } else { + /* this shouldn't have happened. */ + return -FI_EINVAL; + } + } + } + + if (!addr || *addrlen < size) { + *addrlen = size; + return -FI_ETOOSMALL; + } + + la = addr; + + lap = (struct lnx_address_prov *)((char*)la + sizeof(*la)); + + j = 0; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + memcpy(lap->lap_prov, entry->lpv_prov_name, FI_NAME_MAX - 1); + lap->lap_addr_count = entry->lpv_ep_count; + lap->lap_addr_size = addrlen_list[j]; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + tmp = (char*)lap + sizeof(*lap); + + rc = fi_getname(&ep->lpe_ep->fid, (void*)tmp, &addrlen_list[j]); + if (rc) + return rc; + + if (lap->lap_addr_size != addrlen_list[j]) + return -FI_EINVAL; + + tmp += addrlen_list[j]; + } + + lap = (struct lnx_address_prov *)tmp; + j++; + } + + la->la_prov_count = j; + memcpy(la->la_hostname, hostname, FI_NAME_MAX - 1); + + return 0; +} + +static ssize_t lnx_ep_cancel(fid_t fid, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + switch (fid->fclass) { + case FI_CLASS_EP: + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + case FI_CLASS_RX_CTX: + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + break; + case FI_CLASS_TX_CTX: + /* can't cancel a transmit */ + return -FI_ENOENT; + default: + return -FI_EINVAL; + } + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_cancel(&ep->lpe_ep->fid, context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + +static int lnx_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen) +{ + int rc = 0; + struct lnx_ep *lep; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_setopt(&ep->lpe_ep->fid, level, optname, + optval, optlen); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + + +static int lnx_ep_txc(struct fid_ep *fid, int index, struct fi_tx_attr *attr, + struct fid_ep **tx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->tx_ctx_cnt) + continue; + + rc = fi_tx_context(ep->lpe_ep, index, attr, + &ep->lpe_txc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_TX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_tx_ctx); + /* set the callbacks for the transmit context */ + *tx_ep = &ctx->ctx_ep; + + return rc; +} + +static int lnx_ep_rxc(struct fid_ep *fid, int index, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->rx_ctx_cnt) + continue; + + rc = fi_rx_context(ep->lpe_ep, index, attr, + &ep->lpe_rxc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_RX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_rx_ctx); + /* set the callbacks for the receive context */ + *rx_ep = &ctx->ctx_ep; + + return rc; +} + +struct fi_ops_ep lnx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + /* can't get opt, because there is no way to report multiple + * options for the different links */ + .getopt = fi_no_getopt, + .setopt = lnx_ep_setopt, + .tx_ctx = lnx_ep_txc, + .rx_ctx = lnx_ep_rxc, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +struct fi_ops lnx_ep_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ep_close, + .bind = lnx_ep_bind, + .control = lnx_ep_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_cm lnx_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = lnx_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, +}; + +static int lnx_open_eps(struct local_prov *prov, struct fi_info *info, + void *context, size_t fclass, struct lnx_ep *lep) +{ + int rc = 0; + struct local_prov_ep *ep; + struct dlist_entry *tmp; + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_srx_context *ctxt; + + ctxt = calloc(1, sizeof(*ctxt)); + if (!ctxt) + return -FI_ENOMEM; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + if (fclass == FI_CLASS_EP) { + rc = fi_endpoint(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } else { + /* update endpoint attributes with whatever is being + * passed from the application + */ + if (ep->lpe_fi_info && info) { + ep->lpe_fi_info->ep_attr->tx_ctx_cnt = + info->ep_attr->tx_ctx_cnt; + ep->lpe_fi_info->ep_attr->rx_ctx_cnt = + info->ep_attr->rx_ctx_cnt; + } + + ep->lpe_txc = calloc(info->ep_attr->tx_ctx_cnt, + sizeof(*ep->lpe_txc)); + ep->lpe_rxc = calloc(info->ep_attr->rx_ctx_cnt, + sizeof(*ep->lpe_rxc)); + if (!ep->lpe_txc || !ep->lpe_rxc) + return -FI_ENOMEM; + + rc = fi_scalable_ep(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } + if (rc) + return rc; + + ctxt->srx_lep = lep; + ctxt->srx_cep = ep; + + ep->lpe_srx.ep_fid.fid.context = ctxt; + ep->lpe_srx.ep_fid.fid.fclass = FI_CLASS_SRX_CTX; + ofi_spin_init(&ep->lpe_bplock); + /* create a buffer pool for the receive requests */ + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &ep->lpe_recv_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return -FI_ENOMEM; + } + } + + return 0; +} + +static void +lnx_ep_nosys_progress(struct util_ep *util_ep) +{ + assert(0); +} + +static inline int +match_tag(uint64_t tag, uint64_t match_tag, uint64_t ignore) +{ + return ((tag | ignore) == (match_tag | ignore)); +} + +static inline bool +lnx_addr_match(fi_addr_t addr1, fi_addr_t addr2) +{ + return (addr1 == addr2); +} + +static inline bool +lnx_search_addr_match(fi_addr_t cep_addr, struct lnx_peer_prov *lpp) +{ + struct lnx_local2peer_map *lpm; + fi_addr_t peer_addr; + int i; + + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, + lpm, entry) { + for (i = 0; i < LNX_MAX_LOCAL_EPS; i++) { + peer_addr = lpm->peer_addrs[i]; + if (peer_addr == FI_ADDR_NOTAVAIL) + break; + if (lnx_addr_match(peer_addr, cep_addr)) + return true; + } + } + + return false; +} + +static int lnx_match_common(uint64_t tag1, uint64_t tag2, uint64_t ignore, + fi_addr_t cep_addr, fi_addr_t lnx_addr, struct lnx_peer *peer, + struct local_prov_ep *cep) +{ + struct lnx_peer_prov *lpp; + struct local_prov *lp; + bool tmatch; + + /* if a request has no address specified it'll match against any + * rx_entry with a matching tag + * or + * if an rx_entry has no address specified, it'll match against any + * request with a matching tag + * + * for non tagged messages tags will be set to TAG_ANY so they will + * always match and decision will be made on address only. + */ + tmatch = match_tag(tag1, tag2, ignore); + if (!tmatch) + return tmatch; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "tag1=%lx tag2=%lx ignore=%lx cep_addr=%lx lnx_addr=%lx tmatch=%d\n", + tag1, tag2, ignore, cep_addr, lnx_addr, tmatch); + + /* if we're requested to receive from any peer, then tag maching is + * enough. None tagged message will match irregardless. + */ + if (lnx_addr == FI_ADDR_UNSPEC) + return tmatch; + + /* if the address is specified, then we should have a peer and + * a receiving core endpoint and a provider parent + */ + assert(peer && cep && cep->lpe_parent); + + lp = cep->lpe_parent; + + /* if this is a shm core provider, then only go through lnx + * shm provider + */ + if (cep->lpe_local) + return lnx_search_addr_match(cep_addr, peer->lp_shm_prov); + + /* check if we already have a peer provider. + * A peer can receive messages from multiple providers, we need to + * find the provider which maps to the provider we're currently + * checking. The map looked up can have multiple addresses which + * we can receive from, so we need to check which one of those is + * the correct match. + * + * Note: we're trying to make this loop as efficient as possible, + * because it's executed on the message matching path, which is + * heavily hit. + * + * The theory is in most use cases: + * - There will be only two providers to check + * - Each provider will have 1 endpoint, and therefore only one map + * - Each peer will only have 1 address. + * + */ + dlist_foreach_container(&peer->lp_provs, + struct lnx_peer_prov, lpp, entry) { + if (lpp->lpp_prov == lp) + return lnx_search_addr_match(cep_addr, lpp); + } + + return false; +} + +static int lnx_match_unexq(struct dlist_entry *item, const void *args) +{ + /* this entry is placed on the SUQ via the lnx_get_tag() path + * and examined in the lnx_process_tag() path */ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + struct lnx_peer *peer = match_attr->lm_peer; + + /* entry refers to the unexpected message received + * entry->rx_entry.tag will be the tag of the message or TAG_UNSPEC + * otherwise + * + * entry->rx_entry.addr will be the address of the peer which sent the + * message or ADDR_UNSPEC if the core provider didn't do a reverse + * lookup. + * + * entry->rx_cep will be set to the core endpoint which received the + * message. + * + * match_attr is filled in by the lnx_process_tag() and contains + * information passed to us by the application + * + * match_attr->lm_peer is the peer looked up via the addr passed by + * the application to LNX. It is NULL if the addr is ADDR_UNSPEC. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed by the application to LNX via the receive API. + * + * match_attr->lm_addr is the only significant if it's set to + * FI_ADDR_UNSPEC, otherwise it's not used in matching because it's + * the LNX level address and we need to compare the core level address. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + match_attr->lm_ignore, entry->rx_entry.addr, + match_attr->lm_addr, peer, entry->rx_cep); +} + +static int lnx_match_recvq(struct dlist_entry *item, const void *args) +{ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + /* this entry is placed on the recvq via the lnx_process_tag() path + * and examined in the lnx_get_tag() path */ + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + + /* entry refers to the receive request waiting for a message + * entry->rx_entry.tag is the tag passed in by the application. + * + * entry->rx_entry.addr is the address passed in by the application. + * This is the LNX level address. It's only significant if it's set + * to ADDR_UNSPEC. Otherwise, it has already been used to look up the + * peer. + * + * entry->rx_cep is always NULL in this case, as this will only be + * known when the message is received. + * + * entry->rx_peer is the LNX peer looked up if a valid address is + * given by the application, otherwise it's NULL. + * + * match_attr information is filled by the lnx_get_tag() callback and + * contains information passed to us by the core endpoint receiving + * the message. + * + * match_attr->rx_peer is not significant because at the lnx_get_tag() + * call there isn't enough information to find what the peer is. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed up by the core endpoint receiving the message. + * + * match_attr->lm_addr is the address of the peer which sent the + * message. Set if the core endpoint has done a reverse lookup, + * otherwise set to ADDR_UNSPEC. + * + * match_attr->lm_cep is the core endpoint which received the message. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + entry->rx_entry.ignore, match_attr->lm_addr, + entry->rx_entry.addr, entry->rx_peer, match_attr->lm_cep); +} + +static inline int +lnx_init_queue(struct lnx_queue *q, dlist_func_t *match_func) +{ + int rc; + + rc = ofi_spin_init(&q->lq_qlock); + if (rc) + return rc; + + dlist_init(&q->lq_queue); + + q->lq_match_func = match_func; + + return 0; +} + +static inline int +lnx_init_qpair(struct lnx_qpair *qpair, dlist_func_t *recvq_match_func, + dlist_func_t *unexq_match_func) +{ + int rc = 0; + + rc = lnx_init_queue(&qpair->lqp_recvq, recvq_match_func); + if (rc) + goto out; + rc = lnx_init_queue(&qpair->lqp_unexq, unexq_match_func); + if (rc) + goto out; + +out: + return rc; +} + +static inline int +lnx_init_srq(struct lnx_peer_srq *srq) +{ + int rc; + + rc = lnx_init_qpair(&srq->lps_trecv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + rc = lnx_init_qpair(&srq->lps_recv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + + return rc; +} + +static int lnx_get_ctx(struct local_prov_ep *ep, size_t fclass, + struct fid_ep ***ep_ctx, size_t *size) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + *ep_ctx = ep->lpe_rxc; + *size = ep->lpe_fi_info->ep_attr->rx_ctx_cnt; + break; + case FI_CLASS_TX_CTX: + *ep_ctx = ep->lpe_txc; + *size = ep->lpe_fi_info->ep_attr->tx_ctx_cnt; + break; + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static void lnx_close_ep_ctx(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return; + + for (i = 0; i < size; i++) { + rc = fi_close(&ep_ctx[i]->fid); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close ep context %lu with %d\n", + fclass, rc); + } +} + +static int lnx_ctx_close(struct fid *fid) +{ + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + lnx_close_ep_ctx(ep, fid->fclass); + } + + return FI_SUCCESS; +} + +static int lnx_ctx_bind_cq(struct local_prov_ep *ep, size_t fclass, + struct fid *bfid, uint64_t flags) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_ep_bind(ep_ctx[i], bfid, flags); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (bfid->fclass == FI_CLASS_CQ) + /* bind the context to the shared cq */ + rc = lnx_ctx_bind_cq(ep, fid->fclass, + &ep->lpe_cq.lpc_core_cq->fid, + flags); + else + return -FI_ENOSYS; + + if (rc) + return rc; + } + } + + return FI_SUCCESS; +} + +static int +lnx_enable_ctx_eps(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_enable(ep_ctx[i]); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_control(struct fid *fid, int command, void *arg) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + switch (command) { + case FI_ENABLE: + if (!lep->le_peer_tbl) + return -FI_ENOAV; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = lnx_enable_ctx_eps(ep, fid->fclass); + if (rc) + return rc; + } + } + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +static struct fi_ops lnx_ctx_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ctx_close, + .bind = lnx_ctx_bind, + .control = lnx_ctx_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_ep lnx_ctx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +static void +lnx_init_ctx(struct fid_ep *ctx, size_t fclass) +{ + ctx->fid.fclass = fclass; + ctx->fid.ops = &lnx_ctx_ops; + ctx->ops = &lnx_ctx_ep_ops; + ctx->msg = &lnx_msg_ops; + ctx->tagged = &lnx_tagged_ops; + ctx->rma = &lnx_rma_ops; + ctx->atomic = &lnx_atomic_ops; +} + +static int +lnx_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, + struct lnx_ep **out_ep, void *context, size_t fclass) +{ + int rc; + struct lnx_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + uint64_t mr_mode; + + ep = calloc(1, sizeof(*ep)); + if (!ep) + return -FI_ENOMEM; + + ep->le_fclass = fclass; + ep->le_ep.ep_fid.fid.fclass = fclass; + + ep->le_ep.ep_fid.fid.ops = &lnx_ep_fi_ops; + ep->le_ep.ep_fid.ops = &lnx_ep_ops; + ep->le_ep.ep_fid.cm = &lnx_cm_ops; + ep->le_ep.ep_fid.msg = &lnx_msg_ops; + ep->le_ep.ep_fid.tagged = &lnx_tagged_ops; + ep->le_ep.ep_fid.rma = &lnx_rma_ops; + ep->le_ep.ep_fid.atomic = &lnx_atomic_ops; + ep->le_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + lnx_init_srq(&ep->le_srq); + + dlist_init(&ep->le_rx_ctx); + dlist_init(&ep->le_tx_ctx); + + fabric = ep->le_domain->ld_fabric; + + /* create all the core provider endpoints */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_eps(entry, info, context, fclass, ep); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to create ep for %s\n", + entry->lpv_prov_name); + goto fail; + } + } + + mr_mode = lnx_util_prov.info->domain_attr->mr_mode; + lnx_util_prov.info->domain_attr->mr_mode = 0; + rc = ofi_endpoint_init(domain, (const struct util_prov *)&lnx_util_prov, + (struct fi_info *)lnx_util_prov.info, &ep->le_ep, + context, lnx_ep_nosys_progress); + if (rc) + goto fail; + + lnx_util_prov.info->domain_attr->mr_mode = mr_mode; + *out_ep = ep; + + return 0; + +fail: + free(ep); + return rc; +} + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_SEP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_EP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + + diff --git a/prov/lnx/src/lnx_init.c b/prov/lnx/src/lnx_init.c new file mode 100644 index 00000000000..2cbb5b56320 --- /dev/null +++ b/prov/lnx/src/lnx_init.c @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +#define LNX_PASSTHRU_TX_OP_FLAGS (FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | \ + FI_DELIVERY_COMPLETE) +#define LNX_PASSTHRU_RX_OP_FLAGS (0ULL) +#define LNX_TX_OP_FLAGS (FI_INJECT_COMPLETE | FI_COMPLETION | \ + FI_DELIVERY_COMPLETE | FI_TRANSMIT_COMPLETE) +#define LNX_RX_OP_FLAGS (FI_COMPLETION) + +ofi_spin_t global_bplock; +struct ofi_bufpool *global_recv_bp = NULL; + +struct util_fabric lnx_fabric_info; + +struct fi_tx_attr lnx_tx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_TX_OP_FLAGS | LNX_TX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .inject_size = SIZE_MAX, + .size = SIZE_MAX, + .iov_limit = LNX_IOV_LIMIT, + .rma_iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_rx_attr lnx_rx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_RX_OP_FLAGS | LNX_RX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .total_buffered_recv = 0, + .size = 1024, + .iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_ep_attr lnx_ep_attr = { + .type = FI_EP_UNSPEC, + .protocol = FI_PROTO_LNX, + .protocol_version = 1, + .max_msg_size = SIZE_MAX, + .msg_prefix_size = SIZE_MAX, + .max_order_raw_size = SIZE_MAX, + .max_order_war_size = SIZE_MAX, + .max_order_waw_size = SIZE_MAX, + .mem_tag_format = FI_TAG_GENERIC, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .auth_key = NULL, + .auth_key_size = 0, +}; + +struct fi_domain_attr lnx_domain_attr = { + .name = "ofi_lnx_domain", + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_RAW, + .mr_key_size = SIZE_MAX, + .cq_data_size = SIZE_MAX, + .cq_cnt = SIZE_MAX, + .ep_cnt = SIZE_MAX, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .max_ep_tx_ctx = SIZE_MAX, + .max_ep_rx_ctx = SIZE_MAX, + .max_ep_stx_ctx = SIZE_MAX, + .max_ep_srx_ctx = SIZE_MAX, + .cntr_cnt = SIZE_MAX, + .mr_iov_limit = SIZE_MAX, + .caps = ~0x0ULL, + .auth_key_size = SIZE_MAX, + .max_err_data = SIZE_MAX, + .mr_cnt = SIZE_MAX, +}; + +struct fi_fabric_attr lnx_fabric_attr = { + .prov_version = OFI_VERSION_DEF_PROV, + .name = "ofi_lnx_fabric", +}; + +struct fi_info lnx_info = { + .caps = ~0x0ULL, + .tx_attr = &lnx_tx_attr, + .rx_attr = &lnx_rx_attr, + .ep_attr = &lnx_ep_attr, + .domain_attr = &lnx_domain_attr, + .fabric_attr = &lnx_fabric_attr +}; + +static struct fi_ops lnx_fabric_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_fabric_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_fabric lnx_fabric_ops = { + .size = sizeof(struct fi_ops_fabric), + .domain = lnx_domain_open, + .passive_ep = fi_no_passive_ep, + .eq_open = fi_no_eq_open, + .wait_open = fi_no_wait_open, + .trywait = fi_no_trywait +}; + +struct fi_provider lnx_prov = { + .name = OFI_LNX, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, + .getinfo = lnx_getinfo, + .fabric = lnx_fabric, + .cleanup = lnx_fini +}; + +struct util_prov lnx_util_prov = { + .prov = &lnx_prov, + .info = &lnx_info, + .flags = 0 +}; + +/* + * For the fi_getinfo() -> fi_fabric() -> fi_domain() path, we need to + * keep track of the fi_info in case we need them later on when linking in + * the fi_fabric() function. + * + * This cache gets cleared after we use the ones we need, or when the + * library exists, if LNX is never used. + */ +struct dlist_entry lnx_fi_info_cache; +/* this is a list of all possible links */ +struct dlist_entry lnx_links; +struct dlist_entry lnx_links_meta; + +struct lnx_fi_cache_entry { + struct dlist_entry entry; + struct fi_info *fi; +}; + +struct lnx_fi_info_meta { + struct dlist_entry entry; + struct fi_info *lnx_rep; + struct fi_info *lnx_link; +}; + +static int lnx_get_cache_meta(struct dlist_entry *head, int *size) +{ + int num_prov = 0; + struct dlist_entry *e; + + dlist_foreach(head, e) + num_prov++; + + *size = num_prov; + + return FI_SUCCESS; +} + +static void lnx_free_meta(void) +{ + struct lnx_fi_info_meta *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry, tmp) { + dlist_remove(&e->entry); + free(e); + } +} + +static void lnx_free_info_cache(struct dlist_entry *head, bool meta) +{ + struct lnx_fi_cache_entry *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(head, struct lnx_fi_cache_entry, e, + entry, tmp) { + fi_freeinfo(e->fi); + dlist_remove(&e->entry); + free(e); + } + + if (meta) + lnx_free_meta(); +} + +static int lnx_cache_info(struct dlist_entry *head, + struct fi_info *info) +{ + struct lnx_fi_cache_entry *e = calloc(1, sizeof(*e)); + if (!e) + return -FI_ENOMEM; + dlist_init(&e->entry); + e->fi = info; + + dlist_insert_tail(&e->entry, head); + + return 0; +} + +struct fi_info * +lnx_get_link_by_dom(char *domain_name) +{ + struct fi_info *info; + struct lnx_fi_info_meta *e; + + dlist_foreach_container(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry) { + info = e->lnx_rep; + if (info && info->domain_attr) { + if (!strcmp(domain_name, + info->domain_attr->name)) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Found %s\n", + info->fabric_attr->prov_name); + return e->lnx_link; + } + } + } + + return NULL; +} + +static void fi_insert_tail(struct fi_info *head, struct fi_info *item) +{ + struct fi_info *itr = head; + while(itr->next) + itr = itr->next; + itr->next = item; +} + +static void fi_remove_tail(struct fi_info **head) +{ + struct fi_info *itr = *head, *prev = NULL; + while(itr->next) { + prev = itr; + itr = itr->next; + } + + if (prev) + prev->next = NULL; + else + *head = NULL; + free(itr); +} + +struct fi_info *duplicate(struct fi_info *l) +{ + struct fi_info *itr, *new, *prev = NULL, *head = NULL; + for (itr = l; itr; itr = itr->next) { + new = fi_dupinfo(itr); + if (!new) { + if (head) + fi_freeinfo(head); + return NULL; + } + + if (!head) + head = new; + + if (prev) { + prev->next = new; + prev = new; + } else { + prev = new; + } + } + + return head; +} + +static int gen_links_rec(struct dlist_entry *current, struct dlist_entry *head, + struct dlist_entry *result, struct fi_info *l, + int depth, int target_depth) +{ + int rc; + struct fi_info *itr; + struct fi_info *fi_copy, *dup; + struct lnx_fi_cache_entry *e, *new; + + while(current->next != head) { + e = container_of(current->next, struct lnx_fi_cache_entry, entry); + for (itr = e->fi; itr; itr = itr->next) { + fi_copy = fi_dupinfo(itr); + if (l) { + fi_insert_tail(l, fi_copy); + } else { + l = fi_copy; + } + if (current->next->next == head && + depth == target_depth) { + dup = duplicate(l); + if (!dup) + return -FI_ENOMEM; + new = calloc(1, sizeof(*new)); + if (!new) + return -FI_ENOMEM; + new->fi = dup; + dlist_init(&new->entry); + dlist_insert_tail(&new->entry, result); + } + rc = gen_links_rec(current->next, head, result, l, + depth+1, target_depth); + fi_remove_tail(&l); + if (rc) + return rc; + } + current = current->next; + } + + return FI_SUCCESS; +} + +static int gen_links(struct dlist_entry *head, struct dlist_entry *result, + int target_depth) +{ + return gen_links_rec(head, head, result, NULL, 1, target_depth); +} + +static int lnx_form_info(struct fi_info *fi, struct fi_info **out) +{ + int size_prov = 0, size_dom = 0, rc = FI_SUCCESS; + struct lnx_fi_info_meta *meta = NULL; + char *lnx_prov, *lnx_dom, *s; + struct fi_info *itr, *r = NULL; + bool copy = false; + uint64_t min_inject_size = SIZE_MAX; + + for (itr = fi; itr; itr = itr->next) { + size_prov += strlen(itr->fabric_attr->prov_name)+1; + size_dom += strlen(itr->domain_attr->name)+1; + if (itr->tx_attr && itr->tx_attr->inject_size < min_inject_size) + min_inject_size = itr->tx_attr->inject_size; + } + + lnx_dom = calloc(size_dom, sizeof(char)); + lnx_prov = calloc(size_prov, sizeof(char)); + if (!lnx_prov || !lnx_dom) + return -FI_ENOMEM; + + for (itr = fi; itr; itr = itr->next) { + strcat(lnx_prov, itr->fabric_attr->prov_name); + strcat(lnx_dom, itr->domain_attr->name); + if (itr->next) { + strcat(lnx_dom, "+"); + strcat(lnx_prov, "+"); + } + if (!strncmp(itr->fabric_attr->prov_name, "shm", 3)) + continue; + + if (!copy) { + meta = calloc(1, sizeof(*meta)); + r = fi_dupinfo(itr); + if (!r || !meta) { + rc = -FI_ENOMEM; + goto fail; + } + meta->lnx_rep = r; + meta->lnx_link = fi; + if (r->tx_attr) + r->tx_attr->inject_size = min_inject_size; + dlist_init(&meta->entry); + dlist_insert_tail(&meta->entry, &lnx_links_meta); + copy = true; + } + } + + if (!r) { + rc = -FI_ENODATA; + goto fail; + } + + free(r->fabric_attr->prov_name); + free(r->fabric_attr->name); + free(r->domain_attr->name); + + r->fabric_attr->name = NULL; + r->domain_attr->name = NULL; + r->fabric_attr->prov_name = lnx_prov; + + if (asprintf(&s, "%s", lnx_info.fabric_attr->name) < 0) + goto fail; + r->fabric_attr->name = s; + + if (asprintf(&s, "%s:%s", lnx_dom, lnx_info.domain_attr->name) < 0) + goto fail; + r->domain_attr->name = s; + free(lnx_dom); + + *out = r; + return FI_SUCCESS; + +fail: + if (meta) + free(meta); + if (r) + fi_freeinfo(r); + free(lnx_dom); + return rc; +} + +static int lnx_generate_info(struct fi_info **info) +{ + struct fi_info *fi = NULL, *head = NULL, *prev = NULL; + struct lnx_fi_cache_entry *e; + int rc, size; + + /* we need at least 2 providers to link */ + rc = lnx_get_cache_meta(&lnx_fi_info_cache, &size); + if (rc || size < 2) + return -FI_ENODATA; + + rc = gen_links(&lnx_fi_info_cache, &lnx_links, size); + if (rc) + return rc; + + /* + * 1. Iterate over the links and create a linked list of fi_infos + * each fi_info in the list represents one of the links + * 2. Have metadata associated with each fi_info to refer back to + * an entry in the lnx_links cache. + * 3. When the application selects one of these fi_infos, we can + * then find the appropriate link in the cache and be able to + * create the underlying core providers correctly. + */ + dlist_foreach_container(&lnx_links, struct lnx_fi_cache_entry, e, + entry) { + rc = lnx_form_info(e->fi, &fi); + if (rc) + goto err; + + if (prev) { + prev->next = fi; + prev = fi; + } else { + prev = fi; + head = fi; + } + } + + *info = head; + + return FI_SUCCESS; + +err: + if (fi) + fi_freeinfo(fi); + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + return -FI_ENODATA; +} + +int lnx_getinfo_helper(uint32_t version, char *prov, struct fi_info *lnx_hints) +{ + int rc; + char *orig_prov_name = NULL; + struct fi_info *core_info; + uint64_t caps, mr_mode; + bool shm = false; + + caps = lnx_hints->caps; + mr_mode = lnx_hints->domain_attr->mr_mode; + + if (lnx_hints->fabric_attr->prov_name) { + orig_prov_name = lnx_hints->fabric_attr->prov_name; + lnx_hints->fabric_attr->prov_name = NULL; + } + + lnx_hints->fabric_attr->prov_name = prov; + if (!strncmp(prov, "shm", 3)) { + shm = true; + /* make sure we get the correct shm provider */ + lnx_hints->caps &= ~(FI_REMOTE_COMM | FI_LOCAL_COMM); + lnx_hints->caps |= FI_HMEM; + lnx_hints->domain_attr->mr_mode |= (FI_MR_VIRT_ADDR | FI_MR_HMEM + | FI_MR_PROV_KEY); + } + rc = fi_getinfo(version, NULL, NULL, OFI_GETINFO_INTERNAL, + lnx_hints, &core_info); + + lnx_hints->fabric_attr->prov_name = orig_prov_name; + if (rc) + return rc; + + if (shm) { + lnx_hints->caps = caps; + lnx_hints->domain_attr->mr_mode = mr_mode; + } + + rc = lnx_cache_info(&lnx_fi_info_cache, core_info); + + return rc; +} + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info) +{ + int rc; + struct fi_info *lnx_hints; + char *linked_provs, *linked_provs_cp, *token, *exclude = NULL; + + rc = fi_param_get_str(&lnx_prov, "prov_links", + &linked_provs); + if (rc) + return rc; + + if (strstr(linked_provs, "lnx")) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Can't specify the lnx provider as part of the link: %s\n", + linked_provs); + return -FI_EINVAL; + } + + linked_provs_cp = strdup(linked_provs); + if (!linked_provs_cp) + return -FI_ENOMEM; + + /* The assumption is that the entire series of + * lnx_getinfo()->lnx_fabric()->lnx_domain()->lnx_endpoint() are + * going to be called before another lnx_getinfo() is called again. + * Based on this assumption, we will free the cache whenever + * lnx_getinfo() is called + */ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + /* If the hints are not provided then we endup with a new block */ + lnx_hints = fi_dupinfo(hints); + if (!lnx_hints) + return -FI_ENOMEM; + + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, lnx_prov.name); + if (rc) + return rc; + + /* get the providers which support peer functionality. These are + * the only ones we can link*/ + lnx_hints->caps |= FI_PEER; + + FI_INFO(&lnx_prov, FI_LOG_FABRIC, "LNX START -------------------\n"); + token = strtok(linked_provs_cp, "+"); + while (token) { + lnx_getinfo_helper(version, token, lnx_hints); + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, token); + if (rc) + goto free_hints; + token = strtok(NULL, "+"); + } + free(linked_provs_cp); + + /* Generate the lnx info which represents all possible combination + * of domains which are to be linked. + */ + rc = lnx_generate_info(info); + + FI_INFO(&lnx_prov, FI_LOG_FABRIC, "LNX END -------------------\n"); + +free_hints: + free(exclude); + fi_freeinfo(lnx_hints); + return rc; +} + +static struct local_prov * +lnx_get_local_prov(struct dlist_entry *prov_table, char *prov_name) +{ + struct local_prov *entry; + + /* close all the open core fabrics */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) + return entry; + } + + return NULL; +} + +static int +lnx_add_ep_to_prov(struct local_prov *prov, struct local_prov_ep *ep) +{ + dlist_insert_tail(&ep->entry, &prov->lpv_prov_eps); + ep->lpe_parent = prov; + prov->lpv_ep_count++; + + return FI_SUCCESS; +} + +static int +lnx_setup_core_prov(struct fi_info *info, struct dlist_entry *prov_table, + struct local_prov **shm_prov, void *context) +{ + int rc = -FI_EINVAL; + struct local_prov_ep *ep = NULL; + struct local_prov *lprov, *new_lprov = NULL; + + ep = calloc(sizeof(*ep), 1); + if (!ep) + return -FI_ENOMEM; + + new_lprov = calloc(sizeof(*new_lprov), 1); + if (!new_lprov) + goto free_entry; + + dlist_init(&new_lprov->lpv_prov_eps); + + rc = fi_fabric(info->fabric_attr, &ep->lpe_fabric, context); + if (rc) + return rc; + + ep->lpe_fi_info = info; + strncpy(ep->lpe_fabric_name, info->fabric_attr->name, + FI_NAME_MAX - 1); + + lprov = lnx_get_local_prov(prov_table, info->fabric_attr->prov_name); + if (!lprov) { + lprov = new_lprov; + new_lprov = NULL; + strncpy(lprov->lpv_prov_name, info->fabric_attr->prov_name, + FI_NAME_MAX - 1); + } else { + free(new_lprov); + } + + /* indicate that this fabric can be used for on-node communication */ + if (!strncasecmp(lprov->lpv_prov_name, "shm", 3)) { + *shm_prov = lprov; + ep->lpe_local = true; + } + + dlist_init(&ep->entry); + rc = lnx_add_ep_to_prov(lprov, ep); + if (rc) + goto free_all; + + dlist_insert_after(&lprov->lpv_entry, prov_table); + + return 0; + +free_all: + if (new_lprov) + free(new_lprov); +free_entry: + if (ep) + free(ep); + + return rc; +} + +int +lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context) +{ + int rc; + struct fi_info *link, *itr; + + link = lnx_get_link_by_dom(name); + if (!link) + return -FI_ENODATA; + + for (itr = link; itr; itr = itr->next) { + rc = lnx_setup_core_prov(itr, &lnx_fab->local_prov_table, + &lnx_fab->shm_prov, context); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context) +{ + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_fabric *lnx_fab; + int rc; + + lnx_fab = calloc(sizeof(*lnx_fab), 1); + if (!lnx_fab) + return -FI_ENOMEM; + + bp_attrs.size = sizeof(struct lnx_mr); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT32_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &lnx_fab->mem_reg_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create memory registration buffer pool"); + free(lnx_fab); + return -FI_ENOMEM; + } + + /* initialize the provider table */ + dlist_init(&lnx_fab->local_prov_table); + + rc = ofi_fabric_init(&lnx_prov, lnx_info.fabric_attr, + lnx_info.fabric_attr, + &lnx_fab->util_fabric, context); + if (rc) + goto fail; + + lnx_fab->util_fabric.fabric_fid.fid.ops = &lnx_fabric_fi_ops; + lnx_fab->util_fabric.fabric_fid.ops = &lnx_fabric_ops; + *fabric = &lnx_fab->util_fabric.fabric_fid; + + return 0; + +fail: + return rc; +} + +void lnx_fini(void) +{ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + ofi_bufpool_destroy(global_recv_bp); +} + +static int lnx_free_ep(struct local_prov *prov, struct local_prov_ep *ep) +{ + int rc; + + if (!prov || !ep) + return FI_SUCCESS; + + rc = fi_close(&ep->lpe_fabric->fid); + fi_freeinfo(ep->lpe_fi_info); + free(ep); + prov->lpv_ep_count--; + + if (prov->lpv_ep_count == 0) + dlist_remove(&prov->lpv_entry); + + return rc; +} + +static int lnx_free_eps(struct local_prov *prov) +{ + int rc, frc = 0; + struct dlist_entry *tmp; + struct local_prov_ep *ep; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + dlist_remove(&ep->entry); + rc = lnx_free_ep(prov, ep); + if (rc) + frc = rc; + } + + return frc; +} + +int ofi_create_link(struct fi_info *prov_list, + struct fid_fabric **fabric, + uint64_t caps, void *context) +{ + int rc; + struct fi_info *prov; + struct lnx_fabric *lnx_fab; + + lnx_fab = calloc(sizeof(*lnx_fab), 1); + if (!lnx_fab) + return -FI_ENOMEM; + + dlist_init(&lnx_fab->local_prov_table); + + /* create the fabric for the list of providers + * TODO: modify the code to work with the new data structures */ + for (prov = prov_list; prov; prov = prov->next) { + struct fi_info *info = fi_dupinfo(prov); + + if (!info) + return -FI_ENODATA; + + rc = lnx_setup_core_prov(prov, &lnx_fab->local_prov_table, + &lnx_fab->shm_prov, context); + if (rc) + return rc; + } + + rc = ofi_fabric_init(&lnx_prov, lnx_info.fabric_attr, + lnx_info.fabric_attr, + &lnx_fab->util_fabric, context); + if (rc) + return rc; + + lnx_fab->util_fabric.fabric_fid.fid.ops = &lnx_fabric_fi_ops; + lnx_fab->util_fabric.fabric_fid.ops = &lnx_fabric_ops; + *fabric = &lnx_fab->util_fabric.fabric_fid; + + return 0; +} + +int lnx_fabric_close(struct fid *fid) +{ + int rc = 0; + struct util_fabric *fabric; + struct lnx_fabric *lnx_fab; + struct local_prov *entry; + struct dlist_entry *tmp; + + fabric = container_of(fid, struct util_fabric, fabric_fid.fid); + lnx_fab = container_of(fabric, struct lnx_fabric, util_fabric); + + /* close all the open core fabrics */ + dlist_foreach_container_safe(&lnx_fab->local_prov_table, + struct local_prov, entry, lpv_entry, tmp) { + dlist_remove(&entry->lpv_entry); + rc = lnx_free_eps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close provider %s\n", + entry->lpv_prov_name); + + free(entry); + } + + /* free mr registration pool */ + ofi_bufpool_destroy(lnx_fab->mem_reg_bp); + + rc = ofi_fabric_close(fabric); + + return rc; +} + +void ofi_link_fini(void) +{ + lnx_prov.cleanup(); +} + +LNX_INI +{ + struct ofi_bufpool_attr bp_attrs = {}; + int ret; + + fi_param_define(&lnx_prov, "prov_links", FI_PARAM_STRING, + "Specify which providers LNX will link together. Format: " + "++...+. EX: shm+cxi"); + + fi_param_define(&lnx_prov, "disable_shm", FI_PARAM_BOOL, + "Turn off SHM support. Defaults to 0"); + + fi_param_define(&lnx_prov, "srq_support", FI_PARAM_BOOL, + "Turns shared receive queue support on and off. By default it is on. " + "When SRQ is turned on some Hardware offload capability will not " + "work. EX: Hardware Tag matching"); + + dlist_init(&lnx_fi_info_cache); + dlist_init(&lnx_links); + dlist_init(&lnx_links_meta); + + if (!global_recv_bp) { + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + ret = ofi_bufpool_create_attr(&bp_attrs, &global_recv_bp); + if (ret) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return NULL; + } + ofi_spin_init(&global_bplock); + } + + return &lnx_prov; +} diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c new file mode 100644 index 00000000000..aae6bcf9abe --- /dev/null +++ b/prov/lnx/src/lnx_ops.c @@ -0,0 +1,993 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "ofi_lock.h" +#include "rdma/fi_ext.h" +#include "ofi_iov.h" +#include "lnx.h" + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + return -FI_ENOSYS; +} + +int lnx_queue_msg(struct fi_peer_rx_entry *entry) +{ + return -FI_ENOSYS; +} + +void lnx_free_entry(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *) entry; + ofi_spin_t *bplock; + + if (rx_entry->rx_global) + bplock = &global_bplock; + else + bplock = &rx_entry->rx_cep->lpe_bplock; + + ofi_spin_lock(bplock); + ofi_buf_free(rx_entry); + ofi_spin_unlock(bplock); +} + +static struct lnx_ep *lnx_get_lep(struct fid_ep *ep, struct lnx_ctx **ctx) +{ + struct lnx_ep *lep; + + if (ctx) + *ctx = NULL; + + switch (ep->fid.fclass) { + case FI_CLASS_RX_CTX: + case FI_CLASS_TX_CTX: + *ctx = container_of(ep, struct lnx_ctx, ctx_ep.fid); + lep = (*ctx)->ctx_parent; + break; + case FI_CLASS_EP: + case FI_CLASS_SEP: + lep = container_of(ep, struct lnx_ep, le_ep.ep_fid.fid); + break; + default: + lep = NULL; + } + + return lep; +} + +static struct fid_ep *lnx_get_core_ep(struct local_prov_ep *cep, int idx, + size_t fclass) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + return cep->lpe_rxc[idx]; + case FI_CLASS_TX_CTX: + return cep->lpe_txc[idx]; + case FI_CLASS_EP: + case FI_CLASS_SEP: + return cep->lpe_ep; + default: + return NULL; + } + + return NULL; +} + +static void +lnx_init_rx_entry(struct lnx_rx_entry *entry, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + memcpy(&entry->rx_iov, iov, sizeof(*iov) * count); + if (desc) + memcpy(entry->rx_desc, desc, sizeof(*desc) * count); + + entry->rx_entry.iov = entry->rx_iov; + entry->rx_entry.desc = entry->rx_desc; + entry->rx_entry.count = count; + entry->rx_entry.addr = addr; + entry->rx_entry.context = context; + entry->rx_entry.tag = tag; + entry->rx_entry.ignore = ignore; + entry->rx_entry.flags = flags; +} + +static struct lnx_rx_entry * +get_rx_entry(struct local_prov_ep *cep, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + struct lnx_rx_entry *rx_entry = NULL; + ofi_spin_t *bplock; + struct ofi_bufpool *bp; + + /* if lp is NULL, then we don't know where the message is going to + * come from, so allocate the rx_entry from a global pool + */ + if (!cep) { + bp = global_recv_bp; + bplock = &global_bplock; + } else { + bp = cep->lpe_recv_bp; + bplock = &cep->lpe_bplock; + } + + ofi_spin_lock(bplock); + rx_entry = (struct lnx_rx_entry *)ofi_buf_alloc(bp); + ofi_spin_unlock(bplock); + if (rx_entry) { + memset(rx_entry, 0, sizeof(*rx_entry)); + if (!cep) + rx_entry->rx_global = true; + rx_entry->rx_cep = cep; + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, + ignore, context, flags); + } + + return rx_entry; +} + +static inline struct lnx_rx_entry * +lnx_remove_first_match(struct lnx_queue *q, struct lnx_match_attr *match) +{ + struct lnx_rx_entry *rx_entry; + + ofi_spin_lock(&q->lq_qlock); + rx_entry = (struct lnx_rx_entry *) dlist_remove_first_match( + &q->lq_queue, q->lq_match_func, match); + ofi_spin_unlock(&q->lq_qlock); + + return rx_entry; +} + +static inline void +lnx_insert_rx_entry(struct lnx_queue *q, struct lnx_rx_entry *entry) +{ + ofi_spin_lock(&q->lq_qlock); + dlist_insert_tail((struct dlist_entry *)(&entry->rx_entry), + &q->lq_queue); + ofi_spin_unlock(&q->lq_qlock); +} + +int lnx_queue_tag(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *)entry; + struct lnx_peer_srq *lnx_srq = (struct lnx_peer_srq*)entry->owner_context; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + entry->addr, entry->tag); + + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_unexq, rx_entry); + + return 0; +} + +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + struct lnx_match_attr match_attr; + struct lnx_peer_srq *lnx_srq; + struct local_prov_ep *cep; + struct lnx_ep *lep; + struct lnx_rx_entry *rx_entry; + fi_addr_t addr = match->addr; + struct lnx_srx_context *srx_ctxt; + uint64_t tag = match->tag; + int rc = 0; + + /* get the endpoint */ + cep = container_of(srx, struct local_prov_ep, lpe_srx); + srx_ctxt = cep->lpe_srx.ep_fid.fid.context; + cep = srx_ctxt->srx_cep; + lep = srx_ctxt->srx_lep; + lnx_srq = &lep->le_srq; + + /* The fi_addr_t is a generic address returned by the provider. It's usually + * just an index or id in their AV table. When I get it here, I could have + * duplicates if multiple providers are using the same scheme to + * insert in the AV table. I need to be able to identify the provider + * in this function so I'm able to correctly match this message to + * a possible rx entry on my receive queue. That's why we need to make + * sure we use the core endpoint as part of the matching key. + */ + memset(&match_attr, 0, sizeof(match_attr)); + + match_attr.lm_addr = addr; + match_attr.lm_ignore = 0; + match_attr.lm_tag = tag; + match_attr.lm_cep = cep; + + /* 1. Find a matching request to the message received. + * 2. Return the receive request. + * 3. If there are no matching requests, then create a new one + * and return it to the core provider. The core provider will turn + * around and tell us to queue it. Return -FI_ENOENT. + */ + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_recvq, + &match_attr); + if (rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + addr, tag); + + goto assign; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 not found\n", + addr, tag); + + rx_entry = get_rx_entry(cep, NULL, NULL, 0, addr, tag, 0, NULL, + lnx_ep_rx_flags(lep)); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + + rx_entry->rx_match_info = *match; + rx_entry->rx_entry.owner_context = lnx_srq; + rx_entry->rx_entry.msg_size = match->msg_size; + + rc = -FI_ENOENT; + +assign: + rx_entry->rx_entry.msg_size = MIN(rx_entry->rx_entry.msg_size, + match->msg_size); + *entry = &rx_entry->rx_entry; + +out: + return rc; +} + +/* + * if lp is NULL, then we're attempting to receive from any peer so + * matching the tag is the only thing that matters. + * + * if lp != NULL, then we're attempting to receive from a particular + * peer. This peer can have multiple endpoints serviced by different core + * providers. + * + * Therefore when we check the unexpected queue, we need to check + * if we received any messages from any of the peer's addresses. If we + * find one, then we kick the core provider associated with that + * address to receive the message. + * + * If nothing is found on the unexpected messages, then add a receive + * request on the SRQ; happens in the lnx_process_recv() + */ +static int lnx_process_recv(struct lnx_ep *lep, struct iovec *iov, void **desc, + fi_addr_t addr, size_t count, struct lnx_peer *lp, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged) +{ + struct lnx_peer_srq *lnx_srq = &lep->le_srq; + struct local_prov_ep *cep; + struct lnx_rx_entry *rx_entry; + struct lnx_match_attr match_attr; + int rc = 0; + + match_attr.lm_addr = addr; + match_attr.lm_ignore = ignore; + match_attr.lm_tag = tag; + match_attr.lm_cep = NULL; + match_attr.lm_peer = lp; + + /* if support is turned off, don't go down the SRQ path */ + if (!lep->le_domain->ld_srx_supported) + return -FI_ENOSYS; + + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_unexq, + &match_attr); + if (!rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx not found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + goto nomatch; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + cep = rx_entry->rx_cep; + + /* match is found in the unexpected queue. call into the core + * provider to complete this message + */ + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = MIN(ofi_total_iov_len(iov, count), + rx_entry->rx_entry.msg_size); + if (tagged) + rc = cep->lpe_srx.peer_ops->start_tag(&rx_entry->rx_entry); + else + rc = cep->lpe_srx.peer_ops->start_msg(&rx_entry->rx_entry); + + if (rc == -FI_EINPROGRESS) { + /* this is telling me that more messages can match the same + * rx_entry. So keep it on the queue + */ + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() in progress\n", + addr, tag, ignore); + + goto insert_recvq; + } else if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "start tag failed with %d\n", rc); + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() success\n", + addr, tag, ignore); + + return 0; + +nomatch: + /* nothing on the unexpected queue, then allocate one and put it on + * the receive queue + */ + rx_entry = get_rx_entry(NULL, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = ofi_total_iov_len(iov, count); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + rx_entry->rx_peer = lp; + +insert_recvq: + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_recvq, rx_entry); + +out: + return rc; +} + +ssize_t lnx_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct iovec iov = {.iov_base = buf, .iov_len = len}; + struct lnx_peer *lp; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lnx_get_core_desc(desc, &mem_desc); + + /* addr is an index into the peer table. + * This gets us to a peer. Each peer can be reachable on + * multiple endpoints. Each endpoint has its own fi_addr_t which is + * core provider specific. + */ + lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mem_desc); + if (rc) + return rc; + } + + rc = lnx_process_recv(lep, &iov, &mem_desc, src_addr, 1, lp, tag, ignore, + context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + else if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "lnx_process_recv failed with %d\n", rc); + + return rc; + +do_recv: + if (lp) + rc = fi_trecv(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, ignore, context); + + return rc; +} + +ssize_t lnx_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + lnx_get_core_desc(*desc, &mem_desc); + + lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *desc, &cep, + &core_addr, iov, count, &mem_desc); + if (rc) + return rc; + } + + rc = lnx_process_recv(lep, (struct iovec *)iov, &mem_desc, src_addr, + 1, lp, tag, ignore, context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + return rc; + +do_recv: + if (lp) + rc = fi_trecvv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, ignore, context); + + return rc; +} + +ssize_t lnx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct fi_msg_tagged core_msg; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_get_peer(peer_tbl->lpt_entries, msg->addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *msg->desc, + &cep, &core_addr, msg->msg_iov, + msg->iov_count, &mem_desc); + if (rc) + return rc; + } + lnx_get_core_desc(*msg->desc, &mem_desc); + + rc = lnx_process_recv(lep, (struct iovec *)msg->msg_iov, &mem_desc, + msg->addr, msg->iov_count, lp, msg->tag, msg->ignore, + msg->context, flags, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + return rc; + +do_recv: + if (lp) { + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + rc = fi_trecvmsg(cep->lpe_ep, &core_msg, flags); + return 0; + } + + return rc; +} + +ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct iovec iov = {.iov_base = (void*) buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsend(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, context); + + return rc; +} + +ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, (desc) ? *desc : NULL, &cep, + &core_addr, iov, count, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_addr, tag); + + rc = fi_tsendv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, context); + + return rc; +} + +ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct fi_msg_tagged core_msg; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[msg->addr], + lep->le_domain, + (msg->desc) ? *msg->desc : NULL, &cep, + &core_addr, msg->msg_iov, + msg->iov_count, &mem_desc, NULL); + if (rc) + return rc; + + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_msg.addr, core_msg.tag); + + rc = fi_tsendmsg(cep->lpe_ep, &core_msg, flags); + + return rc; +} + +ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinject(cep->lpe_ep, buf, len, core_addr, tag); + + return rc; +} + +ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsenddata(cep->lpe_ep, buf, len, mem_desc, + data, core_addr, tag, context); + + return rc; +} + +ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinjectdata(cep->lpe_ep, buf, len, data, core_addr, tag); + + return rc; +} + +static inline ssize_t +lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[src_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma read from %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_read(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + +out: + return rc; +} + +static inline ssize_t +lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma write to %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_write(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + +out: + return rc; +} + +static inline ssize_t +lnx_atomic_write(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_atomic(core_ep, buf, count, mem_desc, + core_addr, addr, key, datatype, op, context); + +out: + return rc; +} + +static inline ssize_t +lnx_atomic_readwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, + &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_fetch_atomic(core_ep, buf, count, desc, + result, mem_desc, core_addr, addr, key, + datatype, op, context); + +out: + return rc; +} + +static inline ssize_t +lnx_atomic_compwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, + &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_compare_atomic(core_ep, buf, count, desc, + compare, compare_desc, result, mem_desc, + core_addr, addr, key, datatype, op, context); + +out: + return rc; +} + +struct fi_ops_tagged lnx_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = lnx_trecv, + .recvv = lnx_trecvv, + .recvmsg = lnx_trecvmsg, + .send = lnx_tsend, + .sendv = lnx_tsendv, + .sendmsg = lnx_tsendmsg, + .inject = lnx_tinject, + .senddata = lnx_tsenddata, + .injectdata = lnx_tinjectdata, +}; + +struct fi_ops_msg lnx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +struct fi_ops_rma lnx_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = lnx_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = lnx_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + +struct fi_ops_atomic lnx_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = lnx_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = lnx_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = lnx_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + + diff --git a/prov/lnx/src/lnx_peers.c b/prov/lnx/src/lnx_peers.c new file mode 100644 index 00000000000..f2bbd24249f --- /dev/null +++ b/prov/lnx/src/lnx_peers.c @@ -0,0 +1,698 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static void lnx_free_peer(struct lnx_peer *lp) +{ + struct lnx_peer_prov *lpp; + struct dlist_entry *tmp, *tmp2; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container_safe(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry, tmp) { + dlist_foreach_container_safe(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry, tmp2) { + dlist_remove(&lpm->entry); + free(lpm); + } + dlist_remove(&lpp->entry); + free(lpp); + } + + free(lp); +} + +#if ENABLE_DEBUG +static void lnx_print_peer(int idx, struct lnx_peer *lp) +{ + int k; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "%d: lnx_peer[%d] is %s\n", getpid(), idx, + (lp->lp_local) ? "local" : "remote"); + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "%d: peer[%p] provider %s\n", getpid(), lpp, + lpp->lpp_prov_name); + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + " %d: peer has %d mapped addrs\n", + getpid(), lpm->addr_count); + for (k = 0; k < lpm->addr_count; k++) + FI_DBG(&lnx_prov, FI_LOG_CORE, + " %d: addr = %lu\n", + getpid(), lpm->peer_addrs[k]); + } + } +} +#endif /* ENABLE_DEBUG */ + +static int lnx_peer_insert(struct lnx_peer_table *tbl, + struct lnx_peer *lp) +{ + int i; + + if (tbl->lpt_max_count == 0 || + tbl->lpt_count >= tbl->lpt_max_count) + return -FI_ENOENT; + + for (i = 0; i < tbl->lpt_max_count; i++) { + if (!tbl->lpt_entries[i]) { + tbl->lpt_entries[i] = lp; +#if ENABLE_DEBUG + lnx_print_peer(i, lp); +#endif + tbl->lpt_count++; + return i; + } + } + + return -FI_ENOENT; +} + +static int lnx_peer_av_remove(struct lnx_peer *lp) +{ + int rc, frc = 0; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + /* if this is a remote peer then we didn't insert its shm address + * into our local shm endpoint, so no need to remove it + */ + if (!strncasecmp(lpp->lpp_prov_name, "shm", 3) && + !lp->lp_local) + continue; + + /* remove these address from all local providers */ + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + if (lpm->addr_count > 0) { + rc = fi_av_remove(lpm->local_ep->lpe_av, lpm->peer_addrs, + lpm->addr_count, lpp->lpp_flags); + if (rc) + frc = rc; + } + } + } + + return frc; +} + +static int lnx_peer_remove(struct lnx_peer_table *tbl, int idx) +{ + struct lnx_peer *lp = tbl->lpt_entries[idx]; + int rc = 0; + + if (!lp) + return 0; + + rc = lnx_peer_av_remove(lp); + + tbl->lpt_entries[idx] = NULL; + tbl->lpt_count--; + + return rc; +} + +static int lnx_cleanup_avs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_av->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static inline void lnx_free_peer_tbl(struct lnx_peer_table *peer_tbl) +{ + free(peer_tbl->lpt_entries); + free(peer_tbl); +} + +int lnx_av_close(struct fid *fid) +{ + int rc; + struct local_prov *entry; + struct lnx_fabric *fabric; + struct lnx_peer_table *peer_tbl; + + peer_tbl = container_of(fid, struct lnx_peer_table, lpt_av.av_fid.fid); + fabric = peer_tbl->lpt_domain->ld_fabric; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_avs(entry); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to close av for %s\n", + entry->lpv_prov_name); + } + } + + ofi_av_close_lightweight(&peer_tbl->lpt_av); + + free(peer_tbl); + + return 0; +} + +static struct fi_ops lnx_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static int lnx_get_or_create_peer_prov(struct dlist_entry *prov_table, + struct lnx_peer *lp, char *prov_name, + struct lnx_peer_prov **lpp) +{ + bool shm = false; + struct local_prov *entry; + struct lnx_peer_prov *peer_prov; + + if (!strcmp(prov_name, "shm")) { + if (lp->lp_shm_prov) + return -FI_ENOENT; + shm = true; + goto insert_prov; + } + + /* check if we already have a peer provider */ + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, peer_prov, entry) { + if (!strncasecmp(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX)) { + *lpp = peer_prov; + return 0; + } + } + +insert_prov: + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) { + peer_prov = calloc(sizeof(*peer_prov), 1); + if (!peer_prov) + return -FI_ENOMEM; + + dlist_init(&peer_prov->entry); + dlist_init(&peer_prov->lpp_map); + + strncpy(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX); + + peer_prov->lpp_prov = entry; + + if (shm) + lp->lp_shm_prov = peer_prov; + else + dlist_insert_tail(&peer_prov->entry, &lp->lp_provs); + + *lpp = peer_prov; + return 0; + } + } + + return -FI_ENOENT; +} + +static inline struct lnx_address_prov * +next_prov(struct lnx_address_prov *prov) +{ + uint8_t *ptr; + + ptr = (uint8_t*) prov; + + ptr += (sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size)); + + return (struct lnx_address_prov*)ptr; +} + +static inline size_t +get_lnx_addresses_size(struct lnx_addresses *addrs) +{ + int i; + size_t s = sizeof(*addrs); + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + s += sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size); + prov = next_prov(prov); + } + + return s; +} + +static inline struct lnx_addresses * +next_peer(struct lnx_addresses *addrs) +{ + uint8_t *ptr; + + ptr = (uint8_t*)addrs + get_lnx_addresses_size(addrs); + + return (struct lnx_addresses *)ptr; +} + +static struct lnx_address_prov * +lnx_get_peer_shm_addr(struct lnx_addresses *addrs) +{ + int i; + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + if (!strcmp(prov->lap_prov, "shm")) + return prov; + prov = next_prov(prov); + } + + return NULL; +} + +static int is_local_addr(struct local_prov **shm_prov, struct lnx_addresses *la) +{ + int rc; + char hostname[FI_NAME_MAX]; + struct lnx_address_prov *lap_shm; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + lap_shm = lnx_get_peer_shm_addr(la); + if (!lap_shm) + return -FI_EOPNOTSUPP; + + /* Shared memory address not provided or not local*/ + if ((lap_shm->lap_addr_count == 0) || + strncasecmp(hostname, la->la_hostname, FI_NAME_MAX)) + return -FI_EOPNOTSUPP; + + /* badly formed address */ + if (*shm_prov && (lap_shm->lap_addr_count > 1 || + lap_shm->lap_addr_count < 0)) + return -FI_EPROTO; + + return 0; +} + +static void +lnx_update_msg_entries(struct lnx_qpair *qp, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_queue *q = &qp->lqp_unexq; + struct lnx_rx_entry *rx_entry; + struct dlist_entry *item; + + ofi_spin_lock(&q->lq_qlock); + dlist_foreach(&q->lq_queue, item) { + rx_entry = (struct lnx_rx_entry *) item; + rx_entry->rx_entry.addr = get_addr(&rx_entry->rx_entry); + } + ofi_spin_unlock(&q->lq_qlock); +} + +void +lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_srx_context *ctxt; + + ctxt = (struct lnx_srx_context *) srx->ep_fid.fid.context; + + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_trecv, get_addr); + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_recv, get_addr); +} + +static int lnx_peer_map_addrs(struct dlist_entry *prov_table, + struct lnx_peer *lp, struct lnx_addresses *la, + uint64_t flags, void *context) +{ + int i, j, rc; + struct lnx_peer_prov *lpp; + struct lnx_address_prov *lap; + struct local_prov_ep *lpe; + struct dlist_entry *eps; + + lap = &la->la_addr_prov[0]; + + for (i = 0; i < la->la_prov_count; i++) { + if (lap->lap_addr_count > LNX_MAX_LOCAL_EPS) + return -FI_EPROTO; + + rc = lnx_get_or_create_peer_prov(prov_table, lp, lap->lap_prov, + &lpp); + if (rc) + return rc; + + lpp->lpp_flags = flags; + + eps = &lpp->lpp_prov->lpv_prov_eps; + dlist_foreach_container(eps, struct local_prov_ep, lpe, + entry) { + struct lnx_local2peer_map *lpm; + + /* if this is a remote peer, don't insert the shm address + * since we will never talk to that peer over shm + */ + if (!strncasecmp(lpe->lpe_fabric_name, "shm", 3) && + !lp->lp_local) + continue; + + lpm = calloc(sizeof(*lpm), 1); + if (!lpm) + return -FI_ENOMEM; + + dlist_init(&lpm->entry); + dlist_insert_tail(&lpm->entry, &lpp->lpp_map); + + lpm->local_ep = lpe; + lpm->addr_count = lap->lap_addr_count; + for (j = 0; j < LNX_MAX_LOCAL_EPS; j++) + lpm->peer_addrs[j] = FI_ADDR_NOTAVAIL; + /* fi_av_insert returns the number of addresses inserted */ + rc = fi_av_insert(lpe->lpe_av, (void*)lap->lap_addrs, + lap->lap_addr_count, + lpm->peer_addrs, flags, context); + if (rc < 0) + return rc; + + /* should only insert the number of addresses indicated */ + assert(rc == lap->lap_addr_count); + } + + lap = next_prov(lap); + } + + return 0; +} + +/* + * count: number of LNX addresses + * addr: an array of addresses + * fi_addr: an out array of fi_addr)t + * + * Each LNX address can have multiple core provider addresses + * Check the hostname provided in each address to see if it's the same as + * me. If so, then we'll use the SHM address if available. + * + * ASSUMPTION: fi_av_insert() is called exactly once per peer. + * We're not handling multiple av_inserts on the same peer. If that + * happens then we will create multiple peers entries. + */ +int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + int i, rc, idx; + int disable_shm = 0; + struct lnx_peer *lp; + struct dlist_entry *prov_table; + struct lnx_peer_table *peer_tbl; + struct lnx_addresses *addrs = (struct lnx_addresses *)addr; + + fi_param_get_bool(&lnx_prov, "disable_shm", &disable_shm); + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + prov_table = &peer_tbl->lpt_domain->ld_fabric->local_prov_table; + + /* each entry represents a separate peer */ + struct lnx_addresses *la = addrs; + for (i = 0; i < count; i++) { + /* can't have more providers than LNX_MAX_LOCAL_EPS */ + if (la->la_prov_count >= LNX_MAX_LOCAL_EPS || + la->la_prov_count <= 0) + return -FI_EPROTO; + + /* this is a local peer */ + lp = calloc(sizeof(*lp), 1); + if (!lp) + return -FI_ENOMEM; + + dlist_init(&lp->lp_provs); + + rc = is_local_addr(&peer_tbl->lpt_domain->ld_fabric->shm_prov, + la); + if (!rc) { + lp->lp_local = !disable_shm; + } else if (rc == -FI_EOPNOTSUPP) { + lp->lp_local = false; + } else if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to identify address\n"); + return rc; + } + + rc = lnx_peer_map_addrs(prov_table, lp, la, flags, context); + if (rc) { + free(lp); + return rc; + } + + idx = lnx_peer_insert(peer_tbl, lp); + if (idx == -1) { + rc = lnx_peer_av_remove(lp); + lnx_free_peer(lp); + FI_INFO(&lnx_prov, FI_LOG_CORE, + "Peer table size exceeded. Removed = %d\n", rc); + return -FI_ENOENT; + } + + fi_addr[i] = (fi_addr_t) idx; + + la = next_peer(la); + } + + return i; +} + +int lnx_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct lnx_peer_table *peer_tbl; + int frc = 0, rc, i; + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + + for (i = 0; i < count; i++) { + rc = lnx_peer_remove(peer_tbl, (int)fi_addr[i]); + if (rc) + frc = rc; + } + + return frc; +} + +static const char * +lnx_av_straddr(struct fid_av *av, const void *addr, + char *buf, size_t *len) +{ + /* TODO: implement */ + return NULL; +} + +static int +lnx_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + /* TODO: implement */ + return -FI_EOPNOTSUPP; +} + +static struct fi_ops_av lnx_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = lnx_av_insert, + .remove = lnx_av_remove, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .lookup = lnx_av_lookup, + .straddr = lnx_av_straddr, +}; + +static void lnx_get_core_av_attr(struct local_prov_ep *ep, + struct fi_av_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->type = ep->lpe_fi_info->domain_attr->av_type; +} + +static int lnx_open_avs(struct local_prov *prov, struct fi_av_attr *attr, + void *context) +{ + int rc = 0; + struct local_prov_ep *ep; + struct fi_av_attr core_attr; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + lnx_get_core_av_attr(ep, &core_attr); + if (ep->lpe_local) + core_attr.count = ep->lpe_fi_info->domain_attr->ep_cnt; + else + core_attr.count = attr->count; + rc = fi_av_open(ep->lpe_domain, &core_attr, + &ep->lpe_av, context); + if (rc) + return rc; + } + + return 0; +} + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context) +{ + struct lnx_fabric *fabric; + struct lnx_domain *lnx_domain; + struct lnx_peer_table *peer_tbl; + struct local_prov *entry; + size_t table_sz = LNX_DEF_AV_SIZE; + int rc = 0; + + if (!attr) + return -FI_EINVAL; + + if (attr->name) + return -FI_ENOSYS; + + if (attr->type == FI_AV_UNSPEC) + attr->type = FI_AV_TABLE; + + peer_tbl = calloc(sizeof(*peer_tbl), 1); + if (!peer_tbl) + return -FI_ENOMEM; + + if (attr->count != 0) + table_sz = attr->count; + + peer_tbl->lpt_entries = + calloc(sizeof(struct lnx_peer *) * table_sz, 1); + if (!peer_tbl->lpt_entries) { + rc = -FI_ENOMEM; + goto failed; + } + + lnx_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid.fid); + fabric = lnx_domain->ld_fabric; + + rc = ofi_av_init_lightweight(&lnx_domain->ld_domain, attr, + &peer_tbl->lpt_av, context); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "failed to initialize AV: %d\n", rc); + goto failed; + } + + peer_tbl->lpt_max_count = table_sz; + peer_tbl->lpt_domain = lnx_domain; + peer_tbl->lpt_av.av_fid.fid.ops = &lnx_av_fi_ops; + peer_tbl->lpt_av.av_fid.ops = &lnx_av_ops; + + assert(fabric->lnx_peer_tbl == NULL); + + /* need this to handle memory registration vi fi_mr_regattr(). We need + * to be able to access the peer table to determine which endpoint + * we'll be using based on the source/destination address */ + fabric->lnx_peer_tbl = peer_tbl; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_avs(entry, attr, context); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close; + } + } + + *av = &peer_tbl->lpt_av.av_fid; + + return 0; + +close: + ofi_av_close_lightweight(&peer_tbl->lpt_av); +failed: + lnx_free_peer_tbl(peer_tbl); + return rc; +} + + diff --git a/prov/shm/src/smr_init.c b/prov/shm/src/smr_init.c index 3f2c6656637..a220f14cd27 100644 --- a/prov/shm/src/smr_init.c +++ b/prov/shm/src/smr_init.c @@ -171,6 +171,9 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, cur->ep_attr->max_order_waw_size = 0; cur->ep_attr->max_order_war_size = 0; } + + if (mr_mode & FI_MR_PROV_KEY) + cur->domain_attr->mr_mode |= FI_MR_PROV_KEY; } return 0; } diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index 634af1e5e82..c499d88568b 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -93,7 +93,7 @@ char *ofi_strdup_tail(const char *str) } */ -char *ofi_strdup_append(const char *head, const char *tail) +char *ofi_strdup_append_work(const char *head, const char *tail, char delim) { char *str; size_t len; @@ -101,10 +101,20 @@ char *ofi_strdup_append(const char *head, const char *tail) len = strlen(head) + strlen(tail) + 2; str = malloc(len); if (str) - sprintf(str, "%s%c%s", head, OFI_NAME_DELIM, tail); + sprintf(str, "%s%c%s", head, delim, tail); return str; } +char *ofi_strdup_link_append(const char *head, const char *tail) +{ + return ofi_strdup_append_work(head, tail, OFI_NAME_LNX_DELIM); +} + +char *ofi_strdup_append(const char *head, const char *tail) +{ + return ofi_strdup_append_work(head, tail, OFI_NAME_DELIM); +} + int ofi_exclude_prov_name(char **prov_name_list, const char *util_prov_name) { char *exclude, *name, *temp; diff --git a/prov/util/src/util_mr_cache.c b/prov/util/src/util_mr_cache.c index f2148e56267..c22953f497f 100644 --- a/prov/util/src/util_mr_cache.c +++ b/prov/util/src/util_mr_cache.c @@ -55,7 +55,12 @@ static int util_mr_find_within(struct ofi_rbmap *map, void *key, void *data) { struct ofi_mr_entry *entry = data; struct ofi_mr_info *info = key; - +/* + fprintf(stderr, "%d: Compare peer_id %ld-%ld addr %p:%ld-%p:%ld\n", + getpid(), info->peer_id, entry->info.peer_id, + info->iov.iov_base, info->iov.iov_len, + entry->info.iov.iov_base, entry->info.iov.iov_len); +*/ if (info->peer_id < entry->info.peer_id) return -1; if (info->peer_id > entry->info.peer_id) diff --git a/src/fabric.c b/src/fabric.c index b1a735638bb..c876a68b929 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -47,6 +47,7 @@ #include "ofi_util.h" #include "ofi.h" #include "ofi_str.h" +#include "ofi_lnx.h" #include "ofi_prov.h" #include "ofi_perf.h" #include "ofi_hmem.h" @@ -58,7 +59,6 @@ #include #endif - struct ofi_prov { struct ofi_prov *next; char *prov_name; @@ -262,6 +262,11 @@ static int ofi_is_hook_prov(const struct fi_provider *provider) return ofi_prov_ctx(provider)->type == OFI_PROV_HOOK; } +static int ofi_is_lnx_prov(const struct fi_provider *provider) +{ + return ofi_prov_ctx(provider)->type == OFI_PROV_LNX; +} + int ofi_apply_filter(struct ofi_filter *filter, const char *name) { if (!filter->names) @@ -500,6 +505,8 @@ static void ofi_set_prov_type(struct fi_provider *provider) ofi_prov_ctx(provider)->type = OFI_PROV_UTIL; else if (ofi_has_offload_prefix(provider->name)) ofi_prov_ctx(provider)->type = OFI_PROV_OFFLOAD; + else if (ofi_is_lnx(provider->name)) + ofi_prov_ctx(provider)->type = OFI_PROV_LNX; else ofi_prov_ctx(provider)->type = OFI_PROV_CORE; } @@ -988,6 +995,7 @@ void fi_ini(void) ofi_register_provider(SOCKETS_INIT, NULL); ofi_register_provider(TCP_INIT, NULL); + ofi_register_provider(LNX_INIT, NULL); ofi_register_provider(HOOK_PERF_INIT, NULL); ofi_register_provider(HOOK_TRACE_INIT, NULL); ofi_register_provider(HOOK_PROFILE_INIT, NULL); @@ -1207,8 +1215,12 @@ static void ofi_set_prov_attr(struct fi_fabric_attr *attr, core_name = attr->prov_name; if (core_name) { - assert(ofi_is_util_prov(prov)); - attr->prov_name = ofi_strdup_append(core_name, prov->name); + if (ofi_is_util_prov(prov)) + attr->prov_name = ofi_strdup_append(core_name, prov->name); + else if (ofi_is_lnx_prov(prov)) + attr->prov_name = ofi_strdup_link_append(core_name, prov->name); + else + assert(0); free(core_name); } else { attr->prov_name = strdup(prov->name); @@ -1541,6 +1553,26 @@ struct fi_info *DEFAULT_SYMVER_PRE(fi_dupinfo)(const struct fi_info *info) } DEFAULT_SYMVER(fi_dupinfo_, fi_dupinfo, FABRIC_1.8); +__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) +int DEFAULT_SYMVER_PRE(fi_link)(struct fi_info *prov_list, + struct fid_fabric **fabric, uint64_t caps, void *context) +{ + /* count number of providers */ + int num_prov = 0; + struct fi_info *info; + + for (info = prov_list; info; info = prov_list->next) + num_prov++; + + if (num_prov == 1) { + return fi_fabric(prov_list->fabric_attr, fabric, context); + } + + /* create a link between providers in the list */ + return ofi_create_link(prov_list, fabric, caps, context); +} +DEFAULT_SYMVER(fi_link_, fi_link, FABRIC_1.7); + __attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context) @@ -1557,7 +1589,10 @@ int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, fi_ini(); - top_name = strrchr(attr->prov_name, OFI_NAME_DELIM); + ret = ofi_is_linked(attr->prov_name); + top_name = strrchr(attr->prov_name, + ret ? + OFI_NAME_LNX_DELIM : OFI_NAME_DELIM); if (top_name) top_name++; else diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 910dfd1214b..420f0cca2f6 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -259,6 +259,7 @@ static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) CASEENUMSTRN(FI_PROTO_SM2, len); CASEENUMSTRN(FI_PROTO_CXI_RNR, len); CASEENUMSTRN(FI_PROTO_LPP, len); + CASEENUMSTRN(FI_PROTO_LNX, len); default: ofi_strncatf(buf, len, "Unknown"); break; diff --git a/src/hmem_rocr.c b/src/hmem_rocr.c index bba705ba8ef..0ad80271c7a 100644 --- a/src/hmem_rocr.c +++ b/src/hmem_rocr.c @@ -615,7 +615,7 @@ bool rocr_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags) if (hsa_dev_type == HSA_DEVICE_TYPE_GPU) { /* TODO get device pointer/id */ if (flags) - *flags = FI_HMEM_DEVICE_ONLY; + *flags = (FI_HMEM_DEVICE_ONLY | OFI_HMEM_DATA_DEV_REG_HANDLE); return true; } } else {