Skip to content

Commit

Permalink
Merge pull request #1367 from hjelmn/xrc_fixes
Browse files Browse the repository at this point in the history
Fix XRC support
  • Loading branch information
hjelmn committed Feb 17, 2016
2 parents e0de442 + bf83603 commit 2a728f3
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 68 deletions.
63 changes: 36 additions & 27 deletions opal/mca/btl/openib/btl_openib.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2015 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
Expand Down Expand Up @@ -403,14 +403,16 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
}
}

openib_btl->srqs_created = true;

return OPAL_SUCCESS;
}

static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl)
{
int rc = OPAL_SUCCESS;
opal_mutex_lock(&openib_btl->ib_lock);
if (0 == openib_btl->num_peers &&
if (!openib_btl->srqs_created &&
(mca_btl_openib_component.num_srq_qps > 0 ||
mca_btl_openib_component.num_xrc_qps > 0)) {
rc = create_srq(openib_btl);
Expand All @@ -420,17 +422,12 @@ static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl)
}


static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl, size_t nprocs)
static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl)
{
uint32_t send_cqes, recv_cqes;
int rc = OPAL_SUCCESS, qp;
mca_btl_openib_device_t *device = openib_btl->device;

if( 0 == nprocs){
/* nothing to do */
return OPAL_SUCCESS;
}

opal_mutex_lock(&openib_btl->ib_lock);
/* figure out reasonable sizes for completion queues */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
Expand All @@ -439,7 +436,7 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl, si
recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num;
} else {
send_cqes = (mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * nprocs;
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * openib_btl->num_peers;
recv_cqes = send_cqes;
}

Expand All @@ -459,7 +456,6 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl, si
goto out;
}

openib_btl->num_peers += nprocs;
out:
opal_mutex_unlock(&openib_btl->ib_lock);
return rc;
Expand Down Expand Up @@ -1032,10 +1028,14 @@ int mca_btl_openib_add_procs(
return rc;
}

rc = openib_btl_prepare(openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("could not prepare openib btl structure for usel"));
return rc;
if (0 == openib_btl->num_peers) {
/* ensure completion queues are created before attempting to
* make a loop-back queue pair */
rc = openib_btl_size_queues(openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error creating cqs"));
return rc;
}
}

/* prepare all proc's and account them properly */
Expand Down Expand Up @@ -1084,10 +1084,20 @@ int mca_btl_openib_add_procs(
}
}

/* account this procs if need */
rc = openib_btl_size_queues(openib_btl, nprocs_new);
if (nprocs_new) {
OPAL_THREAD_ADD32(&openib_btl->num_peers, nprocs_new);

/* adjust cq sizes given the new procs */
rc = openib_btl_size_queues (openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error creating cqs"));
return rc;
}
}

rc = openib_btl_prepare (openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error creating cqs"));
BTL_ERROR(("could not prepare openib btl module for use"));
return rc;
}

Expand Down Expand Up @@ -1160,23 +1170,15 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul
{
mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl;
volatile mca_btl_base_endpoint_t *endpoint = NULL;
int local_port_cnt = 0, btl_rank, rc;
mca_btl_openib_proc_t *ib_proc;
int rc;
int local_port_cnt = 0, btl_rank;
size_t nprocs_new = 0;

rc = prepare_device_for_use (openib_btl->device);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("could not prepare openib device for use"));
return NULL;
}

rc = openib_btl_prepare(openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("could not prepare openib btl structure for use"));
return NULL;
}

if (NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc))) {
/* if we don't have connection info for this process, it's
* okay because some other method might be able to reach it,
Expand All @@ -1193,7 +1195,8 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul

/* this is a new process to this openib btl
* account this procs if need */
rc = openib_btl_size_queues(openib_btl, nprocs_new);
OPAL_THREAD_ADD32(&openib_btl->num_peers, 1);
rc = openib_btl_size_queues(openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error creating cqs"));
return NULL;
Expand All @@ -1218,6 +1221,12 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul
return NULL;
}

rc = openib_btl_prepare(openib_btl);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("could not prepare openib btl structure for use"));
goto exit;
}

for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) {
endpoint = ib_proc->proc_endpoints[j];
if (endpoint->endpoint_btl == openib_btl) {
Expand Down
3 changes: 2 additions & 1 deletion opal/mca/btl/openib/btl_openib.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
Expand Down Expand Up @@ -458,6 +458,7 @@ struct mca_btl_openib_module_t {
mca_btl_base_module_t super;

bool btl_inited;
bool srqs_created;

/** Common information about all ports */
mca_btl_openib_modex_message_t port_info;
Expand Down
6 changes: 3 additions & 3 deletions opal/mca/btl/openib/btl_openib_put.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
Expand Down Expand Up @@ -49,7 +49,7 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
qp = mca_btl_openib_component.rdma_qp;
}

if (OPAL_UNLIKELY((ep->qps[qp].ib_inline_max < size && !local_handle) || !remote_handle ||
if (OPAL_UNLIKELY((btl->btl_put_local_registration_threshold < size && !local_handle) || !remote_handle ||
size > btl->btl_put_limit)) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -164,7 +164,7 @@ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base

if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) {
qp_put_wqe(ep, qp);
return OPAL_ERROR;;
return OPAL_ERROR;
}

return OPAL_SUCCESS;
Expand Down
69 changes: 32 additions & 37 deletions opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -240,6 +240,7 @@ typedef struct udcm_msg_hdr {
#if HAVE_XRC
/* UDCM_MESSAGE_XCONNECT, UDCM_MESSAGE_XCONNECT2 */
struct msg_xrc_connect {
opal_process_name_t rem_name;
int32_t rem_ep_index;
uint8_t rem_port_num;
uint32_t rem_qp_num;
Expand Down Expand Up @@ -343,11 +344,7 @@ static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc,
static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep);
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep);
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num);
#else
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep);
#endif
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
uint8_t msg_type);
Expand Down Expand Up @@ -529,42 +526,47 @@ static int udcm_component_finalize(void)
static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep)
{
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
int32_t recv_qpn;
int rc;

opal_mutex_lock (&udep->udep_lock);
do {
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
rc = udcm_xrc_recv_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num);
#else
lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num;
rc = udcm_xrc_recv_qp_connect (lcl_ep);
#endif
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error connecting loopback XRC receive queue pair"));
if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) {
BTL_VERBOSE(("error initializing loopback endpoint cpc data"));
break;
}

rc = mca_btl_openib_endpoint_post_recvs (lcl_ep);
rc = udcm_xrc_send_qp_create (lcl_ep);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error posting receives for loopback queue pair"));
BTL_VERBOSE(("error creating send queue pair for loopback endpoint"));
break;
}

lcl_ep->rem_info.rem_index = lcl_ep->index;

rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
lcl_ep->qps[0].qp->lcl_psn);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error creating loopback XRC receive queue pair"));
break;
}

rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
lcl_ep->qps[0].qp->lcl_psn);
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
recv_qpn = lcl_ep->xrc_recv_qp->qp_num;
#else
recv_qpn = lcl_ep->xrc_recv_qp_num;
#endif

lcl_ep->rem_info.rem_qps[0].rem_psn = lcl_ep->xrc_recv_psn;
lcl_ep->rem_info.rem_qps[0].rem_qp_num = recv_qpn;

rc = udcm_xrc_send_qp_connect (lcl_ep, recv_qpn, lcl_ep->xrc_recv_psn);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("error creating loopback XRC send queue pair"));
BTL_VERBOSE(("error connecting loopback XRC send queue pair"));
break;
}

lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
BTL_VERBOSE(("successfully created loopback queue pair"));

/* need to hold the endpoint lock before calling udcm_finish_connection */
OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock);
Expand Down Expand Up @@ -606,8 +608,6 @@ static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep)
break;
}

lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;

/* need to hold the endpoint lock before calling udcm_finish_connection */
OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock);
rc = udcm_finish_connection (lcl_ep);
Expand Down Expand Up @@ -2609,11 +2609,7 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep)
/* mark: xrc receive qp */

/* Recv qp connect */
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num)
#else
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep)
#endif
{
mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl;

Expand All @@ -2627,23 +2623,26 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep)
BTL_VERBOSE(("Connecting Recv QP\n"));
lcl_ep->xrc_recv_qp = ibv_open_qp(openib_btl->device->ib_dev_context, &attr);
if (NULL == lcl_ep->xrc_recv_qp) { /* failed to regester the qp, so it is already die and we should create new one */
/* Return NOT READY !!!*/
BTL_ERROR(("Failed to register qp_num: %d, get error: %s (%d)\n. Replying with RNR",
qp_num, strerror(errno), errno));
/* Return NOT READY !!!*/
BTL_VERBOSE(("Failed to register qp_num: %d, get error: %s (%d)\n. Replying with RNR",
qp_num, strerror(errno), errno));
return OPAL_ERROR;
} else {
BTL_VERBOSE(("Connected to XRC Recv qp [%d]", lcl_ep->xrc_recv_qp->qp_num));
return OPAL_SUCCESS;
}
#else
int ret;
/* silence unused variable warning */
(void) qp_num;

BTL_VERBOSE(("Connecting receive qp: %d", lcl_ep->xrc_recv_qp_num));
ret = ibv_reg_xrc_rcv_qp(openib_btl->device->xrc_domain, lcl_ep->xrc_recv_qp_num);
if (ret) { /* failed to regester the qp, so it is already die and we should create new one */
/* Return NOT READY !!!*/
lcl_ep->xrc_recv_qp_num = 0;
BTL_ERROR(("Failed to register qp_num: %d , get error: %s (%d). Replying with RNR",
lcl_ep->xrc_recv_qp_num, strerror(ret), ret));
BTL_VERBOSE(("Failed to register qp_num: %d , get error: %s (%d). Replying with RNR",
lcl_ep->xrc_recv_qp_num, strerror(ret), ret));
return OPAL_ERROR;
}
#endif
Expand Down Expand Up @@ -2819,9 +2818,9 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_
return rc;
}

msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME;
msg->data->hdr.data.xreq.rem_ep_index = htonl(lcl_ep->index);
msg->data->hdr.data.xreq.rem_port_num = m->modex.mm_port_num;
msg->data->hdr.data.xreq.rem_name = OPAL_PROC_MY_NAME;

if (UDCM_MESSAGE_XCONNECT == msg_type) {
BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num,
Expand Down Expand Up @@ -2925,11 +2924,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg

if (UDCM_MESSAGE_XCONNECT2 == msg_hdr->type) {
response_type = UDCM_MESSAGE_XRESPONSE2;
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
rc = udcm_xrc_recv_qp_connect (lcl_ep, msg_hdr->data.xreq.rem_qp_num);
#else
rc = udcm_xrc_recv_qp_connect (lcl_ep);
#endif
if (OPAL_SUCCESS != rc) {
/* return not ready. remote side will retry */
rej_reason = UDCM_REJ_NOT_READY;
Expand Down

0 comments on commit 2a728f3

Please sign in to comment.