Skip to content

Commit

Permalink
coll/libnbc: fix race condition with multi threaded apps
Browse files Browse the repository at this point in the history
protect the mca_coll_libnbc_component.active_requests list with
the new mca_coll_libnbc_component.lock mutex.

Thanks Jie Hu for the report

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>

(back-ported from commit 2c94a3a)
  • Loading branch information
ggouaillardet committed Nov 21, 2016
1 parent 800063e commit 53ce62f
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 0 deletions.
3 changes: 3 additions & 0 deletions ompi/mca/coll/libnbc/coll_libnbc.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -71,6 +73,7 @@ struct ompi_coll_libnbc_component_t {
opal_list_t active_requests;
int32_t active_comms;
opal_atomic_lock_t progress_lock;
opal_mutex_t lock;
};
typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t;

Expand Down
8 changes: 8 additions & 0 deletions ompi/mca/coll/libnbc/coll_libnbc_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ libnbc_open(void)

OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t);
OBJ_CONSTRUCT(&mca_coll_libnbc_component.lock, opal_mutex_t);
ret = ompi_free_list_init(&mca_coll_libnbc_component.requests,
sizeof(ompi_coll_libnbc_request_t),
OBJ_CLASS(ompi_coll_libnbc_request_t),
Expand Down Expand Up @@ -116,6 +117,7 @@ libnbc_close(void)

OBJ_DESTRUCT(&mca_coll_libnbc_component.requests);
OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests);
OBJ_DESTRUCT(&mca_coll_libnbc_component.lock);

return OMPI_SUCCESS;
}
Expand Down Expand Up @@ -242,19 +244,25 @@ ompi_coll_libnbc_progress(void)

if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;

OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
ompi_coll_libnbc_request_t) {
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
if (NBC_OK == NBC_Progress(request)) {
/* done, remove and complete */
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
&request->super.super.super);
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);

request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&ompi_request_lock);
ompi_request_complete(&request->super, true);
OPAL_THREAD_UNLOCK(&ompi_request_lock);
}
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
}
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);

opal_atomic_unlock(&mca_coll_libnbc_component.progress_lock);

Expand Down
2 changes: 2 additions & 0 deletions ompi/mca/coll/libnbc/nbc.c
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,9 @@ int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule) {
res = NBC_Start_round(handle);
if((NBC_OK != res)) { printf("Error in NBC_Start_round() (%i)\n", res); return res; }

OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super));
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);

return NBC_OK;
}
Expand Down

0 comments on commit 53ce62f

Please sign in to comment.