Skip to content

Commit

Permalink
UCT/UCP: Fixes for am_zcopy sends
Browse files Browse the repository at this point in the history
1. ucp_do_am_zcopy_multi() did not return UCS_ERR_NO_RESOURCE when such
   status was returned from uct_ep_am_zcopy, as a result the send
   request was removed from pending queue and did not progress (io_demo
   test stuck)
2. rc_mlx5 am_zcopy checked pending queue assertion before checking
   send resources in uct_rc_mlx5_ep_zcopy_post() which leads to wrong
   assertion. Move resource checking from zcopy_post to calling
   functions.
3. Invalid send flags passed in zcopy_post: SOLICITED was not set when
   comp != NULL.
  • Loading branch information
yosefe committed Jan 18, 2021
1 parent b0092c6 commit 20aacb7
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 5 deletions.
2 changes: 2 additions & 0 deletions src/ucp/proto/proto_am.inl
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first,
}
ucs_assert(status == UCS_INPROGRESS);
return UCS_OK;
} else {
return UCS_ERR_NO_RESOURCE;
}
}

Expand Down
12 changes: 7 additions & 5 deletions src/uct/ib/rc/accel/rc_mlx5_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,13 @@ uct_rc_mlx5_ep_zcopy_post(uct_rc_mlx5_ep_t *ep, unsigned opcode,
/* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len,
/* RDMA */ uint64_t rdma_raddr, uct_rkey_t rdma_rkey,
/* TAG */ uct_tag_t tag, uint32_t app_ctx, uint32_t ib_imm_be,
int force_sig, uct_rc_send_handler_t handler,
int wqe_flags, uct_rc_send_handler_t handler,
uct_completion_t *comp)
{
uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ep->super.super.super.iface,
uct_rc_mlx5_iface_common_t);
uint16_t sn;

UCT_RC_CHECK_RES(&iface->super, &ep->super);

sn = ep->tx.wq.sw_pi;
uct_rc_mlx5_txqp_dptr_post_iov(iface, IBV_QPT_RC,
&ep->super.txqp, &ep->tx.wq,
Expand All @@ -77,7 +75,8 @@ uct_rc_mlx5_ep_zcopy_post(uct_rc_mlx5_ep_t *ep, unsigned opcode,
rdma_raddr, uct_ib_md_direct_rkey(rdma_rkey),
tag, app_ctx, ib_imm_be,
NULL, NULL, 0,
(comp == NULL) ? force_sig : MLX5_WQE_CTRL_CQ_UPDATE,
(comp == NULL) ? wqe_flags :
(wqe_flags | MLX5_WQE_CTRL_CQ_UPDATE),
UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super));

uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, handler, comp, sn,
Expand Down Expand Up @@ -221,6 +220,7 @@ ucs_status_t uct_rc_mlx5_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size
UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), 0, UCT_IB_MAX_MESSAGE_SIZE,
"put_zcopy");
UCT_RC_CHECK_NUM_RDMA_READ(iface);
UCT_RC_CHECK_RES(iface, &ep->super);

status = uct_rc_mlx5_ep_zcopy_post(ep, MLX5_OPCODE_RDMA_WRITE, iov, iovcnt,
0ul, 0, NULL, 0, remote_addr, rkey, 0ul, 0, 0,
Expand Down Expand Up @@ -269,6 +269,7 @@ ucs_status_t uct_rc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size
iface->super.super.config.max_inl_resp + 1,
iface->super.config.max_get_zcopy, "get_zcopy");
UCT_RC_CHECK_NUM_RDMA_READ(&iface->super);
UCT_RC_CHECK_RES(&iface->super, &ep->super);

status = uct_rc_mlx5_ep_zcopy_post(ep, MLX5_OPCODE_RDMA_READ, iov, iovcnt,
total_length, 0, NULL, 0, remote_addr,
Expand Down Expand Up @@ -363,8 +364,8 @@ ucs_status_t uct_rc_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *hea
"uct_rc_mlx5_ep_am_zcopy");
UCT_RC_MLX5_CHECK_AM_ZCOPY(id, header_length, uct_iov_total_length(iov, iovcnt),
iface->super.super.config.seg_size, 0);
UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super);
UCT_RC_CHECK_FC(&iface->super, &ep->super, id);
UCT_RC_CHECK_NUM_RDMA_READ(&iface->super);

uct_rc_iface_check_pending(&iface->super, &ep->super);

Expand Down Expand Up @@ -814,6 +815,7 @@ ucs_status_t uct_rc_mlx5_ep_tag_eager_zcopy(uct_ep_h tl_ep, uct_tag_t tag,
UCT_RC_CHECK_ZCOPY_DATA(sizeof(struct ibv_tmh),
uct_iov_total_length(iov, iovcnt),
iface->tm.max_zcopy);
UCT_RC_CHECK_RES(&iface->super, &ep->super);

UCT_RC_MLX5_FILL_TM_IMM(imm, app_ctx, ib_imm, opcode, MLX5_OPCODE_SEND,
_IMM);
Expand Down

0 comments on commit 20aacb7

Please sign in to comment.