1.1 --- a/ompi/mca/btl/openib/btl_openib.c Tue Dec 25 21:17:35 2012 +0000
1.2 +++ b/ompi/mca/btl/openib/btl_openib.c Wed Dec 26 10:19:12 2012 +0000
1.3 @@ -1544,6 +1544,7 @@
1.4 ompi_free_list_item_t* item = NULL;
1.5 mca_btl_openib_frag_t *frag;
1.6 mca_btl_openib_header_t *hdr;
1.7 + int send_signaled;
1.8
1.9 OPAL_THREAD_LOCK(&ep->endpoint_lock);
1.10
1.11 @@ -1644,12 +1645,22 @@
1.12 hdr->cm_seen = cm_return;
1.13 }
1.14
1.15 - ib_rc = post_send(ep, to_send_frag(item), do_rdma);
1.16 +#if BTL_OPENIB_FAILOVER_ENABLED
1.17 + send_signaled = 0;
1.18 +#else
1.19 + send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma);
1.20 +#endif
1.21 + ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
1.22
1.23 if(!ib_rc) {
1.24 + if (0 == send_signaled) {
1.25 + MCA_BTL_IB_FRAG_RETURN(frag);
1.26 + }
1.27 #if BTL_OPENIB_FAILOVER_ENABLED
1.28 - /* Return up in case needed for failover */
1.29 - *descriptor = (struct mca_btl_base_descriptor_t *) frag;
1.30 + else {
1.31 + /* Return up in case needed for failover */
1.32 + *descriptor = (struct mca_btl_base_descriptor_t *) frag;
1.33 + }
1.34 #endif
1.35 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
1.36 return OMPI_SUCCESS;
1.37 @@ -1784,7 +1795,11 @@
1.38 /* Setting opcode on a frag constructor isn't enough since prepare_src
1.39 * may return send_frag instead of put_frag */
1.40 frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
1.41 - frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]));
1.42 + frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1);
1.43 +
1.44 + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
1.45 + qp_reset_signal_count(ep, qp);
1.46 +
1.47 if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
1.48 return OMPI_ERROR;
1.49
1.50 @@ -1863,6 +1878,10 @@
1.51 frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
1.52 #endif
1.53 descriptor->order = qp;
1.54 +
1.55 + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
1.56 + qp_reset_signal_count(ep, qp);
1.57 +
1.58 if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
1.59 return OMPI_ERROR;
1.60
2.1 --- a/ompi/mca/btl/openib/btl_openib_component.c Tue Dec 25 21:17:35 2012 +0000
2.2 +++ b/ompi/mca/btl/openib/btl_openib_component.c Wed Dec 26 10:19:12 2012 +0000
2.3 @@ -3280,6 +3280,7 @@
2.4 mca_btl_openib_module_t *openib_btl = NULL;
2.5 ompi_proc_t* remote_proc = NULL;
2.6 int qp, btl_ownership;
2.7 + int n;
2.8
2.9 des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
2.10 frag = to_com_frag(des);
2.11 @@ -3343,8 +3344,11 @@
2.12 /* return send wqe */
2.13 qp_put_wqe(endpoint, qp);
2.14
2.15 + /* return wqes that were sent before this frag */
2.16 + n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des));
2.17 +
2.18 if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
2.19 - OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
2.20 + OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n);
2.21
2.22 /* new SRQ credit available. Try to progress pending frags*/
2.23 progress_pending_frags_srq(openib_btl, qp);
3.1 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c Tue Dec 25 21:17:35 2012 +0000
3.2 +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c Wed Dec 26 10:19:12 2012 +0000
3.3 @@ -152,7 +152,8 @@
3.4 hdr->cm_seen = cm_return;
3.5 }
3.6
3.7 - ib_rc = post_send(endpoint, frag, do_rdma);
3.8 + qp_reset_signal_count(endpoint, qp);
3.9 + ib_rc = post_send(endpoint, frag, do_rdma, 1);
3.10
3.11 if(!ib_rc)
3.12 return OMPI_SUCCESS;
3.13 @@ -287,8 +288,11 @@
3.14 break;
3.15 default:
3.16 BTL_ERROR(("Wrong QP type"));
3.17 - break;
3.18 + return;
3.19 }
3.20 +
3.21 + ep_qp->qp->sd_wqe_inflight = 0;
3.22 + ep_qp->qp->wqe_count = QP_TX_BATCH_COUNT;
3.23 }
3.24
3.25 void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
3.26 @@ -815,7 +819,8 @@
3.27 if(endpoint->nbo)
3.28 BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);
3.29
3.30 - if((rc = post_send(endpoint, frag, do_rdma)) == 0)
3.31 + qp_reset_signal_count(endpoint, qp);
3.32 + if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
3.33 return;
3.34
3.35 if(endpoint->nbo) {
4.1 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h Tue Dec 25 21:17:35 2012 +0000
4.2 +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h Wed Dec 26 10:19:12 2012 +0000
4.3 @@ -37,6 +37,8 @@
4.4 #include "ompi/mca/btl/base/btl_base_error.h"
4.5 #include "connect/base.h"
4.6
4.7 +#define QP_TX_BATCH_COUNT 64
4.8 +
4.9 BEGIN_C_DECLS
4.10
4.11 struct mca_btl_openib_frag_t;
4.12 @@ -133,6 +135,8 @@
4.13 struct ibv_qp *lcl_qp;
4.14 uint32_t lcl_psn;
4.15 int32_t sd_wqe; /**< number of available send wqe entries */
4.16 + int32_t sd_wqe_inflight;
4.17 + int wqe_count;
4.18 int users;
4.19 opal_mutex_t lock;
4.20 } mca_btl_openib_qp_t;
4.21 @@ -270,6 +274,54 @@
4.22 return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, 1);
4.23 }
4.24
4.25 +
4.26 +static inline int32_t qp_inc_inflight_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag)
4.27 +{
4.28 + frag->n_wqes_inflight = 0;
4.29 + return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe_inflight, 1);
4.30 +}
4.31 +
4.32 +static inline void qp_inflight_wqe_to_frag(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag)
4.33 +{
4.34 +
4.35 + frag->n_wqes_inflight = ep->qps[qp].qp->sd_wqe_inflight;
4.36 + ep->qps[qp].qp->sd_wqe_inflight = 0;
4.37 +}
4.38 +
4.39 +static inline int qp_frag_to_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag)
4.40 +{
4.41 + int n;
4.42 + n = frag->n_wqes_inflight;
4.43 + OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, n);
4.44 + frag->n_wqes_inflight = 0;
4.45 +
4.46 + return n;
4.47 +}
4.48 +
4.49 +static inline int qp_need_signal(mca_btl_openib_endpoint_t *ep, const int qp, size_t size, int rdma)
4.50 +{
4.51 +
4.52 + /* note that size here is payload only */
4.53 + if (ep->qps[qp].qp->sd_wqe <= 0 ||
4.54 + size + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) > ep->qps[qp].ib_inline_max) {
4.55 + ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT;
4.56 + return 1;
4.57 + }
4.58 +
4.59 + if (0 < --ep->qps[qp].qp->wqe_count) {
4.60 + return 0;
4.61 + }
4.62 +
4.63 + ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT;
4.64 + return 1;
4.65 +}
4.66 +
4.67 +static inline void qp_reset_signal_count(mca_btl_openib_endpoint_t *ep, const int qp)
4.68 +{
4.69 + ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT;
4.70 +}
4.71 +
4.72 +
4.73 int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t*,
4.74 mca_btl_openib_send_frag_t*);
4.75 int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*,
4.76 @@ -457,10 +509,14 @@
4.77 }
4.78
4.79 static inline __opal_attribute_always_inline__ int
4.80 -ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp)
4.81 +ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp, int do_signal)
4.82 {
4.83 - return IBV_SEND_SIGNALED |
4.84 - ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
4.85 + if (do_signal) {
4.86 + return IBV_SEND_SIGNALED |
4.87 + ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
4.88 + } else {
4.89 + return ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
4.90 + }
4.91 }
4.92
4.93 static inline int
4.94 @@ -475,7 +531,7 @@
4.95 }
4.96
4.97 static inline int post_send(mca_btl_openib_endpoint_t *ep,
4.98 - mca_btl_openib_send_frag_t *frag, const bool rdma)
4.99 + mca_btl_openib_send_frag_t *frag, const bool rdma, int do_signal)
4.100 {
4.101 mca_btl_openib_module_t *openib_btl = ep->endpoint_btl;
4.102 mca_btl_openib_segment_t *seg = &to_base_frag(frag)->segment;
4.103 @@ -487,7 +543,7 @@
4.104 sg->length = seg->base.seg_len + sizeof(mca_btl_openib_header_t) +
4.105 (rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length;
4.106
4.107 - sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]));
4.108 + sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]), do_signal);
4.109
4.110 if(ep->nbo)
4.111 BTL_OPENIB_HEADER_HTON(*frag->hdr);
4.112 @@ -545,6 +601,12 @@
4.113 #endif
4.114 assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr);
4.115
4.116 + if (sr_desc->send_flags & IBV_SEND_SIGNALED) {
4.117 + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
4.118 + } else {
4.119 + qp_inc_inflight_wqe(ep, qp, to_com_frag(frag));
4.120 + }
4.121 +
4.122 return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
4.123 }
4.124
5.1 --- a/ompi/mca/btl/openib/btl_openib_frag.c Tue Dec 25 21:17:35 2012 +0000
5.2 +++ b/ompi/mca/btl/openib/btl_openib_frag.c Wed Dec 26 10:19:12 2012 +0000
5.3 @@ -61,6 +61,7 @@
5.4 frag->sg_entry.lkey = reg->mr->lkey;
5.5 base_frag->segment.key = reg->mr->lkey;
5.6 }
5.7 + frag->n_wqes_inflight = 0;
5.8 }
5.9
5.10 static void out_constructor(mca_btl_openib_out_frag_t *frag)
6.1 --- a/ompi/mca/btl/openib/btl_openib_frag.h Tue Dec 25 21:17:35 2012 +0000
6.2 +++ b/ompi/mca/btl/openib/btl_openib_frag.h Wed Dec 26 10:19:12 2012 +0000
6.3 @@ -306,6 +306,8 @@
6.4 struct ibv_sge sg_entry;
6.5 struct mca_btl_openib_reg_t *registration;
6.6 struct mca_btl_base_endpoint_t *endpoint;
6.7 + /* number of unsignaled frags sent before this frag. */
6.8 + uint32_t n_wqes_inflight;
6.9 } mca_btl_openib_com_frag_t;
6.10 OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);
6.11