Performance optimizations by alexm:
authormiked
Wed Dec 26 10:19:12 2012 +0000 (4 months ago)
changeset 21108290e6e2f8fdc
parent 21107 55507abe10ef
child 21109 ae1bc9ece9dd
Performance optimizations by alexm:
* btl sendi(): if message can be send inline try to avoid signal
* signal is requested one per 64 or when
there are no send wqes
when message can not be send inline
any other btl method then sendi()
ompi/mca/btl/openib/btl_openib.c
ompi/mca/btl/openib/btl_openib_component.c
ompi/mca/btl/openib/btl_openib_endpoint.c
ompi/mca/btl/openib/btl_openib_endpoint.h
ompi/mca/btl/openib/btl_openib_frag.c
ompi/mca/btl/openib/btl_openib_frag.h
     1.1 --- a/ompi/mca/btl/openib/btl_openib.c	Tue Dec 25 21:17:35 2012 +0000
     1.2 +++ b/ompi/mca/btl/openib/btl_openib.c	Wed Dec 26 10:19:12 2012 +0000
     1.3 @@ -1544,6 +1544,7 @@
     1.4      ompi_free_list_item_t* item = NULL;
     1.5      mca_btl_openib_frag_t *frag;
     1.6      mca_btl_openib_header_t *hdr;
     1.7 +    int send_signaled;
     1.8  
     1.9      OPAL_THREAD_LOCK(&ep->endpoint_lock);
    1.10  
    1.11 @@ -1644,12 +1645,22 @@
    1.12          hdr->cm_seen = cm_return;
    1.13      }
    1.14  
    1.15 -    ib_rc = post_send(ep, to_send_frag(item), do_rdma);
    1.16 +#if BTL_OPENIB_FAILOVER_ENABLED
    1.17 +    send_signaled = 0;
    1.18 +#else
    1.19 +    send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma);
    1.20 +#endif
    1.21 +    ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
    1.22  
    1.23      if(!ib_rc) {
    1.24 +        if (0 == send_signaled) {
    1.25 +            MCA_BTL_IB_FRAG_RETURN(frag);
    1.26 +        }
    1.27  #if BTL_OPENIB_FAILOVER_ENABLED
    1.28 -        /* Return up in case needed for failover */
    1.29 -        *descriptor = (struct mca_btl_base_descriptor_t *) frag;
    1.30 +        else {
    1.31 +            /* Return up in case needed for failover */
    1.32 +            *descriptor = (struct mca_btl_base_descriptor_t *) frag;
    1.33 +        }
    1.34  #endif
    1.35          OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
    1.36          return OMPI_SUCCESS;
    1.37 @@ -1784,7 +1795,11 @@
    1.38      /* Setting opcode on a frag constructor isn't enough since prepare_src
    1.39       * may return send_frag instead of put_frag */
    1.40      frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
    1.41 -    frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]));
    1.42 +    frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1);
    1.43 +    
    1.44 +    qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
    1.45 +    qp_reset_signal_count(ep, qp);
    1.46 +
    1.47      if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
    1.48          return OMPI_ERROR;
    1.49  
    1.50 @@ -1863,6 +1878,10 @@
    1.51          frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
    1.52  #endif
    1.53      descriptor->order = qp;
    1.54 +
    1.55 +    qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
    1.56 +    qp_reset_signal_count(ep, qp);
    1.57 +
    1.58      if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
    1.59          return OMPI_ERROR;
    1.60  
     2.1 --- a/ompi/mca/btl/openib/btl_openib_component.c	Tue Dec 25 21:17:35 2012 +0000
     2.2 +++ b/ompi/mca/btl/openib/btl_openib_component.c	Wed Dec 26 10:19:12 2012 +0000
     2.3 @@ -3280,6 +3280,7 @@
     2.4      mca_btl_openib_module_t *openib_btl = NULL;
     2.5      ompi_proc_t* remote_proc = NULL;
     2.6      int qp, btl_ownership;
     2.7 +    int n;
     2.8  
     2.9      des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
    2.10      frag = to_com_frag(des);
    2.11 @@ -3343,8 +3344,11 @@
    2.12              /* return send wqe */
    2.13              qp_put_wqe(endpoint, qp);
    2.14  
    2.15 +            /* return wqes that were sent before this frag */
    2.16 +            n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des));
    2.17 +
    2.18              if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
    2.19 -                OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
    2.20 +                OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n);
    2.21  
    2.22                  /* new SRQ credit available. Try to progress pending frags*/
    2.23                  progress_pending_frags_srq(openib_btl, qp);
     3.1 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c	Tue Dec 25 21:17:35 2012 +0000
     3.2 +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c	Wed Dec 26 10:19:12 2012 +0000
     3.3 @@ -152,7 +152,8 @@
     3.4          hdr->cm_seen = cm_return;
     3.5      }
     3.6  
     3.7 -    ib_rc = post_send(endpoint, frag, do_rdma);
     3.8 +    qp_reset_signal_count(endpoint, qp);
     3.9 +    ib_rc = post_send(endpoint, frag, do_rdma, 1);
    3.10  
    3.11      if(!ib_rc)
    3.12          return OMPI_SUCCESS;
    3.13 @@ -287,8 +288,11 @@
    3.14              break;
    3.15          default:
    3.16              BTL_ERROR(("Wrong QP type"));
    3.17 -            break;
    3.18 +            return;
    3.19      }
    3.20 +
    3.21 +    ep_qp->qp->sd_wqe_inflight = 0;
    3.22 +    ep_qp->qp->wqe_count = QP_TX_BATCH_COUNT;
    3.23  }
    3.24  
    3.25  void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
    3.26 @@ -815,7 +819,8 @@
    3.27      if(endpoint->nbo)
    3.28           BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);
    3.29  
    3.30 -    if((rc = post_send(endpoint, frag, do_rdma)) == 0)
    3.31 +    qp_reset_signal_count(endpoint, qp);
    3.32 +    if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
    3.33          return;
    3.34  
    3.35      if(endpoint->nbo) {
     4.1 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h	Tue Dec 25 21:17:35 2012 +0000
     4.2 +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h	Wed Dec 26 10:19:12 2012 +0000
     4.3 @@ -37,6 +37,8 @@
     4.4  #include "ompi/mca/btl/base/btl_base_error.h"
     4.5  #include "connect/base.h"
     4.6  
     4.7 +#define QP_TX_BATCH_COUNT 64
     4.8 +
     4.9  BEGIN_C_DECLS
    4.10  
    4.11  struct mca_btl_openib_frag_t;
    4.12 @@ -133,6 +135,8 @@
    4.13      struct ibv_qp *lcl_qp;
    4.14      uint32_t lcl_psn;
    4.15      int32_t  sd_wqe;      /**< number of available send wqe entries */
    4.16 +    int32_t  sd_wqe_inflight;
    4.17 +    int wqe_count;
    4.18      int users;
    4.19      opal_mutex_t lock;
    4.20  } mca_btl_openib_qp_t;
    4.21 @@ -270,6 +274,54 @@
    4.22      return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, 1);
    4.23  }
    4.24  
    4.25 +
    4.26 +static inline int32_t qp_inc_inflight_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag)
    4.27 +{
    4.28 +    frag->n_wqes_inflight = 0;
    4.29 +    return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe_inflight, 1);
    4.30 +}
    4.31 +
    4.32 +static inline void qp_inflight_wqe_to_frag(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag)
    4.33 +{
    4.34 +
    4.35 +    frag->n_wqes_inflight = ep->qps[qp].qp->sd_wqe_inflight;
    4.36 +    ep->qps[qp].qp->sd_wqe_inflight = 0;
    4.37 +}
    4.38 +
    4.39 +static inline int qp_frag_to_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag)
    4.40 +{
    4.41 +    int n;
    4.42 +    n = frag->n_wqes_inflight;
    4.43 +    OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, n);
    4.44 +    frag->n_wqes_inflight = 0;
    4.45 +
    4.46 +    return n;
    4.47 +}
    4.48 +
    4.49 +static inline int qp_need_signal(mca_btl_openib_endpoint_t *ep, const int qp, size_t size, int rdma)
    4.50 +{
    4.51 +
    4.52 +    /* note that size here is payload only */
    4.53 +    if (ep->qps[qp].qp->sd_wqe <= 0  || 
    4.54 +            size + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) > ep->qps[qp].ib_inline_max) {
    4.55 +        ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT;
    4.56 +        return 1;
    4.57 +    }
    4.58 +
    4.59 +    if (0 < --ep->qps[qp].qp->wqe_count) {
    4.60 +        return 0;
    4.61 +    }
    4.62 +
    4.63 +    ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT;
    4.64 +    return 1;
    4.65 +}
    4.66 +
    4.67 +static inline void qp_reset_signal_count(mca_btl_openib_endpoint_t *ep, const int qp)
    4.68 +{
    4.69 +    ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT;
    4.70 +}
    4.71 +
    4.72 +
    4.73  int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t*,
    4.74          mca_btl_openib_send_frag_t*);
    4.75  int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*,
    4.76 @@ -457,10 +509,14 @@
    4.77  }
    4.78  
    4.79  static inline __opal_attribute_always_inline__ int
    4.80 -ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp)
    4.81 +ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp, int do_signal)
    4.82  {
    4.83 -    return IBV_SEND_SIGNALED |
    4.84 -        ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
    4.85 +    if (do_signal) {
    4.86 +        return IBV_SEND_SIGNALED |
    4.87 +            ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
    4.88 +    } else {
    4.89 +        return   ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
    4.90 +    }
    4.91  }
    4.92  
    4.93  static inline int
    4.94 @@ -475,7 +531,7 @@
    4.95  }
    4.96  
    4.97  static inline int post_send(mca_btl_openib_endpoint_t *ep,
    4.98 -        mca_btl_openib_send_frag_t *frag, const bool rdma)
    4.99 +        mca_btl_openib_send_frag_t *frag, const bool rdma, int do_signal)
   4.100  {
   4.101      mca_btl_openib_module_t *openib_btl = ep->endpoint_btl;
   4.102      mca_btl_openib_segment_t *seg = &to_base_frag(frag)->segment;
   4.103 @@ -487,7 +543,7 @@
   4.104      sg->length = seg->base.seg_len + sizeof(mca_btl_openib_header_t) +
   4.105          (rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length;
   4.106  
   4.107 -    sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]));
   4.108 +    sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]), do_signal);
   4.109  
   4.110      if(ep->nbo)
   4.111          BTL_OPENIB_HEADER_HTON(*frag->hdr);
   4.112 @@ -545,6 +601,12 @@
   4.113  #endif
   4.114      assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr);
   4.115  
   4.116 +    if (sr_desc->send_flags & IBV_SEND_SIGNALED) {
   4.117 +        qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
   4.118 +    } else {
   4.119 +        qp_inc_inflight_wqe(ep, qp, to_com_frag(frag));
   4.120 +    }
   4.121 +
   4.122      return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
   4.123  }
   4.124  
     5.1 --- a/ompi/mca/btl/openib/btl_openib_frag.c	Tue Dec 25 21:17:35 2012 +0000
     5.2 +++ b/ompi/mca/btl/openib/btl_openib_frag.c	Wed Dec 26 10:19:12 2012 +0000
     5.3 @@ -61,6 +61,7 @@
     5.4          frag->sg_entry.lkey = reg->mr->lkey;
     5.5          base_frag->segment.key = reg->mr->lkey;
     5.6      }
     5.7 +    frag->n_wqes_inflight = 0;
     5.8  }
     5.9  
    5.10  static void out_constructor(mca_btl_openib_out_frag_t *frag)
     6.1 --- a/ompi/mca/btl/openib/btl_openib_frag.h	Tue Dec 25 21:17:35 2012 +0000
     6.2 +++ b/ompi/mca/btl/openib/btl_openib_frag.h	Wed Dec 26 10:19:12 2012 +0000
     6.3 @@ -306,6 +306,8 @@
     6.4      struct ibv_sge sg_entry;
     6.5      struct mca_btl_openib_reg_t *registration;
     6.6      struct mca_btl_base_endpoint_t *endpoint;
     6.7 +    /* number of unsignaled frags sent before this frag. */
     6.8 +    uint32_t n_wqes_inflight;
     6.9  } mca_btl_openib_com_frag_t;
    6.10  OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);
    6.11