Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r26329
From: Nathan Hjelm (hjelmn_at_[hidden])
Date: 2012-04-24 16:47:24


This was RFC'd last month. No one objected :)

-Nathan

On Tue, 24 Apr 2012, Jeffrey Squyres wrote:

> There's some pretty extensive ob1 changes in here.
>
> Can we get these reviewed? Brian / George?
>
>
> On Apr 24, 2012, at 4:18 PM, hjelmn_at_[hidden] wrote:
>
>> Author: hjelmn
>> Date: 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> New Revision: 26329
>> URL: https://svn.open-mpi.org/trac/ompi/changeset/26329
>>
>> Log:
>> ob1: add support for get fallback on put/send
>> Text files modified:
>> trunk/ompi/mca/btl/ugni/btl_ugni_get.c | 17 ----
>> trunk/ompi/mca/btl/ugni/btl_ugni_put.c | 48 --------------
>> trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h | 7 --
>> trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c | 5 -
>> trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h | 1
>> trunk/ompi/mca/pml/ob1/pml_ob1.c | 5 +
>> trunk/ompi/mca/pml/ob1/pml_ob1.h | 2
>> trunk/ompi/mca/pml/ob1/pml_ob1_component.c | 4
>> trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 15 +++-
>> trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c | 94 ++++++++++++++++++++++++++--
>> trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c | 131 ++++++++++++++++++++++-----------------
>> trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h | 2
>> 12 files changed, 182 insertions(+), 149 deletions(-)
>>
>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_get.c
>> ==============================================================================
>> --- trunk/ompi/mca/btl/ugni/btl_ugni_get.c (original)
>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_get.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -13,19 +13,6 @@
>> #include "btl_ugni_rdma.h"
>> #include "btl_ugni_smsg.h"
>>
>> -static int mca_btl_ugni_init_put (struct mca_btl_base_module_t *btl,
>> - mca_btl_ugni_base_frag_t *frag) {
>> - /* off alignment/off size. switch to put */
>> - frag->hdr.rdma.src_seg = frag->base.des_src[0];
>> - frag->hdr.rdma.dst_seg = frag->base.des_dst[0];
>> - frag->hdr.rdma.ctx = (void *) frag;
>> -
>> - /* send the fragment header using smsg. ignore local completion */
>> - return ompi_mca_btl_ugni_smsg_send (frag, true, &frag->hdr.rdma,
>> - sizeof (frag->hdr.rdma), NULL, 0,
>> - MCA_BTL_UGNI_TAG_PUT_INIT);
>> -}
>> -
>> /**
>> * Initiate a get operation.
>> *
>> @@ -54,7 +41,7 @@
>>
>> if (OPAL_UNLIKELY(check || size > mca_btl_ugni_component.ugni_get_limit)) {
>> /* switch to put */
>> - return mca_btl_ugni_init_put (btl, frag);
>> + return OMPI_ERR_NOT_AVAILABLE;
>> }
>>
>> if (NULL != frag->base.des_cbfunc) {
>> @@ -68,7 +55,7 @@
>> return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET, des->des_dst, des->des_src);
>> }
>>
>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int rc)
>> +static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int rc)
>> {
>> BTL_VERBOSE(("rdma operation for rem_ctx %p complete", frag->hdr.rdma.ctx));
>>
>>
>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_put.c
>> ==============================================================================
>> --- trunk/ompi/mca/btl/ugni/btl_ugni_put.c (original)
>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_put.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -46,51 +46,3 @@
>>
>> return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_PUT, des->des_src, des->des_dst);
>> }
>> -
>> -/* reversed get */
>> -static void mca_btl_ugni_callback_put_retry (mca_btl_ugni_base_frag_t *frag, int rc)
>> -{
>> - (void) mca_btl_ugni_start_put(frag->endpoint, frag->hdr.rdma, frag);
>> -}
>> -
>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
>> - mca_btl_ugni_rdma_frag_hdr_t hdr,
>> - mca_btl_ugni_base_frag_t *frag)
>> -{
>> - int rc;
>> -
>> - BTL_VERBOSE(("starting reverse get (put) for remote ctx: %p", hdr.ctx));
>> -
>> - if (NULL == frag) {
>> - rc = MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag);
>> - if (OPAL_UNLIKELY(NULL == frag)) {
>> - BTL_ERROR(("error allocating rdma frag for reverse get. rc = %d. fl_num_allocated = %d", rc,
>> - ep->btl->rdma_int_frags.fl_num_allocated));
>> - return rc;
>> - }
>> - }
>> -
>> - frag->hdr.rdma = hdr;
>> -
>> - frag->base.des_cbfunc = NULL;
>> - frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
>> -
>> - frag->segments[0] = hdr.src_seg;
>> - frag->base.des_src = frag->segments;
>> - frag->base.des_src_cnt = 1;
>> -
>> - frag->segments[1] = hdr.dst_seg;
>> - frag->base.des_dst = frag->segments + 1;
>> - frag->base.des_dst_cnt = 1;
>> -
>> - rc = mca_btl_ugni_put (&ep->btl->super, ep, &frag->base);
>> - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>> - frag->cbfunc = mca_btl_ugni_callback_put_retry;
>> - opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) frag);
>> - return rc;
>> - }
>> -
>> - frag->cbfunc = mca_btl_ugni_callback_rdma_complete;
>> -
>> - return OMPI_SUCCESS;
>> -}
>>
>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h
>> ==============================================================================
>> --- trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h (original)
>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -16,17 +16,10 @@
>> #include "btl_ugni.h"
>> #include "btl_ugni_frag.h"
>>
>> -/* mca_btl_ugni_start_put: get operation could not be completed. start put instead */
>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
>> - mca_btl_ugni_rdma_frag_hdr_t hdr,
>> - mca_btl_ugni_base_frag_t *frag);
>> -
>> int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
>> mca_btl_ugni_eager_ex_frag_hdr_t hdr,
>> mca_btl_ugni_base_frag_t *frag);
>>
>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int rc);
>> -
>> static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag,
>> gni_post_type_t op_type,
>> uint64_t lcl_addr,
>>
>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c
>> ==============================================================================
>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c (original)
>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -78,11 +78,6 @@
>> reg->cbfunc(&ep->btl->super, tag, &(frag.base), reg->cbdata);
>>
>> break;
>> - case MCA_BTL_UGNI_TAG_PUT_INIT:
>> - frag.hdr.rdma = ((mca_btl_ugni_rdma_frag_hdr_t *) data_ptr)[0];
>> -
>> - mca_btl_ugni_start_put (ep, frag.hdr.rdma, NULL);
>> - break;
>> case MCA_BTL_UGNI_TAG_GET_INIT:
>> frag.hdr.eager_ex = ((mca_btl_ugni_eager_ex_frag_hdr_t *) data_ptr)[0];
>>
>>
>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h
>> ==============================================================================
>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h (original)
>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -21,7 +21,6 @@
>> typedef enum {
>> MCA_BTL_UGNI_TAG_SEND,
>> MCA_BTL_UGNI_TAG_DISCONNECT,
>> - MCA_BTL_UGNI_TAG_PUT_INIT,
>> MCA_BTL_UGNI_TAG_GET_INIT,
>> MCA_BTL_UGNI_TAG_RDMA_COMPLETE
>> } mca_btl_ugni_smsg_tag_t;
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.c
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1.c (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -147,6 +147,7 @@
>> OBJ_CONSTRUCT(&mca_pml_ob1.recv_pending, opal_list_t);
>> OBJ_CONSTRUCT(&mca_pml_ob1.pckt_pending, opal_list_t);
>> OBJ_CONSTRUCT(&mca_pml_ob1.rdma_pending, opal_list_t);
>> +
>> /* missing communicator pending list */
>> OBJ_CONSTRUCT(&mca_pml_ob1.non_existing_communicator_pending, opal_list_t);
>>
>> @@ -599,8 +600,10 @@
>> OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>> if(NULL == frag)
>> break;
>> +
>> + frag->retries++;
>> +
>> if(frag->rdma_state == MCA_PML_OB1_RDMA_PUT) {
>> - frag->retries++;
>> rc = mca_pml_ob1_send_request_put_frag(frag);
>> } else {
>> rc = mca_pml_ob1_recv_request_get_frag(frag);
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.h
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1.h (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.h 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -52,7 +52,7 @@
>> int free_list_inc; /* number of elements to grow free list */
>> size_t send_pipeline_depth;
>> size_t recv_pipeline_depth;
>> - size_t rdma_put_retries_limit;
>> + size_t rdma_retries_limit;
>> int max_rdma_per_request;
>> int max_send_per_range;
>> bool leave_pinned;
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_component.c
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1_component.c (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_component.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -112,8 +112,8 @@
>> mca_pml_ob1_param_register_int("send_pipeline_depth", 3);
>> mca_pml_ob1.recv_pipeline_depth =
>> mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
>> - mca_pml_ob1.rdma_put_retries_limit =
>> - mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
>> + mca_pml_ob1.rdma_retries_limit =
>> + mca_pml_ob1_param_register_int("rdma_retries_limit", 5);
>> mca_pml_ob1.max_rdma_per_request =
>> mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
>> mca_pml_ob1.max_send_per_range =
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -294,15 +294,22 @@
>> if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
>> return;
>> }
>> -
>> +
>> ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_ACK);
>> sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
>> sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
>> -
>> +
>> /* if the request should be delivered entirely by copy in/out
>> * then throttle sends */
>> - if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA)
>> + if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
>> + if (NULL != sendreq->src_des) {
>> + /* release registered memory */
>> + mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des);
>> + sendreq->src_des = NULL;
>> + }
>> +
>> sendreq->req_throttle_sends = true;
>> + }
>>
>> mca_pml_ob1_send_request_copy_in_out(sendreq,
>> hdr->hdr_ack.hdr_send_offset,
>> @@ -324,7 +331,7 @@
>>
>> if(send_request_pml_complete_check(sendreq) == false)
>> mca_pml_ob1_send_request_schedule(sendreq);
>> -
>> +
>> return;
>> }
>>
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -352,6 +352,66 @@
>> }
>>
>>
>> +static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
>> + mca_btl_base_descriptor_t *dst) {
>> + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
>> + mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
>> + mca_btl_base_descriptor_t *ctl;
>> + mca_pml_ob1_rdma_hdr_t *hdr;
>> + size_t hdr_size;
>> + unsigned int i;
>> + int rc;
>> +
>> + /* prepare a descriptor for rdma control message */
>> + hdr_size = sizeof (mca_pml_ob1_rdma_hdr_t);
>> + if (dst->des_dst_cnt > 1) {
>> + hdr_size += (sizeof (mca_btl_base_segment_t) *
>> + (dst->des_dst_cnt-1));
>> + }
>> +
>> + mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size,
>> + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>> + MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
>> + if (OPAL_UNLIKELY(NULL == ctl)) {
>> + return OMPI_ERR_OUT_OF_RESOURCE;
>> + }
>> + ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
>> +
>> + /* fill in rdma header */
>> + hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_src->seg_addr.pval;
>> + hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
>> + hdr->hdr_common.hdr_flags =
>> + (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
>> +
>> + hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
>> + hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
>> + hdr->hdr_des.pval = dst;
>> +
>> + hdr->hdr_seg_cnt = dst->des_dst_cnt;
>> +
>> + for (i = 0 ; i < dst->des_dst_cnt ; ++i) {
>> + hdr->hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval);
>> + hdr->hdr_segs[i].seg_len = dst->des_dst[i].seg_len;
>> + hdr->hdr_segs[i].seg_key.key64[0] = dst->des_dst[i].seg_key.key64[0];
>> + hdr->hdr_segs[i].seg_key.key64[1] = dst->des_dst[i].seg_key.key64[1];
>> + }
>> +
>> + dst->des_cbfunc = mca_pml_ob1_put_completion;
>> + dst->des_cbdata = recvreq;
>> +
>> + if (!recvreq->req_ack_sent)
>> + recvreq->req_ack_sent = true;
>> +
>> + /* send rdma request to peer */
>> + rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
>> + if (OPAL_UNLIKELY(rc < 0)) {
>> + mca_bml_base_free (bml_btl, ctl);
>> + return rc;
>> + }
>> +
>> + return OMPI_SUCCESS;
>> +}
>> +
>> /*
>> *
>> */
>> @@ -371,14 +431,25 @@
>> 0,
>> &frag->rdma_length,
>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
>> - MCA_BTL_DES_FLAGS_GET,
>> + MCA_BTL_DES_FLAGS_GET,
>> &descriptor );
>> if( OPAL_UNLIKELY(NULL == descriptor) ) {
>> - frag->rdma_length = save_size;
>> - OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>> - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
>> - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>> - return OMPI_ERR_OUT_OF_RESOURCE;
>> + if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
>> + frag->rdma_length = save_size;
>> + OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>> + opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
>> + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>> + return OMPI_ERR_OUT_OF_RESOURCE;
>> + } else {
>> + ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
>> +
>> + /* tell peer to fall back on send */
>> + recvreq->req_send_offset = 0;
>> + rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
>> + recvreq, recvreq->req_send_offset, true);
>> + MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
>> + return rc;
>> + }
>> }
>>
>> descriptor->des_src = frag->rdma_segs;
>> @@ -393,6 +464,11 @@
>> /* queue up get request */
>> rc = mca_bml_base_get(bml_btl,descriptor);
>> if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
>> + if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
>> + /* get isn't supported for this transfer. tell peer to fallback on put */
>> + rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
>> + }
>> +
>> if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
>> mca_bml_base_free(bml_btl, descriptor);
>> OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>> @@ -400,7 +476,7 @@
>> (opal_list_item_t*)frag);
>> OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>> return OMPI_ERR_OUT_OF_RESOURCE;
>> - } else {
>> + } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>> ORTE_ERROR_LOG(rc);
>> orte_errmgr.abort(-1, NULL);
>> }
>> @@ -551,7 +627,9 @@
>> orte_errmgr.abort(-1, NULL);
>> }
>> #endif /* OMPI_CUDA_SUPPORT */
>> +
>> frag->rdma_hdr.hdr_rget = *hdr;
>> + frag->retries = 0;
>> frag->rdma_req = recvreq;
>> frag->rdma_ep = bml_endpoint;
>> frag->rdma_length = size;
>> @@ -792,7 +870,7 @@
>> mca_bml_base_prepare_dst(bml_btl, reg,
>> &recvreq->req_recv.req_base.req_convertor,
>> MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>> - MCA_BTL_DES_FLAGS_PUT, &dst);
>> + MCA_BTL_DES_FLAGS_PUT, &dst);
>> OPAL_THREAD_UNLOCK(&recvreq->lock);
>>
>> if(OPAL_UNLIKELY(dst == NULL)) {
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -264,6 +264,7 @@
>> MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
>> 0, req_bytes_delivered );
>> OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
>> + sendreq->src_des = NULL;
>>
>> send_request_pml_complete_check(sendreq);
>> /* free the descriptor */
>> @@ -639,6 +640,8 @@
>> bool need_local_cb = false;
>> int rc;
>>
>> + sendreq->src_des = NULL;
>> +
>> bml_btl = sendreq->req_rdma[0].bml_btl;
>> if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
>> mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
>> @@ -657,10 +660,8 @@
>> mca_bml_base_prepare_src( bml_btl,
>> reg,
>> &sendreq->req_send.req_base.req_convertor,
>> - MCA_BTL_NO_ORDER,
>> - 0,
>> - &size,
>> - MCA_BTL_DES_FLAGS_GET,
>> + MCA_BTL_NO_ORDER, 0, &size,
>> + MCA_BTL_DES_FLAGS_GET | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
>> &src );
>> MEMCHECKER(
>> memchecker_call(&opal_memchecker_base_mem_noaccess,
>> @@ -676,6 +677,8 @@
>> src->des_cbfunc = mca_pml_ob1_rget_completion;
>> src->des_cbdata = sendreq;
>>
>> + sendreq->src_des = src;
>> +
>> /* allocate space for get hdr + segment list */
>> mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
>> sizeof(mca_pml_ob1_rget_hdr_t) +
>> @@ -782,8 +785,9 @@
>> return OMPI_SUCCESS;
>> }
>> mca_bml_base_free(bml_btl, des);
>> - if (NULL != src) {
>> - mca_bml_base_free (bml_btl, src);
>> + if (sendreq->src_des) {
>> + mca_bml_base_free (bml_btl, sendreq->src_des);
>> + sendreq->src_des = NULL;
>> }
>>
>> return rc;
>> @@ -1133,63 +1137,71 @@
>> MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
>> }
>>
>> -int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
>> -{
>> - mca_mpool_base_registration_t* reg = NULL;
>> - mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
>> - mca_btl_base_descriptor_t* des;
>> +int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
>> +{
>> + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
>> + mca_mpool_base_registration_t *reg = NULL;
>> + mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
>> + mca_btl_base_descriptor_t *des;
>> size_t save_size = frag->rdma_length;
>> int rc;
>>
>> - /* setup descriptor */
>> - mca_bml_base_prepare_src( bml_btl,
>> - reg,
>> - &frag->convertor,
>> - MCA_BTL_NO_ORDER,
>> - 0,
>> - &frag->rdma_length,
>> - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>> - MCA_BTL_DES_FLAGS_PUT,
>> - &des );
>> + if (OPAL_LIKELY(NULL == sendreq->src_des)) {
>> + /* setup descriptor */
>> + mca_bml_base_prepare_src( bml_btl,
>> + reg,
>> + &frag->convertor,
>> + MCA_BTL_NO_ORDER,
>> + 0,
>> + &frag->rdma_length,
>> + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>> + MCA_BTL_DES_FLAGS_PUT,
>> + &des );
>>
>> - if( OPAL_UNLIKELY(NULL == des) ) {
>> - if(frag->retries < mca_pml_ob1.rdma_put_retries_limit) {
>> - size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
>> - frag->rdma_length = save_size;
>> - opal_convertor_set_position(&frag->convertor, &offset);
>> - OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>> - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
>> - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>> - } else {
>> - mca_pml_ob1_send_request_t *sendreq =
>> - (mca_pml_ob1_send_request_t*)frag->rdma_req;
>> + if( OPAL_UNLIKELY(NULL == des) ) {
>> + if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
>> + size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
>> + frag->rdma_length = save_size;
>> + opal_convertor_set_position(&frag->convertor, &offset);
>> + OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>> + opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
>> + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>> + } else {
>> + mca_pml_ob1_send_request_t *sendreq =
>> + (mca_pml_ob1_send_request_t*)frag->rdma_req;
>> +
>> + /* tell receiver to unregister memory */
>> + mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
>> + bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
>> + MCA_BTL_NO_ORDER, 1);
>> +
>> + /* send fragment by copy in/out */
>> + mca_pml_ob1_send_request_copy_in_out(sendreq,
>> + frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
>> + /* if a pointer to a receive request is not set it means that
>> + * ACK was not yet received. Don't schedule sends before ACK */
>> + if(NULL != sendreq->req_recv.pval)
>> + mca_pml_ob1_send_request_schedule(sendreq);
>> + }
>>
>> - /* tell receiver to unregister memory */
>> - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
>> - bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
>> - MCA_BTL_NO_ORDER, 1);
>> -
>> - /* send fragment by copy in/out */
>> - mca_pml_ob1_send_request_copy_in_out(sendreq,
>> - frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
>> - /* if a pointer to a receive request is not set it means that
>> - * ACK was not yet received. Don't schedule sends before ACK */
>> - if(NULL != sendreq->req_recv.pval)
>> - mca_pml_ob1_send_request_schedule(sendreq);
>> + return OMPI_ERR_OUT_OF_RESOURCE;
>> }
>> - return OMPI_ERR_OUT_OF_RESOURCE;
>> + } else {
>> + /* already have a source descriptor */
>> + des = sendreq->src_des;
>> + sendreq->src_des = NULL;
>> }
>> -
>> - des->des_dst = frag->rdma_segs;
>> +
>> + des->des_dst = frag->rdma_segs;
>> des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
>> - des->des_cbfunc = mca_pml_ob1_put_completion;
>> - des->des_cbdata = frag;
>> + des->des_cbfunc = mca_pml_ob1_put_completion;
>> + des->des_cbdata = frag;
>>
>> PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
>> &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
>>
>> rc = mca_bml_base_put(bml_btl, des);
>> - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
>> + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>> mca_bml_base_free(bml_btl, des);
>> frag->rdma_length = save_size;
>> if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
>> @@ -1203,6 +1215,7 @@
>> orte_errmgr.abort(-1, NULL);
>> }
>> }
>> +
>> return OMPI_SUCCESS;
>> }
>>
>> @@ -1261,21 +1274,25 @@
>> frag->reg = NULL;
>> frag->retries = 0;
>>
>> + if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
>> + /* get fallback path */
>> + sendreq->req_state = 0;
>> + }
>> +
>> /* lookup the corresponding registration */
>> for(i=0; i<sendreq->req_rdma_cnt; i++) {
>> - if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
>> - frag->reg = sendreq->req_rdma[i].btl_reg;
>> - break;
>> - }
>> - }
>> + if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
>> + frag->reg = sendreq->req_rdma[i].btl_reg;
>> + break;
>> + }
>> + }
>>
>> /* RDMA writes may proceed in parallel to send and to each other, so
>> * create clone of the convertor for each RDMA fragment
>> */
>> size = hdr->hdr_rdma_offset;
>> opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
>> - &frag->convertor, 0, &size);
>> + &frag->convertor, 0, &size);
>>
>> mca_pml_ob1_send_request_put_frag(frag);
>> }
>> -
>>
>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h
>> ==============================================================================
>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h (original)
>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>> @@ -54,6 +54,7 @@
>> mca_pml_ob1_send_pending_t req_pending;
>> opal_mutex_t req_send_range_lock;
>> opal_list_t req_send_ranges;
>> + mca_btl_base_descriptor_t *src_des;
>> mca_pml_ob1_com_btl_t req_rdma[1];
>> };
>> typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
>> @@ -129,6 +130,7 @@
>> OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
>> sendreq = (mca_pml_ob1_send_request_t*)item; \
>> sendreq->req_send.req_base.req_proc = proc; \
>> + sendreq->src_des = NULL; \
>> } \
>> }
>>
>> _______________________________________________
>> svn-full mailing list
>> svn-full_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
>
>
> --
> Jeff Squyres
> jsquyres_at_[hidden]
> For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/
>
>
> _______________________________________________
> devel mailing list
> devel_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>