Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |  

This web mail archive is frozen.

This page is part of a frozen web archive of this mailing list.

You can still navigate around this archive, but know that no new mails have been added to it since July of 2016.

Click here to be taken to the new web archives of this list; it includes all the mails that are in this frozen archive plus all new mails that have been sent to the list since it was migrated to the new archives.

Subject: Re: [OMPI devel] [EXTERNAL] Re: [OMPI svn-full] svn:open-mpi r26329
From: Barrett, Brian W (bwbarre_at_[hidden])
Date: 2012-04-24 16:51:49


And I think Jeff made me look at the code when you sent the RFC. Shame on
Jeff for making me review the same code twice ;).

Brian

On 4/24/12 2:47 PM, "Nathan Hjelm" <hjelmn_at_[hidden]> wrote:

>This was RFC'd last month. No one objected :)
>
>-Nathan
>
>On Tue, 24 Apr 2012, Jeffrey Squyres wrote:
>
>> There's some pretty extensive ob1 changes in here.
>>
>> Can we get these reviewed? Brian / George?
>>
>>
>> On Apr 24, 2012, at 4:18 PM, hjelmn_at_[hidden] wrote:
>>
>>> Author: hjelmn
>>> Date: 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>>> New Revision: 26329
>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/26329
>>>
>>> Log:
>>> ob1: add support for get fallback on put/send
>>> Text files modified:
>>> trunk/ompi/mca/btl/ugni/btl_ugni_get.c | 17 ----
>>> trunk/ompi/mca/btl/ugni/btl_ugni_put.c | 48 --------------
>>> trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h | 7 --
>>> trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c | 5 -
>>> trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h | 1
>>> trunk/ompi/mca/pml/ob1/pml_ob1.c | 5 +
>>> trunk/ompi/mca/pml/ob1/pml_ob1.h | 2
>>> trunk/ompi/mca/pml/ob1/pml_ob1_component.c | 4
>>> trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 15 +++-
>>> trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c | 94
>>>++++++++++++++++++++++++++--
>>> trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c | 131
>>>++++++++++++++++++++++-----------------
>>> trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h | 2
>>> 12 files changed, 182 insertions(+), 149 deletions(-)
>>>
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_get.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_get.c (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_get.c 2012-04-24 16:18:56 EDT
>>>(Tue, 24 Apr 2012)
>>> @@ -13,19 +13,6 @@
>>> #include "btl_ugni_rdma.h"
>>> #include "btl_ugni_smsg.h"
>>>
>>> -static int mca_btl_ugni_init_put (struct mca_btl_base_module_t *btl,
>>> - mca_btl_ugni_base_frag_t *frag) {
>>> - /* off alignment/off size. switch to put */
>>> - frag->hdr.rdma.src_seg = frag->base.des_src[0];
>>> - frag->hdr.rdma.dst_seg = frag->base.des_dst[0];
>>> - frag->hdr.rdma.ctx = (void *) frag;
>>> -
>>> - /* send the fragment header using smsg. ignore local completion */
>>> - return ompi_mca_btl_ugni_smsg_send (frag, true, &frag->hdr.rdma,
>>> - sizeof (frag->hdr.rdma),
>>>NULL, 0,
>>> - MCA_BTL_UGNI_TAG_PUT_INIT);
>>> -}
>>> -
>>> /**
>>> * Initiate a get operation.
>>> *
>>> @@ -54,7 +41,7 @@
>>>
>>> if (OPAL_UNLIKELY(check || size >
>>>mca_btl_ugni_component.ugni_get_limit)) {
>>> /* switch to put */
>>> - return mca_btl_ugni_init_put (btl, frag);
>>> + return OMPI_ERR_NOT_AVAILABLE;
>>> }
>>>
>>> if (NULL != frag->base.des_cbfunc) {
>>> @@ -68,7 +55,7 @@
>>> return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET,
>>>des->des_dst, des->des_src);
>>> }
>>>
>>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t
>>>*frag, int rc)
>>> +static void mca_btl_ugni_callback_rdma_complete
>>>(mca_btl_ugni_base_frag_t *frag, int rc)
>>> {
>>> BTL_VERBOSE(("rdma operation for rem_ctx %p complete",
>>>frag->hdr.rdma.ctx));
>>>
>>>
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_put.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_put.c (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_put.c 2012-04-24 16:18:56 EDT
>>>(Tue, 24 Apr 2012)
>>> @@ -46,51 +46,3 @@
>>>
>>> return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_PUT,
>>>des->des_src, des->des_dst);
>>> }
>>> -
>>> -/* reversed get */
>>> -static void mca_btl_ugni_callback_put_retry (mca_btl_ugni_base_frag_t
>>>*frag, int rc)
>>> -{
>>> - (void) mca_btl_ugni_start_put(frag->endpoint, frag->hdr.rdma,
>>>frag);
>>> -}
>>> -
>>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
>>> - mca_btl_ugni_rdma_frag_hdr_t hdr,
>>> - mca_btl_ugni_base_frag_t *frag)
>>> -{
>>> - int rc;
>>> -
>>> - BTL_VERBOSE(("starting reverse get (put) for remote ctx: %p",
>>>hdr.ctx));
>>> -
>>> - if (NULL == frag) {
>>> - rc = MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag);
>>> - if (OPAL_UNLIKELY(NULL == frag)) {
>>> - BTL_ERROR(("error allocating rdma frag for reverse get.
>>>rc = %d. fl_num_allocated = %d", rc,
>>> - ep->btl->rdma_int_frags.fl_num_allocated));
>>> - return rc;
>>> - }
>>> - }
>>> -
>>> - frag->hdr.rdma = hdr;
>>> -
>>> - frag->base.des_cbfunc = NULL;
>>> - frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
>>> -
>>> - frag->segments[0] = hdr.src_seg;
>>> - frag->base.des_src = frag->segments;
>>> - frag->base.des_src_cnt = 1;
>>> -
>>> - frag->segments[1] = hdr.dst_seg;
>>> - frag->base.des_dst = frag->segments + 1;
>>> - frag->base.des_dst_cnt = 1;
>>> -
>>> - rc = mca_btl_ugni_put (&ep->btl->super, ep, &frag->base);
>>> - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>>> - frag->cbfunc = mca_btl_ugni_callback_put_retry;
>>> - opal_list_append (&ep->btl->failed_frags, (opal_list_item_t
>>>*) frag);
>>> - return rc;
>>> - }
>>> -
>>> - frag->cbfunc = mca_btl_ugni_callback_rdma_complete;
>>> -
>>> - return OMPI_SUCCESS;
>>> -}
>>>
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h 2012-04-24 16:18:56 EDT
>>>(Tue, 24 Apr 2012)
>>> @@ -16,17 +16,10 @@
>>> #include "btl_ugni.h"
>>> #include "btl_ugni_frag.h"
>>>
>>> -/* mca_btl_ugni_start_put: get operation could not be completed.
>>>start put instead */
>>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
>>> - mca_btl_ugni_rdma_frag_hdr_t hdr,
>>> - mca_btl_ugni_base_frag_t *frag);
>>> -
>>> int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
>>> mca_btl_ugni_eager_ex_frag_hdr_t hdr,
>>> mca_btl_ugni_base_frag_t *frag);
>>>
>>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t
>>>*frag, int rc);
>>> -
>>> static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag,
>>> gni_post_type_t op_type,
>>> uint64_t lcl_addr,
>>>
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c 2012-04-24 16:18:56 EDT
>>>(Tue, 24 Apr 2012)
>>> @@ -78,11 +78,6 @@
>>> reg->cbfunc(&ep->btl->super, tag, &(frag.base),
>>>reg->cbdata);
>>>
>>> break;
>>> - case MCA_BTL_UGNI_TAG_PUT_INIT:
>>> - frag.hdr.rdma = ((mca_btl_ugni_rdma_frag_hdr_t *)
>>>data_ptr)[0];
>>> -
>>> - mca_btl_ugni_start_put (ep, frag.hdr.rdma, NULL);
>>> - break;
>>> case MCA_BTL_UGNI_TAG_GET_INIT:
>>> frag.hdr.eager_ex = ((mca_btl_ugni_eager_ex_frag_hdr_t *)
>>>data_ptr)[0];
>>>
>>>
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h 2012-04-24 16:18:56 EDT
>>>(Tue, 24 Apr 2012)
>>> @@ -21,7 +21,6 @@
>>> typedef enum {
>>> MCA_BTL_UGNI_TAG_SEND,
>>> MCA_BTL_UGNI_TAG_DISCONNECT,
>>> - MCA_BTL_UGNI_TAG_PUT_INIT,
>>> MCA_BTL_UGNI_TAG_GET_INIT,
>>> MCA_BTL_UGNI_TAG_RDMA_COMPLETE
>>> } mca_btl_ugni_smsg_tag_t;
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1.c (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.c 2012-04-24 16:18:56 EDT (Tue,
>>>24 Apr 2012)
>>> @@ -147,6 +147,7 @@
>>> OBJ_CONSTRUCT(&mca_pml_ob1.recv_pending, opal_list_t);
>>> OBJ_CONSTRUCT(&mca_pml_ob1.pckt_pending, opal_list_t);
>>> OBJ_CONSTRUCT(&mca_pml_ob1.rdma_pending, opal_list_t);
>>> +
>>> /* missing communicator pending list */
>>> OBJ_CONSTRUCT(&mca_pml_ob1.non_existing_communicator_pending,
>>>opal_list_t);
>>>
>>> @@ -599,8 +600,10 @@
>>> OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> if(NULL == frag)
>>> break;
>>> +
>>> + frag->retries++;
>>> +
>>> if(frag->rdma_state == MCA_PML_OB1_RDMA_PUT) {
>>> - frag->retries++;
>>> rc = mca_pml_ob1_send_request_put_frag(frag);
>>> } else {
>>> rc = mca_pml_ob1_recv_request_get_frag(frag);
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.h
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1.h (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.h 2012-04-24 16:18:56 EDT (Tue,
>>>24 Apr 2012)
>>> @@ -52,7 +52,7 @@
>>> int free_list_inc; /* number of elements to grow free list */
>>> size_t send_pipeline_depth;
>>> size_t recv_pipeline_depth;
>>> - size_t rdma_put_retries_limit;
>>> + size_t rdma_retries_limit;
>>> int max_rdma_per_request;
>>> int max_send_per_range;
>>> bool leave_pinned;
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_component.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_component.c (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_component.c 2012-04-24 16:18:56
>>>EDT (Tue, 24 Apr 2012)
>>> @@ -112,8 +112,8 @@
>>> mca_pml_ob1_param_register_int("send_pipeline_depth", 3);
>>> mca_pml_ob1.recv_pipeline_depth =
>>> mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
>>> - mca_pml_ob1.rdma_put_retries_limit =
>>> - mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
>>> + mca_pml_ob1.rdma_retries_limit =
>>> + mca_pml_ob1_param_register_int("rdma_retries_limit", 5);
>>> mca_pml_ob1.max_rdma_per_request =
>>> mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
>>> mca_pml_ob1.max_send_per_range =
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c 2012-04-24 16:18:56
>>>EDT (Tue, 24 Apr 2012)
>>> @@ -294,15 +294,22 @@
>>> if( OPAL_UNLIKELY(segments->seg_len <
>>>sizeof(mca_pml_ob1_common_hdr_t)) ) {
>>> return;
>>> }
>>> -
>>> +
>>> ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_ACK);
>>> sendreq =
>>>(mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
>>> sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
>>> -
>>> +
>>> /* if the request should be delivered entirely by copy in/out
>>> * then throttle sends */
>>> - if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA)
>>> + if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
>>> + if (NULL != sendreq->src_des) {
>>> + /* release registered memory */
>>> + mca_bml_base_free (sendreq->req_rdma[0].bml_btl,
>>>sendreq->src_des);
>>> + sendreq->src_des = NULL;
>>> + }
>>> +
>>> sendreq->req_throttle_sends = true;
>>> + }
>>>
>>> mca_pml_ob1_send_request_copy_in_out(sendreq,
>>> hdr->hdr_ack.hdr_send_offset,
>>> @@ -324,7 +331,7 @@
>>>
>>> if(send_request_pml_complete_check(sendreq) == false)
>>> mca_pml_ob1_send_request_schedule(sendreq);
>>> -
>>> +
>>> return;
>>> }
>>>
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c 2012-04-24 16:18:56
>>>EDT (Tue, 24 Apr 2012)
>>> @@ -352,6 +352,66 @@
>>> }
>>>
>>>
>>> +static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t
>>>*frag,
>>> + mca_btl_base_descriptor_t
>>>*dst) {
>>> + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t
>>>*) frag->rdma_req;
>>> + mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
>>> + mca_btl_base_descriptor_t *ctl;
>>> + mca_pml_ob1_rdma_hdr_t *hdr;
>>> + size_t hdr_size;
>>> + unsigned int i;
>>> + int rc;
>>> +
>>> + /* prepare a descriptor for rdma control message */
>>> + hdr_size = sizeof (mca_pml_ob1_rdma_hdr_t);
>>> + if (dst->des_dst_cnt > 1) {
>>> + hdr_size += (sizeof (mca_btl_base_segment_t) *
>>> + (dst->des_dst_cnt-1));
>>> + }
>>> +
>>> + mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size,
>>> + MCA_BTL_DES_FLAGS_PRIORITY |
>>>MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> + MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
>>> + if (OPAL_UNLIKELY(NULL == ctl)) {
>>> + return OMPI_ERR_OUT_OF_RESOURCE;
>>> + }
>>> + ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
>>> +
>>> + /* fill in rdma header */
>>> + hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_src->seg_addr.pval;
>>> + hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
>>> + hdr->hdr_common.hdr_flags =
>>> + (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
>>> +
>>> + hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
>>> + hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
>>> + hdr->hdr_des.pval = dst;
>>> +
>>> + hdr->hdr_seg_cnt = dst->des_dst_cnt;
>>> +
>>> + for (i = 0 ; i < dst->des_dst_cnt ; ++i) {
>>> + hdr->hdr_segs[i].seg_addr.lval =
>>>ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval);
>>> + hdr->hdr_segs[i].seg_len = dst->des_dst[i].seg_len;
>>> + hdr->hdr_segs[i].seg_key.key64[0] =
>>>dst->des_dst[i].seg_key.key64[0];
>>> + hdr->hdr_segs[i].seg_key.key64[1] =
>>>dst->des_dst[i].seg_key.key64[1];
>>> + }
>>> +
>>> + dst->des_cbfunc = mca_pml_ob1_put_completion;
>>> + dst->des_cbdata = recvreq;
>>> +
>>> + if (!recvreq->req_ack_sent)
>>> + recvreq->req_ack_sent = true;
>>> +
>>> + /* send rdma request to peer */
>>> + rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
>>> + if (OPAL_UNLIKELY(rc < 0)) {
>>> + mca_bml_base_free (bml_btl, ctl);
>>> + return rc;
>>> + }
>>> +
>>> + return OMPI_SUCCESS;
>>> +}
>>> +
>>> /*
>>> *
>>> */
>>> @@ -371,14 +431,25 @@
>>> 0,
>>> &frag->rdma_length,
>>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>>MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
>>> - MCA_BTL_DES_FLAGS_GET,
>>> + MCA_BTL_DES_FLAGS_GET,
>>> &descriptor );
>>> if( OPAL_UNLIKELY(NULL == descriptor) ) {
>>> - frag->rdma_length = save_size;
>>> - OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> - opal_list_append(&mca_pml_ob1.rdma_pending,
>>>(opal_list_item_t*)frag);
>>> - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> - return OMPI_ERR_OUT_OF_RESOURCE;
>>> + if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
>>> + frag->rdma_length = save_size;
>>> + OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> + opal_list_append(&mca_pml_ob1.rdma_pending,
>>>(opal_list_item_t*)frag);
>>> + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> + return OMPI_ERR_OUT_OF_RESOURCE;
>>> + } else {
>>> + ompi_proc_t *proc = (ompi_proc_t *)
>>>recvreq->req_recv.req_base.req_proc;
>>> +
>>> + /* tell peer to fall back on send */
>>> + recvreq->req_send_offset = 0;
>>> + rc = mca_pml_ob1_recv_request_ack_send(proc,
>>>frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
>>> + recvreq,
>>>recvreq->req_send_offset, true);
>>> + MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
>>> + return rc;
>>> + }
>>> }
>>>
>>> descriptor->des_src = frag->rdma_segs;
>>> @@ -393,6 +464,11 @@
>>> /* queue up get request */
>>> rc = mca_bml_base_get(bml_btl,descriptor);
>>> if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
>>> + if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
>>> + /* get isn't supported for this transfer. tell peer to
>>>fallback on put */
>>> + rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
>>> + }
>>> +
>>> if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
>>> mca_bml_base_free(bml_btl, descriptor);
>>> OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> @@ -400,7 +476,7 @@
>>> (opal_list_item_t*)frag);
>>> OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> return OMPI_ERR_OUT_OF_RESOURCE;
>>> - } else {
>>> + } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>>> ORTE_ERROR_LOG(rc);
>>> orte_errmgr.abort(-1, NULL);
>>> }
>>> @@ -551,7 +627,9 @@
>>> orte_errmgr.abort(-1, NULL);
>>> }
>>> #endif /* OMPI_CUDA_SUPPORT */
>>> +
>>> frag->rdma_hdr.hdr_rget = *hdr;
>>> + frag->retries = 0;
>>> frag->rdma_req = recvreq;
>>> frag->rdma_ep = bml_endpoint;
>>> frag->rdma_length = size;
>>> @@ -792,7 +870,7 @@
>>> mca_bml_base_prepare_dst(bml_btl, reg,
>>>
>>>&recvreq->req_recv.req_base.req_convertor,
>>> MCA_BTL_NO_ORDER, 0, &size,
>>>MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> - MCA_BTL_DES_FLAGS_PUT, &dst);
>>> + MCA_BTL_DES_FLAGS_PUT, &dst);
>>> OPAL_THREAD_UNLOCK(&recvreq->lock);
>>>
>>> if(OPAL_UNLIKELY(dst == NULL)) {
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c 2012-04-24 16:18:56
>>>EDT (Tue, 24 Apr 2012)
>>> @@ -264,6 +264,7 @@
>>> MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
>>> 0, req_bytes_delivered );
>>> OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered,
>>>req_bytes_delivered);
>>> + sendreq->src_des = NULL;
>>>
>>> send_request_pml_complete_check(sendreq);
>>> /* free the descriptor */
>>> @@ -639,6 +640,8 @@
>>> bool need_local_cb = false;
>>> int rc;
>>>
>>> + sendreq->src_des = NULL;
>>> +
>>> bml_btl = sendreq->req_rdma[0].bml_btl;
>>> if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags &
>>>(MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
>>> mca_mpool_base_registration_t* reg =
>>>sendreq->req_rdma[0].btl_reg;
>>> @@ -657,10 +660,8 @@
>>> mca_bml_base_prepare_src( bml_btl,
>>> reg,
>>>
>>>&sendreq->req_send.req_base.req_convertor,
>>> - MCA_BTL_NO_ORDER,
>>> - 0,
>>> - &size,
>>> - MCA_BTL_DES_FLAGS_GET,
>>> + MCA_BTL_NO_ORDER, 0, &size,
>>> + MCA_BTL_DES_FLAGS_GET |
>>>MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
>>> &src );
>>> MEMCHECKER(
>>> memchecker_call(&opal_memchecker_base_mem_noaccess,
>>> @@ -676,6 +677,8 @@
>>> src->des_cbfunc = mca_pml_ob1_rget_completion;
>>> src->des_cbdata = sendreq;
>>>
>>> + sendreq->src_des = src;
>>> +
>>> /* allocate space for get hdr + segment list */
>>> mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
>>> sizeof(mca_pml_ob1_rget_hdr_t) +
>>> @@ -782,8 +785,9 @@
>>> return OMPI_SUCCESS;
>>> }
>>> mca_bml_base_free(bml_btl, des);
>>> - if (NULL != src) {
>>> - mca_bml_base_free (bml_btl, src);
>>> + if (sendreq->src_des) {
>>> + mca_bml_base_free (bml_btl, sendreq->src_des);
>>> + sendreq->src_des = NULL;
>>> }
>>>
>>> return rc;
>>> @@ -1133,63 +1137,71 @@
>>> MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
>>> }
>>>
>>> -int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
>>> -{
>>> - mca_mpool_base_registration_t* reg = NULL;
>>> - mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
>>> - mca_btl_base_descriptor_t* des;
>>> +int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
>>> +{
>>> + mca_pml_ob1_send_request_t* sendreq =
>>>(mca_pml_ob1_send_request_t*)frag->rdma_req;
>>> + mca_mpool_base_registration_t *reg = NULL;
>>> + mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
>>> + mca_btl_base_descriptor_t *des;
>>> size_t save_size = frag->rdma_length;
>>> int rc;
>>>
>>> - /* setup descriptor */
>>> - mca_bml_base_prepare_src( bml_btl,
>>> - reg,
>>> - &frag->convertor,
>>> - MCA_BTL_NO_ORDER,
>>> - 0,
>>> - &frag->rdma_length,
>>> - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> - MCA_BTL_DES_FLAGS_PUT,
>>> - &des );
>>> + if (OPAL_LIKELY(NULL == sendreq->src_des)) {
>>> + /* setup descriptor */
>>> + mca_bml_base_prepare_src( bml_btl,
>>> + reg,
>>> + &frag->convertor,
>>> + MCA_BTL_NO_ORDER,
>>> + 0,
>>> + &frag->rdma_length,
>>> + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> + MCA_BTL_DES_FLAGS_PUT,
>>> + &des );
>>>
>>> - if( OPAL_UNLIKELY(NULL == des) ) {
>>> - if(frag->retries < mca_pml_ob1.rdma_put_retries_limit) {
>>> - size_t offset =
>>>(size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
>>> - frag->rdma_length = save_size;
>>> - opal_convertor_set_position(&frag->convertor, &offset);
>>> - OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> - opal_list_append(&mca_pml_ob1.rdma_pending,
>>>(opal_list_item_t*)frag);
>>> - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> - } else {
>>> - mca_pml_ob1_send_request_t *sendreq =
>>> - (mca_pml_ob1_send_request_t*)frag->rdma_req;
>>> + if( OPAL_UNLIKELY(NULL == des) ) {
>>> + if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
>>> + size_t offset =
>>>(size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
>>> + frag->rdma_length = save_size;
>>> + opal_convertor_set_position(&frag->convertor,
>>>&offset);
>>> + OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> + opal_list_append(&mca_pml_ob1.rdma_pending,
>>>(opal_list_item_t*)frag);
>>> + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> + } else {
>>> + mca_pml_ob1_send_request_t *sendreq =
>>> + (mca_pml_ob1_send_request_t*)frag->rdma_req;
>>> +
>>> + /* tell receiver to unregister memory */
>>> +
>>>mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
>>> + bml_btl,
>>>frag->rdma_hdr.hdr_rdma.hdr_des,
>>> + MCA_BTL_NO_ORDER, 1);
>>> +
>>> + /* send fragment by copy in/out */
>>> + mca_pml_ob1_send_request_copy_in_out(sendreq,
>>> +
>>>frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
>>> + /* if a pointer to a receive request is not set it
>>>means that
>>> + * ACK was not yet received. Don't schedule sends
>>>before ACK */
>>> + if(NULL != sendreq->req_recv.pval)
>>> + mca_pml_ob1_send_request_schedule(sendreq);
>>> + }
>>>
>>> - /* tell receiver to unregister memory */
>>> - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
>>> - bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
>>> - MCA_BTL_NO_ORDER, 1);
>>> -
>>> - /* send fragment by copy in/out */
>>> - mca_pml_ob1_send_request_copy_in_out(sendreq,
>>> - frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
>>>frag->rdma_length);
>>> - /* if a pointer to a receive request is not set it means
>>>that
>>> - * ACK was not yet received. Don't schedule sends before
>>>ACK */
>>> - if(NULL != sendreq->req_recv.pval)
>>> - mca_pml_ob1_send_request_schedule(sendreq);
>>> + return OMPI_ERR_OUT_OF_RESOURCE;
>>> }
>>> - return OMPI_ERR_OUT_OF_RESOURCE;
>>> + } else {
>>> + /* already have a source descriptor */
>>> + des = sendreq->src_des;
>>> + sendreq->src_des = NULL;
>>> }
>>> -
>>> - des->des_dst = frag->rdma_segs;
>>> +
>>> + des->des_dst = frag->rdma_segs;
>>> des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
>>> - des->des_cbfunc = mca_pml_ob1_put_completion;
>>> - des->des_cbdata = frag;
>>> + des->des_cbfunc = mca_pml_ob1_put_completion;
>>> + des->des_cbdata = frag;
>>>
>>> PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
>>>
>>>&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base),
>>>save_size, PERUSE_SEND );
>>>
>>> rc = mca_bml_base_put(bml_btl, des);
>>> - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
>>> + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>>> mca_bml_base_free(bml_btl, des);
>>> frag->rdma_length = save_size;
>>> if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
>>> @@ -1203,6 +1215,7 @@
>>> orte_errmgr.abort(-1, NULL);
>>> }
>>> }
>>> +
>>> return OMPI_SUCCESS;
>>> }
>>>
>>> @@ -1261,21 +1274,25 @@
>>> frag->reg = NULL;
>>> frag->retries = 0;
>>>
>>> + if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
>>> + /* get fallback path */
>>> + sendreq->req_state = 0;
>>> + }
>>> +
>>> /* lookup the corresponding registration */
>>> for(i=0; i<sendreq->req_rdma_cnt; i++) {
>>> - if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
>>> - frag->reg = sendreq->req_rdma[i].btl_reg;
>>> - break;
>>> - }
>>> - }
>>> + if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
>>> + frag->reg = sendreq->req_rdma[i].btl_reg;
>>> + break;
>>> + }
>>> + }
>>>
>>> /* RDMA writes may proceed in parallel to send and to each other,
>>>so
>>> * create clone of the convertor for each RDMA fragment
>>> */
>>> size = hdr->hdr_rdma_offset;
>>>
>>>opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_conve
>>>rtor,
>>> - &frag->convertor, 0, &size);
>>> + &frag->convertor, 0, &size);
>>>
>>> mca_pml_ob1_send_request_put_frag(frag);
>>> }
>>> -
>>>
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h
>>>
>>>========================================================================
>>>======
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h 2012-04-24 16:18:56
>>>EDT (Tue, 24 Apr 2012)
>>> @@ -54,6 +54,7 @@
>>> mca_pml_ob1_send_pending_t req_pending;
>>> opal_mutex_t req_send_range_lock;
>>> opal_list_t req_send_ranges;
>>> + mca_btl_base_descriptor_t *src_des;
>>> mca_pml_ob1_com_btl_t req_rdma[1];
>>> };
>>> typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
>>> @@ -129,6 +130,7 @@
>>> OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item,
>>>rc); \
>>> sendreq = (mca_pml_ob1_send_request_t*)item;
>>> \
>>> sendreq->req_send.req_base.req_proc = proc;
>>> \
>>> + sendreq->src_des = NULL;
>>> \
>>> }
>>> \
>>> }
>>>
>>> _______________________________________________
>>> svn-full mailing list
>>> svn-full_at_[hidden]
>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
>>
>>
>> --
>> Jeff Squyres
>> jsquyres_at_[hidden]
>> For corporate legal information go to:
>>http://www.cisco.com/web/about/doing_business/legal/cri/
>>
>>
>> _______________________________________________
>> devel mailing list
>> devel_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>
>_______________________________________________
>devel mailing list
>devel_at_[hidden]
>http://www.open-mpi.org/mailman/listinfo.cgi/devel
>
>

-- 
  Brian W. Barrett
  Dept. 1423: Scalable System Software
  Sandia National Laboratories