diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 61f5db8..97f5c8a 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -157,6 +157,14 @@ static int mca_bml_r2_add_btls( void ) return OMPI_SUCCESS; } +static int btl_bandwidth_compare(const void *v1, const void *v2) +{ + mca_bml_base_btl_t *b1 = (mca_bml_base_btl_t*)v1, + *b2 = (mca_bml_base_btl_t*)v2; + + return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth; +} + /* * For each proc setup a datastructure that indicates the PTLs * that can be used to reach the destination. @@ -380,6 +388,11 @@ int mca_bml_r2_add_procs( * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + + /* sort BTLs in descending order according to bandwidth value */ + qsort(bml_endpoint->btl_send.bml_btls, n_size, + sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); + bml_endpoint->bml_max_send_length = 0; bml_endpoint->bml_max_rdma_length = 0; bml_endpoint->btl_rdma_index = 0; diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index fd528fa..8b06c99 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -536,3 +536,15 @@ int mca_pml_ob1_ft_event( int state ) return OMPI_SUCCESS; } + +int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2) +{ + const mca_pml_ob1_com_btl_t *b1 = v1, *b2 = v2; + + if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight) + return 1; + if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight) + return -1; + + return 0; +} diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index 50c8f31..e9c5403 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -55,6 +55,8 @@ struct mca_pml_ob1_t { size_t send_pipeline_depth; size_t recv_pipeline_depth; size_t rdma_put_retries_limit; + int max_rdma_per_request; + int max_send_per_range; bool leave_pinned; int leave_pinned_pipeline; @@ -241,8 +243,6 @@ struct mca_pml_ob1_rdma_reg_t { }; typedef struct mca_pml_ob1_rdma_reg_t mca_pml_ob1_rdma_reg_t; -#define MCA_PML_OB1_MAX_REGISTRATIONS 4 - struct mca_pml_ob1_pckt_pending_t { ompi_free_list_item_t super; ompi_proc_t* proc; @@ -326,4 +326,53 @@ do { \ length -= hdrlen; \ } while(0) +/* represent BTL chosen for sending request */ +struct mca_pml_ob1_com_btl_t { + mca_bml_base_btl_t *bml_btl; + struct mca_mpool_base_registration_t* btl_reg; + size_t length; +}; +typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; + +int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2); + +/* Calculate what percentage of a message to send through each BTL according to + * relative weight */ +static inline void mca_pml_ob1_calc_weighted_length( + mca_pml_ob1_com_btl_t *btls, int num_btls, size_t size, + double weight_total) +{ + int i; + size_t length_left = size; + + /* shortcut for common case for only one BTL */ + if(num_btls == 1) { + btls[0].length = size; + return; + } + + /* sort BTLs according of their weights so BTLs with smaller weight will + * not hijack all of the traffic */ + qsort(btls, num_btls, sizeof(mca_pml_ob1_com_btl_t), + mca_pml_ob1_com_btl_comp); + + for(i = 0; i < num_btls; i++) { + mca_bml_base_btl_t* bml_btl = btls[i].bml_btl; + size_t length = 0; + if(length_left != 0) { + length = (length_left > bml_btl->btl_eager_limit)? + ((size_t)(size * (bml_btl->btl_weight / weight_total))) : + length_left; + + if(length > length_left) + length = length_left; + length_left -= length; + } + btls[i].length = length; + } + + /* account for rounding errors */ + btls[0].length += length_left; +} + #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index d3237e5..b35f4ee 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -113,6 +113,10 @@ int mca_pml_ob1_component_open(void) mca_pml_ob1_param_register_int("recv_pipeline_depth", 4); mca_pml_ob1.rdma_put_retries_limit = mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5); + mca_pml_ob1.max_rdma_per_request = + mca_pml_ob1_param_register_int("max_rdma_per_request", 4); + mca_pml_ob1.max_send_per_range = + mca_pml_ob1_param_register_int("max_send_per_range", 4); mca_pml_ob1.unexpected_limit = mca_pml_ob1_param_register_int("unexpected_limit", 128); @@ -146,7 +150,8 @@ int mca_pml_ob1_component_open(void) OBJ_CONSTRUCT(&mca_pml_ob1.send_requests, ompi_free_list_t); ompi_free_list_init( &mca_pml_ob1.send_requests, - sizeof(mca_pml_ob1_send_request_t), + sizeof(mca_pml_ob1_send_request_t) + + (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t), OBJ_CLASS(mca_pml_ob1_send_request_t), mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_max, @@ -156,7 +161,8 @@ int mca_pml_ob1_component_open(void) OBJ_CONSTRUCT(&mca_pml_ob1.recv_requests, ompi_free_list_t); ompi_free_list_init( &mca_pml_ob1.recv_requests, - sizeof(mca_pml_ob1_recv_request_t), + sizeof(mca_pml_ob1_recv_request_t) + + (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t), OBJ_CLASS(mca_pml_ob1_recv_request_t), mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_max, @@ -185,8 +191,6 @@ int mca_pml_ob1_component_open(void) mca_pml_ob1.free_list_inc, NULL); - - OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, ompi_free_list_t); ompi_free_list_init( &mca_pml_ob1.pending_pckts, @@ -202,7 +206,8 @@ int mca_pml_ob1_component_open(void) OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, ompi_free_list_t); ompi_free_list_init( &mca_pml_ob1.send_ranges, - sizeof(mca_pml_ob1_send_range_t), + sizeof(mca_pml_ob1_send_range_t) + + (mca_pml_ob1.max_send_per_range - 1) * sizeof(mca_pml_ob1_com_btl_t), OBJ_CLASS(mca_pml_ob1_send_range_t), mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_max, diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index c8eec33..4e64a87 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -30,6 +30,11 @@ #include "pml_ob1.h" #include "pml_ob1_rdma.h" +/* Use this registration if no registration needed for a BTL instead of NULL. + * This will help other code to distinguish case when memory is not registered + * from case when registration is not needed */ +static mca_mpool_base_registration_t pml_ob1_dummy_reg; + /* * Check to see if memory is registered or can be registered. Build a * set of registrations on the request. @@ -39,11 +44,11 @@ size_t mca_pml_ob1_rdma_btls( mca_bml_base_endpoint_t* bml_endpoint, unsigned char* base, size_t size, - mca_pml_ob1_rdma_btl_t* rdma_btls) + mca_pml_ob1_com_btl_t* rdma_btls) { - size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); - size_t num_btls_used = 0; - size_t n; + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + int num_btls_used = 0, n; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { @@ -51,7 +56,7 @@ size_t mca_pml_ob1_rdma_btls( } /* check to see if memory is registered */ - for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST; + for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, @@ -59,10 +64,7 @@ size_t mca_pml_ob1_rdma_btls( mca_mpool_base_registration_t* reg = NULL; mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool; - /* btl is rdma capable and registration is not required */ - if(NULL == btl_mpool) { - reg = NULL; - } else { + if(NULL != btl_mpool) { if(!mca_pml_ob1.leave_pinned) { /* look through existing registrations */ btl_mpool->mpool_find(btl_mpool, base, size, ®); @@ -73,14 +75,52 @@ size_t mca_pml_ob1_rdma_btls( if(NULL == reg) bml_btl = NULL; /* skip it */ + } else { + /* if registration is not required use dummy registration */ + reg = &pml_ob1_dummy_reg; } if(bml_btl != NULL) { rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].btl_reg = reg; + weight_total += bml_btl->btl_weight; num_btls_used++; } } + + /* if we don't use leave_pinned and all BTLs that already have this memory + * registered amount to less then half of available bandwidth - fall back to + * pipeline protocol */ + if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) + return 0; + + mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, + weight_total); + bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; return num_btls_used; } + +size_t mca_pml_ob1_rdma_pipeline_btls( + mca_bml_base_endpoint_t* bml_endpoint, + size_t size, + mca_pml_ob1_com_btl_t* rdma_btls) +{ + int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + + for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { + rdma_btls[i].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); + if(rdma_btls[i].bml_btl->btl_mpool != NULL) + rdma_btls[i].btl_reg = NULL; + else + rdma_btls[i].btl_reg = &pml_ob1_dummy_reg; + + weight_total += rdma_btls[i].bml_btl->btl_weight; + } + + mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total); + + return i; +} diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h index d2c983c..3ed0655 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h @@ -24,27 +24,18 @@ struct mca_bml_base_endpoint_t; -/** - * structure to associate RDMA capable BTL(s) with corresponding registration - */ - -struct mca_pml_ob1_rdma_btl_t { - struct mca_bml_base_btl_t* bml_btl; - struct mca_mpool_base_registration_t* btl_reg; -}; -typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t; - - -#define MCA_PML_OB1_MAX_RDMA_PER_REQUEST 4 - - /* * Of the set of available btls that support RDMA, * find those that already have registrations - or * register if required (for leave_pinned option) */ size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, - unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls); + unsigned char* base, size_t size, struct mca_pml_ob1_com_btl_t* btls); +/* Choose RDMA BTLs to use for sending of a request by pipeline protocol. + * Calculate number of bytes to send through each BTL according to available + * bandwidth */ +size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, + size_t size, mca_pml_ob1_com_btl_t* rdma_btls); #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 33c7c3e..c47003c 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -240,13 +240,15 @@ static int mca_pml_ob1_recv_request_ack( /* by default copy everything */ recvreq->req_send_offset = bytes_received; if(hdr->hdr_msg_length > bytes_received) { + size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); /* * lookup request buffer to determine if memory is already * registered. */ if(ompi_convertor_need_buffers(&recvreq->req_recv.req_convertor) == 0 && - hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG) { + hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG && + rdma_num != 0) { unsigned char *base; ompi_convertor_get_current_pointer( &recvreq->req_recv.req_convertor, (void**)&(base) ); @@ -261,19 +263,23 @@ static int mca_pml_ob1_recv_request_ack( recvreq->req_send_offset = hdr->hdr_msg_length; /* are rdma devices available for long rdma protocol */ - } else if (bml_endpoint->btl_send_limit < hdr->hdr_msg_length && - bml_endpoint->btl_pipeline_send_length < - hdr->hdr_msg_length && - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) { - + } else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) { /* use convertor to figure out the rdma offset for this request */ recvreq->req_send_offset = hdr->hdr_msg_length - bml_endpoint->btl_pipeline_send_length; - if(recvreq->req_send_offset < bytes_received) { + + if(recvreq->req_send_offset < bytes_received) recvreq->req_send_offset = bytes_received; - } - ompi_convertor_set_position( &recvreq->req_recv.req_convertor, - &recvreq->req_send_offset ); + + /* use converter to figure out the rdma offset for this + * request */ + ompi_convertor_set_position(&recvreq->req_recv.req_convertor, + &recvreq->req_send_offset); + + recvreq->req_rdma_cnt = + mca_pml_ob1_rdma_pipeline_btls(bml_endpoint, + recvreq->req_send_offset - bytes_received, + recvreq->req_rdma); } } /* nothing to send by copy in/out - no need to ack */ @@ -592,14 +598,8 @@ void mca_pml_ob1_recv_request_matched_probe( int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq ) { ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc; - mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; mca_bml_base_btl_t* bml_btl; - int num_btl_avail = - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); - int num_tries = num_btl_avail; - - if(recvreq->req_rdma_cnt) - num_tries = recvreq->req_rdma_cnt; + int num_tries = recvreq->req_rdma_cnt; do { size_t bytes_remaining = recvreq->req_send_offset - @@ -615,8 +615,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec mca_btl_base_descriptor_t* dst; mca_btl_base_descriptor_t* ctl; mca_mpool_base_registration_t * reg = NULL; - int rc; - + int rc, rdma_idx; + if(prev_bytes_remaining == bytes_remaining) { if( ++num_fail == num_tries ) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); @@ -636,47 +636,18 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset); - if(recvreq->req_rdma_cnt) { - /* - * Select the next btl out of the list w/ preregistered - * memory. - */ - bml_btl = recvreq->req_rdma[recvreq->req_rdma_idx].bml_btl; - num_btl_avail = recvreq->req_rdma_cnt - recvreq->req_rdma_idx; - reg = recvreq->req_rdma[recvreq->req_rdma_idx].btl_reg; - - if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) + do { + rdma_idx = recvreq->req_rdma_idx; + bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; + reg = recvreq->req_rdma[rdma_idx].btl_reg; + size = recvreq->req_rdma[rdma_idx].length; + if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) recvreq->req_rdma_idx = 0; - } else { - /* - * Otherwise, schedule round-robin across the - * available RDMA nics dynamically registering/deregister - * as required. - */ - bml_btl = - mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); - } + } while(!size); - /* - * If more than one NIC is available - try to use both for - * anything larger than the eager limit - */ - if( num_btl_avail == 1 || - bytes_remaining < bml_btl->btl_eager_limit ) { - size = bytes_remaining; - } else { - /* - * otherwise attempt to give the BTL a percentage of - * the message based on a weighting factor. for - * simplicity calculate this as a percentage of the - * overall message length (regardless of amount - * previously assigned) - */ - size = (size_t)(bml_btl->btl_weight * bytes_remaining); - } /* makes sure that we don't exceed BTL max rdma size * if memory is not pinned already */ - if(0 == recvreq->req_rdma_cnt && + if(NULL == reg && bml_btl->btl_rdma_pipeline_frag_size != 0 && size > bml_btl->btl_rdma_pipeline_frag_size) { size = bml_btl->btl_rdma_pipeline_frag_size; @@ -684,8 +655,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec /* prepare a descriptor for RDMA */ mca_bml_base_prepare_dst(bml_btl, reg, - &recvreq->req_recv.req_convertor, MCA_BTL_NO_ORDER, - 0, &size, &dst); + &recvreq->req_recv.req_convertor, + MCA_BTL_NO_ORDER, 0, &size, &dst); if(dst == NULL) { continue; } @@ -751,6 +722,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec /* update request state */ recvreq->req_rdma_offset += size; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,1); + recvreq->req_rdma[rdma_idx].length -= size; bytes_remaining -= size; } else { mca_bml_base_free(bml_btl,ctl); diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 0928e02..1c67d1d 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -48,11 +48,11 @@ struct mca_pml_ob1_recv_request_t { size_t req_bytes_delivered; size_t req_rdma_offset; size_t req_send_offset; - mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST]; uint32_t req_rdma_cnt; uint32_t req_rdma_idx; bool req_pending; bool req_ack_sent; /**< whether ack was sent to the sender */ + mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; @@ -135,7 +135,7 @@ do { \ for( r = 0; r < recvreq->req_rdma_cnt; r++ ) { \ mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[r].btl_reg; \ - if( NULL != btl_reg ) { \ + if( NULL != btl_reg && btl_reg->mpool != NULL) { \ btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); \ } \ } \ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 31549bc..94923f1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -883,7 +883,10 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq, { mca_pml_ob1_send_range_t *sr; ompi_free_list_item_t *i; - int rc = OMPI_SUCCESS; + mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint; + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + int rc = OMPI_SUCCESS, n; + double weight_total = 0; if(0 == send_length) return; @@ -894,6 +897,18 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq, sr->range_send_offset = send_offset; sr->range_send_length = send_length; + sr->range_btl_idx = 0; + + for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) { + sr->range_btls[n].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send); + weight_total += sr->range_btls[n].bml_btl->btl_weight; + } + + sr->range_btl_cnt = n; + mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length, + weight_total); + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr); OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); @@ -910,22 +925,19 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq, int mca_pml_ob1_send_request_schedule_exclusive( mca_pml_ob1_send_request_t* sendreq) { - mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint; - size_t num_btl_avail = - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); do { - size_t prev_bytes_remaining = 0, num_fail = 0; + size_t prev_bytes_remaining = 0; mca_pml_ob1_send_range_t *range = NULL; + int num_fail = 0; while(true) { mca_pml_ob1_frag_hdr_t* hdr; mca_btl_base_descriptor_t* des; - int rc; + int rc, btl_idx; size_t size, offset; opal_list_item_t *item; - mca_bml_base_btl_t* bml_btl = - mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send); + mca_bml_base_btl_t* bml_btl; if(NULL == range || 0 == range->range_send_length) { OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); @@ -957,7 +969,7 @@ int mca_pml_ob1_send_request_schedule_exclusive( prev_bytes_remaining = range->range_send_length; - if (num_fail == num_btl_avail) { + if (num_fail == range->range_btl_cnt) { assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE); OPAL_THREAD_LOCK(&mca_pml_ob1.lock); sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_SCHEDULE; @@ -967,17 +979,13 @@ int mca_pml_ob1_send_request_schedule_exclusive( return OMPI_ERR_OUT_OF_RESOURCE; } - if(num_btl_avail == 1 || - range->range_send_length < bml_btl->btl_min_send_size) { - size = range->range_send_length; - } else { - /* otherwise attempt to give the BTL a percentage of the message - * based on a weighting factor. for simplicity calculate this as - * a percentage of the overall message length (regardless of - * amount previously assigned) - */ - size = (size_t)(bml_btl->btl_weight * range->range_send_length); - } + do { + btl_idx = range->range_btl_idx; + bml_btl = range->range_btls[btl_idx].bml_btl; + size = range->range_btls[btl_idx].length; + if(++range->range_btl_idx == range->range_btl_cnt) + range->range_btl_idx = 0; + } while(!size); /* makes sure that we don't exceed BTL max send size */ if (bml_btl->btl_max_send_size != 0 && @@ -1035,8 +1043,9 @@ int mca_pml_ob1_send_request_schedule_exclusive( rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); if(rc == OMPI_SUCCESS) { - range->range_send_length -= size; /* update state */ + range->range_btls[btl_idx].length -= size; + range->range_send_length -= size; range->range_send_offset += size; OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); } else { diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 30a7783..f1394c0 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -54,11 +54,11 @@ struct mca_pml_ob1_send_request_t { bool req_throttle_sends; size_t req_pipeline_depth; size_t req_bytes_delivered; - mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST]; uint32_t req_rdma_cnt; mca_pml_ob1_send_pending_t req_pending; opal_mutex_t req_send_range_lock; opal_list_t req_send_ranges; + mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; @@ -68,6 +68,9 @@ struct mca_pml_ob1_send_range_t { ompi_free_list_item_t base; uint64_t range_send_offset; uint64_t range_send_length; + int range_btl_idx; + int range_btl_cnt; + mca_pml_ob1_com_btl_t range_btls[1]; }; typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t); @@ -122,7 +125,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s /* return mpool resources */ for(r = 0; r < sendreq->req_rdma_cnt; r++) { mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; - if( NULL != reg ) { + if( NULL != reg && reg->mpool != NULL ) { reg->mpool->mpool_deregister(reg->mpool, reg); } }