Index: ompi/mca/btl/openib/btl_openib_component.c =================================================================== --- ompi/mca/btl/openib/btl_openib_component.c (revision 18437) +++ ompi/mca/btl/openib/btl_openib_component.c (working copy) @@ -42,6 +42,7 @@ #include "opal/mca/carto/carto.h" #include "opal/mca/carto/base/base.h" #include "opal/mca/paffinity/base/base.h" +#include "opal/mca/installdirs/installdirs.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/proc_info.h" @@ -80,6 +81,11 @@ static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); static int btl_openib_component_progress(void); +/* + * Local variables + */ +static mca_btl_openib_hca_t *receive_queues_hca = NULL; + mca_btl_openib_component_t mca_btl_openib_component = { { /* First, the mca_base_component_t struct containing meta information @@ -459,8 +465,6 @@ union ibv_gid gid; uint64_t subnet_id; - ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid); - /* If we have struct ibv_device.transport_type, then we're >= OFED v1.2, and the transport could be iWarp or IB. If we don't have that member, then we're < OFED v1.2, and it can only be IB. */ @@ -468,9 +472,11 @@ if (IBV_TRANSPORT_IWARP == hca->ib_dev->transport_type) { subnet_id = get_iwarp_subnet_id(hca->ib_dev); } else { + ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid); subnet_id = ntoh64(gid.global.subnet_prefix); } #else + ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid); subnet_id = ntoh64(gid.global.subnet_prefix); #endif @@ -664,8 +670,6 @@ static void hca_construct(mca_btl_openib_hca_t *hca) { - int i; - hca->ib_dev = NULL; hca->ib_dev_context = NULL; hca->ib_pd = NULL; @@ -687,13 +691,8 @@ #if HAVE_XRC hca->xrc_fd = -1; #endif - hca->qps = (mca_btl_openib_hca_qp_t*)calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_hca_qp_t)); + hca->qps = NULL; OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t); - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t); - OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t); - } OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t); } @@ -709,13 +708,14 @@ free(hca->eager_rdma_buffers); } OBJ_DESTRUCT(&hca->hca_lock); - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_DESTRUCT(&hca->qps[i].send_free); - OBJ_DESTRUCT(&hca->qps[i].recv_free); - } OBJ_DESTRUCT(&hca->send_free_control); - if(hca->qps) + if (hca->qps) { + for (i = 0; i < mca_btl_openib_component.num_qps; i++) { + OBJ_DESTRUCT(&hca->qps[i].send_free); + OBJ_DESTRUCT(&hca->qps[i].recv_free); + } free(hca->qps); + } } OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct, @@ -947,6 +947,9 @@ return num_ports; } +/* + * Prefer values that are already in the target + */ static void merge_values(ompi_btl_openib_ini_values_t *target, ompi_btl_openib_ini_values_t *src) { @@ -959,6 +962,12 @@ target->use_eager_rdma = src->use_eager_rdma; target->use_eager_rdma_set = true; } + + if (!target->receive_queues_set && src->receive_queues_set) { + free(target->receive_queues); + target->receive_queues = strdup(src->receive_queues); + target->receive_queues_set = true; + } } static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) @@ -969,6 +978,14 @@ (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type); } +static int32_t atoi_param(char *param, int32_t dflt) +{ + if(NULL == param || '\0' == param[0]) + return dflt ? dflt : 1; + + return atoi(param); +} + static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid) { int index; @@ -985,6 +1002,203 @@ } } +static int setup_qps(void) +{ + char **queues, **params = NULL; + int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0; + uint32_t max_qp_size, max_size_needed; + int32_t min_freelist_size = 0; + int smallest_pp_qp = 0, ret = OMPI_ERROR; + + queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':'); + + if (0 == opal_argv_count(queues)) { + orte_show_help("help-mpi-btl-openib.txt", + "no qps in receive_queues", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + return OMPI_ERROR; + } + + while (queues[qp] != NULL) { + if (0 == strncmp("P,", queues[qp], 2)) { + num_pp_qps++; + if(smallest_pp_qp > qp) + smallest_pp_qp = qp; + } else if (0 == strncmp("S,", queues[qp], 2)) { + num_srq_qps++; + } else if (0 == strncmp("X,", queues[qp], 2)) { +#if HAVE_XRC + num_xrc_qps++; +#else + orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + goto error; +#endif + } else { + orte_show_help("help-mpi-btl-openib.txt", + "invalid qp type in receive_queues", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues, + queues[qp]); + goto error; + } + qp++; + } + /* Current XRC implementation can't used with other QP types - PP + and SRQ */ + if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { + orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + goto error; + } + + /* Current XRC implementation can't used with btls_per_lid > 1 */ + if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { + orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", + true, orte_process_info.nodename, + mca_btl_openib_component.receive_queues, num_xrc_qps); + goto error; + } + mca_btl_openib_component.num_pp_qps = num_pp_qps; + mca_btl_openib_component.num_srq_qps = num_srq_qps; + mca_btl_openib_component.num_xrc_qps = num_xrc_qps; + mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; + + mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) + malloc(sizeof(mca_btl_openib_qp_info_t) * + mca_btl_openib_component.num_qps); + + qp = 0; +#define P(N) (((N) > count)?NULL:params[(N)]) + while(queues[qp] != NULL) { + int i = 0, count; + int32_t rd_low, rd_num; + params = opal_argv_split_with_empty(queues[qp], ','); + count = opal_argv_count(params); + + if ('P' == params[0][0]) { + int32_t rd_win, rd_rsv; + if (count < 3 || count > 6) { + orte_show_help("help-mpi-btl-openib.txt", + "invalid pp qp specification", true, + orte_process_info.nodename, queues[qp]); + goto error; + } + mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; + mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); + rd_num = atoi_param(P(2), 256); + /* by default set rd_low to be 3/4 of rd_num */ + rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); + rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); + rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); + + BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", + rd_num, rd_low, rd_win, rd_rsv)); + + /* Calculate the smallest freelist size that can be allowed */ + if (rd_num + rd_rsv > min_freelist_size) + min_freelist_size = rd_num + rd_rsv; + + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; + if((rd_num - rd_low) > rd_win) + orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", + true, rd_win, rd_num - rd_low); + } else { + int32_t sd_max; + if(count < 3 || count > 5) { + orte_show_help("help-mpi-btl-openib.txt", + "invalid srq specification", true, + orte_process_info.nodename, queues[qp]); + goto error; + } + mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? + MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; + mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); + rd_num = atoi_param(P(2), 256); + /* by default set rd_low to be 3/4 of rd_num */ + rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); + sd_max = atoi_param(P(4), rd_low / 4); + BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", + rd_num, rd_low, sd_max)); + + /* Calculate the smallest freelist size that can be allowed */ + if (rd_num > min_freelist_size) + min_freelist_size = rd_num; + + mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; + } + + if (rd_num <= rd_low) { + orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", + true, orte_process_info.nodename, queues[qp]); + goto error; + } + mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; + mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; + while (NULL != params[i]) { + free(params[i++]); + } + free(params); + qp++; + } + params = NULL; + + /* Sanity check some sizes */ + + max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; + max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > + mca_btl_openib_module.super.btl_max_send_size) ? + mca_btl_openib_module.super.btl_eager_limit : + mca_btl_openib_module.super.btl_max_send_size; + if (max_qp_size < max_size_needed) { + orte_show_help("help-mpi-btl-openib.txt", + "biggest qp size is too small", true, + orte_process_info.nodename, max_qp_size, + max_size_needed); + ret = OMPI_ERROR; + goto error; + } else if (max_qp_size > max_size_needed) { + orte_show_help("help-mpi-btl-openib.txt", + "biggest qp size is too big", true, + orte_process_info.nodename, max_qp_size, + max_size_needed); + } + + if (mca_btl_openib_component.ib_free_list_max > 0 && + min_freelist_size > mca_btl_openib_component.ib_free_list_max) { + orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true, + orte_process_info.nodename, + mca_btl_openib_component.ib_free_list_max, + min_freelist_size); + goto error; + } + + mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; + mca_btl_openib_component.credits_qp = smallest_pp_qp; + + ret = OMPI_SUCCESS; +error: + if(params) { + qp = 0; + while(params[qp] != NULL) + free(params[qp++]); + free(params); + } + + if(queues) { + qp = 0; + while(queues[qp] != NULL) + free(queues[qp++]); + free(queues); + } + + return ret; +} + static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) { struct mca_mpool_base_resources_t mpool_resources; @@ -1024,24 +1238,11 @@ allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int)); port_cnt = get_port_list(hca, allowed_ports); if(0 == port_cnt) { + free(allowed_ports); ret = OMPI_SUCCESS; goto error; } -#if HAVE_XRC - /* if user configured to run with XRC qp and the device don't support it - - * we should ignore this hca. Maybe we have other one that have XRC support - */ - if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && - mca_btl_openib_component.num_xrc_qps > 0) { - orte_show_help("help-mpi-btl-openib.txt", - "XRC on device without XRC support", true, - mca_btl_openib_component.num_xrc_qps, - ibv_get_device_name(ib_dev), - orte_process_info.nodename); - ret = OMPI_SUCCESS; - goto error; - } -#endif + /* Load in vendor/part-specific HCA parameters. Note that even if we don't find values for this vendor/part, "values" will be set indicating that it does not have good values */ @@ -1101,16 +1302,72 @@ hca->mtu = mca_btl_openib_component.ib_mtu; } + /* If the user specified btl_openib_receive_queues MCA param, it + overrides all HCA INI params */ + if (BTL_OPENIB_RQ_SOURCE_MCA != + mca_btl_openib_component.receive_queues_source && + values.receive_queues_set) { + /* If a prior HCA's INI values set a different value for + receive_queues, this is unsupported (see + https://svn.open-mpi.org/trac/ompi/ticket/1285) */ + if (BTL_OPENIB_RQ_SOURCE_HCA_INI == + mca_btl_openib_component.receive_queues_source) { + if (0 != strcmp(values.receive_queues, + mca_btl_openib_component.receive_queues)) { + orte_show_help("help-mpi-btl-openib.txt", + "conflicting receive_queues", true, + orte_process_info.nodename, + ibv_get_device_name(hca->ib_dev), + hca->ib_dev_attr.vendor_id, + hca->ib_dev_attr.vendor_part_id, + values.receive_queues, + ibv_get_device_name(receive_queues_hca->ib_dev), + receive_queues_hca->ib_dev_attr.vendor_id, + receive_queues_hca->ib_dev_attr.vendor_part_id, + mca_btl_openib_component.receive_queues, + opal_install_dirs.pkgdatadir); + ret = OMPI_ERR_RESOURCE_BUSY; + goto error; + } + } else { + if (NULL != mca_btl_openib_component.receive_queues) { + free(mca_btl_openib_component.receive_queues); + } + receive_queues_hca = hca; + mca_btl_openib_component.receive_queues = + strdup(values.receive_queues); + mca_btl_openib_component.receive_queues_source = + BTL_OPENIB_RQ_SOURCE_HCA_INI; + } + } + /* If "use eager rdma" was set, then enable it on this HCA */ if (values.use_eager_rdma_set) { hca->use_eager_rdma = values.use_eager_rdma; } +#if HAVE_XRC + /* if user configured to run with XRC qp and the device doesn't + * support it - we should ignore this hca. Maybe we have another + * one that has XRC support + */ + if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && + mca_btl_openib_component.num_xrc_qps > 0) { + orte_show_help("help-mpi-btl-openib.txt", + "XRC on device without XRC support", true, + mca_btl_openib_component.num_xrc_qps, + ibv_get_device_name(hca->ib_dev), + orte_process_info.nodename); + ret = OMPI_SUCCESS; + goto error; + } +#endif + /* Allocate the protection domain for the HCA */ hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context); if(NULL == hca->ib_pd){ BTL_ERROR(("error allocating protection domain for %s errno says %s", - ibv_get_device_name(ib_dev), strerror(errno))); + ibv_get_device_name(hca->ib_dev), strerror(errno))); goto error; } @@ -1132,7 +1389,7 @@ hca, &mpool_resources); if(NULL == hca->mpool){ BTL_ERROR(("error creating IB memory pool for %s errno says %s", - ibv_get_device_name(ib_dev), strerror(errno))); + ibv_get_device_name(hca->ib_dev), strerror(errno))); goto error; } @@ -1155,7 +1412,7 @@ if(ibv_query_port(hca->ib_dev_context, i, &ib_port_attr)){ BTL_ERROR(("error getting port attributes for device %s " "port number %d errno says %s", - ibv_get_device_name(ib_dev), i, strerror(errno))); + ibv_get_device_name(hca->ib_dev), i, strerror(errno))); break; } if(IBV_PORT_ACTIVE == ib_port_attr.state) { @@ -1180,25 +1437,25 @@ if (OMPI_SUCCESS != ret) { /* Out of bounds error indicates that we hit max btl number * don't propagate the error to the caller */ - if(OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { ret = OMPI_SUCCESS; + } break; } } } + free(allowed_ports); - /* If we made a BTL, we're done. Otherwise, fall through and - destroy everything */ + /* If we made a BTL, check APM status and return. Otherwise, fall + through and destroy everything */ if (hca->btls > 0) { /* if apm was enabled it should be > 1 */ if (1 == mca_btl_openib_component.apm_ports) { - orte_show_help("help-mpi-btl-openib.txt", "apm not enough ports", true); + orte_show_help("help-mpi-btl-openib.txt", + "apm not enough ports", true); mca_btl_openib_component.apm_ports = 0; } - ret = prepare_hca_for_use(hca); - if(OMPI_SUCCESS == ret) { - return OMPI_SUCCESS; - } + return OMPI_SUCCESS; } error: @@ -1556,31 +1813,30 @@ dev_sorted = sort_devs_by_distance(ib_devs, num_devs); - /* We must loop through all the hca id's, get their handles and - for each hca we query the number of ports on the hca and set up - a distinct btl module for each hca port */ - OBJ_CONSTRUCT(&btl_list, opal_list_t); OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t); #if OMPI_HAVE_THREADS mca_btl_openib_component.async_thread = 0; #endif - for(i = 0; i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls || + for (i = 0; i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls || mca_btl_openib_component.ib_num_btls < mca_btl_openib_component.ib_max_btls); i++) { - if(0 == mca_btl_openib_component.ib_num_btls) + if (0 == mca_btl_openib_component.ib_num_btls) { distance = dev_sorted[i].distance; - else if(distance != dev_sorted[i].distance) + } else if (distance != dev_sorted[i].distance) { break; + } - if(OMPI_SUCCESS != - (ret = init_one_hca(&btl_list, dev_sorted[i].ib_dev))) + if (OMPI_SUCCESS != + (ret = init_one_hca(&btl_list, dev_sorted[i].ib_dev))) { break; + } } - if(ret != OMPI_SUCCESS) { + if (ret != OMPI_SUCCESS) { orte_show_help("help-mpi-btl-openib.txt", "error in hca init", true, orte_process_info.nodename); + return NULL; } free(dev_sorted); @@ -1606,6 +1862,42 @@ return NULL; } + /* Setup the BSRQ QP's based on the final value of + mca_btl_openib_component.receive_queues. */ + setup_qps(); + + /* Loop through all the btl modules that we made and find every + base HCA that doesn't have hca->qps setup on it yet (remember + that some modules may share the same HCA, so when going through + to loop, we may hit an HCA that was already setup earlier in + the loop). */ + for (item = opal_list_get_first(&btl_list); + opal_list_get_end(&btl_list) != item; + item = opal_list_get_next(item)) { + mca_btl_base_selected_module_t *m = + (mca_btl_base_selected_module_t*) item; + mca_btl_openib_hca_t *hca = + ((mca_btl_openib_module_t*) m->btl_module)->hca; + if (NULL == hca->qps) { + + /* Setup the HCA qps info */ + hca->qps = (mca_btl_openib_hca_qp_t*) + calloc(mca_btl_openib_component.num_qps, + sizeof(mca_btl_openib_hca_qp_t)); + for (i = 0; i < mca_btl_openib_component.num_qps; i++) { + OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t); + OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t); + } + + /* Do finial init on HCA */ + ret = prepare_hca_for_use(hca); + if (OMPI_SUCCESS != ret) { + /* JMS */ + return NULL; + } + } + } + /* Allocate space for btl modules */ mca_btl_openib_component.openib_btls = malloc(sizeof(mca_btl_openib_module_t*) * Index: ompi/mca/btl/openib/help-mpi-btl-openib.txt =================================================================== --- ompi/mca/btl/openib/help-mpi-btl-openib.txt (revision 18437) +++ ompi/mca/btl/openib/help-mpi-btl-openib.txt (working copy) @@ -448,3 +448,20 @@ [apm not enough ports] WARNING: For APM over ports ompi require at least 2 active ports and only single active port was found. Disabling APM over ports +# +[conflicting receive_queues] +Open MPI detected two different sets of OpenFabrics receives queues on +the same host (in the openib BTL). Open MPI currently only supports +one set of OF receive queues in an MPI job, even if you have different +types of OpenFabrics adapters on the same host. + +Host: %s +Adapter 1: %s (vendor 0x%x, part ID %d) +Queues: %s +Adapter 2: %s (vendor 0x%x, part ID %d) +Queues: %s + +Note that these receive queues values may have come from the Open MPI +adapter default settings file: + + %s/mca-btl-openib-hca-params.ini Index: ompi/mca/btl/openib/btl_openib_ini.c =================================================================== --- ompi/mca/btl/openib/btl_openib_ini.c (revision 18437) +++ ompi/mca/btl/openib/btl_openib_ini.c (working copy) @@ -388,6 +388,11 @@ sv->values.use_eager_rdma_set = true; } + else if (0 == strcasecmp(key_buffer, "receive_queues")) { + sv->values.receive_queues = strdup(value); + sv->values.receive_queues_set = true; + } + else { /* Have no idea what this parameter is. Not an error -- just ignore it */ @@ -469,6 +474,9 @@ v->use_eager_rdma = 0; v->use_eager_rdma_set = false; + + v->receive_queues = NULL; + v->receive_queues_set = false; } Index: ompi/mca/btl/openib/btl_openib_ini.h =================================================================== --- ompi/mca/btl/openib/btl_openib_ini.h (revision 18437) +++ ompi/mca/btl/openib/btl_openib_ini.h (working copy) @@ -25,6 +25,9 @@ uint32_t use_eager_rdma; bool use_eager_rdma_set; + + char *receive_queues; + bool receive_queues_set; } ompi_btl_openib_ini_values_t; Index: ompi/mca/btl/openib/btl_openib.h =================================================================== --- ompi/mca/btl/openib/btl_openib.h (revision 18437) +++ ompi/mca/btl/openib/btl_openib.h (working copy) @@ -110,6 +110,13 @@ #define BTL_OPENIB_QP_TYPE_XRC(Q) \ (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP) +typedef enum { + BTL_OPENIB_RQ_SOURCE_DEFAULT, + BTL_OPENIB_RQ_SOURCE_MCA, + BTL_OPENIB_RQ_SOURCE_HCA_INI, + BTL_OPENIB_RQ_SOURCE_HCA_MAX +} btl_openib_receive_queues_source_t; + struct mca_btl_openib_component_t { mca_btl_base_component_1_0_1_t super; /**< base BTL component */ @@ -197,6 +204,11 @@ char *if_exclude; char **if_exclude_list; + /* MCA param btl_openib_receive_queues */ + char *receive_queues; + /* Whether we got a non-default value of btl_openib_receive_queues */ + btl_openib_receive_queues_source_t receive_queues_source; + /** Colon-delimited list of filenames for HCA parameters */ char *hca_params_file_names; Index: ompi/mca/btl/openib/btl_openib_mca.c =================================================================== --- ompi/mca/btl/openib/btl_openib_mca.c (revision 18437) +++ ompi/mca/btl/openib/btl_openib_mca.c (working copy) @@ -52,9 +52,7 @@ REGSTR_MAX = 0x88 }; -static int mca_btl_openib_mca_setup_qps(void); - /* * utility routine for string parameter registration */ @@ -109,6 +107,9 @@ */ int btl_openib_register_mca_params(void) { + char default_qps[100]; + uint32_t mid_qp_size; + int i; char *msg, *str; int ival, ival2, ret, tmp; @@ -485,246 +486,49 @@ &mca_btl_openib_module.super)); /* setup all the qp stuff */ - CHECK(mca_btl_openib_mca_setup_qps()); - - CHECK(reg_string("if_include", - "Comma-delimited list of HCAs/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.", - NULL, &mca_btl_openib_component.if_include, - 0)); - - CHECK(reg_string("if_exclude", - "Comma-delimited list of HCAs/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with btl_openib_if_include.", - NULL, &mca_btl_openib_component.if_exclude, - 0)); - - return ret; -} - -static int32_t atoi_param(char *param, int32_t dflt) -{ - if(NULL == param || '\0' == param[0]) - return dflt ? dflt : 1; - - return atoi(param); -} - -static int mca_btl_openib_mca_setup_qps(void) -{ - /* All the multi-qp stuff.. */ - char *str; - char **queues, **params = NULL; - int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0; - char default_qps[100]; - uint32_t max_qp_size, max_size_needed; - int32_t min_freelist_size = 0; - int smallest_pp_qp = 0, ret = OMPI_ERROR, i; - uint32_t mid_qp_size; - mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4; /* round mid_qp_size to smallest power of two */ for(i = 31; i > 0; i--) { - if(!(mid_qp_size & (1< qp) - smallest_pp_qp = qp; - } else if (0 == strncmp("S,", queues[qp], 2)) { - num_srq_qps++; - } else if (0 == strncmp("X,", queues[qp], 2)) { -#if HAVE_XRC - num_xrc_qps++; -#else - orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, - orte_process_info.nodename, str); - goto error; -#endif - } else { - orte_show_help("help-mpi-btl-openib.txt", - "invalid qp type in receive_queues", true, - orte_process_info.nodename, str, queues[qp]); - goto error; - } - qp++; - } - /* Current XRC implementation can't used with other QP types - PP and SRQ */ - if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { - orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, - orte_process_info.nodename, str); - goto error; - } + CHECK(reg_string("if_exclude", + "Comma-delimited list of HCAs/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with btl_openib_if_include.", + NULL, &mca_btl_openib_component.if_exclude, + 0)); - /* Current XRC implementation can't used with btls_per_lid > 1 */ - if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { - orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", true, - orte_process_info.nodename, str, num_xrc_qps); - goto error; - } - mca_btl_openib_component.num_pp_qps = num_pp_qps; - mca_btl_openib_component.num_srq_qps = num_srq_qps; - mca_btl_openib_component.num_xrc_qps = num_xrc_qps; - mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; - - mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) - malloc(sizeof(mca_btl_openib_qp_info_t) * - mca_btl_openib_component.num_qps); - - qp = 0; -#define P(N) (((N) > count)?NULL:params[(N)]) - while(queues[qp] != NULL) { - int i = 0, count; - int32_t rd_low, rd_num; - params = opal_argv_split_with_empty(queues[qp], ','); - count = opal_argv_count(params); - - if ('P' == params[0][0]) { - int32_t rd_win, rd_rsv; - if (count < 3 || count > 6) { - orte_show_help("help-mpi-btl-openib.txt", - "invalid pp qp specification", true, - orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); - rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); - - BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", - rd_num, rd_low, rd_win, rd_rsv)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num + rd_rsv > min_freelist_size) - min_freelist_size = rd_num + rd_rsv; - - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; - if((rd_num - rd_low) > rd_win) - orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", - true, rd_win, rd_num - rd_low); - } else { - int32_t sd_max; - if(count < 3 || count > 5) { - orte_show_help("help-mpi-btl-openib.txt", - "invalid srq specification", true, - orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? - MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - sd_max = atoi_param(P(4), rd_low / 4); - BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", - rd_num, rd_low, sd_max)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num > min_freelist_size) - min_freelist_size = rd_num; - - mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; - } - - if (rd_num <= rd_low) { - orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", - true, orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; - mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; - while (NULL != params[i]) { - free(params[i++]); - } - free(params); - qp++; - } - params = NULL; - - /* Sanity check some sizes */ - - max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; - max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > - mca_btl_openib_module.super.btl_max_send_size) ? - mca_btl_openib_module.super.btl_eager_limit : - mca_btl_openib_module.super.btl_max_send_size; - if (max_qp_size < max_size_needed) { - orte_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too small", true, - orte_process_info.nodename, max_qp_size, - max_size_needed); - ret = OMPI_ERROR; - goto error; - } else if (max_qp_size > max_size_needed) { - orte_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too big", true, - orte_process_info.nodename, max_qp_size, - max_size_needed); - orte_output(0, "The biggest QP size is bigger than maximum send size. " - "This is not optimal configuration as memory will be wasted."); - } - - if (mca_btl_openib_component.ib_free_list_max > 0 && - min_freelist_size > mca_btl_openib_component.ib_free_list_max) { - orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true, - orte_process_info.nodename, - mca_btl_openib_component.ib_free_list_max, - min_freelist_size); - goto error; - } - - mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; - mca_btl_openib_component.credits_qp = smallest_pp_qp; - /* Register any MCA params for the connect pseudo-components */ - if (OMPI_SUCCESS != ompi_btl_openib_connect_base_register()) - goto error; - - ret = OMPI_SUCCESS; -error: - if(params) { - qp = 0; - while(params[qp] != NULL) - free(params[qp++]); - free(params); + if (OMPI_SUCCESS == ret) { + ret = ompi_btl_openib_connect_base_register(); } - if(queues) { - qp = 0; - while(queues[qp] != NULL) - free(queues[qp++]); - free(queues); - } - return ret; } Index: ompi/mca/btl/openib/mca-btl-openib-hca-params.ini =================================================================== --- ompi/mca/btl/openib/mca-btl-openib-hca-params.ini (revision 18437) +++ ompi/mca/btl/openib/mca-btl-openib-hca-params.ini (working copy) @@ -139,3 +139,4 @@ vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032 use_eager_rdma = 1 mtu = 2048 +receive_queues = P,65536,256,192,128