Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r22313
From: Jeff Squyres (jsquyres_at_[hidden])
Date: 2009-12-15 20:56:39


Hmm. I'm a little disappointed that this was applied without answering my questions first...

    http://www.open-mpi.org/community/lists/devel/2009/12/7187.php

Can you at least answer my questions after this fact?

On Dec 15, 2009, at 10:52 AM, <vasily_at_[hidden]> <vasily_at_[hidden]> wrote:

> Author: vasily
> Date: 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> New Revision: 22313
> URL: https://svn.open-mpi.org/trac/ompi/changeset/22313
>
> Log:
> Adding support for on-demand SRQ pre-post (receive wqe allocation)
>
>
> Text files modified:
> trunk/ompi/mca/btl/openib/btl_openib.c | 19 +++++++++++
> trunk/ompi/mca/btl/openib/btl_openib.h | 18 ++++++++++
> trunk/ompi/mca/btl/openib/btl_openib_async.c | 57 ++++++++++++++++++++++++++++++++-
> trunk/ompi/mca/btl/openib/btl_openib_component.c | 67 +++++++++++++++++++++++++++++++++++----
> trunk/ompi/mca/btl/openib/btl_openib_mca.c | 5 ++
> trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt | 35 +++++++++++++++++++-
> 6 files changed, 189 insertions(+), 12 deletions(-)
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib.c
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib.c (original)
> +++ trunk/ompi/mca/btl/openib/btl_openib.c 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> @@ -223,6 +223,7 @@
> static int create_srq(mca_btl_openib_module_t *openib_btl)
> {
> int qp;
> + int32_t rd_num, rd_curr_num;
>
> /* create the SRQ's */
> for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
> @@ -251,6 +252,24 @@
> ibv_get_device_name(openib_btl->device->ib_dev));
> return OMPI_ERROR;
> }
> +
> + rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
> + rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init;
> +
> + if(true == mca_btl_openib_component.enable_srq_resize) {
> + if(0 == rd_curr_num) {
> + openib_btl->qps[qp].u.srq_qp.rd_curr_num = 1;
> + }
> +
> + openib_btl->qps[qp].u.srq_qp.rd_low_local = rd_curr_num - (rd_curr_num >> 2);
> + openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
> + } else {
> + openib_btl->qps[qp].u.srq_qp.rd_curr_num = rd_num;
> + openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
> + /* Not used in this case, but we don't need a garbage */
> + mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 0;
> + openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
> + }
> }
> }
>
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib.h
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib.h (original)
> +++ trunk/ompi/mca/btl/openib/btl_openib.h 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> @@ -96,6 +96,12 @@
>
> struct mca_btl_openib_srq_qp_info_t {
> int32_t sd_max;
> + /* The init value for rd_curr_num variables of all SRQs */
> + int32_t rd_init;
> + /* The watermark, threshold - if the number of WQEs in SRQ is less then this value =>
> + the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ.
> + As result the maximal number of pre-posted WQEs on the SRQ will be increased */
> + int32_t srq_limit;
> }; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
>
> struct mca_btl_openib_qp_info_t {
> @@ -263,6 +269,8 @@
> ompi_free_list_t send_free_coalesced;
> /** Default receive queues */
> char* default_recv_qps;
> + /** Whether we want a dynamically resizing srq, enabled by default */
> + bool enable_srq_resize;
> }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
>
> OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
> @@ -363,6 +371,16 @@
> int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */
> /* i.e. the number of frags that can be outstanding (down counter) */
> opal_list_t pending_frags[2]; /**< list of high/low prio frags */
> + /** The number of receive buffers that can be post in the current time.
> + The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
> + event handler. The value starts from (rd_num / 4) and increased up to rd_num */
> + int32_t rd_curr_num;
> + /** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
> + The value increased together with rd_curr_num. The value is unique for every SRQ. */
> + int32_t rd_low_local;
> + /** The flag points if we want to get the
> + IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
> + bool srq_limit_event_flag;
> }; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
>
> struct mca_btl_openib_module_qp_t {
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib_async.c
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib_async.c (original)
> +++ trunk/ompi/mca/btl/openib/btl_openib_async.c 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
> * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
> * Copyright (c) 2006-2007 Voltaire All rights reserved.
> * $COPYRIGHT$
> @@ -226,10 +226,53 @@
> return OMPI_SUCCESS;
> }
>
> +/* The main idea of resizing SRQ algorithm -
> + We create a SRQ with size = rd_num, but for efficient usage of resources
> + the number of WQEs that we post = rd_curr_num < rd_num and this value is
> + increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
> + the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */
> +static int btl_openib_async_srq_limit_event(struct ibv_srq* srq,
> + mca_btl_openib_module_t *openib_btl)
> +{
> + int qp;
> +
> + for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
> + if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
> + if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
> + break;
> + }
> + }
> + }
> +
> + if(qp >= mca_btl_openib_component.num_qps) {
> + orte_show_help("help-mpi-btl-openib.txt", "SRQ doesn't found",
> + true,orte_process_info.nodename,
> + ibv_get_device_name(openib_btl->device->ib_dev));
> + return OMPI_ERROR;
> + }
> +
> + /* dynamically re-size the SRQ to be larger */
> + openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;
> +
> + if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= mca_btl_openib_component.qp_infos[qp].rd_num) {
> + openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num;
> + openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
> +
> + openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
> +
> + return OMPI_SUCCESS;
> + }
> +
> + openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
> + openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
> +
> + return OMPI_SUCCESS;
> +}
> +
> /* Function handle async device events */
> static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index)
> {
> - int j;
> + int j, btl_index = 0;
> mca_btl_openib_device_t *device = NULL;
> struct ibv_async_event event;
> bool xrc_event = false;
> @@ -240,6 +283,8 @@
> if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
> devices_poll->async_pollfd[index].fd ) {
> device = mca_btl_openib_component.openib_btls[j]->device;
> + btl_index = j;
> +
> break;
> }
> }
> @@ -306,7 +351,15 @@
> #if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
> case IBV_EVENT_CLIENT_REREGISTER:
> #endif
> + break;
> + /* The event is signaled when number of prepost receive WQEs is going
> + under predefined threshold - srq_limit */
> case IBV_EVENT_SRQ_LIMIT_REACHED:
> + if(OMPI_SUCCESS != btl_openib_async_srq_limit_event(event.element.srq,
> + mca_btl_openib_component.openib_btls[btl_index])) {
> + return OMPI_ERROR;
> + }
> +
> break;
> default:
> orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib_component.c
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib_component.c (original)
> +++ trunk/ompi/mca/btl/openib/btl_openib_component.c 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> @@ -1376,8 +1376,8 @@
> true, rd_win, rd_num - rd_low);
> }
> } else {
> - int32_t sd_max;
> - if (count < 3 || count > 5) {
> + int32_t sd_max, rd_init, srq_limit;
> + if (count < 3 || count > 7) {
> orte_show_help("help-mpi-btl-openib.txt",
> "invalid srq specification", true,
> orte_process_info.nodename, queues[qp]);
> @@ -1391,15 +1391,47 @@
> /* by default set rd_low to be 3/4 of rd_num */
> rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
> sd_max = atoi_param(P(4), rd_low / 4);
> - BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
> - rd_num, rd_low, sd_max));
> + /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */
> + rd_init = atoi_param(P(5), rd_num / 4);
> + /* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local,
> + the value of rd_low_local we calculate in create_srq function) */
> + srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
> +
> + /* If we set srq_limit less or greater than rd_init
> + (init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED
> + event immediately and the value of rd_curr_num will be increased */
> +
> + /* If we set srq_limit to zero, but size of SRQ greater than 1 and
> + it is not a user request (param number 6 in --mca btl_openib_receive_queues) => set it to be 1 */
> + if((0 == srq_limit) && (1 < rd_num) && (0 != P(6))) {
> + srq_limit = 1;
> + }
> +
> + BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d",
> + rd_num, rd_low, sd_max, rd_init, srq_limit));
>
> /* Calculate the smallest freelist size that can be allowed */
> if (rd_num > min_freelist_size) {
> min_freelist_size = rd_num;
> }
>
> + if (rd_num < rd_init) {
> + orte_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init",
> + true, orte_process_info.nodename, queues[qp]);
> + ret = OMPI_ERR_BAD_PARAM;
> + goto error;
> + }
> +
> + if (rd_num < srq_limit) {
> + orte_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num",
> + true, orte_process_info.nodename, queues[qp]);
> + ret = OMPI_ERR_BAD_PARAM;
> + goto error;
> + }
> +
> mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
> + mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
> + mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit;
> }
>
> if (rd_num <= rd_low) {
> @@ -3200,19 +3232,19 @@
>
> int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
> {
> - int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
> - int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
> + int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
> + int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
> int num_post, i, rc;
> struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
>
> assert(!BTL_OPENIB_QP_TYPE_PP(qp));
>
> OPAL_THREAD_LOCK(&openib_btl->ib_lock);
> - if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low) {
> + if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
> OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
> return OMPI_SUCCESS;
> }
> - num_post = rd_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
> + num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
>
> for(i = 0; i < num_post; i++) {
> ompi_free_list_item_t* item;
> @@ -3229,7 +3261,26 @@
>
> rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
> if(OPAL_LIKELY(0 == rc)) {
> + struct ibv_srq_attr srq_attr;
> +
> OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
> +
> + if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
> + srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
> + srq_attr.max_sge = 1;
> + srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
> +
> + openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
> + if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) {
> + BTL_ERROR(("Failed to request limit event for srq on %s. "
> + "Fatal error, stoping asynch event thread",
> + ibv_get_device_name(openib_btl->device->ib_dev)));
> +
> + OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
> + return OMPI_ERROR;
> + }
> + }
> +
> OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
> return OMPI_SUCCESS;
> }
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib_mca.c
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib_mca.c (original)
> +++ trunk/ompi/mca/btl/openib/btl_openib_mca.c 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> @@ -163,6 +163,11 @@
> 1, &ival, 0));
> mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
>
> + CHECK(reg_int("enable_srq_resize", NULL,
> + "Enable/Disable on demand SRQ resize. "
> + "(0 = without resizing, nonzero = with resizing)", 1, &ival, 0));
> + mca_btl_openib_component.enable_srq_resize = (0 != ival);
> +
> if (OMPI_HAVE_IBV_FORK_INIT) {
> ival2 = -1;
> } else {
>
> Modified: trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt (original)
> +++ trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt 2009-12-15 10:52:10 EST (Tue, 15 Dec 2009)
> @@ -168,6 +168,13 @@
> You may need to consult with your system administrator to get this
> problem fixed.
> #
> +[SRQ doesn't found]
> +The srq doesn't found.
> +Below is some information about the host that raised the error:
> +
> + Local host: %s
> + Local device: %s
> +#
> [srq rnr retry exceeded]
> The OpenFabrics "receiver not ready" retry count on a shared receive
> queue or XRC receive queue has been exceeded. This error can occur if
> @@ -386,21 +393,27 @@
> part of the btl_openib_receive_queues MCA parameter. The OpenFabrics
> (openib) BTL will therefore be deactivated for this run.
>
> -Shared receive queues can take between 2 and 4 parameters:
> +Shared receive queues can take between 2 and 6 parameters:
>
> 1. Buffer size in bytes (mandatory)
> 2. Number of buffers (mandatory)
> 3. Low buffer count watermark (optional; defaults to (num_buffers / 2))
> 4. Maximum number of outstanding sends a sender can have (optional;
> defaults to (low_watermark / 4)
> + 5. Start value of number of receive buffers that will be pre-posted (optional; defaults to (num_buffers / 4))
> + 6. Event limit buffer count watermark (optional; defaults to (3/16 of start value of buffers number))
>
> - Example: S,1024,256,128,32
> + Example: S,1024,256,128,32,32,8
> - 1024 byte buffers
> - 256 buffers to receive incoming MPI messages
> - When the number of available buffers reaches 128, re-post 128 more
> buffers to reach a total of 256
> - A sender will not send to a peer unless it has less than 32
> outstanding sends to that peer.
> + - 32 receive buffers will be preposted.
> + - When the number of not used receive buffers will decreased to 8
> + the IBV_EVENT_SRQ_LIMIT_REACHED event will be signaled and the number
> + of receive buffers that we can pre-post will be increased.
>
> Local host: %s
> Bad queue specification: %s
> @@ -414,6 +427,24 @@
> Local host: %s
> Bad queue specification: %s
> #
> +[rd_num must be >= rd_init]
> +WARNING: The number of buffers for a queue pair specified via the
> +btl_openib_receive_queues MCA parameter (parameter #2) must be
> +greater or equal to the initial SRQ size (parameter #5).
> +The OpenFabrics (openib) BTL will therefore be deactivated for this run.
> +
> + Local host: %s
> + Bad queue specification: %s
> +#
> +[srq_limit must be > rd_num]
> +WARNING: The number of buffers for a queue pair specified via the
> +btl_openib_receive_queues MCA parameter (parameter #2) must be greater than the limit
> +buffer count (parameter #6). The OpenFabrics (openib) BTL will therefore
> +be deactivated for this run.
> +
> + Local host: %s
> + Bad queue specification: %s
> +#
> [biggest qp size is too small]
> WARNING: The largest queue pair buffer size specified in the
> btl_openib_receive_queues MCA parameter is smaller than the maximum
> _______________________________________________
> svn-full mailing list
> svn-full_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
>

-- 
Jeff Squyres
jsquyres_at_[hidden]