Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] [PATCH] Not optimal SRQ resource allocation
From: Vasily Philipov (vasily_at_[hidden])
Date: 2009-12-02 10:20:08


The attach patch should resolve the long pending issue that we have on
our track https://svn.open-mpi.org/trac/ompi/ticket/1912.

The issue: As process of OpenIB BTL creation we also create set of SRQs
and corresponding receive fragments are allocated and posted on all
SRQs. It mean that a processes that do not have active communication
will keep bunch on unused memory on SRQ,

The Solution: The patch modify openib btl to pre-post very limited
number of receive fragments on each SRQ. If the number of receive
buffers will not be enough, openib btl will get SRQ limit event and will
pre-post additional fragments.

 

Please review.

diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib.c
--- a/ompi/mca/btl/openib/btl_openib.c Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.c Wed Dec 02 16:24:55 2009 +0200
@@ -214,6 +214,7 @@
 static int create_srq(mca_btl_openib_module_t *openib_btl)
 {
     int qp;
+ int32_t rd_num, rd_curr_num;
 
     /* create the SRQ's */
     for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
@@ -242,6 +243,24 @@
                                                ibv_get_device_name(openib_btl->device->ib_dev));
                 return OMPI_ERROR;
             }
+
+ rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
+ rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init;
+
+ if(true == mca_btl_openib_component.enable_srq_resize) {
+ if(0 == rd_curr_num) {
+ openib_btl->qps[qp].u.srq_qp.rd_curr_num = 1;
+ }
+
+ openib_btl->qps[qp].u.srq_qp.rd_low_local = rd_curr_num - (rd_curr_num >> 2);
+ openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
+ } else {
+ openib_btl->qps[qp].u.srq_qp.rd_curr_num = rd_num;
+ openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
+ /* Not used in this case, but we don't need a garbage */
+ mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 0;
+ openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
+ }
         }
     }
 
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib.h
--- a/ompi/mca/btl/openib/btl_openib.h Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.h Wed Dec 02 16:24:55 2009 +0200
@@ -87,6 +87,12 @@
 
 struct mca_btl_openib_srq_qp_info_t {
     int32_t sd_max;
+ /* The init value for rd_curr_num variables of all SRQs */
+ int32_t rd_init;
+ /* The watermark, threshold - if the number of WQEs in SRQ is less then this value =>
+ the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ.
+ As result the maximal number of pre-posted WQEs on the SRQ will be increased */
+ int32_t srq_limit;
 }; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
 
 struct mca_btl_openib_qp_info_t {
@@ -254,6 +260,8 @@
     ompi_free_list_t recv_user_free;
     /**< frags for coalesced massages */
     ompi_free_list_t send_free_coalesced;
+ /**< Whether we want a dynamically resizing srq, enabled by default */
+ bool enable_srq_resize;
 }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
 
 OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
@@ -348,6 +356,16 @@
     int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */
                          /* i.e. the number of frags that can be outstanding (down counter) */
     opal_list_t pending_frags[2]; /**< list of high/low prio frags */
+ /**< The number of max rd that we can post in the current time.
+ The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
+ event handler. The value starts from (rd_num / 4) and increased up to rd_num */
+ int32_t rd_curr_num;
+ /**< We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
+ The value increased together with rd_curr_num. The value is unique for every SRQ. */
+ int32_t rd_low_local;
+ /**< The flag points if we want to get the
+ IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
+ bool srq_limit_event_flag;
 }; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
 
 struct mca_btl_openib_module_qp_t {
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib_async.c
--- a/ompi/mca/btl/openib/btl_openib_async.c Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_async.c Wed Dec 02 16:24:55 2009 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2006-2007 Voltaire All rights reserved.
  * $COPYRIGHT$
@@ -226,10 +226,51 @@
     return OMPI_SUCCESS;
 }
 
+/* The main idea of resizing SRQ algorithm -
+ We create a SRQ with size = rd_num, but for efficient usage of resources
+ the number of WQEs that we post = rd_curr_num < rd_num and this value is
+ increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
+ the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */
+static int btl_openib_async_srq_limit_event(struct ibv_srq* srq,
+ mca_btl_openib_module_t *openib_btl)
+{
+ int qp;
+
+ for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
+ if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
+ if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
+ break;
+ }
+ }
+ }
+
+ if(qp >= mca_btl_openib_component.num_qps) {
+ BTL_ERROR(("The srq doesn't found on %s.", ibv_get_device_name(openib_btl->device->ib_dev)));
+ return OMPI_ERROR;
+ }
+
+ /* dynamically re-size the SRQ to be larger */
+ openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;
+
+ if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= mca_btl_openib_component.qp_infos[qp].rd_num) {
+ openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num;
+ openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
+
+ openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
+
+ return OMPI_SUCCESS;
+ }
+
+ openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
+ openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
+
+ return OMPI_SUCCESS;
+}
+
 /* Function handle async device events */
 static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index)
 {
- int j;
+ int j, btl_index = 0;
     mca_btl_openib_device_t *device = NULL;
     struct ibv_async_event event;
     bool xrc_event = false;
@@ -240,6 +281,8 @@
         if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
                 devices_poll->async_pollfd[index].fd ) {
             device = mca_btl_openib_component.openib_btls[j]->device;
+ btl_index = j;
+
             break;
         }
     }
@@ -306,7 +349,15 @@
 #if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
             case IBV_EVENT_CLIENT_REREGISTER:
 #endif
+ break;
+ /* The event is signaled when number of prepost receive WQEs is going
+ under predefined threshold - srq_limit */
             case IBV_EVENT_SRQ_LIMIT_REACHED:
+ if(OMPI_SUCCESS != btl_openib_async_srq_limit_event(event.element.srq,
+ mca_btl_openib_component.openib_btls[btl_index])) {
+ return OMPI_ERROR;
+ }
+
                 break;
             default:
                 orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib_component.c
--- a/ompi/mca/btl/openib/btl_openib_component.c Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_component.c Wed Dec 02 16:24:55 2009 +0200
@@ -1361,8 +1361,8 @@
                         true, rd_win, rd_num - rd_low);
             }
         } else {
- int32_t sd_max;
- if (count < 3 || count > 5) {
+ int32_t sd_max, rd_init, srq_limit;
+ if (count < 3 || count > 7) {
                 orte_show_help("help-mpi-btl-openib.txt",
                                "invalid srq specification", true,
                                orte_process_info.nodename, queues[qp]);
@@ -1376,15 +1376,47 @@
             /* by default set rd_low to be 3/4 of rd_num */
             rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
             sd_max = atoi_param(P(4), rd_low / 4);
- BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
- rd_num, rd_low, sd_max));
+ /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */
+ rd_init = atoi_param(P(5), rd_num / 4);
+ /* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local,
+ the value of rd_low_local we calculate in create_srq function) */
+ srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
+
+ /* If we set srq_limit less or greater than rd_init
+ (init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED
+ event immediately and the value of rd_curr_num will be increased */
+
+ /* If we set srq_limit to zero, but size of SRQ greater than 1 and
+ it is not a user request (param number 6 in --mca btl_openib_receive_queues) => set it to be 1 */
+ if((0 == srq_limit) && (1 < rd_num) && (0 != P(6))) {
+ srq_limit = 1;
+ }
+
+ BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d",
+ rd_num, rd_low, sd_max, rd_init, srq_limit));
 
             /* Calculate the smallest freelist size that can be allowed */
             if (rd_num > min_freelist_size) {
                 min_freelist_size = rd_num;
             }
 
+ if (rd_num < rd_init) {
+ orte_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init",
+ true, orte_process_info.nodename, queues[qp]);
+ ret = OMPI_ERR_BAD_PARAM;
+ goto error;
+ }
+
+ if (rd_num < srq_limit) {
+ orte_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num",
+ true, orte_process_info.nodename, queues[qp]);
+ ret = OMPI_ERR_BAD_PARAM;
+ goto error;
+ }
+
             mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
+ mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
+ mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit;
         }
 
         if (rd_num <= rd_low) {
@@ -3185,19 +3217,19 @@
 
 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
 {
- int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
- int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
+ int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
+ int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
     int num_post, i, rc;
     struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
 
     assert(!BTL_OPENIB_QP_TYPE_PP(qp));
 
     OPAL_THREAD_LOCK(&openib_btl->ib_lock);
- if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low) {
+ if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
         return OMPI_SUCCESS;
     }
- num_post = rd_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
+ num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
 
     for(i = 0; i < num_post; i++) {
         ompi_free_list_item_t* item;
@@ -3214,7 +3246,26 @@
 
     rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
     if(OPAL_LIKELY(0 == rc)) {
+ struct ibv_srq_attr srq_attr;
+
         OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
+
+ if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
+ srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
+ srq_attr.max_sge = 1;
+ srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
+
+ openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
+ if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) {
+ BTL_ERROR(("Failed to request limit event for srq on %s. "
+ "Fatal error, stoping asynch event thread",
+ ibv_get_device_name(openib_btl->device->ib_dev)));
+
+ OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
+ return OMPI_ERROR;
+ }
+ }
+
         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
         return OMPI_SUCCESS;
     }
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib_mca.c
--- a/ompi/mca/btl/openib/btl_openib_mca.c Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_mca.c Wed Dec 02 16:24:55 2009 +0200
@@ -10,7 +10,7 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
  * reserved.
  * Copyright (c) 2006-2007 Voltaire All rights reserved.
@@ -163,6 +163,11 @@
                   1, &ival, 0));
     mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
 
+ CHECK(reg_int("enable_srq_resize", NULL,
+ "Enable/Disable on demand SRQ resize. "
+ "(0 = without resizing, nonzero = with resizing)", 1, &ival, 0));
+ mca_btl_openib_component.enable_srq_resize = (0 != ival);
+
     if (OMPI_HAVE_IBV_FORK_INIT) {
         ival2 = -1;
     } else {
diff -r a5938d9dcada ompi/mca/btl/openib/help-mpi-btl-openib.txt
--- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt Wed Dec 02 16:24:55 2009 +0200
@@ -11,7 +11,7 @@
 # Copyright (c) 2004-2006 The Regents of the University of California.
 # All rights reserved.
 # Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
-# Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
+# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
 # Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
 # $COPYRIGHT$
 #
@@ -414,6 +414,24 @@
   Local host: %s
   Bad queue specification: %s
 #
+[rd_num must be >= rd_init]
+WARNING: The number of buffers for a queue pair specified via the
+btl_openib_receive_queues MCA parameter (parametr #2) must be
+greater or equal to the init srq size (parametr #5).
+The OpenFabrics (openib) BTL will therefore be deactivated for this run.
+
+ Local host: %s
+ Bad queue specification: %s
+#
+[srq_limit must be > rd_num]
+WARNING: The number of buffers for a queue pair specified via the
+btl_openib_receive_queues MCA parameter (parametr #2) must be greater than the limit
+buffer count (parametr #6). The OpenFabrics (openib) BTL will therefore
+be deactivated for this run.
+
+ Local host: %s
+ Bad queue specification: %s
+#
 [biggest qp size is too small]
 WARNING: The largest queue pair buffer size specified in the
 btl_openib_receive_queues MCA parameter is smaller than the maximum