Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [PATCH] Improving heterogeneous IB clusters support.
From: Vasily Philipov (vasily_at_[hidden])
Date: 2009-11-16 10:46:21


Vasily Filipov wrote:

Hello.
Here is new patch for heterogeneous clusters supporting.
 
Please comment.

Regards,
Vasily

> Hello,
>
> Some time ago Mellanox proposed design that should improve current
> support for heterogeneous clusters (see Design.txt).The design was
> accepted by IB vendors, and now we propose patch that adds a
> heterogeneous cluster support. The path leaves one issue that we do
> not resolve completely. If 2 different procs have different QPs
> configuration (P/S/X) we print nice warning message that describes
> that such configuration is not supported and it propose way to resolve
> the issue. Theoretically it will be best to provide solution that
> automatically will resolve the problem, but it will require
> significant changes on openib blt that we don’t want to introduce in
> this stage.
>
> Please comment.
>
> Regards,
> Vasily
>

diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib.c
--- a/ompi/mca/btl/openib/btl_openib.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.c Mon Nov 16 17:41:48 2009 +0200
@@ -39,6 +39,8 @@
 #include "ompi/runtime/ompi_cr.h"
 #endif
 
+#include "btl_openib_ini.h"
+
 #include "btl_openib.h"
 #include "btl_openib_frag.h"
 #include "btl_openib_proc.h"
@@ -287,6 +289,158 @@
     return rc;
 }
 
+const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type)
+{
+ switch(transport_type) {
+ case MCA_BTL_OPENIB_TRANSPORT_RDMAOE:
+ return "MCA_BTL_OPENIB_TRANSPORT_RDMAOE";
+
+ case MCA_BTL_OPENIB_TRANSPORT_IB:
+ return "MCA_BTL_OPENIB_TRANSPORT_IB";
+
+ case MCA_BTL_OPENIB_TRANSPORT_IWARP:
+ return "MCA_BTL_OPENIB_TRANSPORT_IWARP";
+
+ case MCA_BTL_OPENIB_TRANSPORT_UNKNOWN:
+ default:
+ return "MCA_BTL_OPENIB_TRANSPORT_UNKNOWN";
+ }
+}
+
+mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl)
+{
+#ifdef OMPI_HAVE_RDMAOE
+ switch(openib_btl->ib_port_attr.transport) {
+ case RDMA_TRANSPORT_IB:
+ return MCA_BTL_OPENIB_TRANSPORT_IB;
+
+ case RDMA_TRANSPORT_IWARP:
+ return MCA_BTL_OPENIB_TRANSPORT_IWARP;
+
+ case RDMA_TRANSPORT_RDMAOE:
+ return MCA_BTL_OPENIB_TRANSPORT_RDMAOE;
+
+ default:
+ return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
+ }
+#else
+#ifdef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE
+ switch(openib_btl->device->ib_dev->transport_type) {
+ case IBV_TRANSPORT_IB:
+ return MCA_BTL_OPENIB_TRANSPORT_IB;
+
+ case IBV_TRANSPORT_IWARP:
+ return MCA_BTL_OPENIB_TRANSPORT_IWARP;
+
+ case IBV_TRANSPORT_UNKNOWN:
+ default:
+ return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
+ }
+#endif
+ return MCA_BTL_OPENIB_TRANSPORT_IB;
+#endif
+}
+
+static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
+ mca_btl_base_endpoint_t* endpoint)
+{
+ int ret = OMPI_SUCCESS;
+
+ char* recv_qps = NULL;
+
+ ompi_btl_openib_ini_values_t values;
+
+ if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "conflicting transport types", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev),
+ (openib_btl->device->ib_dev_attr).vendor_id,
+ (openib_btl->device->ib_dev_attr).vendor_part_id,
+ btl_openib_get_transport_name(mca_btl_openib_get_transport_type(openib_btl)),
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id,
+ btl_openib_get_transport_name(endpoint->rem_info.rem_transport_type));
+
+ return OMPI_ERROR;
+ }
+
+ memset(&values, 0, sizeof(ompi_btl_openib_ini_values_t));
+ ret = ompi_btl_openib_ini_query(endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id, &values);
+
+ if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_FOUND != ret) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "error in device init", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev));
+ return ret;
+ }
+
+ if(openib_btl->device->mtu < endpoint->rem_info.rem_mtu) {
+ endpoint->rem_info.rem_mtu = openib_btl->device->mtu;
+ }
+
+ endpoint->use_eager_rdma = openib_btl->device->use_eager_rdma &
+ endpoint->use_eager_rdma;
+
+ /* Receive queues checking */
+ switch(mca_btl_openib_component.receive_queues_source) {
+ case BTL_OPENIB_RQ_SOURCE_MCA:
+ case BTL_OPENIB_RQ_SOURCE_MAX:
+ break;
+
+ case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
+ if(NULL != values.receive_queues) {
+ recv_qps = values.receive_queues;
+ } else {
+ recv_qps = mca_btl_openib_component.default_recv_qps;
+ }
+
+ if(0 != strcmp(mca_btl_openib_component.receive_queues,
+ recv_qps)) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "unsupported queues configuration", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev),
+ (openib_btl->device->ib_dev_attr).vendor_id,
+ (openib_btl->device->ib_dev_attr).vendor_part_id,
+ mca_btl_openib_component.receive_queues,
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id,
+ recv_qps);
+
+ return OMPI_ERROR;
+ }
+ break;
+
+ case BTL_OPENIB_RQ_SOURCE_DEFAULT:
+ if(NULL != values.receive_queues) {
+ if(0 != strcmp(mca_btl_openib_component.receive_queues,
+ values.receive_queues)) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "unsupported queues configuration", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev),
+ (openib_btl->device->ib_dev_attr).vendor_id,
+ (openib_btl->device->ib_dev_attr).vendor_part_id,
+ mca_btl_openib_component.receive_queues,
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id,
+ values.receive_queues);
+
+ return OMPI_ERROR;
+ }
+ }
+ break;
+ }
+
+ return OMPI_SUCCESS;
+}
+
 /*
  * add a proc to this btl module
  * creates an endpoint that is setup on the
@@ -471,6 +625,10 @@
             continue;
         }
 
+ if(OMPI_SUCCESS != mca_btl_openib_tune_endpoint(openib_btl, endpoint)) {
+ return OMPI_ERROR;
+ }
+
         endpoint->index = opal_pointer_array_add(openib_btl->device->endpoints, (void*)endpoint);
         if( 0 > endpoint->index ) {
             OBJ_RELEASE(endpoint);
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib.h
--- a/ompi/mca/btl/openib/btl_openib.h Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.h Mon Nov 16 17:41:48 2009 +0200
@@ -75,6 +75,13 @@
  */
 
 typedef enum {
+ MCA_BTL_OPENIB_TRANSPORT_UNKNOWN = -1,
+ MCA_BTL_OPENIB_TRANSPORT_IB = 0,
+ MCA_BTL_OPENIB_TRANSPORT_IWARP,
+ MCA_BTL_OPENIB_TRANSPORT_RDMAOE
+} mca_btl_openib_transport_type_t;
+
+typedef enum {
     MCA_BTL_OPENIB_PP_QP,
     MCA_BTL_OPENIB_SRQ_QP,
     MCA_BTL_OPENIB_XRC_QP
@@ -254,6 +261,8 @@
     ompi_free_list_t recv_user_free;
     /**< frags for coalesced massages */
     ompi_free_list_t send_free_coalesced;
+ /**< Default receive queues */
+ char* default_recv_qps;
 }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
 
 OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
@@ -272,6 +281,12 @@
     uint16_t apm_lid;
     /** The MTU used by this port */
     uint8_t mtu;
+ /** vendor id define device type and tuning */
+ uint32_t vendor_id;
+ /** vendor part id define device type and tuning */
+ uint32_t vendor_part_id;
+ /** Transport type of remote port */
+ uint8_t transport_type;
     /** Dummy field used to calculate the real length */
     uint8_t end;
 } mca_btl_openib_modex_message_t;
@@ -633,6 +648,18 @@
 
 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
 
+/**
+ * Get a transport name of btl by its transport type.
+ */
+
+const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
+
+/**
+ * Get a transport type of btl.
+ */
+
+mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl);
+
 static inline int qp_cq_prio(const int qp)
 {
     if(0 == qp)
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_component.c
--- a/ompi/mca/btl/openib/btl_openib_component.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_component.c Mon Nov 16 17:41:48 2009 +0200
@@ -143,6 +143,7 @@
     OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
     mca_btl_openib_component.devices_count = 0;
     mca_btl_openib_component.cpc_explicitly_defined = false;
+ mca_btl_openib_component.default_recv_qps = NULL;
 
     /* initialize objects */
     OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
@@ -196,6 +197,10 @@
         free(mca_btl_openib_component.receive_queues);
     }
 
+ if (NULL != mca_btl_openib_component.default_recv_qps) {
+ free(mca_btl_openib_component.default_recv_qps);
+ }
+
     return rc;
 }
 
@@ -303,6 +308,16 @@
 
         /* Pack the modex common message struct. */
         size = modex_message_size;
+
+ (mca_btl_openib_component.openib_btls[i]->port_info).vendor_id =
+ (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id;
+
+ (mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id =
+ (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id;
+
+ (mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
+ mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
+
         memcpy(offset,
                &(mca_btl_openib_component.openib_btls[i]->port_info),
                size);
@@ -1657,45 +1672,6 @@
         ibv_destroy_cq(cq);
     }
 
- /* If the user specified btl_openib_receive_queues MCA param, it
- overrides all device INI params */
- if (BTL_OPENIB_RQ_SOURCE_MCA !=
- mca_btl_openib_component.receive_queues_source &&
- NULL != values.receive_queues) {
- /* If a prior device's INI values set a different value for
- receive_queues, this is unsupported (see
- https://svn.open-mpi.org/trac/ompi/ticket/1285) */
- if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
- mca_btl_openib_component.receive_queues_source) {
- if (0 != strcmp(values.receive_queues,
- mca_btl_openib_component.receive_queues)) {
- orte_show_help("help-mpi-btl-openib.txt",
- "conflicting receive_queues", true,
- orte_process_info.nodename,
- ibv_get_device_name(device->ib_dev),
- device->ib_dev_attr.vendor_id,
- device->ib_dev_attr.vendor_part_id,
- values.receive_queues,
- ibv_get_device_name(receive_queues_device->ib_dev),
- receive_queues_device->ib_dev_attr.vendor_id,
- receive_queues_device->ib_dev_attr.vendor_part_id,
- mca_btl_openib_component.receive_queues,
- opal_install_dirs.pkgdatadir);
- ret = OMPI_ERR_RESOURCE_BUSY;
- goto error;
- }
- } else {
- if (NULL != mca_btl_openib_component.receive_queues) {
- free(mca_btl_openib_component.receive_queues);
- }
- receive_queues_device = device;
- mca_btl_openib_component.receive_queues =
- strdup(values.receive_queues);
- mca_btl_openib_component.receive_queues_source =
- BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
- }
- }
-
     /* Should we use RDMA for short / eager messages? First check MCA
        param, then check INI file values. */
     if (mca_btl_openib_component.use_eager_rdma >= 0) {
@@ -1795,6 +1771,45 @@
                            "apm not enough ports", true);
             mca_btl_openib_component.apm_ports = 0;
         }
+
+ /* If the user specified btl_openib_receive_queues MCA param, it
+ overrides all device INI params */
+ if (BTL_OPENIB_RQ_SOURCE_MCA !=
+ mca_btl_openib_component.receive_queues_source &&
+ NULL != values.receive_queues) {
+ /* If a prior device's INI values set a different value for
+ receive_queues, this is unsupported (see
+ https://svn.open-mpi.org/trac/ompi/ticket/1285) */
+ if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
+ mca_btl_openib_component.receive_queues_source) {
+ if (0 != strcmp(values.receive_queues,
+ mca_btl_openib_component.receive_queues)) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "conflicting receive_queues", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(device->ib_dev),
+ device->ib_dev_attr.vendor_id,
+ device->ib_dev_attr.vendor_part_id,
+ values.receive_queues,
+ ibv_get_device_name(receive_queues_device->ib_dev),
+ receive_queues_device->ib_dev_attr.vendor_id,
+ receive_queues_device->ib_dev_attr.vendor_part_id,
+ mca_btl_openib_component.receive_queues,
+ opal_install_dirs.pkgdatadir);
+ ret = OMPI_ERR_RESOURCE_BUSY;
+ goto error;
+ }
+ } else {
+ if (NULL != mca_btl_openib_component.receive_queues) {
+ free(mca_btl_openib_component.receive_queues);
+ }
+ receive_queues_device = device;
+ mca_btl_openib_component.receive_queues =
+ strdup(values.receive_queues);
+ mca_btl_openib_component.receive_queues_source =
+ BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
+ }
+ }
         return OMPI_SUCCESS;
     }
 
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_endpoint.c
--- a/ompi/mca/btl/openib/btl_openib_endpoint.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_endpoint.c Mon Nov 16 17:41:48 2009 +0200
@@ -310,6 +310,11 @@
                 ep->rem_info.rem_subnet_id,
                 ep->rem_info.rem_mtu);
 
+ ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id;
+ ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id;
+
+ ep->rem_info.rem_transport_type = (remote_proc_info->pm_port_info).transport_type;
+
     for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
         endpoint_init_qp(ep, qp);
     }
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_endpoint.h
--- a/ompi/mca/btl/openib/btl_openib_endpoint.h Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_endpoint.h Mon Nov 16 17:41:48 2009 +0200
@@ -94,6 +94,12 @@
     mca_btl_openib_rem_qp_info_t *rem_qps;
     /* Remote xrc_srq info, used only with XRC connections */
     mca_btl_openib_rem_srq_info_t *rem_srqs;
+ /* Vendor id of remote HCA */
+ uint32_t rem_vendor_id;
+ /* Vendor part id of remote HCA */
+ uint32_t rem_vendor_part_id;
+ /* Transport type of remote port */
+ mca_btl_openib_transport_type_t rem_transport_type;
 } mca_btl_openib_rem_info_t;
 
 
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_mca.c
--- a/ompi/mca/btl/openib/btl_openib_mca.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_mca.c Mon Nov 16 17:41:48 2009 +0200
@@ -10,7 +10,7 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
  * reserved.
  * Copyright (c) 2006-2007 Voltaire All rights reserved.
@@ -526,6 +526,13 @@
             mid_qp_size,
             (uint32_t)mca_btl_openib_module.super.btl_eager_limit,
             (uint32_t)mca_btl_openib_module.super.btl_max_send_size);
+
+ mca_btl_openib_component.default_recv_qps = strdup(default_qps);
+ if(NULL == mca_btl_openib_component.default_recv_qps) {
+ BTL_ERROR(("Unable to allocate memory for default receive queues string.\n"));
+ return OMPI_ERROR;
+ }
+
     CHECK(reg_string("receive_queues", NULL,
                      "Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
                      default_qps, &mca_btl_openib_component.receive_queues,
diff -r 521e5f4b161a ompi/mca/btl/openib/help-mpi-btl-openib.txt
--- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt Mon Nov 16 17:41:48 2009 +0200
@@ -11,7 +11,7 @@
 # Copyright (c) 2004-2006 The Regents of the University of California.
 # All rights reserved.
 # Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
-# Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
+# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
 # Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
 # $COPYRIGHT$
 #
@@ -590,3 +590,28 @@
   Local host: %s
   Value: %s
   Message: %s
+#
+[unsupported queues configuration]
+The remote and local queues were automatically configured for different
+devices and as result Open MPI failed to find optimal configuration.
+Please use MCA parameters in order define Open Fabrics queues configuration.
+
+ Local host: %s
+ Local adapter: %s (vendor 0x%x, part ID %d)
+ Local queues: %s
+
+ Remote host: %s
+ Remote adapter: remote adapter (vendor 0x%x, part ID %d)
+ Remote queues: %s
+#
+[conflicting transport types]
+Open MPI detected two different OpenFabrics transport types in the same Infiniband network.
+Such mixed network trasport configuration is not supported by Open MPI.
+
+ Local host: %s
+ Local adapter: %s (vendor 0x%x, part ID %d)
+ Local transport type: %s
+
+ Remote host: %s
+ Remote Adapter: remote adapter (vendor 0x%x, part ID %d)
+ Remote transport type: %s