Vasily Filipov wrote:
Hello.
Here is new patch for heterogeneous clusters supporting.
Please comment.
Regards,
Vasily
> Hello,
>
> Some time ago Mellanox proposed design that should improve current
> support for heterogeneous clusters (see Design.txt).The design was
> accepted by IB vendors, and now we propose patch that adds a
> heterogeneous cluster support. The path leaves one issue that we do
> not resolve completely. If 2 different procs have different QPs
> configuration (P/S/X) we print nice warning message that describes
> that such configuration is not supported and it propose way to resolve
> the issue. Theoretically it will be best to provide solution that
> automatically will resolve the problem, but it will require
> significant changes on openib blt that we donât want to introduce in
> this stage.
>
> Please comment.
>
> Regards,
> Vasily
>
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib.c
--- a/ompi/mca/btl/openib/btl_openib.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.c Mon Nov 16 17:41:48 2009 +0200
@@ -39,6 +39,8 @@
#include "ompi/runtime/ompi_cr.h"
#endif
+#include "btl_openib_ini.h"
+
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_proc.h"
@@ -287,6 +289,158 @@
return rc;
}
+const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type)
+{
+ switch(transport_type) {
+ case MCA_BTL_OPENIB_TRANSPORT_RDMAOE:
+ return "MCA_BTL_OPENIB_TRANSPORT_RDMAOE";
+
+ case MCA_BTL_OPENIB_TRANSPORT_IB:
+ return "MCA_BTL_OPENIB_TRANSPORT_IB";
+
+ case MCA_BTL_OPENIB_TRANSPORT_IWARP:
+ return "MCA_BTL_OPENIB_TRANSPORT_IWARP";
+
+ case MCA_BTL_OPENIB_TRANSPORT_UNKNOWN:
+ default:
+ return "MCA_BTL_OPENIB_TRANSPORT_UNKNOWN";
+ }
+}
+
+mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl)
+{
+#ifdef OMPI_HAVE_RDMAOE
+ switch(openib_btl->ib_port_attr.transport) {
+ case RDMA_TRANSPORT_IB:
+ return MCA_BTL_OPENIB_TRANSPORT_IB;
+
+ case RDMA_TRANSPORT_IWARP:
+ return MCA_BTL_OPENIB_TRANSPORT_IWARP;
+
+ case RDMA_TRANSPORT_RDMAOE:
+ return MCA_BTL_OPENIB_TRANSPORT_RDMAOE;
+
+ default:
+ return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
+ }
+#else
+#ifdef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE
+ switch(openib_btl->device->ib_dev->transport_type) {
+ case IBV_TRANSPORT_IB:
+ return MCA_BTL_OPENIB_TRANSPORT_IB;
+
+ case IBV_TRANSPORT_IWARP:
+ return MCA_BTL_OPENIB_TRANSPORT_IWARP;
+
+ case IBV_TRANSPORT_UNKNOWN:
+ default:
+ return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
+ }
+#endif
+ return MCA_BTL_OPENIB_TRANSPORT_IB;
+#endif
+}
+
+static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
+ mca_btl_base_endpoint_t* endpoint)
+{
+ int ret = OMPI_SUCCESS;
+
+ char* recv_qps = NULL;
+
+ ompi_btl_openib_ini_values_t values;
+
+ if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "conflicting transport types", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev),
+ (openib_btl->device->ib_dev_attr).vendor_id,
+ (openib_btl->device->ib_dev_attr).vendor_part_id,
+ btl_openib_get_transport_name(mca_btl_openib_get_transport_type(openib_btl)),
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id,
+ btl_openib_get_transport_name(endpoint->rem_info.rem_transport_type));
+
+ return OMPI_ERROR;
+ }
+
+ memset(&values, 0, sizeof(ompi_btl_openib_ini_values_t));
+ ret = ompi_btl_openib_ini_query(endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id, &values);
+
+ if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_FOUND != ret) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "error in device init", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev));
+ return ret;
+ }
+
+ if(openib_btl->device->mtu < endpoint->rem_info.rem_mtu) {
+ endpoint->rem_info.rem_mtu = openib_btl->device->mtu;
+ }
+
+ endpoint->use_eager_rdma = openib_btl->device->use_eager_rdma &
+ endpoint->use_eager_rdma;
+
+ /* Receive queues checking */
+ switch(mca_btl_openib_component.receive_queues_source) {
+ case BTL_OPENIB_RQ_SOURCE_MCA:
+ case BTL_OPENIB_RQ_SOURCE_MAX:
+ break;
+
+ case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
+ if(NULL != values.receive_queues) {
+ recv_qps = values.receive_queues;
+ } else {
+ recv_qps = mca_btl_openib_component.default_recv_qps;
+ }
+
+ if(0 != strcmp(mca_btl_openib_component.receive_queues,
+ recv_qps)) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "unsupported queues configuration", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev),
+ (openib_btl->device->ib_dev_attr).vendor_id,
+ (openib_btl->device->ib_dev_attr).vendor_part_id,
+ mca_btl_openib_component.receive_queues,
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id,
+ recv_qps);
+
+ return OMPI_ERROR;
+ }
+ break;
+
+ case BTL_OPENIB_RQ_SOURCE_DEFAULT:
+ if(NULL != values.receive_queues) {
+ if(0 != strcmp(mca_btl_openib_component.receive_queues,
+ values.receive_queues)) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "unsupported queues configuration", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(openib_btl->device->ib_dev),
+ (openib_btl->device->ib_dev_attr).vendor_id,
+ (openib_btl->device->ib_dev_attr).vendor_part_id,
+ mca_btl_openib_component.receive_queues,
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ endpoint->rem_info.rem_vendor_id,
+ endpoint->rem_info.rem_vendor_part_id,
+ values.receive_queues);
+
+ return OMPI_ERROR;
+ }
+ }
+ break;
+ }
+
+ return OMPI_SUCCESS;
+}
+
/*
* add a proc to this btl module
* creates an endpoint that is setup on the
@@ -471,6 +625,10 @@
continue;
}
+ if(OMPI_SUCCESS != mca_btl_openib_tune_endpoint(openib_btl, endpoint)) {
+ return OMPI_ERROR;
+ }
+
endpoint->index = opal_pointer_array_add(openib_btl->device->endpoints, (void*)endpoint);
if( 0 > endpoint->index ) {
OBJ_RELEASE(endpoint);
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib.h
--- a/ompi/mca/btl/openib/btl_openib.h Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.h Mon Nov 16 17:41:48 2009 +0200
@@ -75,6 +75,13 @@
*/
typedef enum {
+ MCA_BTL_OPENIB_TRANSPORT_UNKNOWN = -1,
+ MCA_BTL_OPENIB_TRANSPORT_IB = 0,
+ MCA_BTL_OPENIB_TRANSPORT_IWARP,
+ MCA_BTL_OPENIB_TRANSPORT_RDMAOE
+} mca_btl_openib_transport_type_t;
+
+typedef enum {
MCA_BTL_OPENIB_PP_QP,
MCA_BTL_OPENIB_SRQ_QP,
MCA_BTL_OPENIB_XRC_QP
@@ -254,6 +261,8 @@
ompi_free_list_t recv_user_free;
/**< frags for coalesced massages */
ompi_free_list_t send_free_coalesced;
+ /**< Default receive queues */
+ char* default_recv_qps;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
@@ -272,6 +281,12 @@
uint16_t apm_lid;
/** The MTU used by this port */
uint8_t mtu;
+ /** vendor id define device type and tuning */
+ uint32_t vendor_id;
+ /** vendor part id define device type and tuning */
+ uint32_t vendor_part_id;
+ /** Transport type of remote port */
+ uint8_t transport_type;
/** Dummy field used to calculate the real length */
uint8_t end;
} mca_btl_openib_modex_message_t;
@@ -633,6 +648,18 @@
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
+/**
+ * Get a transport name of btl by its transport type.
+ */
+
+const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
+
+/**
+ * Get a transport type of btl.
+ */
+
+mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl);
+
static inline int qp_cq_prio(const int qp)
{
if(0 == qp)
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_component.c
--- a/ompi/mca/btl/openib/btl_openib_component.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_component.c Mon Nov 16 17:41:48 2009 +0200
@@ -143,6 +143,7 @@
OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
mca_btl_openib_component.devices_count = 0;
mca_btl_openib_component.cpc_explicitly_defined = false;
+ mca_btl_openib_component.default_recv_qps = NULL;
/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
@@ -196,6 +197,10 @@
free(mca_btl_openib_component.receive_queues);
}
+ if (NULL != mca_btl_openib_component.default_recv_qps) {
+ free(mca_btl_openib_component.default_recv_qps);
+ }
+
return rc;
}
@@ -303,6 +308,16 @@
/* Pack the modex common message struct. */
size = modex_message_size;
+
+ (mca_btl_openib_component.openib_btls[i]->port_info).vendor_id =
+ (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id;
+
+ (mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id =
+ (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id;
+
+ (mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
+ mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
+
memcpy(offset,
&(mca_btl_openib_component.openib_btls[i]->port_info),
size);
@@ -1657,45 +1672,6 @@
ibv_destroy_cq(cq);
}
- /* If the user specified btl_openib_receive_queues MCA param, it
- overrides all device INI params */
- if (BTL_OPENIB_RQ_SOURCE_MCA !=
- mca_btl_openib_component.receive_queues_source &&
- NULL != values.receive_queues) {
- /* If a prior device's INI values set a different value for
- receive_queues, this is unsupported (see
- https://svn.open-mpi.org/trac/ompi/ticket/1285) */
- if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
- mca_btl_openib_component.receive_queues_source) {
- if (0 != strcmp(values.receive_queues,
- mca_btl_openib_component.receive_queues)) {
- orte_show_help("help-mpi-btl-openib.txt",
- "conflicting receive_queues", true,
- orte_process_info.nodename,
- ibv_get_device_name(device->ib_dev),
- device->ib_dev_attr.vendor_id,
- device->ib_dev_attr.vendor_part_id,
- values.receive_queues,
- ibv_get_device_name(receive_queues_device->ib_dev),
- receive_queues_device->ib_dev_attr.vendor_id,
- receive_queues_device->ib_dev_attr.vendor_part_id,
- mca_btl_openib_component.receive_queues,
- opal_install_dirs.pkgdatadir);
- ret = OMPI_ERR_RESOURCE_BUSY;
- goto error;
- }
- } else {
- if (NULL != mca_btl_openib_component.receive_queues) {
- free(mca_btl_openib_component.receive_queues);
- }
- receive_queues_device = device;
- mca_btl_openib_component.receive_queues =
- strdup(values.receive_queues);
- mca_btl_openib_component.receive_queues_source =
- BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
- }
- }
-
/* Should we use RDMA for short / eager messages? First check MCA
param, then check INI file values. */
if (mca_btl_openib_component.use_eager_rdma >= 0) {
@@ -1795,6 +1771,45 @@
"apm not enough ports", true);
mca_btl_openib_component.apm_ports = 0;
}
+
+ /* If the user specified btl_openib_receive_queues MCA param, it
+ overrides all device INI params */
+ if (BTL_OPENIB_RQ_SOURCE_MCA !=
+ mca_btl_openib_component.receive_queues_source &&
+ NULL != values.receive_queues) {
+ /* If a prior device's INI values set a different value for
+ receive_queues, this is unsupported (see
+ https://svn.open-mpi.org/trac/ompi/ticket/1285) */
+ if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
+ mca_btl_openib_component.receive_queues_source) {
+ if (0 != strcmp(values.receive_queues,
+ mca_btl_openib_component.receive_queues)) {
+ orte_show_help("help-mpi-btl-openib.txt",
+ "conflicting receive_queues", true,
+ orte_process_info.nodename,
+ ibv_get_device_name(device->ib_dev),
+ device->ib_dev_attr.vendor_id,
+ device->ib_dev_attr.vendor_part_id,
+ values.receive_queues,
+ ibv_get_device_name(receive_queues_device->ib_dev),
+ receive_queues_device->ib_dev_attr.vendor_id,
+ receive_queues_device->ib_dev_attr.vendor_part_id,
+ mca_btl_openib_component.receive_queues,
+ opal_install_dirs.pkgdatadir);
+ ret = OMPI_ERR_RESOURCE_BUSY;
+ goto error;
+ }
+ } else {
+ if (NULL != mca_btl_openib_component.receive_queues) {
+ free(mca_btl_openib_component.receive_queues);
+ }
+ receive_queues_device = device;
+ mca_btl_openib_component.receive_queues =
+ strdup(values.receive_queues);
+ mca_btl_openib_component.receive_queues_source =
+ BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
+ }
+ }
return OMPI_SUCCESS;
}
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_endpoint.c
--- a/ompi/mca/btl/openib/btl_openib_endpoint.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_endpoint.c Mon Nov 16 17:41:48 2009 +0200
@@ -310,6 +310,11 @@
ep->rem_info.rem_subnet_id,
ep->rem_info.rem_mtu);
+ ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id;
+ ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id;
+
+ ep->rem_info.rem_transport_type = (remote_proc_info->pm_port_info).transport_type;
+
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
endpoint_init_qp(ep, qp);
}
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_endpoint.h
--- a/ompi/mca/btl/openib/btl_openib_endpoint.h Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_endpoint.h Mon Nov 16 17:41:48 2009 +0200
@@ -94,6 +94,12 @@
mca_btl_openib_rem_qp_info_t *rem_qps;
/* Remote xrc_srq info, used only with XRC connections */
mca_btl_openib_rem_srq_info_t *rem_srqs;
+ /* Vendor id of remote HCA */
+ uint32_t rem_vendor_id;
+ /* Vendor part id of remote HCA */
+ uint32_t rem_vendor_part_id;
+ /* Transport type of remote port */
+ mca_btl_openib_transport_type_t rem_transport_type;
} mca_btl_openib_rem_info_t;
diff -r 521e5f4b161a ompi/mca/btl/openib/btl_openib_mca.c
--- a/ompi/mca/btl/openib/btl_openib_mca.c Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_mca.c Mon Nov 16 17:41:48 2009 +0200
@@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
@@ -526,6 +526,13 @@
mid_qp_size,
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
+
+ mca_btl_openib_component.default_recv_qps = strdup(default_qps);
+ if(NULL == mca_btl_openib_component.default_recv_qps) {
+ BTL_ERROR(("Unable to allocate memory for default receive queues string.\n"));
+ return OMPI_ERROR;
+ }
+
CHECK(reg_string("receive_queues", NULL,
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
default_qps, &mca_btl_openib_component.receive_queues,
diff -r 521e5f4b161a ompi/mca/btl/openib/help-mpi-btl-openib.txt
--- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt Fri Nov 06 12:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt Mon Nov 16 17:41:48 2009 +0200
@@ -11,7 +11,7 @@
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
-# Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
+# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
# $COPYRIGHT$
#
@@ -590,3 +590,28 @@
Local host: %s
Value: %s
Message: %s
+#
+[unsupported queues configuration]
+The remote and local queues were automatically configured for different
+devices and as result Open MPI failed to find optimal configuration.
+Please use MCA parameters in order define Open Fabrics queues configuration.
+
+ Local host: %s
+ Local adapter: %s (vendor 0x%x, part ID %d)
+ Local queues: %s
+
+ Remote host: %s
+ Remote adapter: remote adapter (vendor 0x%x, part ID %d)
+ Remote queues: %s
+#
+[conflicting transport types]
+Open MPI detected two different OpenFabrics transport types in the same Infiniband network.
+Such mixed network trasport configuration is not supported by Open MPI.
+
+ Local host: %s
+ Local adapter: %s (vendor 0x%x, part ID %d)
+ Local transport type: %s
+
+ Remote host: %s
+ Remote Adapter: remote adapter (vendor 0x%x, part ID %d)
+ Remote transport type: %s
|