Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Adding support for RDMAoE devices.
From: Vasily Philipov (vasily_at_[hidden])
Date: 2009-11-01 06:44:24


The attached patch adds support for RDMAoE (RDMA over Ethernet) devices
to Openib BTL. The code changes are very minimal, actually we only
modified the RDMACM code to provide better support for IB and RDMAoE
devices. Please let me know if you have any comments.

Regards,Vasily.

diff -r 9aad663adc9f ompi/config/ompi_check_openib.m4
--- a/ompi/config/ompi_check_openib.m4 Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/config/ompi_check_openib.m4 Sun Nov 01 12:17:03 2009 +0200
@@ -13,7 +13,7 @@
 # Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
 # Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
 # reserved.
-# Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
+# Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -204,6 +204,21 @@
                        [$1_have_ibcm=1
                        $1_LIBS="-libcm $$1_LIBS"])])
            fi
+
+ # Check support for RDMAoE devices
+ $1_have_rdmaoe=0
+ AC_CHECK_DECLS([RDMA_TRANSPORT_RDMAOE],
+ [$1_have_rdmaoe=1], [],
+ [#include <infiniband/verbs.h>])
+
+ AC_MSG_CHECKING([if RDMAoE support is enabled])
+ if test "1" = "$$1_have_rdmaoe"; then
+ AC_DEFINE_UNQUOTED([OMPI_HAVE_RDMAOE], [$$1_have_rdmaoe], [Enable RDMAoE support])
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ fi
+
           ])
 
     # Check to see if <infiniband/driver.h> works. It is known to
diff -r 9aad663adc9f ompi/mca/btl/openib/btl_openib.c
--- a/ompi/mca/btl/openib/btl_openib.c Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/mca/btl/openib/btl_openib.c Sun Nov 01 12:17:03 2009 +0200
@@ -354,6 +354,13 @@
         }
 #endif
 
+#ifdef OMPI_HAVE_RDMAOE
+ if(RDMA_TRANSPORT_RDMAOE == (openib_btl->ib_port_attr.transport) &&
+ OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
+ continue;
+ }
+#endif
+
         if(NULL == (ib_proc = mca_btl_openib_proc_create(ompi_proc))) {
             return OMPI_ERR_OUT_OF_RESOURCE;
         }
diff -r 9aad663adc9f ompi/mca/btl/openib/btl_openib_endpoint.c
--- a/ompi/mca/btl/openib/btl_openib_endpoint.c Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/mca/btl/openib/btl_openib_endpoint.c Sun Nov 01 12:17:03 2009 +0200
@@ -556,7 +556,6 @@
 {
     /* If the CPC uses the CTS protocol, then start it up */
     if (endpoint->endpoint_local_cpc->cbm_uses_cts) {
- int transport_type_ib_p = 0;
         /* Post our receives, which will make credit management happy
            (i.e., rd_credits will be 0) */
         if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(endpoint)) {
@@ -572,16 +571,13 @@
            receives this side's CTS). Also send the CTS if we already
            received the peer's CTS (e.g., if this process was slow to
            call cpc_complete(). */
-#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
- transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
-#endif
+
         OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
                      endpoint->endpoint_proc->proc_ompi->proc_hostname,
                      transport_type_ib_p,
                      endpoint->endpoint_initiator,
                      endpoint->endpoint_cts_received));
- if (transport_type_ib_p ||
- endpoint->endpoint_initiator ||
+ if (endpoint->endpoint_initiator ||
             endpoint->endpoint_cts_received) {
             mca_btl_openib_endpoint_send_cts(endpoint);
 
diff -r 9aad663adc9f ompi/mca/btl/openib/connect/base.h
--- a/ompi/mca/btl/openib/connect/base.h Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/mca/btl/openib/connect/base.h Sun Nov 01 12:17:03 2009 +0200
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
- *
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -13,6 +13,17 @@
 
 #include "connect/connect.h"
 
+#ifdef OMPI_HAVE_RDMAOE
+#define BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl) \
+ (((IBV_TRANSPORT_IB != ((btl)->device->ib_dev->transport_type)) || \
+ (RDMA_TRANSPORT_RDMAOE == ((btl)->ib_port_attr.transport))) ? \
+ true : false)
+#else
+#define BTL_OPENIB_CONNECT_BASE_CHECK_TRANSPORT_TYPE(btl) \
+ ((IBV_TRANSPORT_IB != ((btl)->device->ib_dev->transport_type)) ? \
+ true : false)
+#endif
+
 BEGIN_C_DECLS
 
 /*
diff -r 9aad663adc9f ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
--- a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c Sun Nov 01 12:17:03 2009 +0200
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
  *
  * $COPYRIGHT$
  *
@@ -653,7 +653,7 @@
        we're in an old version of OFED that is IB only (i.e., no
        iWarp), so we can safely assume that we can use this CPC. */
 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
- if (IBV_TRANSPORT_IB != btl->device->ib_dev->transport_type) {
+ if (BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl)) {
         BTL_VERBOSE(("ibcm CPC only supported on InfiniBand; skipped on %s:%d",
                      ibv_get_device_name(btl->device->ib_dev),
                      openib_btl->port_num));
diff -r 9aad663adc9f ompi/mca/btl/openib/connect/btl_openib_connect_oob.c
--- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c Sun Nov 01 12:17:03 2009 +0200
@@ -12,7 +12,7 @@
  * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2006 Los Alamos National Security, LLC. All rights
  * reserved.
- * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
  *
  * $COPYRIGHT$
  *
@@ -120,7 +120,7 @@
        transport_type member, then we must be < OFED v1.2, and
        therefore we must be IB. */
 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
- if (IBV_TRANSPORT_IB != btl->device->ib_dev->transport_type) {
+ if (BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl)) {
         opal_output_verbose(5, mca_btl_base_output,
                             "openib BTL: oob CPC only supported on InfiniBand; skipped on %s:%d",
                             ibv_get_device_name(btl->device->ib_dev),
diff -r 9aad663adc9f ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c
--- a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c Sun Oct 25 08:29:01 2009 -0700
+++ b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c Sun Nov 01 12:17:03 2009 +0200
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2007-2008 Chelsio, Inc. All rights reserved.
- * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
  *
  * $COPYRIGHT$
@@ -857,34 +857,38 @@
         goto out;
     }
 
- /* Post a single receive buffer on the smallest QP for the CTS
- protocol */
- if (mca_btl_openib_component.credits_qp == qpnum) {
- struct ibv_recv_wr *bad_wr, *wr;
+ if(endpoint->endpoint_local_cpc->cbm_uses_cts) {
+ /* Post a single receive buffer on the smallest QP for the CTS
+ protocol */
+ if (mca_btl_openib_component.credits_qp == qpnum) {
+ struct ibv_recv_wr *bad_wr, *wr;
 
- if (OMPI_SUCCESS !=
- ompi_btl_openib_connect_base_alloc_cts(endpoint)) {
- BTL_ERROR(("Failed to alloc CTS frag"));
- goto out1;
+ if (OMPI_SUCCESS !=
+ ompi_btl_openib_connect_base_alloc_cts(endpoint)) {
+ BTL_ERROR(("Failed to alloc CTS frag"));
+ goto out1;
+ }
+ wr = &(endpoint->endpoint_cts_frag.rd_desc);
+ assert(NULL != wr);
+ wr->next = NULL;
+
+ if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp,
+ wr, &bad_wr)) {
+ BTL_ERROR(("failed to post CTS recv buffer"));
+ goto out1;
+ }
+ OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
+ (void*) wr->sg_list[0].addr,
+ endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ qpnum,
+ endpoint->qps[qpnum].qp->lcl_qp->qp_num,
+ (void*) wr->wr_id,
+ (void*) wr->sg_list[0].addr,
+ wr->sg_list[0].length,
+ wr->sg_list[0].lkey));
         }
- wr = &(endpoint->endpoint_cts_frag.rd_desc);
- assert(NULL != wr);
- wr->next = NULL;
-
- if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp,
- wr, &bad_wr)) {
- BTL_ERROR(("failed to post CTS recv buffer"));
- goto out1;
- }
- OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
- (void*) wr->sg_list[0].addr,
- endpoint->endpoint_proc->proc_ompi->proc_hostname,
- qpnum,
- endpoint->qps[qpnum].qp->lcl_qp->qp_num,
- (void*) wr->wr_id,
- (void*) wr->sg_list[0].addr,
- wr->sg_list[0].length,
- wr->sg_list[0].lkey));
+ } else { /* NOT IWARP */
+ mca_btl_openib_endpoint_post_recvs(endpoint);
     }
 
     /* Since the event id is already created (since we're the server),
@@ -1327,27 +1331,31 @@
             goto out;
         }
 
- if (mca_btl_openib_component.credits_qp == context->qpnum) {
- /* Post a single receive buffer on the smallest QP for the CTS
- protocol */
-
- struct ibv_recv_wr *bad_wr, *wr;
- assert(NULL != contents->endpoint->endpoint_cts_frag.super.super.base.super.ptr);
- wr = &(contents->endpoint->endpoint_cts_frag.rd_desc);
- assert(NULL != wr);
- wr->next = NULL;
-
- if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp,
- wr, &bad_wr)) {
- BTL_ERROR(("failed to post CTS recv buffer"));
- goto out1;
+ if (contents->endpoint->endpoint_local_cpc->cbm_uses_cts) {
+ if (mca_btl_openib_component.credits_qp == context->qpnum) {
+ /* Post a single receive buffer on the smallest QP for the CTS
+ protocol */
+
+ struct ibv_recv_wr *bad_wr, *wr;
+ assert(NULL != contents->endpoint->endpoint_cts_frag.super.super.base.super.ptr);
+ wr = &(contents->endpoint->endpoint_cts_frag.rd_desc);
+ assert(NULL != wr);
+ wr->next = NULL;
+
+ if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp,
+ wr, &bad_wr)) {
+ BTL_ERROR(("failed to post CTS recv buffer"));
+ goto out1;
+ }
+ OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
+ (void*) wr->sg_list[0].addr,
+ wr->sg_list[0].length,
+ contents->endpoint->endpoint_proc->proc_ompi->proc_hostname,
+ context->qpnum,
+ contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
             }
- OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
- (void*) wr->sg_list[0].addr,
- wr->sg_list[0].length,
- contents->endpoint->endpoint_proc->proc_ompi->proc_hostname,
- context->qpnum,
- contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
+ } else { /* NOT IWARP */
+ mca_btl_openib_endpoint_post_recvs(contents->endpoint);
         }
     } else {
         /* If we are establishing a connection in the "wrong" direction,
@@ -1809,7 +1817,12 @@
     (*cpc)->cbm_finalize = NULL;
     /* Setting uses_cts=true also guarantees that we'll only be
        selected if QP 0 is PP */
- (*cpc)->cbm_uses_cts = true;
+
+ if(IBV_TRANSPORT_IWARP == (openib_btl->device->ib_dev->transport_type)) {
+ (*cpc)->cbm_uses_cts = true;
+ } else {
+ (*cpc)->cbm_uses_cts = false;
+ }
 
     server = OBJ_NEW(rdmacm_contents_t);
     if (NULL == server) {