Hi,
We would like to add SDP support for OPENMPI.
SDP - Socket Direct Protocol is a byte-stream transport
protocol implementing the TCP SOCK_STREAM semantics utilizing transport
offloading capabilities of the InfiniBand fabric
(http://www.mellanox.com/pdf/whitepapers/SDP_Whitepaper.pdf,
http://www.openfabrics.org/archives/aug2005datacenter/das_SDP_Linux.pdf
).
SDP can be used to accelerate job start ( oob over sdp ) and
IPoIB performance.
The main idea is to use AF_INET_SDP protocol family instead
of AF_INET and AF_INET6 when opening sockets.
SDP will be used in OOB and BTL with appropriate mca
parameters –
-mca btl_tcp_sdp_enable 1
-mca oob_tcp_sdp_enable 1
Since not all functions support this family, the changes
were maid only in critical sections of the code
Since SDP support is relevant only for InfiniBand Fabrics
you need to configure sdp support with –enable-sdp flag. SDP will be
disabled by default.
./configure –enable-sdp
Test results of running bandwidth and latency of SDP on 2
DDR nodes.
BWsize
VERBS
IPoIB
connected IPoIB
datagram
btl SDP
1000000
1507.68
665.70
425.21
1272.37
LTsize
5
3.82
28.83
28.24
25.73
Index:
opal/include/opal_config_bottom.h
===================================================================
---
opal/include/opal_config_bottom.h (revision 17027)
+++
opal/include/opal_config_bottom.h (working copy)
@@ -509,7 +509,15 @@
#if !HAVE_DECL_PF_INET6
#define PF_INET6 PF_UNSPEC
#endif
+#if !HAVE_DECL_AF_INET_SDP
+#define AF_INET_SDP 27
+#endif
+#if OPAL_ENABLE_SDP
+#define OPAL_WANT_SDP 1
+#else
+#define OPAL_WANT_SDP 0
+#endif
#if defined(__APPLE__)
&& defined(HAVE_INTTYPES_H)
/* Prior to Mac OS X 10.3,
the length modifier "ll" wasn't
supported, but
"q" was for long long. This isn't ANSI
Index: configure.ac
===================================================================
--- configure.ac (revision
17027)
+++ configure.ac (working
copy)
@@ -674,7 +674,7 @@
#include
<netinet/in.h>
#endif])
-AC_CHECK_DECLS([AF_UNSPEC,
PF_UNSPEC, AF_INET6, PF_INET6],
+AC_CHECK_DECLS([AF_UNSPEC,
PF_UNSPEC, AF_INET6, PF_INET6, AF_INET_SDP],
[], [],
[AC_INCLUDES_DEFAULT
#if HAVE_SYS_SOCKET_H
#include
<sys/socket.h>
Index:
ompi/mca/btl/tcp/btl_tcp_component.c
===================================================================
---
ompi/mca/btl/tcp/btl_tcp_component.c (revision 17027)
+++
ompi/mca/btl/tcp/btl_tcp_component.c (working copy)
@@ -263,6 +263,10 @@
mca_btl_tcp_component.tcp_disable_family =
mca_btl_tcp_param_register_int ("disable_family", NULL, 0);
+#if OPAL_WANT_SDP
+
mca_btl_tcp_component.sdp_enable =
+
mca_btl_tcp_param_register_int("sdp_enable", "Enable SDP for TCP
connections", 0);
+#endif
return OMPI_SUCCESS;
}
@@ -527,6 +531,11 @@
memset (&hints,
0, sizeof(hints));
hints.ai_family =
af_family;
+#if OPAL_WANT_SDP
+ if (
mca_btl_tcp_component.sdp_enable ) {
+ hints.ai_family =
AF_INET6;
+ }
+#endif
hints.ai_socktype =
SOCK_STREAM;
hints.ai_flags =
AI_PASSIVE;
@@ -555,7 +564,7 @@
#endif /* IPV6_V6ONLY */
}
#else
- ((struct sockaddr_in*)
&inaddr)->sin_family = AF_INET;
+ ((struct sockaddr_in*)
&inaddr)->sin_family = af_family;
((struct sockaddr_in*)
&inaddr)->sin_addr.s_addr = INADDR_ANY;
addrlen = sizeof(struct
sockaddr_in);
#endif
@@ -600,7 +609,11 @@
}
goto
socket_binded;
}
- if( AF_INET ==
af_family ) {
+#if OPAL_WANT_SDP
+ if( AF_INET ==
af_family|| AF_INET_SDP == af_family) {
+#else
+ if( AF_INET ==
af_family) {
+#endif
BTL_ERROR(("bind() failed: no port available in the range [%d..%d]",
mca_btl_tcp_component.tcp_port_min,
mca_btl_tcp_component.tcp_port_min + range));
@@ -624,7 +637,11 @@
return OMPI_ERROR;
}
- if (AF_INET ==
af_family) {
+#if OPAL_WANT_SDP
+ if( AF_INET ==
af_family|| AF_INET_SDP == af_family) {
+#else
+ if( AF_INET ==
af_family) {
+#endif
mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*)
&inaddr)->sin_port;
mca_btl_tcp_component.tcp_listen_sd
= sd;
}
@@ -660,7 +677,11 @@
}
/* register listen port
*/
- if (AF_INET ==
af_family) {
+#if OPAL_WANT_SDP
+ if( AF_INET ==
af_family|| AF_INET_SDP == af_family) {
+#else
+ if( AF_INET ==
af_family) {
+#endif
opal_event_set(
&mca_btl_tcp_component.tcp_recv_event,
mca_btl_tcp_component.tcp_listen_sd,
OPAL_EV_READ|OPAL_EV_PERSIST,
@@ -822,6 +843,12 @@
}
/* create a TCP listen
socket for incoming connection attempts */
+#if OPAL_WANT_SDP
+ if
(mca_btl_tcp_component.sdp_enable) {
+ if(OMPI_SUCCESS !=
(ret = mca_btl_tcp_component_create_listen(AF_INET_SDP) )) {
+ return 0;
+ }
+ } else {
if(OMPI_SUCCESS != (ret
= mca_btl_tcp_component_create_listen(AF_INET) )) {
return 0;
}
@@ -833,7 +860,21 @@
}
}
#endif
+ }
+#else
+ if(OMPI_SUCCESS != (ret
= mca_btl_tcp_component_create_listen(AF_INET) )) {
+ return 0;
+ }
+#if OPAL_WANT_IPV6
+ if((ret =
mca_btl_tcp_component_create_listen(AF_INET6)) != OMPI_SUCCESS) {
+ if
(!(OMPI_ERR_IN_ERRNO == ret && EAFNOSUPPORT == opal_socket_errno)) {
+ opal_output (0,
"mca_btl_tcp_component: IPv6 listening socket failed\n");
+ return 0;
+ }
+ }
+#endif
+#endif
/* publish TCP
parameters with the MCA framework */
if(OMPI_SUCCESS != (ret
= mca_btl_tcp_component_exchange() )) {
return 0;
Index:
ompi/mca/btl/tcp/btl_tcp_endpoint.c
===================================================================
---
ompi/mca/btl/tcp/btl_tcp_endpoint.c (revision 17027)
+++
ompi/mca/btl/tcp/btl_tcp_endpoint.c (working copy)
@@ -535,7 +535,14 @@
addrlen = sizeof
(struct sockaddr_in6);
}
#endif
-
+
+#if OPAL_WANT_SDP
+ if (
mca_btl_tcp_component.sdp_enable){
+ af_family =
AF_INET_SDP;
+ addrlen =
sizeof(struct sockaddr_in);
+ }
+#endif
+
btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0);
if
(btl_endpoint->endpoint_sd < 0) {
btl_endpoint->endpoint_retries++;
Index:
ompi/mca/btl/tcp/btl_tcp.h
===================================================================
---
ompi/mca/btl/tcp/btl_tcp.h (revision 17027)
+++ ompi/mca/btl/tcp/btl_tcp.h (working
copy)
@@ -90,6 +90,9 @@
int
tcp_sndbuf; /**< socket sndbuf size */
int
tcp_rcvbuf; /**< socket rcvbuf size */
int
tcp_disable_family; /**< disabled AF_family */
+#if OPAL_WANT_SDP
+ int
sdp_enable; /**< enable SDP */
+#endif /* OPAL_WANT_SDP */
/* free list of
fragment descriptors */
ompi_free_list_t
tcp_frag_eager;
Index:
config/ompi_configure_options.m4
===================================================================
---
config/ompi_configure_options.m4 (revision 17027)
+++
config/ompi_configure_options.m4 (working copy)
@@ -683,6 +683,23 @@
[Enable
IPv6 support, but only if the underlying system supports it])
#
+# Do we want to disable SDP
support?
+#
+AC_MSG_CHECKING([if want
SDP support])
+AC_ARG_ENABLE([sdp],
+
[AC_HELP_STRING([--enable-sdp],
+ [Enable SDP support
(default: disabled)])])
+if test
"$enable_sdp" = "yes"; then
+ AC_MSG_RESULT([yes])
+ opal_want_sdp=1
+else
+ AC_MSG_RESULT([no])
+ opal_want_sdp=0
+fi
+AC_DEFINE_UNQUOTED([OPAL_ENABLE_SDP],
[$opal_want_sdp],
+ [Enable
SDP support])
+
+#
# Do we want orterun's
--prefix behavior to be enabled by default?
#
AC_MSG_CHECKING([if want
orterun "--prefix" behavior to be enabled by default])
Index:
orte/mca/oob/tcp/oob_tcp_peer.c
===================================================================
---
orte/mca/oob/tcp/oob_tcp_peer.c (revision 17027)
+++
orte/mca/oob/tcp/oob_tcp_peer.c (working copy)
@@ -371,7 +371,15 @@
opal_net_get_port((struct sockaddr*) &inaddr));
}
- rc =
mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);
+#if OPAL_WANT_SDP
+ if
(mca_oob_tcp_component.sdp_enable) {
+ rc =
mca_oob_tcp_peer_create_socket(peer, AF_INET_SDP);
+ } else {
+ rc =
mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);
+ }
+#else
+ rc =
mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);
+#endif
if (ORTE_SUCCESS !=
rc) {
struct timeval
tv = { 1,0 };
opal_evtimer_add(&peer->peer_timer_event, &tv);
Index:
orte/mca/oob/tcp/oob_tcp.c
===================================================================
---
orte/mca/oob/tcp/oob_tcp.c (revision 17027)
+++
orte/mca/oob/tcp/oob_tcp.c (working copy)
@@ -380,6 +380,13 @@
mca_oob_tcp_component.tcp6_listen_sd = -1;
#endif /* OPAL_WANT_IPV6
*/
+#if OPAL_WANT_SDP
+
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
+
"sdp_enable","Enable SDP for TCP connections",
+
false, false,
+
0,
+
&mca_oob_tcp_component.sdp_enable);
+#endif
/* initialize state */
mca_oob_tcp_component.tcp_shutdown = false;
mca_oob_tcp_component.tcp_listen_sd = -1;
@@ -514,7 +521,7 @@
int error;
memset(&hints,
0, sizeof(hints));
- hints.ai_family =
af_family;
+ hints.ai_family =
AF_INET6;
hints.ai_socktype =
SOCK_STREAM;
hints.ai_flags =
AI_PASSIVE;
@@ -542,9 +549,6 @@
}
#endif /* IPV6_V6ONLY */
#else
- if (AF_INET !=
af_family) {
- return ORTE_ERROR;
- }
((struct sockaddr_in*)
&inaddr)->sin_family = af_family;
((struct sockaddr_in*)
&inaddr)->sin_addr.s_addr = INADDR_ANY;
addrlen = sizeof(struct
sockaddr_in);
@@ -590,7 +594,11 @@
}
goto
socket_binded;
}
- if( AF_INET ==
af_family ) {
+#if OPAL_WANT_SDP
+ if( AF_INET ==
af_family || AF_INET_SDP == af_family) {
+#else
+ if( AF_INET ==
af_family) {
+#endif
opal_output( 0,
"bind() failed: no port available in the range [%d..%d]",
mca_oob_tcp_component.tcp_port_min,
mca_oob_tcp_component.tcp_port_min + range);
@@ -614,7 +622,11 @@
return ORTE_ERROR;
}
- if (AF_INET ==
af_family) {
+#if OPAL_WANT_SDP
+ if( AF_INET ==
af_family || AF_INET_SDP == af_family) {
+#else
+ if( AF_INET ==
af_family) {
+#endif
mca_oob_tcp_component.tcp_listen_port = ((struct sockaddr_in*)
&inaddr)->sin_port;
mca_oob_tcp_component.tcp_listen_sd = *target_sd;
}
@@ -647,7 +659,11 @@
}
/* register listen port
*/
- if (AF_INET ==
af_family) {
+#if OPAL_WANT_SDP
+ if( AF_INET ==
af_family || AF_INET_SDP == af_family) {
+#else
+ if( AF_INET ==
af_family) {
+#endif
opal_event_set(&mca_oob_tcp_component.tcp_recv_event,
*target_sd,
OPAL_EV_READ|OPAL_EV_PERSIST,
@@ -822,6 +838,7 @@
int flags;
/* create a listen
socket for incoming connections */
+ /* FIXME add support
for SDP */
mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
if(mca_oob_tcp_component.tcp_listen_sd < 0) {
opal_output(0,"mca_oob_tcp_component_init: socket() failed: %s (%d)",
@@ -1194,26 +1211,41 @@
but can't do
that since we weren't the HNP. */
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;
- rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,
-
AF_INET);
- if (ORTE_SUCCESS !=
rc &&
- (EAFNOSUPPORT
!= opal_socket_errno ||
-
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
- opal_output(0,
-
"mca_oob_tcp_init: unable to create IPv4 listen socket: %s\n",
+#if OPAL_WANT_SDP
+ if (
mca_oob_tcp_component.sdp_enable){
+ rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,
+
AF_INET_SDP);
+ if (ORTE_SUCCESS
!= rc &&
+ (EAFNOSUPPORT
!= opal_socket_errno ||
+
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
+
opal_output(0,
+
"mca_oob_tcp_init: unable to create SDP listen socket: %s\n",
opal_strerror(rc));
- }
+ }
+ } else
+#endif
+ {
+ rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,
+
AF_INET);
+ if (ORTE_SUCCESS !=
rc &&
+ (EAFNOSUPPORT
!= opal_socket_errno ||
+
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
+
opal_output(0,
+
"mca_oob_tcp_init: unable to create IPv4 listen socket: %s\n",
+
opal_strerror(rc));
+ }
#if OPAL_WANT_IPV6
- rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp6_listen_sd,
-
AF_INET6);
- if (ORTE_SUCCESS !=
rc &&
- (EAFNOSUPPORT
!= opal_socket_errno ||
-
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
- opal_output(0,
-
"mca_oob_tcp_init: unable to create IPv6 listen socket: %s\n",
- opal_strerror(rc));
- }
+ rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp6_listen_sd,
+
AF_INET6);
+ if (ORTE_SUCCESS
!= rc &&
+ (EAFNOSUPPORT
!= opal_socket_errno ||
+
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
+
opal_output(0,
+
"mca_oob_tcp_init: unable to create IPv6 listen socket: %s\n",
+ opal_strerror(rc));
+ }
#endif
+ }
if
(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0,
"%s accepting connections via event library",
ORTE_NAME_PRINT(orte_process_info.my_name));
Index:
orte/mca/oob/tcp/oob_tcp.h
===================================================================
---
orte/mca/oob/tcp/oob_tcp.h (revision 17027)
+++
orte/mca/oob/tcp/oob_tcp.h (working copy)
@@ -217,6 +217,9 @@
int tcp6_port_min;
/**< Minimum allowed port for the OOB listen socket */
int
tcp6_port_range; /**< Range of allowed TCP ports */
#endif /* OPAL_WANT_IPV6
*/
+#if OPAL_WANT_SDP
+ int
sdp_enable; /**< support for SDP */
+#endif /* OAP_WANT_SDP */
opal_mutex_t
tcp_lock; /**< lock for accessing module state */
opal_list_t
tcp_events; /**< list of pending events (accepts) */
opal_list_t tcp_msg_post;
/**< list of recieves user has posted */
Thanks,
Verkhovsky Lenny.