Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] SDP support for OPEN-MPI
From: Brian Barrett (brbarret_at_[hidden])
Date: 2007-12-31 16:54:18


Since I used to be the OOB guy, I wanted to throw my $0.02 out
there. I think this is the right approach for adding such support.
I haven't tested it, but if it works see no reason not to commit.

Brian

On Dec 31, 2007, at 1:41 AM, Lenny Verkhovsky wrote:

>
>
> Hi,
>
> We would like to add SDP support for OPENMPI.
>
>
>
> SDP - Socket Direct Protocol is a byte-stream transport protocol
> implementing the TCP SOCK_STREAM semantics utilizing transport
> offloading capabilities of the InfiniBand fabric
>
> (http://www.mellanox.com/pdf/whitepapers/SDP_Whitepaper.pdf, http://
> www.openfabrics.org/archives/aug2005datacenter/das_SDP_Linux.pdf ).
>
>
>
> SDP can be used to accelerate job start ( oob over sdp ) and IPoIB
> performance.
>
>
>
> The main idea is to use AF_INET_SDP protocol family instead of
> AF_INET and AF_INET6 when opening sockets.
>
> SDP will be used in OOB and BTL with appropriate mca parameters –
>
> -mca btl_tcp_sdp_enable 1
>
> -mca oob_tcp_sdp_enable 1
>
>
>
> Since not all functions support this family, the changes were maid
> only in critical sections of the code
>
>
>
> Since SDP support is relevant only for InfiniBand Fabrics you need
> to configure sdp support with –enable-sdp flag. SDP will be
> disabled by default.
>
> ./configure –enable-sdp
>
>
>
> Test results of running bandwidth and latency of SDP on 2 DDR nodes.
>
>
>
> BWsize VERBS IPoIB connected
> IPoIB datagram btl SDP
>
> 1000000 1507.68
> 665.70 425.21
> 1272.37
>
>
>
> LTsize
>
> 5 3.82
> 28.83 28.24
> 25.73
>
>
>
>
>
>
>
>
>
>
>
> Index: opal/include/opal_config_bottom.h
>
> ===================================================================
>
> --- opal/include/opal_config_bottom.h (revision 17027)
>
> +++ opal/include/opal_config_bottom.h (working copy)
>
> @@ -509,7 +509,15 @@
>
> #if !HAVE_DECL_PF_INET6
>
> #define PF_INET6 PF_UNSPEC
>
> #endif
>
> +#if !HAVE_DECL_AF_INET_SDP
>
> +#define AF_INET_SDP 27
>
> +#endif
>
>
>
> +#if OPAL_ENABLE_SDP
>
> +#define OPAL_WANT_SDP 1
>
> +#else
>
> +#define OPAL_WANT_SDP 0
>
> +#endif
>
> #if defined(__APPLE__) && defined(HAVE_INTTYPES_H)
>
> /* Prior to Mac OS X 10.3, the length modifier "ll" wasn't
>
> supported, but "q" was for long long. This isn't ANSI
>
> Index: configure.ac
>
> ===================================================================
>
> --- configure.ac (revision 17027)
>
> +++ configure.ac (working copy)
>
> @@ -674,7 +674,7 @@
>
> #include <netinet/in.h>
>
> #endif])
>
>
>
> -AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6],
>
> +AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6,
> AF_INET_SDP],
>
> [], [], [AC_INCLUDES_DEFAULT
>
> #if HAVE_SYS_SOCKET_H
>
> #include <sys/socket.h>
>
> Index: ompi/mca/btl/tcp/btl_tcp_component.c
>
> ===================================================================
>
> --- ompi/mca/btl/tcp/btl_tcp_component.c (revision 17027)
>
> +++ ompi/mca/btl/tcp/btl_tcp_component.c (working copy)
>
> @@ -263,6 +263,10 @@
>
>
>
> mca_btl_tcp_component.tcp_disable_family =
>
> mca_btl_tcp_param_register_int ("disable_family", NULL, 0);
>
> +#if OPAL_WANT_SDP
>
> + mca_btl_tcp_component.sdp_enable =
>
> + mca_btl_tcp_param_register_int("sdp_enable", "Enable SDP
> for TCP connections", 0);
>
> +#endif
>
> return OMPI_SUCCESS;
>
> }
>
>
>
> @@ -527,6 +531,11 @@
>
>
>
> memset (&hints, 0, sizeof(hints));
>
> hints.ai_family = af_family;
>
> +#if OPAL_WANT_SDP
>
> + if ( mca_btl_tcp_component.sdp_enable ) {
>
> + hints.ai_family = AF_INET6;
>
> + }
>
> +#endif
>
> hints.ai_socktype = SOCK_STREAM;
>
> hints.ai_flags = AI_PASSIVE;
>
>
>
> @@ -555,7 +564,7 @@
>
> #endif /* IPV6_V6ONLY */
>
> }
>
> #else
>
> - ((struct sockaddr_in*) &inaddr)->sin_family = AF_INET;
>
> + ((struct sockaddr_in*) &inaddr)->sin_family = af_family;
>
> ((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;
>
> addrlen = sizeof(struct sockaddr_in);
>
> #endif
>
> @@ -600,7 +609,11 @@
>
> }
>
> goto socket_binded;
>
> }
>
> - if( AF_INET == af_family ) {
>
> +#if OPAL_WANT_SDP
>
> + if( AF_INET == af_family|| AF_INET_SDP == af_family) {
>
> +#else
>
> + if( AF_INET == af_family) {
>
> +#endif
>
> BTL_ERROR(("bind() failed: no port available in the
> range [%d..%d]",
>
> mca_btl_tcp_component.tcp_port_min,
>
> mca_btl_tcp_component.tcp_port_min + range));
>
> @@ -624,7 +637,11 @@
>
> return OMPI_ERROR;
>
> }
>
>
>
> - if (AF_INET == af_family) {
>
> +#if OPAL_WANT_SDP
>
> + if( AF_INET == af_family|| AF_INET_SDP == af_family) {
>
> +#else
>
> + if( AF_INET == af_family) {
>
> +#endif
>
> mca_btl_tcp_component.tcp_listen_port = ((struct
> sockaddr_in*) &inaddr)->sin_port;
>
> mca_btl_tcp_component.tcp_listen_sd = sd;
>
> }
>
> @@ -660,7 +677,11 @@
>
> }
>
>
>
> /* register listen port */
>
> - if (AF_INET == af_family) {
>
> +#if OPAL_WANT_SDP
>
> + if( AF_INET == af_family|| AF_INET_SDP == af_family) {
>
> +#else
>
> + if( AF_INET == af_family) {
>
> +#endif
>
> opal_event_set( &mca_btl_tcp_component.tcp_recv_event,
>
> mca_btl_tcp_component.tcp_listen_sd,
>
> OPAL_EV_READ|OPAL_EV_PERSIST,
>
> @@ -822,6 +843,12 @@
>
> }
>
>
>
> /* create a TCP listen socket for incoming connection attempts */
>
> +#if OPAL_WANT_SDP
>
> + if (mca_btl_tcp_component.sdp_enable) {
>
> + if(OMPI_SUCCESS != (ret =
> mca_btl_tcp_component_create_listen(AF_INET_SDP) )) {
>
> + return 0;
>
> + }
>
> + } else {
>
> if(OMPI_SUCCESS != (ret = mca_btl_tcp_component_create_listen
> (AF_INET) )) {
>
> return 0;
>
> }
>
> @@ -833,7 +860,21 @@
>
> }
>
> }
>
> #endif
>
> + }
>
>
>
> +#else
>
> + if(OMPI_SUCCESS != (ret = mca_btl_tcp_component_create_listen
> (AF_INET) )) {
>
> + return 0;
>
> + }
>
> +#if OPAL_WANT_IPV6
>
> + if((ret = mca_btl_tcp_component_create_listen(AF_INET6)) !=
> OMPI_SUCCESS) {
>
> + if (!(OMPI_ERR_IN_ERRNO == ret && EAFNOSUPPORT ==
> opal_socket_errno)) {
>
> + opal_output (0, "mca_btl_tcp_component: IPv6 listening
> socket failed\n");
>
> + return 0;
>
> + }
>
> + }
>
> +#endif
>
> +#endif
>
> /* publish TCP parameters with the MCA framework */
>
> if(OMPI_SUCCESS != (ret = mca_btl_tcp_component_exchange() )) {
>
> return 0;
>
> Index: ompi/mca/btl/tcp/btl_tcp_endpoint.c
>
> ===================================================================
>
> --- ompi/mca/btl/tcp/btl_tcp_endpoint.c (revision 17027)
>
> +++ ompi/mca/btl/tcp/btl_tcp_endpoint.c (working copy)
>
> @@ -535,7 +535,14 @@
>
> addrlen = sizeof (struct sockaddr_in6);
>
> }
>
> #endif
>
> -
>
> +
>
> +#if OPAL_WANT_SDP
>
> + if ( mca_btl_tcp_component.sdp_enable){
>
> + af_family = AF_INET_SDP;
>
> + addrlen = sizeof(struct sockaddr_in);
>
> + }
>
> +#endif
>
> +
>
> btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0);
>
> if (btl_endpoint->endpoint_sd < 0) {
>
> btl_endpoint->endpoint_retries++;
>
> Index: ompi/mca/btl/tcp/btl_tcp.h
>
> ===================================================================
>
> --- ompi/mca/btl/tcp/btl_tcp.h (revision 17027)
>
> +++ ompi/mca/btl/tcp/btl_tcp.h (working copy)
>
> @@ -90,6 +90,9 @@
>
> int tcp_sndbuf; /**< socket sndbuf
> size */
>
> int tcp_rcvbuf; /**< socket rcvbuf
> size */
>
> int tcp_disable_family; /**< disabled
> AF_family */
>
> +#if OPAL_WANT_SDP
>
> + int sdp_enable; /**< enable
> SDP */
>
> +#endif /* OPAL_WANT_SDP */
>
>
>
> /* free list of fragment descriptors */
>
> ompi_free_list_t tcp_frag_eager;
>
> Index: config/ompi_configure_options.m4
>
> ===================================================================
>
> --- config/ompi_configure_options.m4 (revision 17027)
>
> +++ config/ompi_configure_options.m4 (working copy)
>
> @@ -683,6 +683,23 @@
>
> [Enable IPv6 support, but only if the
> underlying system supports it])
>
>
>
> #
>
> +# Do we want to disable SDP support?
>
> +#
>
> +AC_MSG_CHECKING([if want SDP support])
>
> +AC_ARG_ENABLE([sdp],
>
> + [AC_HELP_STRING([--enable-sdp],
>
> + [Enable SDP support (default: disabled)])])
>
> +if test "$enable_sdp" = "yes"; then
>
> + AC_MSG_RESULT([yes])
>
> + opal_want_sdp=1
>
> +else
>
> + AC_MSG_RESULT([no])
>
> + opal_want_sdp=0
>
> +fi
>
> +AC_DEFINE_UNQUOTED([OPAL_ENABLE_SDP], [$opal_want_sdp],
>
> + [Enable SDP support])
>
> +
>
> +#
>
> # Do we want orterun's --prefix behavior to be enabled by default?
>
> #
>
> AC_MSG_CHECKING([if want orterun "--prefix" behavior to be enabled
> by default])
>
> Index: orte/mca/oob/tcp/oob_tcp_peer.c
>
> ===================================================================
>
> --- orte/mca/oob/tcp/oob_tcp_peer.c (revision 17027)
>
> +++ orte/mca/oob/tcp/oob_tcp_peer.c (working copy)
>
> @@ -371,7 +371,15 @@
>
> opal_net_get_port((struct sockaddr*)
> &inaddr));
>
> }
>
>
>
> - rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);
>
> +#if OPAL_WANT_SDP
>
> + if (mca_oob_tcp_component.sdp_enable) {
>
> + rc = mca_oob_tcp_peer_create_socket(peer, AF_INET_SDP);
>
> + } else {
>
> + rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);
>
> + }
>
> +#else
>
> + rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);
>
> +#endif
>
> if (ORTE_SUCCESS != rc) {
>
> struct timeval tv = { 1,0 };
>
> opal_evtimer_add(&peer->peer_timer_event, &tv);
>
> Index: orte/mca/oob/tcp/oob_tcp.c
>
> ===================================================================
>
> --- orte/mca/oob/tcp/oob_tcp.c (revision 17027)
>
> +++ orte/mca/oob/tcp/oob_tcp.c (working copy)
>
> @@ -380,6 +380,13 @@
>
> mca_oob_tcp_component.tcp6_listen_sd = -1;
>
> #endif /* OPAL_WANT_IPV6 */
>
>
>
> +#if OPAL_WANT_SDP
>
> + mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
>
> + "sdp_enable","Enable SDP for TCP
> connections",
>
> + false, false,
>
> + 0,
>
> + &mca_oob_tcp_component.sdp_enable);
>
> +#endif
>
> /* initialize state */
>
> mca_oob_tcp_component.tcp_shutdown = false;
>
> mca_oob_tcp_component.tcp_listen_sd = -1;
>
> @@ -514,7 +521,7 @@
>
> int error;
>
>
>
> memset(&hints, 0, sizeof(hints));
>
> - hints.ai_family = af_family;
>
> + hints.ai_family = AF_INET6;
>
> hints.ai_socktype = SOCK_STREAM;
>
> hints.ai_flags = AI_PASSIVE;
>
>
>
> @@ -542,9 +549,6 @@
>
> }
>
> #endif /* IPV6_V6ONLY */
>
> #else
>
> - if (AF_INET != af_family) {
>
> - return ORTE_ERROR;
>
> - }
>
> ((struct sockaddr_in*) &inaddr)->sin_family = af_family;
>
> ((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;
>
> addrlen = sizeof(struct sockaddr_in);
>
> @@ -590,7 +594,11 @@
>
> }
>
> goto socket_binded;
>
> }
>
> - if( AF_INET == af_family ) {
>
> +#if OPAL_WANT_SDP
>
> + if( AF_INET == af_family || AF_INET_SDP == af_family) {
>
> +#else
>
> + if( AF_INET == af_family) {
>
> +#endif
>
> opal_output( 0, "bind() failed: no port available in
> the range [%d..%d]",
>
> mca_oob_tcp_component.tcp_port_min,
>
> mca_oob_tcp_component.tcp_port_min + range);
>
> @@ -614,7 +622,11 @@
>
> return ORTE_ERROR;
>
> }
>
>
>
> - if (AF_INET == af_family) {
>
> +#if OPAL_WANT_SDP
>
> + if( AF_INET == af_family || AF_INET_SDP == af_family) {
>
> +#else
>
> + if( AF_INET == af_family) {
>
> +#endif
>
> mca_oob_tcp_component.tcp_listen_port = ((struct
> sockaddr_in*) &inaddr)->sin_port;
>
> mca_oob_tcp_component.tcp_listen_sd = *target_sd;
>
> }
>
> @@ -647,7 +659,11 @@
>
> }
>
>
>
> /* register listen port */
>
> - if (AF_INET == af_family) {
>
> +#if OPAL_WANT_SDP
>
> + if( AF_INET == af_family || AF_INET_SDP == af_family) {
>
> +#else
>
> + if( AF_INET == af_family) {
>
> +#endif
>
> opal_event_set(&mca_oob_tcp_component.tcp_recv_event,
>
> *target_sd,
>
> OPAL_EV_READ|OPAL_EV_PERSIST,
>
> @@ -822,6 +838,7 @@
>
> int flags;
>
>
>
> /* create a listen socket for incoming connections */
>
> + /* FIXME add support for SDP */
>
> mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET,
> SOCK_STREAM, 0);
>
> if(mca_oob_tcp_component.tcp_listen_sd < 0) {
>
> opal_output(0,"mca_oob_tcp_component_init: socket()
> failed: %s (%d)",
>
> @@ -1194,26 +1211,41 @@
>
> but can't do that since we weren't the HNP. */
>
> mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;
>
>
>
> - rc = mca_oob_tcp_create_listen
> (&mca_oob_tcp_component.tcp_listen_sd,
>
> - AF_INET);
>
> - if (ORTE_SUCCESS != rc &&
>
> - (EAFNOSUPPORT != opal_socket_errno ||
>
> - mca_oob_tcp_component.tcp_debug >=
> OOB_TCP_DEBUG_CONNECT)) {
>
> - opal_output(0,
>
> - "mca_oob_tcp_init: unable to create IPv4
> listen socket: %s\n",
>
> +#if OPAL_WANT_SDP
>
> + if ( mca_oob_tcp_component.sdp_enable){
>
> + rc = mca_oob_tcp_create_listen
> (&mca_oob_tcp_component.tcp_listen_sd,
>
> + AF_INET_SDP);
>
> + if (ORTE_SUCCESS != rc &&
>
> + (EAFNOSUPPORT != opal_socket_errno ||
>
> + mca_oob_tcp_component.tcp_debug >=
> OOB_TCP_DEBUG_CONNECT)) {
>
> + opal_output(0,
>
> + "mca_oob_tcp_init: unable to create SDP
> listen socket: %s\n",
>
> opal_strerror(rc));
>
> - }
>
> + }
>
> + } else
>
> +#endif
>
> + {
>
> + rc = mca_oob_tcp_create_listen
> (&mca_oob_tcp_component.tcp_listen_sd,
>
> + AF_INET);
>
> + if (ORTE_SUCCESS != rc &&
>
> + (EAFNOSUPPORT != opal_socket_errno ||
>
> + mca_oob_tcp_component.tcp_debug >=
> OOB_TCP_DEBUG_CONNECT)) {
>
> + opal_output(0,
>
> + "mca_oob_tcp_init: unable to create IPv4
> listen socket: %s\n",
>
> + opal_strerror(rc));
>
> + }
>
> #if OPAL_WANT_IPV6
>
> - rc = mca_oob_tcp_create_listen
> (&mca_oob_tcp_component.tcp6_listen_sd,
>
> - AF_INET6);
>
> - if (ORTE_SUCCESS != rc &&
>
> - (EAFNOSUPPORT != opal_socket_errno ||
>
> - mca_oob_tcp_component.tcp_debug >=
> OOB_TCP_DEBUG_CONNECT)) {
>
> - opal_output(0,
>
> - "mca_oob_tcp_init: unable to create IPv6
> listen socket: %s\n",
>
> - opal_strerror(rc));
>
> - }
>
> + rc = mca_oob_tcp_create_listen
> (&mca_oob_tcp_component.tcp6_listen_sd,
>
> + AF_INET6);
>
> + if (ORTE_SUCCESS != rc &&
>
> + (EAFNOSUPPORT != opal_socket_errno ||
>
> + mca_oob_tcp_component.tcp_debug >=
> OOB_TCP_DEBUG_CONNECT)) {
>
> + opal_output(0,
>
> + "mca_oob_tcp_init: unable to create IPv6
> listen socket: %s\n",
>
> + opal_strerror(rc));
>
> + }
>
> #endif
>
> + }
>
> if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
>
> opal_output(0, "%s accepting connections via event
> library",
>
> ORTE_NAME_PRINT(orte_process_info.my_name));
>
> Index: orte/mca/oob/tcp/oob_tcp.h
>
> ===================================================================
>
> --- orte/mca/oob/tcp/oob_tcp.h (revision 17027)
>
> +++ orte/mca/oob/tcp/oob_tcp.h (working copy)
>
> @@ -217,6 +217,9 @@
>
> int tcp6_port_min; /**< Minimum allowed
> port for the OOB listen socket */
>
> int tcp6_port_range; /**< Range of allowed
> TCP ports */
>
> #endif /* OPAL_WANT_IPV6 */
>
> +#if OPAL_WANT_SDP
>
> + int sdp_enable; /**< support for SDP */
>
> +#endif /* OAP_WANT_SDP */
>
> opal_mutex_t tcp_lock; /**< lock for
> accessing module state */
>
> opal_list_t tcp_events; /**< list of pending
> events (accepts) */
>
> opal_list_t tcp_msg_post; /**< list of recieves
> user has posted */
>
>
>
>
>
>
>
> Thanks,
>
> Verkhovsky Lenny.
>
>
>
> _______________________________________________
> devel mailing list
> devel_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/devel

-- 
   Brian Barrett
   Open MPI developer
   http://www.open-mpi.org/