Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Branch for iWARP uDAPL enablement
From: Jon Mason (jon_at_[hidden])
Date: 2007-11-29 20:49:56


I created a public branch to make available the patch which gets OPMI
uDAPL to kinda work on iWARP. The branch can be found at:
http://svn.open-mpi.org/svn/ompi/tmp-public/iwarp-ompi-v1.2/

The branch contains an updated version of the patch Steve Wise sent out
some time ago. Below is the patch (on top of the ompi v1.2 tree) that
enables this.

I am currently focusing on other issues, and might not be able to get
back to it for a while. Therefore, I wanted to make this patch
available to anyone who might need it or want to work on flushing out
the existing bugs. Feel free to contact me if there are any questions.

Thanks,
Jon

======================

This patch gets OPMI uDAPL to kinda work on iWARP.

Specifically, this patch address 3 issues needed for iWARP to work:
1. Force the first DTO from connecting side
2. Post receive buffers for the connection
3. Flush outstanding writes with a 0B read

This patch enforces the rule that all connections must come from the
connecting side.

On iWARP, the connection may be TERMINATED if a SEND arrives on a QP
and no corresponding RECV buffer is posted. This patch posts the
receive buffers prior to the connection setup completing.

There is a race condition where a the receive buffers for a large
write may be freed prior to the completion of the write. This patch
to post the 0B read after a large write and use the 0B read completion
to trigger the write completion to the upper layers.

With this patch some MPI test cases using the uDAPL BTL will run,
while others continue to fail. Without this patch, no MPI programs
will run if using the uDAPL BTL.

This patch breaks IB support, and should not be checked in to the
regular tree until that is fixed.

Index: ompi/mca/btl/udapl/btl_udapl_endpoint.c
===================================================================
--- ompi/mca/btl/udapl/btl_udapl_endpoint.c (revision 16805)
+++ ompi/mca/btl/udapl/btl_udapl_endpoint.c (working copy)
@@ -130,7 +130,7 @@
     remote_buffer.segment_length = frag->triplet.segment_length;
 
     /* write the data out */
- cookie.as_ptr = frag;
+ cookie.as_ptr = frag;
     rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
         1,
         &(frag->triplet),
@@ -367,7 +367,9 @@
         }
     }
 
- (*ep_attr).max_recv_dtos = btl->udapl_max_recv_dtos;
+ (*ep_attr).max_recv_dtos = btl->udapl_max_recv_dtos + 1;
+ (*ep_attr).max_rdma_read_in = 4;
+ (*ep_attr).max_rdma_read_out = 4;
 
     /* Set max_request_dtos :
      * The max_request_dtos should equal the max number of
@@ -429,6 +431,74 @@
     return rc;
 }
 
+int mca_btl_udapl_addrdata_send(mca_btl_udapl_module_t* btl,
+ DAT_EP_HANDLE endpoint)
+{
+ mca_btl_udapl_frag_t* frag;
+ DAT_DTO_COOKIE cookie;
+ static int32_t connection_seq = 1;
+ int rc;
+
+ /* Send our local address data over this EP */
+ frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
+ (mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
+ sizeof(int32_t));
+ cookie.as_ptr = frag;
+
+ memcpy(frag->segment.seg_addr.pval,
+ &btl->udapl_addr, sizeof(mca_btl_udapl_addr_t));
+ memcpy((char *)frag->segment.seg_addr.pval + sizeof(mca_btl_udapl_addr_t),
+ &connection_seq, sizeof(int32_t));
+ connection_seq++;
+
+ frag->type = MCA_BTL_UDAPL_CONN_SEND;
+
+ rc = dat_ep_post_send(endpoint, 1,
+ &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
+ if(DAT_SUCCESS != rc) {
+ char* major;
+ char* minor;
+
+ dat_strerror(rc, (const char**)&major,
+ (const char**)&minor);
+ BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send",
+ major, minor));
+ return OMPI_ERROR;
+ }
+
+ return OMPI_SUCCESS;
+}
+
+static inline int mca_btl_udapl_addrdata_recv(mca_btl_udapl_module_t* btl,
+ DAT_EP_HANDLE endpoint)
+{
+ mca_btl_udapl_frag_t* frag;
+ DAT_DTO_COOKIE cookie;
+ int rc;
+
+ /* Post a receive to get the peer's address data */
+ frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
+ (mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
+ sizeof(int32_t));
+ cookie.as_ptr = frag;
+
+ frag->type = MCA_BTL_UDAPL_CONN_RECV;
+
+ rc = dat_ep_post_recv(endpoint, 1,
+ &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
+ if(DAT_SUCCESS != rc) {
+ char* major;
+ char* minor;
+
+ dat_strerror(rc, (const char**)&major,
+ (const char**)&minor);
+ BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_recv",
+ major, minor));
+ return OMPI_ERROR;
+ }
+ return OMPI_SUCCESS;
+}
+
 /*
  * Create a uDAPL endpoint
  *
@@ -457,6 +527,15 @@
             major, minor));
         dat_ep_free(udapl_endpoint);
         udapl_endpoint = DAT_HANDLE_NULL;
+ } else {
+ DAT_CONTEXT c;
+
+ /* pre-post recv buffer for exchanging address data */
+ mca_btl_udapl_addrdata_recv(btl, *udapl_endpoint);
+
+ /* context gets set to 1 for active connections */
+ c.as_64 = 0;
+ dat_set_consumer_context(*udapl_endpoint, c);
     }
 
     return rc;
@@ -567,6 +646,7 @@
 {
     mca_btl_udapl_module_t* btl = endpoint->endpoint_btl;
     int rc;
+ DAT_CONTEXT c;
 
     OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
     OPAL_THREAD_ADD32(&(btl->udapl_connect_inprogress), 1);
@@ -590,6 +670,10 @@
         goto failure_create;
     }
 
+ /* post eager recv buffers */
+ mca_btl_udapl_endpoint_post_recv(endpoint,
+ mca_btl_udapl_component.udapl_eager_frag_size);
+
     rc = dat_ep_connect(endpoint->endpoint_eager, &endpoint->endpoint_addr.addr,
             endpoint->endpoint_addr.port, mca_btl_udapl_component.udapl_timeout,
             sizeof(mca_btl_udapl_addr_t), &btl->udapl_addr, 0, DAT_CONNECT_DEFAULT_FLAG);
@@ -604,6 +688,9 @@
         goto failure;
     }
 
+ /* set context to 1 indicating active connect */
+ c.as_64 = 1;
+ dat_set_consumer_context(endpoint->endpoint_eager, c);
     endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_EAGER;
     OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
     return;
@@ -633,7 +720,21 @@
     mca_btl_base_endpoint_t* ep;
     size_t i;
     int rc;
+ DAT_CONTEXT c;
+ bool passive;
 
+ /*
+ * get consumer context from the endpoint.
+ * 1 == active side, 0 == passive side.
+ */
+ dat_get_consumer_context(endpoint, &c);
+ passive = (bool)!c.as_64;
+
+ /*
+ * active side - finish the connection.
+ * server side - post recvs, send addr data and finish the connection.
+ */
+
     /* Search for the matching BTL EP */
     for(proc = (mca_btl_udapl_proc_t*)
                 opal_list_get_first(&mca_btl_udapl_component.udapl_procs);
@@ -653,6 +754,15 @@
                     ep->endpoint_connection_seq = (NULL != connection_seq) ?
                         *connection_seq:0;
                     ep->endpoint_eager = endpoint;
+ if (passive) {
+
+ /* post max recv buffers */
+ mca_btl_udapl_endpoint_post_recv(ep,
+ mca_btl_udapl_component.udapl_eager_frag_size);
+
+ /* send address data */
+ mca_btl_udapl_addrdata_send(btl, endpoint);
+ }
                     rc = mca_btl_udapl_endpoint_finish_eager(ep);
                } else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) {
                     /* Check to see order of messages received are in
@@ -673,6 +783,15 @@
                         ep->endpoint_eager = endpoint;
                     }
 
+ if(passive) {
+
+ /* post max recv buffers */
+ mca_btl_udapl_endpoint_post_recv(ep,
+ mca_btl_udapl_component.udapl_max_frag_size);
+
+ /* send address data */
+ mca_btl_udapl_addrdata_send(btl, endpoint);
+ }
                     rc = mca_btl_udapl_endpoint_finish_max(ep);
                 } else {
                     OPAL_OUTPUT((0, "btl_udapl ERROR invalid EP state %d\n",
@@ -699,6 +818,7 @@
 {
     mca_btl_udapl_module_t* btl = endpoint->endpoint_btl;
     int rc;
+ DAT_CONTEXT c;
 
     endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_MAX;
     OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
@@ -721,6 +841,10 @@
             return OMPI_ERROR;
         }
 
+ /* post max recv buffers */
+ mca_btl_udapl_endpoint_post_recv(endpoint,
+ mca_btl_udapl_component.udapl_max_frag_size);
+
         rc = dat_ep_connect(endpoint->endpoint_max,
             &endpoint->endpoint_addr.addr, endpoint->endpoint_addr.port,
             mca_btl_udapl_component.udapl_timeout,
@@ -737,6 +861,10 @@
             dat_ep_free(endpoint->endpoint_max);
             return OMPI_ERROR;
         }
+
+ /* set context to 1 indicating active connect */
+ c.as_64 = 1;
+ dat_set_consumer_context(endpoint->endpoint_max, c);
     }
     
     return OMPI_SUCCESS;
@@ -753,12 +881,6 @@
     endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED;
     OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), -1);
 
- /* post eager/max recv buffers */
- mca_btl_udapl_endpoint_post_recv(endpoint,
- mca_btl_udapl_component.udapl_eager_frag_size);
- mca_btl_udapl_endpoint_post_recv(endpoint,
- mca_btl_udapl_component.udapl_max_frag_size);
-
     /* post queued sends */
     assert(endpoint->endpoint_eager_sends ==
             mca_btl_udapl_component.udapl_num_sends);
@@ -835,7 +957,7 @@
  * Post receive buffers for a newly established endpoint connection.
  */
 
-static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint,
+int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint,
                                             size_t size)
 {
     mca_btl_udapl_frag_t* frag = NULL;
Index: ompi/mca/btl/udapl/btl_udapl.c
===================================================================
--- ompi/mca/btl/udapl/btl_udapl.c (revision 16805)
+++ ompi/mca/btl/udapl/btl_udapl.c (working copy)
@@ -497,7 +497,13 @@
                 major, minor));
         }
 
- if (udapl_btl->udapl_dto_evd_qlen > evd_param.evd_qlen) {
+ /* evd queue length is stored as the length requested minus
+ * one. Therefore, we must compare against the length returned
+ * above plus one or we could try to alloc the same size we
+ * already have (and thus throw an error and bail....and no
+ * one wants that).
+ */
+ if (udapl_btl->udapl_dto_evd_qlen > evd_param.evd_qlen + 1) {
             /* resize dto event dispatcher queue length */
             dat_rc = dat_evd_resize(udapl_btl->udapl_evd_dto,
                 udapl_btl->udapl_dto_evd_qlen);
@@ -1030,8 +1036,8 @@
     frag->endpoint = endpoint;
     frag->type = MCA_BTL_UDAPL_PUT;
 
- if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -1) < 0) {
- OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 1);
+ if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -2) < 0) {
+ OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 2);
         OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
         opal_list_append(&endpoint->endpoint_max_frags,
             (opal_list_item_t*)frag);
@@ -1046,7 +1052,7 @@
             (DAT_VADDR)dst_segment->seg_addr.lval;
         remote_buffer.segment_length = dst_segment->seg_len;
 
- cookie.as_ptr = frag;
+ cookie.as_ptr = 0;
         
         OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
         rc = dat_ep_post_rdma_write(endpoint->endpoint_max,
@@ -1054,8 +1060,36 @@
             &frag->triplet,
             cookie,
             &remote_buffer,
+ DAT_COMPLETION_SUPPRESS_FLAG);
+ if(DAT_SUCCESS != rc) {
+ char* major;
+ char* minor;
+
+ OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
+ dat_strerror(rc, (const char**)&major,
+ (const char**)&minor);
+ BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
+ major, minor));
+ rc = OMPI_ERROR;
+ goto out;
+ }
+
+ remote_buffer.rmr_context =
+ (DAT_RMR_CONTEXT)dst_segment->seg_key.key32[0];
+ remote_buffer.target_address =
+ (DAT_VADDR)dst_segment->seg_addr.lval;
+ remote_buffer.segment_length = 0;
+ cookie.as_ptr = frag;
+
+ rc = dat_ep_post_rdma_read(endpoint->endpoint_max,
+ 1,
+ &frag->triplet,
+ cookie,
+ &remote_buffer,
             DAT_COMPLETION_DEFAULT_FLAG);
+
         OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
+
         if(DAT_SUCCESS != rc) {
             char* major;
             char* minor;
@@ -1067,7 +1101,7 @@
             rc = OMPI_ERROR;
         }
     }
-
+out:
     return rc;
 }
 
Index: ompi/mca/btl/udapl/btl_udapl_endpoint.h
===================================================================
--- ompi/mca/btl/udapl/btl_udapl_endpoint.h (revision 16805)
+++ ompi/mca/btl/udapl/btl_udapl_endpoint.h (working copy)
@@ -187,6 +187,9 @@
 int mca_btl_udapl_endpoint_send_sr_credits(mca_btl_base_endpoint_t* endpoint,
                                            const int connection);
 
+int mca_btl_udapl_addrdata_send(struct mca_btl_udapl_module_t* btl,
+ DAT_EP_HANDLE endpoint);
+
 #if defined(c_plusplus) || defined(__cplusplus)
 }
 #endif
Index: ompi/mca/btl/udapl/btl_udapl_component.c
===================================================================
--- ompi/mca/btl/udapl/btl_udapl_component.c (revision 16805)
+++ ompi/mca/btl/udapl/btl_udapl_component.c (working copy)
@@ -420,66 +420,6 @@
 }
 
 
-static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
- DAT_EP_HANDLE* endpoint)
-{
- mca_btl_udapl_frag_t* frag;
- DAT_DTO_COOKIE cookie;
- static int32_t connection_seq = 1;
- int rc;
-
- /* Post a receive to get the peer's address data */
- frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
- (mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
- sizeof(int32_t));
- cookie.as_ptr = frag;
-
- frag->type = MCA_BTL_UDAPL_CONN_RECV;
-
- rc = dat_ep_post_recv(endpoint, 1,
- &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
- if(DAT_SUCCESS != rc) {
- char* major;
- char* minor;
-
- dat_strerror(rc, (const char**)&major,
- (const char**)&minor);
- BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_recv",
- major, minor));
- return OMPI_ERROR;
- }
-
-
- /* Send our local address data over this EP */
- frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
- (mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
- sizeof(int32_t));
- cookie.as_ptr = frag;
-
- memcpy(frag->segment.seg_addr.pval,
- &btl->udapl_addr, sizeof(mca_btl_udapl_addr_t));
- memcpy((char *)frag->segment.seg_addr.pval + sizeof(mca_btl_udapl_addr_t),
- &connection_seq, sizeof(int32_t));
- connection_seq++;
-
- frag->type = MCA_BTL_UDAPL_CONN_SEND;
-
- rc = dat_ep_post_send(endpoint, 1,
- &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
- if(DAT_SUCCESS != rc) {
- char* major;
- char* minor;
-
- dat_strerror(rc, (const char**)&major,
- (const char**)&minor);
- BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send",
- major, minor));
- return OMPI_ERROR;
- }
-
- return OMPI_SUCCESS;
-}
-
 static inline int mca_btl_udapl_frag_progress_one(
         mca_btl_udapl_module_t* udapl_btl,
         mca_btl_udapl_frag_t* frag)
@@ -744,7 +684,7 @@
                     frag->base.des_cbfunc(&btl->super, frag->endpoint,
                         &frag->base, OMPI_SUCCESS);
 
- OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION]), 1);
+ OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION]), 2); /* LSW */
 
                     mca_btl_udapl_frag_progress_pending(btl,
                         frag->endpoint,
@@ -789,6 +729,7 @@
                     count++;
                     break;
                 case DAT_CONNECTION_EVENT_ESTABLISHED:
+# if NOT_IWARP
                     /* Both the client and server side of a connection generate
                        this event */
                     if (mca_btl_udapl_component.udapl_conn_priv_data) {
@@ -802,6 +743,20 @@
                         mca_btl_udapl_sendrecv(btl,
                             event.event_data.connect_event_data.ep_handle);
                     }
+#else
+ {
+ DAT_CONTEXT c;
+
+ /* Both the client and server side of a connection generate
+ this event */
+ dat_get_consumer_context(event.event_data.connect_event_data.ep_handle, &c);
+
+ /* active side starts sendrecv exchange */
+ if (c.as_64) {
+ mca_btl_udapl_addrdata_send(btl, event.event_data.connect_event_data.ep_handle);
+ }
+ }
+#endif
                     count++;
                     break;
                 case DAT_CONNECTION_EVENT_PEER_REJECTED: