Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

From: Jeremy Buisson (jbuisson_at_[hidden])
Date: 2007-04-02 12:34:26


Ralph Castain a écrit :
> The runtime underneath Open MPI (called OpenRTE) will not allow you to spawn
> processes on nodes outside of your allocation. This is for several reasons,
> but primarily because (a) we only know about the nodes that were allocated,
> so we have no idea how to spawn a process anywhere else, and (b) most
> resource managers wouldn't let us do it anyway.
>
> I gather you have some node that you know about and have hard-coded into
> your application? How do you know the name of the node if it isn't in your
> allocation??

Because I can give that names to OpenMPI (or OpenRTE, or whatever). I
also would like to do the same, and I don't want OpenMPI to restrict to
what it thinks to be the allocation, while I'm sure to know better than
it what I am doing.
The concept of nodes being in allocations fixed at launch-time is really
rigid; and it prevents the application (or whatever else) to modify the
allocation at runtime, which may be quite nice.

Here is an ugly patch I've quickly done for my own use, which changes
the round-robin rmaps such that is first allocates the hosts to the
rmgr, as a copy&paste of some code in the dash_host ras component. It's
far from being bugfree, but it can be a startpoint to hack.

Jeremy

> Ralph
>
>
> On 4/2/07 10:05 AM, "Prakash Velayutham" <Prakash.Velayutham_at_[hidden]>
> wrote:
>
>> Hello,
>>
>> I have built Open MPI (1.2) with run-time environment enabled for Torque
>> (2.1.6) resource manager. Initially I am requesting 4 nodes (1 CPU each)
>> from Torque. The from inside of my MPI code I am trying to spawn more
>> processes to nodes outside of Torque-assigned nodes using
>> MPI_Comm_spawn, but this is failing with an error below:
>>
>> [wins04:13564] *** An error occurred in MPI_Comm_spawn
>> [wins04:13564] *** on communicator MPI_COMM_WORLD
>> [wins04:13564] *** MPI_ERR_ARG: invalid argument of some other kind
>> [wins04:13564] *** MPI_ERRORS_ARE_FATAL (goodbye)
>> mpirun noticed that job rank 1 with PID 15070 on node wins03 exited on
>> signal 15 (Terminated).
>> 2 additional processes aborted (not shown)
>>
>> #################################
>>
>> MPI_Info info;
>> MPI_Comm comm, *intercomm;
>> ...
>> ...
>> char *key, *value;
>> key = "host";
>> value = "wins08";
>> rc1 = MPI_Info_create(&info);
>> rc1 = MPI_Info_set(info, key, value);
>> rc1 = MPI_Comm_spawn(slave,MPI_ARGV_NULL, 1, info, 0,
>> MPI_COMM_WORLD, intercomm, arr);
>> ...
>> }
>>
>> ###################################################
>>
>> Would this work as it is or is something wrong with my assumption? Is
>> OpenRTE stopping me from spawning processes outside of the initially
>> allocated nodes through Torque?
>>
>> Thanks,
>> Prakash
>>
>> _______________________________________________
>> users mailing list
>> users_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>
>
> _______________________________________________
> users mailing list
> users_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/users

diff -ru openmpi-1.2/ompi/mca/btl/tcp/btl_tcp.c openmpi-1.2-custom/ompi/mca/btl/tcp/btl_tcp.c
--- openmpi-1.2/ompi/mca/btl/tcp/btl_tcp.c 2006-11-09 19:53:44.000000000 +0100
+++ openmpi-1.2-custom/ompi/mca/btl/tcp/btl_tcp.c 2007-03-28 14:02:10.000000000 +0200
@@ -117,8 +117,8 @@
         tcp_endpoint->endpoint_btl = tcp_btl;
         rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint);
         if(rc != OMPI_SUCCESS) {
- OBJ_RELEASE(tcp_endpoint);
             OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock);
+ OBJ_RELEASE(tcp_endpoint);
             continue;
         }
 
diff -ru openmpi-1.2/opal/threads/mutex.c openmpi-1.2-custom/opal/threads/mutex.c
--- openmpi-1.2/opal/threads/mutex.c 2006-11-09 19:53:32.000000000 +0100
+++ openmpi-1.2-custom/opal/threads/mutex.c 2007-03-28 15:59:25.000000000 +0200
@@ -54,6 +54,8 @@
 #elif OMPI_ENABLE_DEBUG && OMPI_HAVE_PTHREAD_MUTEX_ERRORCHECK
     /* set type to ERRORCHECK so that we catch recursive locks */
     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+#else
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 #endif
     
     pthread_mutex_init(&m->m_lock_pthread, &attr);
diff -ru openmpi-1.2/opal/threads/mutex_unix.h openmpi-1.2-custom/opal/threads/mutex_unix.h
--- openmpi-1.2/opal/threads/mutex_unix.h 2006-11-09 19:53:32.000000000 +0100
+++ openmpi-1.2-custom/opal/threads/mutex_unix.h 2007-03-28 15:36:13.000000000 +0200
@@ -76,7 +76,7 @@
 
 static inline int opal_mutex_trylock(opal_mutex_t *m)
 {
-#if OMPI_ENABLE_DEBUG
+#if 1 // OMPI_ENABLE_DEBUG
     int ret = pthread_mutex_trylock(&m->m_lock_pthread);
     if (ret == EDEADLK) {
         errno = ret;
@@ -91,7 +91,7 @@
 
 static inline void opal_mutex_lock(opal_mutex_t *m)
 {
-#if OMPI_ENABLE_DEBUG
+#if 1 // OMPI_ENABLE_DEBUG
     int ret = pthread_mutex_lock(&m->m_lock_pthread);
     if (ret == EDEADLK) {
         errno = ret;
diff -ru openmpi-1.2/opal/util/stacktrace.c openmpi-1.2-custom/opal/util/stacktrace.c
--- openmpi-1.2/opal/util/stacktrace.c 2007-01-24 19:16:07.000000000 +0100
+++ openmpi-1.2-custom/opal/util/stacktrace.c 2007-03-28 14:02:10.000000000 +0200
@@ -344,6 +344,8 @@
                    stacktrace_hostname, getpid());
     write(fileno(stderr), print_buffer, ret);
     fflush(stderr);
+ for(;;)
+ pause();
 }
 
 #endif /* OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) */
diff -ru openmpi-1.2/orte/mca/rmaps/round_robin/rmaps_rr.c openmpi-1.2-custom/orte/mca/rmaps/round_robin/rmaps_rr.c
--- openmpi-1.2/orte/mca/rmaps/round_robin/rmaps_rr.c 2007-01-24 19:16:10.000000000 +0100
+++ openmpi-1.2-custom/orte/mca/rmaps/round_robin/rmaps_rr.c 2007-03-28 15:11:57.000000000 +0200
@@ -265,6 +265,134 @@
 
     return ORTE_SUCCESS;
 }
+
+static bool orte_rmaps_rr_is_host_allocated(char* name)
+{
+ orte_ras_node_t* node;
+ node = orte_ras_base_node_lookup(0, name);
+ OBJ_RELEASE(node);
+ return node != NULL;
+}
+
+static int orte_rmaps_rr_host_allocate(orte_jobid_t jobid)
+{
+ opal_list_t nodes;
+ opal_list_item_t* item;
+ orte_app_context_t **context;
+ size_t i, j, k;
+ orte_std_cntr_t num_context = 0;
+ int rc;
+ char **mapped_nodes = NULL, **mini_map;
+ orte_ras_node_t *node;
+
+ /* get the context */
+
+ rc = orte_rmgr.get_app_context(jobid, &context, &num_context);
+ if (ORTE_SUCCESS != rc) {
+ ORTE_ERROR_LOG(rc);
+ return rc;
+ }
+ OBJ_CONSTRUCT(&nodes, opal_list_t);
+
+ /* If there's nothing to do, skip to the end */
+
+ if (0 == num_context) {
+ rc = ORTE_SUCCESS;
+ goto cleanup;
+ }
+
+ /* Otherwise, go through the contexts */
+
+ for (i = 0; i < num_context; ++i) {
+ if (context[i] != 0) {
+ if (context[i]->num_map > 0) {
+ orte_app_context_map_t** map = context[i]->map_data;
+
+ /* Accumulate all of the host name mappings */
+ for (j = 0; j < context[i]->num_map; ++j) {
+ if (ORTE_APP_CONTEXT_MAP_HOSTNAME == map[j]->map_type) {
+ mini_map = opal_argv_split(map[j]->map_data, ',');
+ for (k = 0; NULL != mini_map[k]; ++k) {
+ if(!orte_rmaps_rr_is_host_allocated(mini_map[k]))
+ {
+ rc = opal_argv_append_nosize(&mapped_nodes,
+ mini_map[k]);
+ if (OPAL_SUCCESS != rc) {
+ goto cleanup;
+ }
+ }
+ }
+ opal_argv_free(mini_map);
+ }
+ }
+ }
+ }
+ }
+
+ /* Did we find anything? */
+
+ if (NULL != mapped_nodes) {
+
+ /* Go through the names found and add them to the host list.
+ If they're not unique, then bump the slots count for each
+ duplicate */
+
+ for (i = 0; NULL != mapped_nodes[i]; ++i) {
+ for (item = opal_list_get_first(&nodes);
+ item != opal_list_get_end(&nodes);
+ item = opal_list_get_next(item)) {
+ node = (orte_ras_node_t*) item;
+ if (0 == strcmp(node->node_name, mapped_nodes[i])) {
+ ++node->node_slots;
+ break;
+ }
+ }
+
+ /* If we didn't find it, add it to the list */
+
+ if (item == opal_list_get_end(&nodes)) {
+ node = OBJ_NEW(orte_ras_node_t);
+ if (NULL == node) {
+ return ORTE_ERR_OUT_OF_RESOURCE;
+ }
+ node->node_name = strdup(mapped_nodes[i]);
+ node->node_arch = NULL;
+ node->node_state = ORTE_NODE_STATE_UP;
+ /* JMS: this should not be hard-wired to 0, but there's no
+ other value to put it to [yet]... */
+ node->node_cellid = 0;
+ node->node_slots_inuse = 0;
+ node->node_slots_max = 0;
+ node->node_slots = 1;
+ opal_list_append(&nodes, &node->super);
+ }
+ }
+
+ /* Put them on the segment and allocate them */
+
+ if (ORTE_SUCCESS !=
+ (rc = orte_ras_base_node_insert(&nodes)) ||
+ ORTE_SUCCESS !=
+ (rc = orte_ras_base_allocate_nodes(jobid, &nodes))) {
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (NULL != mapped_nodes) {
+ opal_argv_free(mapped_nodes);
+ }
+
+ while (NULL != (item = opal_list_remove_first(&nodes))) {
+ OBJ_RELEASE(item);
+ }
+ OBJ_DESTRUCT(&nodes);
+ for (i = 0; i < num_context; i++) {
+ OBJ_RELEASE(context[i]);
+ }
+ free(context);
+ return rc;
+}
    
 
 /*
@@ -367,6 +495,11 @@
     orte_attribute_t *attr;
     orte_std_cntr_t slots_per_node;
 
+ if(ORTE_SUCCESS != (rc = orte_rmaps_rr_host_allocate(jobid))) {
+ ORTE_ERROR_LOG(rc);
+ return rc;
+ }
+
     OPAL_TRACE(1);
     
     /* setup the local environment from the attributes */