Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: Re: [OMPI users] Segfault in mca_odls_default.so with > ~100 process.
From: Ralph Castain (rhc_at_[hidden])
Date: 2010-02-27 12:46:42


I modified the patch (it missed a few places, some minor changes in implementation, etc) and committed it to the developer's trunk. I'll check with the release managers to see if this is something they want in the 1.4 series, or if they would rather defer it to the 1.5 series due out soon.

Thanks!

On Feb 27, 2010, at 7:02 AM, Oliver Ford wrote:

> Ralph Castain wrote:
>> Yeah, the system won't like this. Your approach makes it look like you are launching 136 app_contexts. We currently only support up to 128 app_contexts. I don't think anyone anticipated somebody trying to use the system this way.
>>
>> I can expand the number to something larger. Will have to see how big a change it requires (mostly a question of how many places are touched) before we know what release this might show up in.
>>
>>
> The app_context allocation is all dynamic so is fine, the problem that 'app_idx' (various structures and code) which appears to be some kind of index mapping is defined as int8_t, so everything goes negative after 128 - hence the segfault.
>
> Attached is a patch to the openmpi-1.4.1 taball on the website to make it all int32_t, which I've tested and works fine.
>
> I've also attached a patch for the current SVN head, which compiles but I can't test it because the current SVN head doesn't work for me at all at present (for an appfile with less than 128 entries).
>
> Sorry to send this here rather than the dev list, but I don't really have the time to sign up and get involved at the moment.
>
>
> Hope that helps a bit,
> Oliver
> diff -ur openmpi-1.4.1/orte/mca/odls/base/odls_base_default_fns.c openmpi-1.4.1-new/orte/mca/odls/base/odls_base_default_fns.c
> --- openmpi-1.4.1/orte/mca/odls/base/odls_base_default_fns.c 2009-12-08 20:36:37.000000000 +0000
> +++ openmpi-1.4.1-new/orte/mca/odls/base/odls_base_default_fns.c 2010-02-27 12:21:14.000000000 +0000
> @@ -74,7 +74,7 @@
> #include "orte/mca/odls/base/base.h"
> #include "orte/mca/odls/base/odls_private.h"
>
> -static int8_t *app_idx;
> +static int32_t *app_idx;
>
> /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
> * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
> @@ -1555,7 +1577,7 @@
> nrank = 0;
> opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */
> one8 = 0;
> - opal_dss.pack(&buffer, &one8, 1, OPAL_INT8); /* app_idx */
> + opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* app_idx */
> jobdat->pmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
> opal_dss.unload(&buffer, (void**)&jobdat->pmap->bytes, &jobdat->pmap->size);
> OBJ_DESTRUCT(&buffer);
> diff -ur openmpi-1.4.1/orte/runtime/orte_globals.h openmpi-1.4.1-new/orte/runtime/orte_globals.h
> --- openmpi-1.4.1/orte/runtime/orte_globals.h 2009-12-08 20:36:44.000000000 +0000
> +++ openmpi-1.4.1-new/orte/runtime/orte_globals.h 2010-02-27 12:30:20.000000000 +0000
> @@ -137,7 +137,7 @@
> /** Parent object */
> opal_object_t super;
> /** Unique index when multiple apps per job */
> - int8_t idx;
> + int32_t idx;
> /** Absolute pathname of argv[0] */
> char *app;
> /** Number of copies of this process that are to be launched */
> @@ -382,7 +382,7 @@
> /* exit code */
> orte_exit_code_t exit_code;
> /* the app_context that generated this proc */
> - int8_t app_idx;
> + int32_t app_idx;
> /* a cpu list, if specified by the user */
> char *slot_list;
> /* pointer to the node where this proc is executing */
> diff -ur openmpi-1.4.1/orte/util/nidmap.c openmpi-1.4.1-new/orte/util/nidmap.c
> --- openmpi-1.4.1/orte/util/nidmap.c 2009-12-08 20:36:44.000000000 +0000
> +++ openmpi-1.4.1-new/orte/util/nidmap.c 2010-02-27 12:23:18.000000000 +0000
> @@ -589,7 +589,7 @@
> int32_t *nodes;
> orte_proc_t **procs;
> orte_vpid_t i;
> - int8_t *tmp;
> + int32_t *tmp;
> opal_buffer_t buf;
> orte_local_rank_t *lrank;
> orte_node_rank_t *nrank;
> @@ -645,11 +645,11 @@
> free(nrank);
>
> /* transfer and pack the app_idx in one pack */
> - tmp = (int8_t*)malloc(jdata->num_procs);
> + tmp = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
> for (i=0; i < jdata->num_procs; i++) {
> tmp[i] = procs[i]->app_idx;
> }
> - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_INT8))) {
> + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_INT32))) {
> ORTE_ERROR_LOG(rc);
> return rc;
> }
> @@ -664,7 +665,7 @@
>
>
> int orte_util_decode_pidmap(opal_byte_object_t *bo, orte_vpid_t *nprocs,
> - opal_value_array_t *procs, int8_t **app_idx,
> + opal_value_array_t *procs, int32_t **app_idx,
> char ***slot_str)
> {
> orte_vpid_t i, num_procs;
> @@ -672,7 +673,7 @@
> int32_t *nodes;
> orte_local_rank_t *local_rank;
> orte_node_rank_t *node_rank;
> - int8_t *idx;
> + int32_t *idx;
> orte_std_cntr_t n;
> opal_buffer_t buf;
> int rc;
> @@ -746,10 +747,10 @@
> }
>
> /* allocate memory for app_idx */
> - idx = (int8_t*)malloc(num_procs);
> + idx = (int32_t*)malloc(num_procs * sizeof(int32_t));
> /* unpack app_idx in one shot */
> n=num_procs;
> - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, idx, &n, OPAL_INT8))) {
> + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, idx, &n, OPAL_INT32))) {
> ORTE_ERROR_LOG(rc);
> return rc;
> }
> diff -ur openmpi-1.4.1/orte/util/nidmap.h openmpi-1.4.1-new/orte/util/nidmap.h
> --- openmpi-1.4.1/orte/util/nidmap.h 2009-12-08 20:36:44.000000000 +0000
> +++ openmpi-1.4.1-new/orte/util/nidmap.h 2010-02-27 11:59:52.000000000 +0000
> @@ -49,7 +49,7 @@
>
> ORTE_DECLSPEC int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr);
> ORTE_DECLSPEC int orte_util_decode_pidmap(opal_byte_object_t *boptr, orte_vpid_t *num_procs,
> - opal_value_array_t *procs, int8_t **app_idx,
> + opal_value_array_t *procs, int32_t **app_idx,
> char ***slot_str);
>
>
> Index: orte/mca/odls/base/odls_base_default_fns.c
> ===================================================================
> --- orte/mca/odls/base/odls_base_default_fns.c (revision 22725)
> +++ orte/mca/odls/base/odls_base_default_fns.c (working copy)
> @@ -89,9 +89,8 @@
> orte_job_map_t *map;
> opal_buffer_t *wireup;
> opal_byte_object_t bo, *boptr;
> - int32_t numbytes, *tmp32;
> + int32_t numbytes, *tmp32A, *tmp32B;
> int8_t flag;
> - int8_t *tmp;
> orte_vpid_t i;
> int j;
> orte_daemon_cmd_flag_t command;
> @@ -386,25 +385,25 @@
> free(bo.bytes);
>
> /* transfer and pack the app_idx and restart arrays for this job */
> - tmp = (int8_t*)malloc(jdata->num_procs);
> - tmp32 = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
> + tmp32A = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
> + tmp32B = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
> for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
> if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
> continue;
> }
> - tmp[i] = proc->app_idx;
> - tmp32[i++] = proc->restarts;
> + tmp32A[i] = proc->app_idx;
> + tmp32B[i++] = proc->restarts;
> }
> - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp, jdata->num_procs, OPAL_INT8))) {
> + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp32A, jdata->num_procs, OPAL_INT32))) {
> ORTE_ERROR_LOG(rc);
> return rc;
> }
> - free(tmp);
> - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp32, jdata->num_procs, OPAL_INT32))) {
> + free(tmp32A);
> + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp32B, jdata->num_procs, OPAL_INT32))) {
> ORTE_ERROR_LOG(rc);
> return rc;
> }
> - free(tmp32);
> + free(tmp32B);
>
> /* are there cpu_list strings? */
> if (jdata->map->cpu_lists) {
> @@ -579,7 +578,7 @@
> opal_buffer_t alert;
> opal_list_item_t *item;
> int8_t flag;
> - int8_t *app_idx=NULL;
> + int32_t *app_idx=NULL;
> int32_t *restarts=NULL;
> char **slot_str=NULL;
> orte_jobid_t debugger;
> @@ -643,8 +642,8 @@
> }
> }
> /* fake an app_idx array */
> - app_idx = (int8_t*)malloc(jobdat->num_procs * sizeof(int8_t));
> - memset(app_idx, 0, jobdat->num_procs * sizeof(int8_t));
> + app_idx = (int32_t*)malloc(jobdat->num_procs * sizeof(int32_t));
> + memset(app_idx, 0, jobdat->num_procs * sizeof(int32_t));
> /* if we are doing a timing test, store the time the msg was recvd */
> if (orte_timing) {
> jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
> @@ -846,10 +845,10 @@
> }
>
> /* allocate memory for app_idx */
> - app_idx = (int8_t*)malloc(jobdat->num_procs);
> + app_idx = (int32_t*)malloc(jobdat->num_procs * sizeof(int32_t));
> /* unpack app_idx in one shot */
> cnt=jobdat->num_procs;
> - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, OPAL_INT8))) {
> + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, OPAL_INT32))) {
> ORTE_ERROR_LOG(rc);
> goto REPORT_ERROR;
> }
> @@ -2213,8 +2212,8 @@
> opal_dss.pack(&buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */
> nrank = 0;
> opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */
> - one8 = 0;
> - opal_dss.pack(&buffer, &one8, 1, OPAL_INT8); /* app_idx */
> + one32 = 0;
> + opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* app_idx */
> /* setup a byte object and unload the packed data to it */
> bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
> opal_dss.unload(&buffer, (void**)&bo->bytes, &bo->size);
> Index: orte/runtime/orte_globals.h
> ===================================================================
> --- orte/runtime/orte_globals.h (revision 22725)
> +++ orte/runtime/orte_globals.h (working copy)
> @@ -167,7 +167,7 @@
> /** Parent object */
> opal_object_t super;
> /** Unique index when multiple apps per job */
> - int8_t idx;
> + int32_t idx;
> /** Absolute pathname of argv[0] */
> char *app;
> /** Number of copies of this process that are to be launched */
> @@ -423,7 +423,7 @@
> /* exit code */
> orte_exit_code_t exit_code;
> /* the app_context that generated this proc */
> - int8_t app_idx;
> + int32_t app_idx;
> /* a cpu list, if specified by the user */
> char *slot_list;
> /* pointer to the node where this proc is executing */