Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r29079 - in trunk: opal/mca/hwloc/base orte/mca/rmaps/mindist
From: Joshua Ladd (joshual_at_[hidden])
Date: 2013-08-28 12:37:50


I have no objections to this.

Josh

-----Original Message-----
From: devel [mailto:devel-bounces_at_[hidden]] On Behalf Of Jeff Squyres (jsquyres)
Sent: Wednesday, August 28, 2013 12:37 PM
To: <devel_at_[hidden]>
Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r29079 - in trunk: opal/mca/hwloc/base orte/mca/rmaps/mindist

Can we rename rmaps_base_dist_hca to something that is less specific to IB?

E.g., rmaps_base_dist_verbs_device? (admittedly, that's a little long, but...)

On Aug 28, 2013, at 12:23 PM, <svn-commit-mailer_at_[hidden]> wrote:

> Author: jladd (Joshua Ladd)
> Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) New Revision: 29079
> URL: https://svn.open-mpi.org/trac/ompi/changeset/29079
>
> Log:
> Add support for autodetecting a MLNX HCA in the rmaps min distance
> feature. In this way, .ini files distributed with software stacks need
> not specify a particular HCA but instead may select the key word auto
> which will automatically select the discovered device. To use this
> feature, simply pass the keyword auto instead of a specific device
> name, --mca rmaps_base_dist_hca auto. If more than one card is
> installed, the mapper will inform the user of this and, at this point,
> the user will then need to specify which card via the normal route,
> e.g. --mca rmaps_base_dist_hca <dev_name>. This should be added to
> \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist
> mapping
>
> Text files modified:
> trunk/opal/mca/hwloc/base/base.h | 4 ++--
> trunk/opal/mca/hwloc/base/hwloc_base_util.c | 40 ++++++++++++++++++++++++++++++++++++----
> trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt | 8 ++++++++
> trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c | 11 +++++++++--
> 4 files changed, 55 insertions(+), 8 deletions(-)
>
> Modified: trunk/opal/mca/hwloc/base/base.h
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/base.h Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/opal/mca/hwloc/base/base.h 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -169,8 +169,8 @@
> hwloc_obj_t obj,
>
> opal_hwloc_resource_type_t rtype);
>
> -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo,
> - const char* device_name,
> +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo,
> + char* device_name,
> opal_list_t *sorted_list);
>
> /**
>
> Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/hwloc_base_util.c Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -1729,7 +1729,7 @@
> }
> }
>
> -static void sort_by_dist(hwloc_topology_t topo, const char*
> device_name, opal_list_t *sorted_list)
> +static void sort_by_dist(hwloc_topology_t topo, char* device_name,
> +opal_list_t *sorted_list)
> {
> hwloc_obj_t device_obj = NULL;
> hwloc_obj_t obj = NULL, root = NULL; @@ -1751,6 +1751,9 @@
> obj = obj->parent;
> }
> if (obj == NULL) {
> + opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
> + "hwloc:base:get_sorted_numa_list: NUMA node closest to %s wasn't found.",
> + device_name);
> return;
> } else {
> close_node_index = obj->logical_index; @@ -1762,6
> +1765,8 @@
> /* we can try to find distances under group object. This info can be there. */
> depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
> if (depth < 0) {
> + opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
> + "hwloc:base:get_sorted_numa_list:
> + There is no information about distances on the node.");
> return;
> }
> root = hwloc_get_root_obj(topo); @@ -1779,6
> +1784,8 @@
> }
> /* find all distances for our close node with logical index = close_node_index as close_node_index + nbobjs*j */
> if ((NULL == distances) || (0 == distances->nbobjs)) {
> + opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
> + "hwloc:base:get_sorted_numa_list: There
> + is no information about distances on the node.");
> return;
> }
> /* fill list of numa nodes */ @@ -1797,13 +1804,28 @@
> }
> }
>
> -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const
> char* device_name, opal_list_t *sorted_list)
> +static int find_devices(hwloc_topology_t topo, char* device_name) {
> + hwloc_obj_t device_obj = NULL;
> + int count = 0;
> + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
> + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
> + count++;
> + free(device_name);
> + device_name = strdup(device_obj->name);
> + }
> + }
> + return count;
> +}
> +
> +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char*
> +device_name, opal_list_t *sorted_list)
> {
> hwloc_obj_t obj;
> opal_list_item_t *item;
> opal_hwloc_summary_t *sum;
> opal_hwloc_topo_data_t *data;
> orte_rmaps_numa_node_t *numa, *copy_numa;
> + int count;
>
> obj = hwloc_get_root_obj(topo);
>
> @@ -1823,9 +1845,19 @@
> copy_numa->dist_from_closed = numa->dist_from_closed;
> opal_list_append(sorted_list, &copy_numa->super);
> }
> - return;
> + return 0;
> }else {
> /* don't already know it - go get it */
> + /* firstly we check if we need to autodetect OpenFabrics devices or we have the specified one */
> + if (!strcmp(device_name, "auto")) {
> + count = find_devices(topo, device_name);
> + if (count > 1) {
> + return count;
> + }
> + }
> + if (!device_name || (strlen(device_name) == 0)) {
> + return 1;
> + }
> sort_by_dist(topo, device_name, sorted_list);
> /* store this info in summary object for later usage */
> OPAL_LIST_FOREACH(numa, sorted_list,
> orte_rmaps_numa_node_t) { @@ -1834,7 +1866,7 @@
> copy_numa->dist_from_closed = numa->dist_from_closed;
> opal_list_append(&(sum->sorted_by_dist_list), &copy_numa->super);
> }
> - return;
> + return 0;
> }
> }
> }
>
> Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -29,3 +29,11 @@
> Node: %s
>
> Open MPI therefore cannot mapp the application as specified.
> +#
> +[orte-rmaps-mindist:several-hca-devices]
> +There are several OpenFabrics devices found on at least one node. Please specify the definite one.
> +
> + Devices: %d
> + Node: %s
> +
> +Open MPI therefore cannot mapp the application as specified.
>
> Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -71,6 +71,7 @@
> mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
> bool initial_map=true;
> bool bynode = false;
> + int ret;
>
> /* this mapper can only handle initial launch
> * when mindist mapping is desired @@ -245,7 +246,13 @@
> * so we call opal_hwloc_base_get_nbobjs_by_type */
> opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
> OBJ_CONSTRUCT(&numa_list, opal_list_t);
> - opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
> + ret = opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
> + if (ret > 1) {
> + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-hca-devices",
> + true, ret, node->name);
> + rc = ORTE_ERR_SILENT;
> + goto error;
> + }
> if (opal_list_get_size(&numa_list) > 0) {
> j = 0;
> required = 0;
> @@ -390,7 +397,7 @@
> }
> OBJ_DESTRUCT(&node_list);
> }
> -
> + free(orte_rmaps_base.device);
> return ORTE_SUCCESS;
>
> error:
> _______________________________________________
> svn-full mailing list
> svn-full_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full

--
Jeff Squyres
jsquyres_at_[hidden]
For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/
_______________________________________________
devel mailing list
devel_at_[hidden]
http://www.open-mpi.org/mailman/listinfo.cgi/devel