Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r29079 - in trunk: opal/mca/hwloc/base orte/mca/rmaps/mindist
From: Jeff Squyres (jsquyres) (jsquyres_at_[hidden])
Date: 2013-08-28 12:36:22


Can we rename rmaps_base_dist_hca to something that is less specific to IB?

E.g., rmaps_base_dist_verbs_device? (admittedly, that's a little long, but...)

On Aug 28, 2013, at 12:23 PM, <svn-commit-mailer_at_[hidden]> wrote:

> Author: jladd (Joshua Ladd)
> Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013)
> New Revision: 29079
> URL: https://svn.open-mpi.org/trac/ompi/changeset/29079
>
> Log:
> Add support for autodetecting a MLNX HCA in the rmaps min distance feature. In this way, .ini files distributed with software stacks need not specify a particular HCA but instead may select the key word auto which will automatically select the discovered device. To use this feature, simply pass the keyword auto instead of a specific device name, --mca rmaps_base_dist_hca auto. If more than one card is installed, the mapper will inform the user of this and, at this point, the user will then need to specify which card via the normal route, e.g. --mca rmaps_base_dist_hca <dev_name>. This should be added to \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist mapping
>
> Text files modified:
> trunk/opal/mca/hwloc/base/base.h | 4 ++--
> trunk/opal/mca/hwloc/base/hwloc_base_util.c | 40 ++++++++++++++++++++++++++++++++++++----
> trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt | 8 ++++++++
> trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c | 11 +++++++++--
> 4 files changed, 55 insertions(+), 8 deletions(-)
>
> Modified: trunk/opal/mca/hwloc/base/base.h
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/base.h Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/opal/mca/hwloc/base/base.h 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -169,8 +169,8 @@
> hwloc_obj_t obj,
> opal_hwloc_resource_type_t rtype);
>
> -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo,
> - const char* device_name,
> +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo,
> + char* device_name,
> opal_list_t *sorted_list);
>
> /**
>
> Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/hwloc_base_util.c Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -1729,7 +1729,7 @@
> }
> }
>
> -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, opal_list_t *sorted_list)
> +static void sort_by_dist(hwloc_topology_t topo, char* device_name, opal_list_t *sorted_list)
> {
> hwloc_obj_t device_obj = NULL;
> hwloc_obj_t obj = NULL, root = NULL;
> @@ -1751,6 +1751,9 @@
> obj = obj->parent;
> }
> if (obj == NULL) {
> + opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
> + "hwloc:base:get_sorted_numa_list: NUMA node closest to %s wasn't found.",
> + device_name);
> return;
> } else {
> close_node_index = obj->logical_index;
> @@ -1762,6 +1765,8 @@
> /* we can try to find distances under group object. This info can be there. */
> depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
> if (depth < 0) {
> + opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
> + "hwloc:base:get_sorted_numa_list: There is no information about distances on the node.");
> return;
> }
> root = hwloc_get_root_obj(topo);
> @@ -1779,6 +1784,8 @@
> }
> /* find all distances for our close node with logical index = close_node_index as close_node_index + nbobjs*j */
> if ((NULL == distances) || (0 == distances->nbobjs)) {
> + opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
> + "hwloc:base:get_sorted_numa_list: There is no information about distances on the node.");
> return;
> }
> /* fill list of numa nodes */
> @@ -1797,13 +1804,28 @@
> }
> }
>
> -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* device_name, opal_list_t *sorted_list)
> +static int find_devices(hwloc_topology_t topo, char* device_name)
> +{
> + hwloc_obj_t device_obj = NULL;
> + int count = 0;
> + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
> + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
> + count++;
> + free(device_name);
> + device_name = strdup(device_obj->name);
> + }
> + }
> + return count;
> +}
> +
> +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, opal_list_t *sorted_list)
> {
> hwloc_obj_t obj;
> opal_list_item_t *item;
> opal_hwloc_summary_t *sum;
> opal_hwloc_topo_data_t *data;
> orte_rmaps_numa_node_t *numa, *copy_numa;
> + int count;
>
> obj = hwloc_get_root_obj(topo);
>
> @@ -1823,9 +1845,19 @@
> copy_numa->dist_from_closed = numa->dist_from_closed;
> opal_list_append(sorted_list, &copy_numa->super);
> }
> - return;
> + return 0;
> }else {
> /* don't already know it - go get it */
> + /* firstly we check if we need to autodetect OpenFabrics devices or we have the specified one */
> + if (!strcmp(device_name, "auto")) {
> + count = find_devices(topo, device_name);
> + if (count > 1) {
> + return count;
> + }
> + }
> + if (!device_name || (strlen(device_name) == 0)) {
> + return 1;
> + }
> sort_by_dist(topo, device_name, sorted_list);
> /* store this info in summary object for later usage */
> OPAL_LIST_FOREACH(numa, sorted_list, orte_rmaps_numa_node_t) {
> @@ -1834,7 +1866,7 @@
> copy_numa->dist_from_closed = numa->dist_from_closed;
> opal_list_append(&(sum->sorted_by_dist_list), &copy_numa->super);
> }
> - return;
> + return 0;
> }
> }
> }
>
> Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -29,3 +29,11 @@
> Node: %s
>
> Open MPI therefore cannot mapp the application as specified.
> +#
> +[orte-rmaps-mindist:several-hca-devices]
> +There are several OpenFabrics devices found on at least one node. Please specify the definite one.
> +
> + Devices: %d
> + Node: %s
> +
> +Open MPI therefore cannot mapp the application as specified.
>
> Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c Wed Aug 28 12:03:23 2013 (r29078)
> +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) (r29079)
> @@ -71,6 +71,7 @@
> mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
> bool initial_map=true;
> bool bynode = false;
> + int ret;
>
> /* this mapper can only handle initial launch
> * when mindist mapping is desired
> @@ -245,7 +246,13 @@
> * so we call opal_hwloc_base_get_nbobjs_by_type */
> opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
> OBJ_CONSTRUCT(&numa_list, opal_list_t);
> - opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
> + ret = opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
> + if (ret > 1) {
> + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-hca-devices",
> + true, ret, node->name);
> + rc = ORTE_ERR_SILENT;
> + goto error;
> + }
> if (opal_list_get_size(&numa_list) > 0) {
> j = 0;
> required = 0;
> @@ -390,7 +397,7 @@
> }
> OBJ_DESTRUCT(&node_list);
> }
> -
> + free(orte_rmaps_base.device);
> return ORTE_SUCCESS;
>
> error:
> _______________________________________________
> svn-full mailing list
> svn-full_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full

-- 
Jeff Squyres
jsquyres_at_[hidden]
For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/