Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn] svn:open-mpi r26804 - trunk/ompi/mca/btl/openib
From: Ralph Castain (rhc_at_[hidden])
Date: 2012-07-19 12:18:46


I've spent hours trying to fix this commit so openib would even compile again, but failed. Just too many errors. Setting aside the need to include <sys/types.h>, <sys/stat.h>, and <unistd.h> to handle the stat call under linux, there is no function "read_module_param" anywhere, nor is "device" defined in btl_openib_component.c

Please - a tad more care in what gets committed??

I finally just reverted it so the trunk could build.

On Jul 18, 2012, at 10:29 AM, svn-commit-mailer_at_[hidden] wrote:

> Author: hjelmn (Nathan Hjelm)
> Date: 2012-07-18 13:29:48 EDT (Wed, 18 Jul 2012)
> New Revision: 26804
> URL: https://svn.open-mpi.org/trac/ompi/changeset/26804
>
> Log:
> btl/openib: limit each process to a ppn fraction of the available registered memory when using mellanox hardware (mlx4 and mthca)
>
> Text files modified:
> trunk/ompi/mca/btl/openib/btl_openib.c | 74 ++++++++++++++++++++++++++++++++++++++-
> trunk/ompi/mca/btl/openib/btl_openib.h | 4 ++
> trunk/ompi/mca/btl/openib/btl_openib_component.c | 15 ++++++++
> trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt | 19 ++++++++++
> 4 files changed, 110 insertions(+), 2 deletions(-)
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib.c
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib.c Wed Jul 18 13:29:37 2012 (r26803)
> +++ trunk/ompi/mca/btl/openib/btl_openib.c 2012-07-18 13:29:48 EDT (Wed, 18 Jul 2012) (r26804)
> @@ -70,6 +70,10 @@
> #ifdef HAVE_UNISTD_H
> #include <unistd.h>
> #endif
> +#ifdef OPAL_HAVE_HWLOC
> +#include "opal/mca/hwloc/hwloc.h"
> +#endif
> +
> #ifndef MIN
> #define MIN(a,b) ((a)<(b)?(a):(b))
> #endif
> @@ -579,6 +583,65 @@
> return OMPI_SUCCESS;
> }
>
> +/* calculate memory registation limits */
> +static uint64_t calculate_total_mem (void)
> +{
> +#if OPAL_HAVE_HWLOC
> + hwloc_obj_t machine;
> +
> + machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
> + if (NULL == machine) {
> + return 0;
> + }
> +
> + return machine->memory.total_memory;
> +#else
> + return 0;
> +#endif
> +}
> +
> +static uint64_t calculate_max_reg (void)
> +{
> + struct stat statinfo;
> + uint64_t mtts_per_seg = 1;
> + uint64_t num_mtt = 1 << 19;
> + uint64_t reserved_mtt = 0;
> + uint64_t max_reg, mem_total;
> +
> + mem_total = calculate_total_mem ();
> +
> + if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
> + mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
> + num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
> + if (1 == num_mtt) {
> + /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
> + num_mtt = 1 << 20;
> + }
> +
> + max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
> + } else if (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) {
> + mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
> + num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
> + reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
> +
> + max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
> + } else {
> + /* need to update to determine the registration limit for this configuration */
> + max_reg = mem_total;
> + }
> +
> + /* NTH: print a warning if we can't register more than 75% of physical memory */
> + if (max_reg < mem_total * 3 / 4) {
> + orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
> + orte_process_info.nodename, (unsigned long)(max_reg >> 20),
> + (unsigned long)(mem_total >> 20));
> + }
> +
> + /* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
> + return (max_reg * 7) >> 3;
> +}
> +
> +
> /*
> * add a proc to this btl module
> * creates an endpoint that is setup on the
> @@ -592,7 +655,7 @@
> opal_bitmap_t* reachable)
> {
> mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
> - int i,j, rc;
> + int i,j, rc, local_procs;
> int rem_subnet_id_port_cnt;
> int lcl_subnet_id_port_cnt = 0;
> int btl_rank = 0;
> @@ -621,13 +684,17 @@
> }
> #endif
>
> - for (i = 0; i < (int) nprocs; i++) {
> + for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
> struct ompi_proc_t* ompi_proc = ompi_procs[i];
> mca_btl_openib_proc_t* ib_proc;
> int remote_matching_port;
>
> opal_output(-1, "add procs: adding proc %d", i);
>
> + if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
> + local_procs ++;
> + }
> +
> /* OOB, XOOB, and RDMACM do not support SELF comunication, so
> * mark the prco as unreachable by openib btl */
> if (OPAL_EQUAL == orte_util_compare_name_fields
> @@ -794,6 +861,9 @@
> peers[i] = endpoint;
> }
>
> + openib_btl->local_procs += local_procs;
> + openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs;
> +
> return mca_btl_openib_size_queues(openib_btl, nprocs);
> }
>
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib.h
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib.h Wed Jul 18 13:29:37 2012 (r26803)
> +++ trunk/ompi/mca/btl/openib/btl_openib.h 2012-07-18 13:29:48 EDT (Wed, 18 Jul 2012) (r26804)
> @@ -390,6 +390,8 @@
> mca_btl_openib_device_qp_t *qps;
> /* Maximum value supported by this device for max_inline_data */
> uint32_t max_inline_data;
> + /* Registration limit and current count */
> + uint64_t mem_reg_max, mem_reg_active;
> } mca_btl_openib_device_t;
> OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
>
> @@ -467,6 +469,8 @@
> mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
>
> mca_btl_openib_module_qp_t * qps;
> +
> + int local_procs; /** number of local procs */
> };
> typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
>
>
> Modified: trunk/ompi/mca/btl/openib/btl_openib_component.c
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/btl_openib_component.c Wed Jul 18 13:29:37 2012 (r26803)
> +++ trunk/ompi/mca/btl/openib/btl_openib_component.c 2012-07-18 13:29:48 EDT (Wed, 18 Jul 2012) (r26804)
> @@ -596,6 +596,13 @@
> enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
> IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
>
> + if (device->mem_reg_max &&
> + device->mem_reg_max < (device->mem_reg_active + size)) {
> + return OMPI_ERR_OUT_OF_RESOURCE;
> + }
> +
> + device->mem_reg_active += size;
> +
> #if HAVE_DECL_IBV_ACCESS_SO
> if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) {
> access_flag |= IBV_ACCESS_SO;
> @@ -637,6 +644,9 @@
> #endif
>
> }
> +
> + device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
> +
> openib_reg->mr = NULL;
> return OMPI_SUCCESS;
> }
> @@ -818,6 +828,7 @@
>
> openib_btl->cpcs = NULL;
> openib_btl->num_cpcs = 0;
> + openib_btl->local_procs = 0;
>
> mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
> mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
> @@ -1670,6 +1681,10 @@
> return OMPI_ERR_OUT_OF_RESOURCE;
> }
>
> + device->mem_reg_active = 0;
> + /* NTH: set some high default until we know how many local peers we have */
> + device->mem_reg_max = 1ull << 48;
> +
> device->ib_dev = ib_dev;
> device->ib_dev_context = ibv_open_device(ib_dev);
> device->ib_pd = NULL;
>
> Modified: trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt
> ==============================================================================
> --- trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt Wed Jul 18 13:29:37 2012 (r26803)
> +++ trunk/ompi/mca/btl/openib/help-mpi-btl-openib.txt 2012-07-18 13:29:48 EDT (Wed, 18 Jul 2012) (r26804)
> @@ -689,3 +689,22 @@
>
> Use "ibv_devinfo -v" on the local host to see the GID table of this
> device.
> +[reg mem limit low]
> +WARNING: It appears that your OpenFabrics subsystem is configured to only
> +allow registering part of your physical memory. This can cause MPI jobs to
> +run with erratic performance, hang, and/or crash.
> +
> +This may be caused by your OpenFabrics vendor limiting the amount of
> +physical memory that can be registered. You should investigate the
> +relevant Linux kernel module parameters that control how much physical
> +memory can be registered, and increase them to allow registering all
> +physical memory on your machine.
> +
> +See this Open MPI FAQ item for more information on these Linux kernel module
> +parameters:
> +
> + http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
> +
> + Local host: %s
> + Registerable memory: %lu MiB
> + Total memory: %lu MiB
> _______________________________________________
> svn mailing list
> svn_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn