Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] hcoll destruction via MPI attribute
From: Jeff Squyres (jsquyres) (jsquyres_at_[hidden])
Date: 2014-01-09 06:45:29


Mike / Devendar --

I'd like to understand the use case for this a bit more, from the perspective of: is there more infrastructure that we need to provide in the coll framework? Can you explain what you're trying to do, and when you need your cleanup action(s) to run?

I ask because this seems a little odd (use an MPI attribute to trigger something during shutdown). Also, you commented out a bunch of code that doesn't seem to have been replaced with anything -- the attribute delete function simply calls hcoll_group_destroy_notify(); it doesn't do any of the things that the now-commented-out code in the hcoll module destructor does.

Thanks.

On Jan 9, 2014, at 6:27 AM, <svn-commit-mailer_at_[hidden]> wrote:

> Author: miked (Mike Dubman)
> Date: 2014-01-09 06:27:24 EST (Thu, 09 Jan 2014)
> New Revision: 30175
> URL: https://svn.open-mpi.org/trac/ompi/changeset/30175
>
> Log:
> Create a comm keyval for hcoll component with delete callback function.
> Set comm attribute with keyval.
> Wait for pending hcoll module tasks in comm delete callback where PML
> still valid on the communicator. safely destroy hcoll context during
> hcoll module destructor.
>
> Author: Devendar Bureddy
> reviewed by miked
>
> cmr=v1.7.4:reviewer=ompi-rm1.7
>
> Text files modified:
> trunk/ompi/mca/coll/hcoll/coll_hcoll.h | 1
> trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c | 51 ++++++++++++++++++++++++++++++++++++---
> 2 files changed, 48 insertions(+), 4 deletions(-)
>
> Modified: trunk/ompi/mca/coll/hcoll/coll_hcoll.h
> ==============================================================================
> --- trunk/ompi/mca/coll/hcoll/coll_hcoll.h Thu Jan 9 00:56:29 2014 (r30174)
> +++ trunk/ompi/mca/coll/hcoll/coll_hcoll.h 2014-01-09 06:27:24 EST (Thu, 09 Jan 2014) (r30175)
> @@ -19,6 +19,7 @@
> #include "ompi/mca/pml/pml.h"
> #include "ompi/mca/coll/base/coll_tags.h"
> #include "ompi/communicator/communicator.h"
> +#include "ompi/attribute/attribute.h"
> #include "ompi/op/op.h"
>
> #include "orte/runtime/orte_globals.h"
>
> Modified: trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c
> ==============================================================================
> --- trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c Thu Jan 9 00:56:29 2014 (r30174)
> +++ trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c 2014-01-09 06:27:24 EST (Thu, 09 Jan 2014) (r30175)
> @@ -10,6 +10,8 @@
> #include "ompi_config.h"
> #include "coll_hcoll.h"
>
> +int hcoll_comm_attr_keyval;
> +
> /*
> * Initial query function that is invoked during MPI_INIT, allowing
> * this module to indicate what level of thread support it provides.
> @@ -57,6 +59,7 @@
> am = &mca_coll_hcoll_component.active_modules;
>
> if (hcoll_module->comm == &ompi_mpi_comm_world.comm){
> +#if 0
> /* If we get here then we are detroying MPI_COMM_WORLD now. So,
> * it is safe to destory all the other communicators and corresponding
> * hcoll contexts that could still be on the "active_modules" list.
> @@ -86,6 +89,10 @@
> (rte_grp_handle_t)hcoll_module->comm,
> &context_destroyed);
> }
> +#endif
> + if (OMPI_SUCCESS != ompi_attr_free_keyval(COMM_ATTR, &hcoll_comm_attr_keyval, 0)) {
> + HCOL_VERBOSE(1,"hcoll ompi_attr_free_keyval failed");
> + }
> }
>
> OBJ_RELEASE(hcoll_module->previous_barrier_module);
> @@ -104,6 +111,11 @@
> OBJ_RELEASE(hcoll_module->previous_ibcast_module);
> OBJ_RELEASE(hcoll_module->previous_iallreduce_module);
> OBJ_RELEASE(hcoll_module->previous_iallgather_module);
> + context_destroyed = 0;
> + hcoll_destroy_context(hcoll_module->hcoll_context,
> + (rte_grp_handle_t)hcoll_module->comm,
> + &context_destroyed);
> + assert(context_destroyed);
>
> mca_coll_hcoll_module_clear(hcoll_module);
> }
> @@ -142,7 +154,18 @@
> return OMPI_SUCCESS;
> }
>
> +/*
> +** Communicator free callback
> +*/
> +int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra)
> +{
> +
> + mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) attr_val;
>
> + hcoll_group_destroy_notify(hcoll_module->hcoll_context);
> + return OMPI_SUCCESS;
> +
> +}
> /*
> * Initialize module on the communicator
> */
> @@ -150,6 +173,7 @@
> struct ompi_communicator_t *comm)
> {
> mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module;
> + int ret;
> hcoll_module->comm = comm;
> if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){
> HCOL_ERROR("coll_hcol: __save_coll_handlers failed");
> @@ -166,6 +190,7 @@
> return OMPI_ERROR;
> }
>
> +#if 0
> if (comm != &ompi_mpi_comm_world.comm){
> mca_coll_hcoll_module_list_item_wrapper_t *mw =
> OBJ_NEW(mca_coll_hcoll_module_list_item_wrapper_t);
> @@ -174,6 +199,13 @@
> opal_list_append(&mca_coll_hcoll_component.active_modules,
> (opal_list_item_t*)mw);
> }
> +#endif
> +
> + ret = ompi_attr_set_c(COMM_ATTR, comm, &comm->c_keyhash, hcoll_comm_attr_keyval, (void *)hcoll_module, false);
> + if (OMPI_SUCCESS != ret) {
> + HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed");
> + return OMPI_ERROR;
> + }
>
> return OMPI_SUCCESS;
> }
> @@ -197,7 +229,7 @@
> if (ompi_mpi_finalized){
> hcoll_rte_p2p_disabled_notify();
> }
> -
> +#if 0
> item = opal_list_get_first(am);
> while (item != opal_list_get_end(am)){
> item_next = opal_list_get_next(item);
> @@ -225,10 +257,9 @@
> }
> item = item_next;
> }
> +#endif
>
> - if (!ompi_mpi_finalized) {
> - (*hcoll_progress_fn)();
> - }
> + (*hcoll_progress_fn)();
> OPAL_THREAD_ADD32(&mca_coll_hcoll_component.progress_lock,-1);
> return OMPI_SUCCESS;
> }
> @@ -245,6 +276,9 @@
> mca_coll_base_module_t *module;
> mca_coll_hcoll_module_t *hcoll_module;
> static bool libhcoll_initialized = false;
> + ompi_attribute_fn_ptr_union_t del_fn;
> + ompi_attribute_fn_ptr_union_t copy_fn;
> + int err;
> int rc;
>
> *priority = 0;
> @@ -269,6 +303,15 @@
> HCOL_VERBOSE(0,"Hcol library init failed");
> return NULL;
> }
> +
> + copy_fn.attr_communicator_copy_fn = (MPI_Comm_internal_copy_attr_function*) MPI_COMM_NULL_COPY_FN;
> + del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn;
> + err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, &hcoll_comm_attr_keyval, NULL ,0, NULL);
> + if (OMPI_SUCCESS != err) {
> + HCOL_VERBOSE(0,"Hcol comm keyval create failed");
> + return NULL;
> + }
> +
> libhcoll_initialized = true;
> }
> hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t);
> _______________________________________________
> svn-full mailing list
> svn-full_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full

-- 
Jeff Squyres
jsquyres_at_[hidden]
For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/