Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r25005
From: Jeff Squyres (jsquyres_at_[hidden])
Date: 2011-08-08 11:28:08


Mike --

Does mxm_init() do Reasonable Things to check to see if the local OpenFabrics-capable devices are unsuitable for MXM? E.g., does it check to see if the local OpenFabrics devices are MXM-capable, and if not, fail gracefully?

Also, I would suggest NOT showing a show_help message if there are OF devices available such that CM/MXM can (probably) fail over to OB1/openib. I.e., only show a show_help message if devices are available for MXM, but an actual error occurs during the MXM initialization.

Otherwise, if I mpirun (with the MXM MTL installed) on a system with only RoCE or iWARP devices present, MXM will complain but then fail over to OB1/openib. That would probably be confusing.

On Aug 7, 2011, at 8:06 AM, miked_at_[hidden] wrote:

> Author: miked
> Date: 2011-08-07 08:06:49 EDT (Sun, 07 Aug 2011)
> New Revision: 25005
> URL: https://svn.open-mpi.org/trac/ompi/changeset/25005
>
> Log:
> better mxm selection mechanism, some refactoring
> Text files modified:
> trunk/ompi/mca/mtl/mxm/mtl_mxm_cancel.c | 4 ++--
> trunk/ompi/mca/mtl/mxm/mtl_mxm_component.c | 32 ++++++++++++++------------------
> trunk/ompi/mca/mtl/mxm/mtl_mxm_recv.c | 6 +++---
> trunk/ompi/mca/mtl/mxm/mtl_mxm_request.h | 6 +++++-
> trunk/ompi/mca/mtl/mxm/mtl_mxm_send.c | 4 ++--
> 5 files changed, 26 insertions(+), 26 deletions(-)
>
> Modified: trunk/ompi/mca/mtl/mxm/mtl_mxm_cancel.c
> ==============================================================================
> --- trunk/ompi/mca/mtl/mxm/mtl_mxm_cancel.c (original)
> +++ trunk/ompi/mca/mtl/mxm/mtl_mxm_cancel.c 2011-08-07 08:06:49 EDT (Sun, 07 Aug 2011)
> @@ -18,9 +18,9 @@
> mxm_error_t err;
> mca_mtl_mxm_request_t *mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;
>
> - err = mxm_req_cancel(mtl_mxm_request->mxm_base_request);
> + err = mxm_req_cancel(&mtl_mxm_request->mxm.base);
> if (MXM_OK == err) {
> - err = mxm_req_test(mtl_mxm_request->mxm_base_request);
> + err = mxm_req_test(&mtl_mxm_request->mxm.base);
> if (MXM_OK == err) {
> mtl_request->ompi_req->req_status._cancelled = true;
> mtl_mxm_request->super.completion_callback(&mtl_mxm_request->super);
>
> Modified: trunk/ompi/mca/mtl/mxm/mtl_mxm_component.c
> ==============================================================================
> --- trunk/ompi/mca/mtl/mxm/mtl_mxm_component.c (original)
> +++ trunk/ompi/mca/mtl/mxm/mtl_mxm_component.c 2011-08-07 08:06:49 EDT (Sun, 07 Aug 2011)
> @@ -72,18 +72,27 @@
>
> static int ompi_mtl_mxm_component_open(void)
> {
> - struct stat st;
>
> - /* Component available only if IB hardware is present */
> - if (0 == stat("/dev/infiniband/uverbs0", &st)) {
> - return OMPI_SUCCESS;
> - } else {
> + mxm_context_opts_t mxm_opts;
> + mxm_error_t err;
> +
> + mca_mtl_mxm_output = opal_output_open(NULL);
> + opal_output_set_verbosity(mca_mtl_mxm_output, ompi_mtl_mxm.verbose);
> +
> + mxm_fill_context_opts(&mxm_opts);
> + err = mxm_init(&mxm_opts, &ompi_mtl_mxm.mxm_context);
> + if (MXM_OK != err) {
> + orte_show_help("help-mtl-mxm.txt", "mxm init", true,
> + mxm_error_string(err));
> return OPAL_ERR_NOT_AVAILABLE;
> }
> + return OMPI_SUCCESS;
> }
>
> static int ompi_mtl_mxm_component_close(void)
> {
> + mxm_cleanup(ompi_mtl_mxm.mxm_context);
> + ompi_mtl_mxm.mxm_context = NULL;
> return OMPI_SUCCESS;
> }
>
> @@ -91,21 +100,8 @@
> ompi_mtl_mxm_component_init(bool enable_progress_threads,
> bool enable_mpi_threads)
> {
> - mxm_context_opts_t mxm_opts;
> - mxm_error_t err;
> int rc;
>
> - mca_mtl_mxm_output = opal_output_open(NULL);
> - opal_output_set_verbosity(mca_mtl_mxm_output, ompi_mtl_mxm.verbose);
> -
> - mxm_fill_context_opts(&mxm_opts);
> - err = mxm_init(&mxm_opts, &ompi_mtl_mxm.mxm_context);
> - if (MXM_OK != err) {
> - orte_show_help("help-mtl-mxm.txt", "mxm init", true,
> - mxm_error_string(err));
> - return NULL;
> - }
> -
> rc = ompi_mtl_mxm_module_init();
> if (OMPI_SUCCESS != rc) {
> return NULL;
>
> Modified: trunk/ompi/mca/mtl/mxm/mtl_mxm_recv.c
> ==============================================================================
> --- trunk/ompi/mca/mtl/mxm/mtl_mxm_recv.c (original)
> +++ trunk/ompi/mca/mtl/mxm/mtl_mxm_recv.c 2011-08-07 08:06:49 EDT (Sun, 07 Aug 2011)
> @@ -22,12 +22,12 @@
> {
> mca_mtl_mxm_request_t *req = (mca_mtl_mxm_request_t *) context;
> struct ompi_request_t *ompi_req = req->super.ompi_req;
> - mxm_recv_req_t *mxm_recv_req = (mxm_recv_req_t *)req->mxm_base_request;
> + mxm_recv_req_t *mxm_recv_req = &req->mxm.recv;
>
> /* Set completion status and envelope */
> ompi_req->req_status.MPI_TAG = mxm_recv_req->completion.sender_tag;
> ompi_req->req_status.MPI_SOURCE = mxm_recv_req->completion.sender_imm;
> - ompi_req->req_status.MPI_ERROR = ompi_mtl_mxm_to_mpi_status(req->mxm_base_request->error);
> + ompi_req->req_status.MPI_ERROR = ompi_mtl_mxm_to_mpi_status(mxm_recv_req->base.error);
> ompi_req->req_status._ucount = mxm_recv_req->completion.actual_len;
>
> /* Copy data */
> @@ -63,7 +63,7 @@
> return ret;
> }
>
> - mxm_recv_req = (mxm_recv_req_t *)mtl_mxm_request->mxm_base_request;
> + mxm_recv_req = &mtl_mxm_request->mxm.recv;
>
> /* prepare a receive request embedded in the MTL request */
> mxm_recv_req->base.state = MXM_REQ_NEW;
>
> Modified: trunk/ompi/mca/mtl/mxm/mtl_mxm_request.h
> ==============================================================================
> --- trunk/ompi/mca/mtl/mxm/mtl_mxm_request.h (original)
> +++ trunk/ompi/mca/mtl/mxm/mtl_mxm_request.h 2011-08-07 08:06:49 EDT (Sun, 07 Aug 2011)
> @@ -16,7 +16,11 @@
>
> struct mca_mtl_mxm_request_t {
> struct mca_mtl_request_t super;
> - mxm_req_base_t *mxm_base_request;
> + union {
> + mxm_req_base_t base;
> + mxm_send_req_t send;
> + mxm_recv_req_t recv;
> + } mxm;
> /* mxm_segment_t mxm_segment[1]; */
> void *buf;
> size_t length;
>
> Modified: trunk/ompi/mca/mtl/mxm/mtl_mxm_send.c
> ==============================================================================
> --- trunk/ompi/mca/mtl/mxm/mtl_mxm_send.c (original)
> +++ trunk/ompi/mca/mtl/mxm/mtl_mxm_send.c 2011-08-07 08:06:49 EDT (Sun, 07 Aug 2011)
> @@ -25,7 +25,7 @@
> free(mtl_mxm_request->buf);
> }
>
> - mtl_mxm_request->super.ompi_req->req_status.MPI_ERROR = ompi_mtl_mxm_to_mpi_status(mtl_mxm_request->mxm_base_request->error);
> + mtl_mxm_request->super.ompi_req->req_status.MPI_ERROR = ompi_mtl_mxm_to_mpi_status(mtl_mxm_request->mxm.base.error);
>
> mtl_mxm_request->super.completion_callback(&mtl_mxm_request->super);
> }
> @@ -93,7 +93,7 @@
> return ret;
> }
>
> - mxm_send_req = (mxm_send_req_t *) mtl_mxm_request->mxm_base_request;
> + mxm_send_req = &mtl_mxm_request->mxm.send;
>
> /* prepare a send request embedded in the MTL request */
> mxm_send_req->base.state = MXM_REQ_NEW;
> _______________________________________________
> svn-full mailing list
> svn-full_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full

-- 
Jeff Squyres
jsquyres_at_[hidden]
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/