Adrian - are you going to commit this? Or do you want one of us to do so for you?


On Dec 9, 2013, at 1:39 PM, Josh Hursey <jjhursey@open-mpi.org> wrote:

With the modification that Ralph mentioned below, I think the patch it good to go.

Thanks!


On Mon, Dec 9, 2013 at 2:19 PM, Ralph Castain <rhc@open-mpi.org> wrote:
On Dec 9, 2013, at 10:07 AM, Ralph Castain <rhc@open-mpi.org> wrote:

> I see some things in here that concern me. First, there are variables being added to functions that would appear to generate "not used" warnings if ft is not enabled - they need to be properly protected. Second, I see references like this one:
>
> -        (ret = orte_oob.ft_event(state)) ) {
> +    if( ORTE_SUCCESS != (ret = orte_rml_oob_module.super.ft_event(state)) ) {

Took me awhile to grok what you were doing with the above line - it's actually okay, but you could avoid going thru the module dereference by just calling

orte_rml_oob_ft_event

No need to reference thru the module unless you want to for some reason.


>
> This doesn't seem right - if we are referencing the OOB, then we need to go directly to it. I'll have to check/correct the code, but the RML shouldn't even be storing a pointer to the OOB in it as there no longer is a direct linkage.
>
>
> On Dec 9, 2013, at 5:38 AM, Adrian Reber <adrian@lisas.de> wrote:
>
>> From: Adrian Reber <adrian.reber@hs-esslingen.de>
>>
>> This are the remaining changes to get C/R to compile again. This patch
>> includes various fixes all over the C/R code and are hard to group
>> like the previous patches.
>>
>> Changes from V1:
>> * explain why mca_base_component_distill_checkpoint_ready no longer works
>> * compare return result of opal functions with OPAL_* values
>>
>> Signed-off-by: Adrian Reber <adrian.reber@hs-esslingen.de>
>> ---
>> ompi/mca/bml/r2/bml_r2_ft.c                    | 10 +++++-----
>> opal/mca/base/mca_base_components_open.c       |  9 +++++++++
>> opal/mca/crs/self/crs_self_component.c         | 16 ++++++++--------
>> opal/tools/opal-restart/opal-restart.c         |  2 +-
>> orte/mca/errmgr/base/errmgr_base_fns.c         |  2 +-
>> orte/mca/ess/env/ess_env_module.c              |  2 +-
>> orte/mca/plm/base/plm_base_launch_support.c    |  1 +
>> orte/mca/rml/oob/rml_oob_component.c           |  9 ++-------
>> orte/mca/snapc/base/snapc_base_frame.c         |  4 ++--
>> orte/mca/snapc/full/snapc_full_app.c           | 15 +++++++++++++++
>> orte/mca/snapc/full/snapc_full_component.c     |  7 ++++---
>> orte/mca/snapc/full/snapc_full_global.c        |  8 ++++----
>> orte/mca/snapc/full/snapc_full_local.c         |  2 +-
>> orte/mca/sstore/stage/sstore_stage_component.c |  5 +++++
>> 14 files changed, 59 insertions(+), 33 deletions(-)
>>
>> diff --git a/ompi/mca/bml/r2/bml_r2_ft.c b/ompi/mca/bml/r2/bml_r2_ft.c
>> index 1448c04..fc16452 100644
>> --- a/ompi/mca/bml/r2/bml_r2_ft.c
>> +++ b/ompi/mca/bml/r2/bml_r2_ft.c
>> @@ -191,7 +191,7 @@ int mca_bml_r2_ft_event(int state)
>>
>>            for(p = 0; p < (int)num_procs; ++p) {
>>                if( NULL != procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
>> -                    OBJ_RELEASE((mca_bml_base_endpoint_t*) procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
>> +                    OBJ_RELEASE(procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
>>                    procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
>>                }
>>
>> @@ -263,9 +263,9 @@ int mca_bml_r2_ft_event(int state)
>>        mca_base_var_get_value(param_type, &btl_list, NULL, NULL);
>>        opal_output_verbose(11, ompi_cr_output,
>>                            "Restart (New BTL MCA): <%s>\n", btl_list ? btl_list[0] : "");
>> -        if( NULL != param_list ) {
>> -            free(param_list);
>> -            param_list = NULL;
>> +        if( NULL != btl_list ) {
>> +            free(btl_list);
>> +            btl_list = NULL;
>>        }
>>
>>        /*
>> @@ -286,7 +286,7 @@ int mca_bml_r2_ft_event(int state)
>>
>>        for(p = 0; p < (int)num_procs; ++p) {
>>            if( NULL != procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
>> -                OBJ_RELEASE((mca_bml_base_endpoint_t*) procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
>> +                OBJ_RELEASE(procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
>>                procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
>>            }
>>
>> diff --git a/opal/mca/base/mca_base_components_open.c b/opal/mca/base/mca_base_components_open.c
>> index e46e0f3..4568a51 100644
>> --- a/opal/mca/base/mca_base_components_open.c
>> +++ b/opal/mca/base/mca_base_components_open.c
>> @@ -141,9 +141,18 @@ static int open_components(mca_base_framework_t *framework)
>>     * NTH: Logic moved to mca_base_components_filter.
>>     */
>> #if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
>> +#ifdef ENABLE_FT_FIXED
>> +    /* FIXME_FT
>> +     *
>> +     * the variable mca_base_component_distill_checkpoint_ready
>> +     * was removed by commit 8181c8273c486bba59b3dead324939eac1a58b8c (r28237)
>> +     * "Introduce the MCA framework system. This formalizes the interface frameworks must provide."
>> +     *
>> +     * */
>>    if (mca_base_component_distill_checkpoint_ready) {
>>        open_only_flags |= MCA_BASE_METADATA_PARAM_CHECKPOINT;
>>    }
>> +#endif /* ENABLE_FT_FIXED */
>> #endif  /* (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1) */
>>
>>    /* If mca_base_framework_register_components was called with the MCA_BASE_COMPONENTS_ALL flag
>> diff --git a/opal/mca/crs/self/crs_self_component.c b/opal/mca/crs/self/crs_self_component.c
>> index e0ca1ab..eb45d59 100644
>> --- a/opal/mca/crs/self/crs_self_component.c
>> +++ b/opal/mca/crs/self/crs_self_component.c
>> @@ -90,9 +90,9 @@ static int crs_self_register (void)
>>    mca_crs_self_component.super.priority = 20;
>>    ret = mca_base_component_var_register (&mca_crs_self_component.super.base_version,
>>                                           "priority", "Priority of the CRS self component "
>> -                                           "(default: 20)", MCA_BASE_VAR_TYPE_INT, NULL,
>> +                                           "(default: 20)", MCA_BASE_VAR_TYPE_INT, NULL, 0,
>>                                           MCA_BASE_VAR_FLAG_SETTABLE,
>> -                                           OPAL_INFO_LVL_9, MPI_BASE_VAR_SCOPE_ALL_EQ,
>> +                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
>>                                           &mca_crs_self_component.super.priority);
>>    if (0 > ret) {
>>        return ret;
>> @@ -102,8 +102,8 @@ static int crs_self_register (void)
>>    ret = mca_base_component_var_register (&mca_crs_self_component.super.base_version,
>>                                           "verbose",
>>                                           "Verbose level for the CRS self component",
>> -                                           MCA_BASE_VAR_TYPE_INT, NULL,MCA_BASE_VAR_FLAG_SETTABLE,
>> -                                           OPAL_INFO_LVL_9, MPI_BASE_VAR_SCOPE_LOCAL,
>> +                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
>> +                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
>>                                           &mca_crs_self_component.super.verbose);
>>    if (0 > ret) {
>>        return ret;
>> @@ -116,8 +116,8 @@ static int crs_self_register (void)
>>    ret = mca_base_component_var_register (&mca_crs_self_component.super.base_version,
>>                                           "prefix",
>>                                           "Prefix for user defined callback functions",
>> -                                           MCA_BASE_VAR_TYPE_STRING, NULL, MCA_BASE_VAR_FLAG_SETTABLE,
>> -                                           OPAL_INFO_LVL_9, MPI_BASE_VAR_SCOPE_LOCAL,
>> +                                           MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
>> +                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
>>                                           &mca_crs_self_component.prefix);
>>    if (0 > ret) {
>>        return ret;
>> @@ -126,8 +126,8 @@ static int crs_self_register (void)
>>    ret = mca_base_component_var_register (&mca_crs_self_component.super.base_version,
>>                                           "do_restart",
>>                                           "Start execution by calling restart callback",
>> -                                           MCA_BASE_VAR_TYPE_BOOL, NULL, MCA_BASE_VAR_FLAG_SETTABLE,
>> -                                           OPAL_INFO_LVL_9, MPI_BASE_VAR_SCOPE_LOCAL,
>> +                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
>> +                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
>>                                           &mca_crs_self_component.do_restart);
>>    return (0 > ret) ? ret : OPAL_SUCCESS;
>> }
>> diff --git a/opal/tools/opal-restart/opal-restart.c b/opal/tools/opal-restart/opal-restart.c
>> index 35b7843..53da7f3 100644
>> --- a/opal/tools/opal-restart/opal-restart.c
>> +++ b/opal/tools/opal-restart/opal-restart.c
>> @@ -247,7 +247,7 @@ main(int argc, char *argv[])
>>     * restart on this node because it doesn't have the proper checkpointer
>>     * available.
>>     */
>> -    if( OPAL_SUCCESS != (ret = opal_crs_base_open()) ) {
>> +    if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
>>        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
>>                       "crs", ret);
>>        exit_status = ret;
>> diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c
>> index 399c237..e8f41a2 100644
>> --- a/orte/mca/errmgr/base/errmgr_base_fns.c
>> +++ b/orte/mca/errmgr/base/errmgr_base_fns.c
>> @@ -366,7 +366,7 @@ void orte_errmgr_base_migrate_state_notify(int state)
>>    }
>> }
>>
>> -void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc)
>> +int orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc)
>> {
>>    if (NULL != proc) {
>>        switch(state) {
>> diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c
>> index 6a71230..9b80099 100644
>> --- a/orte/mca/ess/env/ess_env_module.c
>> +++ b/orte/mca/ess/env/ess_env_module.c
>> @@ -422,7 +422,7 @@ static int rte_ft_event(int state)
>>            exit_status = ret;
>>            goto cleanup;
>>        }
>> -        if (ORTE_SUCCESS != (ret = orte_db.remove(NULL, NULL))) {
>> +        if (OPAL_SUCCESS != (ret = opal_db.remove(NULL, NULL))) {
>>            ORTE_ERROR_LOG(ret);
>>            exit_status = ret;
>>            goto cleanup;
>> diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c
>> index 3deee11..6aba2c2 100644
>> --- a/orte/mca/plm/base/plm_base_launch_support.c
>> +++ b/orte/mca/plm/base/plm_base_launch_support.c
>> @@ -333,6 +333,7 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
>> {
>>    orte_job_t *jdata, *jdatorted;
>>    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
>> +    int rc;
>>
>>    /* if we don't want to launch the apps, now is the time to leave */
>>    if (orte_do_not_launch) {
>> diff --git a/orte/mca/rml/oob/rml_oob_component.c b/orte/mca/rml/oob/rml_oob_component.c
>> index dd539cd..b91f4a3 100644
>> --- a/orte/mca/rml/oob/rml_oob_component.c
>> +++ b/orte/mca/rml/oob/rml_oob_component.c
>> @@ -11,11 +11,7 @@
>> * Copyright (c) 2004-2005 The Regents of the University of California.
>> *                         All rights reserved.
>> * Copyright (c) 2007      Cisco Systems, Inc.  All rights reserved.
>> -<<<<<<< .mine
>> - * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
>> -=======
>> * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
>> ->>>>>>> .r28253
>> *                         All rights reserved.
>> * $COPYRIGHT$
>> *
>> @@ -189,8 +185,7 @@ orte_rml_oob_ft_event(int state) {
>>        ;
>>    }
>>
>> -    if( ORTE_SUCCESS !=
>> -        (ret = orte_oob.ft_event(state)) ) {
>> +    if( ORTE_SUCCESS != (ret = orte_rml_oob_module.super.ft_event(state)) ) {
>>        ORTE_ERROR_LOG(ret);
>>        exit_status = ret;
>>        goto cleanup;
>> @@ -212,7 +207,7 @@ orte_rml_oob_ft_event(int state) {
>>            goto cleanup;
>>        }
>>
>> -        if( ORTE_SUCCESS != (ret = mca_oob_base_select())) {
>> +        if( ORTE_SUCCESS != (ret = orte_oob_base_select())) {
>>            ORTE_ERROR_LOG(ret);
>>            exit_status = ret;
>>            goto cleanup;
>> diff --git a/orte/mca/snapc/base/snapc_base_frame.c b/orte/mca/snapc/base/snapc_base_frame.c
>> index edb8e6e..a46c77a 100644
>> --- a/orte/mca/snapc/base/snapc_base_frame.c
>> +++ b/orte/mca/snapc/base/snapc_base_frame.c
>> @@ -79,7 +79,7 @@ static int orte_snapc_base_register(mca_base_register_flag_t flags)
>>    return ORTE_SUCCESS;
>> }
>>
>> -static int orte_snapc_base_close(void)
>> +int orte_snapc_base_close(void)
>> {
>>    /* Close the selected component */
>>    if( NULL != orte_snapc.snapc_finalize ) {
>> @@ -93,7 +93,7 @@ static int orte_snapc_base_close(void)
>> * Function for finding and opening either all MCA components,
>> * or the one that was specifically requested via a MCA parameter.
>> */
>> -static int orte_snapc_base_open(mca_base_open_flag_t flags)
>> +int orte_snapc_base_open(mca_base_open_flag_t flags)
>> {
>>    /* Init the sequence (interval) number */
>>    orte_snapc_base_snapshot_seq_number = 0;
>> diff --git a/orte/mca/snapc/full/snapc_full_app.c b/orte/mca/snapc/full/snapc_full_app.c
>> index 1ff036e..c7438f1 100644
>> --- a/orte/mca/snapc/full/snapc_full_app.c
>> +++ b/orte/mca/snapc/full/snapc_full_app.c
>> @@ -99,6 +99,12 @@ static int current_cr_state = OPAL_CRS_NONE;
>> static orte_sstore_base_handle_t current_ss_handle = ORTE_SSTORE_HANDLE_INVALID, last_ss_handle = ORTE_SSTORE_HANDLE_INVALID;
>> static opal_crs_base_ckpt_options_t *current_options = NULL;
>>
>> +static void snapc_full_app_callback_recv(int status,
>> +                                         orte_process_name_t* sender,
>> +                                         opal_buffer_t* buffer,
>> +                                         orte_rml_tag_t tag,
>> +                                         void* cbdata);
>> +
>> /************************
>> * Function Definitions
>> ************************/
>> @@ -1673,3 +1679,12 @@ int app_coord_request_op(orte_snapc_base_request_op_t *datum)
>>
>>    return exit_status;
>> }
>> +
>> +/* dummy implementation of a callback function to get it to compile again */
>> +static void snapc_full_app_callback_recv(int status,
>> +                                         orte_process_name_t* sender,
>> +                                         opal_buffer_t* buffer,
>> +                                         orte_rml_tag_t tag,
>> +                                         void* cbdata)
>> +{
>> +}
>> diff --git a/orte/mca/snapc/full/snapc_full_component.c b/orte/mca/snapc/full/snapc_full_component.c
>> index 7815363..b953e17 100644
>> --- a/orte/mca/snapc/full/snapc_full_component.c
>> +++ b/orte/mca/snapc/full/snapc_full_component.c
>> @@ -32,6 +32,7 @@ const char *orte_snapc_full_component_version_string =
>> */
>> static int snapc_full_open(void);
>> static int snapc_full_close(void);
>> +static int snapc_full_register(void);
>>
>> bool orte_snapc_full_skip_app   = false;
>> bool orte_snapc_full_timing_enabled = false;
>> @@ -74,7 +75,7 @@ orte_snapc_full_component_t mca_snapc_full_component = {
>>    }
>> };
>>
>> -static int snaps_full_register (void)
>> +static int snapc_full_register (void)
>> {
>>    mca_base_component_t *component = &mca_snapc_full_component.super.base_version;
>>    /*
>> @@ -129,7 +130,7 @@ static int snaps_full_register (void)
>>                                            OPAL_INFO_LVL_9,
>>                                            MCA_BASE_VAR_SCOPE_LOCAL,
>>                                            &orte_snapc_full_progress_meter);
>> -    orte_snapc_full_progress_meter = (value % 101);
>> +    orte_snapc_full_progress_meter %= 101;
>>
>>    return ORTE_SUCCESS;
>> }
>> @@ -148,7 +149,7 @@ static int snapc_full_open(void)
>>    }
>>
>>    /* recheck the progress meter (it may have changed between register and open) */
>> -    orte_snapc_full_progress_meter = (value % 101);
>> +    orte_snapc_full_progress_meter %= 101;
>>
>>    /*
>>     * Debug Output
>> diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c
>> index c88c6db..9f6da34 100644
>> --- a/orte/mca/snapc/full/snapc_full_global.c
>> +++ b/orte/mca/snapc/full/snapc_full_global.c
>> @@ -513,7 +513,7 @@ int global_coord_end_ckpt(orte_snapc_base_quiesce_t *datum)
>>    if( currently_migrating ) {
>>        OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
>>                             "Global) End Ckpt: Flush the modex cached data\n"));
>> -        if (ORTE_SUCCESS != (ret = orte_db.remove(NULL, NULL))) {
>> +        if (OPAL_SUCCESS != (ret = opal_db.remove(NULL, NULL))) {
>>            ORTE_ERROR_LOG(ret);
>>            exit_status = ret;
>>            goto cleanup;
>> @@ -1138,7 +1138,7 @@ void snapc_full_global_orted_recv(int status,
>>            OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
>>                                 "Global) Command: Job State Update (quick)"));
>>
>> -            snapc_full_process_job_update_cmd(&sender, buffer, true);
>> +            snapc_full_process_job_update_cmd(sender, buffer, true);
>>            break;
>>
>>        case ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD:
>> @@ -1974,7 +1974,7 @@ static void snapc_full_process_job_update_cmd(orte_process_name_t* sender,
>>
>> static int snapc_full_establish_snapshot_dir(bool empty_metadata)
>> {
>> -    const char **value = NULL;
>> +    char **value = NULL;
>>    int idx = 0;
>>
>>    /*********************
>> @@ -1998,7 +1998,7 @@ static int snapc_full_establish_snapshot_dir(bool empty_metadata)
>>        opal_show_help("help-orte-restart.txt", "amca_param_not_found", true);
>>    }
>>    if( 0 < idx ) {
>> -        mca_base_var_get_value (idx, &value, sizeof (value), NULL, NULL);
>> +        mca_base_var_get_value (idx, &value, NULL, NULL);
>>
>>        if (*value) {
>>            orte_sstore.set_attr(global_snapshot.ss_handle,
>> diff --git a/orte/mca/snapc/full/snapc_full_local.c b/orte/mca/snapc/full/snapc_full_local.c
>> index c0b168a..b13fce9 100644
>> --- a/orte/mca/snapc/full/snapc_full_local.c
>> +++ b/orte/mca/snapc/full/snapc_full_local.c
>> @@ -1776,7 +1776,7 @@ static void snapc_full_local_comm_read_event(int fd, short flags, void *arg)
>>    if( currently_migrating && !flushed_modex ) {
>>        OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
>>                             "Local) Read Event: Flush the modex cached data\n"));
>> -        if (ORTE_SUCCESS != (ret = orte_db.remove(NULL, NULL))) {
>> +        if (OPAL_SUCCESS != (ret = opal_db.remove(NULL, NULL))) {
>>            ORTE_ERROR_LOG(ret);
>>            exit_status = ret;
>>            goto cleanup;
>> diff --git a/orte/mca/sstore/stage/sstore_stage_component.c b/orte/mca/sstore/stage/sstore_stage_component.c
>> index 19d7c75..aca2b46 100644
>> --- a/orte/mca/sstore/stage/sstore_stage_component.c
>> +++ b/orte/mca/sstore/stage/sstore_stage_component.c
>> @@ -235,3 +235,8 @@ static int sstore_stage_close(void)
>>
>>    return ORTE_SUCCESS;
>> }
>> +
>> +static int sstore_stage_register(void)
>> +{
>> +    return ORTE_SUCCESS;
>> +}
>> --
>> 1.8.4.2
>>
>> _______________________________________________
>> devel mailing list
>> devel@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>

_______________________________________________
devel mailing list
devel@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel



--
Joshua Hursey
Assistant Professor of Computer Science
University of Wisconsin-La Crosse
http://cs.uwlax.edu/~jjhursey
_______________________________________________
devel mailing list
devel@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel