From ba65cb790f2046e89a0559b178ef655f48583623 Mon Sep 17 00:00:00 2001 From: luzhexuan <635426116@qq.com> Date: Wed, 26 Feb 2025 20:50:04 +0800 Subject: [PATCH] [huawei] adapt to donau --- config/prte_check_donau.m4 | 43 + src/mca/ess/donau/Makefile.am | 36 + src/mca/ess/donau/configure.m4 | 29 + src/mca/ess/donau/ess_donau.h | 30 + src/mca/ess/donau/ess_donau_component.c | 82 ++ src/mca/ess/donau/ess_donau_module.c | 123 +++ src/mca/ess/donau/owner.txt | 7 + src/mca/grpcomm/grpcomm.h | 1 + src/mca/plm/donau/Makefile.am | 38 + src/mca/plm/donau/configure.m4 | 30 + src/mca/plm/donau/help-plm-donau.txt | 54 ++ src/mca/plm/donau/owner.txt | 7 + src/mca/plm/donau/plm_donau.h | 36 + src/mca/plm/donau/plm_donau_component.c | 121 +++ src/mca/plm/donau/plm_donau_module.c | 829 ++++++++++++++++++ src/mca/ras/donau/Makefile.am | 46 + src/mca/ras/donau/configure.m4 | 29 + src/mca/ras/donau/help-ras-donau.txt | 26 + src/mca/ras/donau/owner.txt | 7 + src/mca/ras/donau/ras_donau.h | 38 + src/mca/ras/donau/ras_donau_component.c | 96 ++ src/mca/ras/donau/ras_donau_module.c | 126 +++ src/mca/rmaps/base/rmaps_base_map_job.c | 7 + .../rank_file/rmaps_rank_file_component.c | 2 + src/prted/pmix/pmix_server_fence.c | 2 + src/runtime/prte_globals.c | 7 + src/runtime/prte_globals.h | 15 + src/tools/prte/prte.c | 7 + 28 files changed, 1874 insertions(+) create mode 100644 config/prte_check_donau.m4 create mode 100644 src/mca/ess/donau/Makefile.am create mode 100644 src/mca/ess/donau/configure.m4 create mode 100644 src/mca/ess/donau/ess_donau.h create mode 100644 src/mca/ess/donau/ess_donau_component.c create mode 100644 src/mca/ess/donau/ess_donau_module.c create mode 100644 src/mca/ess/donau/owner.txt create mode 100644 src/mca/plm/donau/Makefile.am create mode 100644 src/mca/plm/donau/configure.m4 create mode 100644 src/mca/plm/donau/help-plm-donau.txt create mode 100644 src/mca/plm/donau/owner.txt create mode 100644 src/mca/plm/donau/plm_donau.h create mode 100644 src/mca/plm/donau/plm_donau_component.c create mode 100644 src/mca/plm/donau/plm_donau_module.c create mode 100644 src/mca/ras/donau/Makefile.am create mode 100644 src/mca/ras/donau/configure.m4 create mode 100644 src/mca/ras/donau/help-ras-donau.txt create mode 100644 src/mca/ras/donau/owner.txt create mode 100644 src/mca/ras/donau/ras_donau.h create mode 100644 src/mca/ras/donau/ras_donau_component.c create mode 100644 src/mca/ras/donau/ras_donau_module.c diff --git a/config/prte_check_donau.m4 b/config/prte_check_donau.m4 new file mode 100644 index 00000000000..1680910a3e7 --- /dev/null +++ b/config/prte_check_donau.m4 @@ -0,0 +1,43 @@ +# -*- shell-script -*- +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# PRTE_CHECK_DONAU(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +AC_DEFUN([PRTE_CHECK_DONAU],[ + if test -z "$prte_check_donau_happy" ; then + AC_ARG_WITH([donau], + [AC_HELP_STRING([--with-donau], + [Build DONAU scheduler component (default: yes)])]) + if test "$with_donau" = "no" ; then + prte_check_donau_happy="no" + else + prte_check_donau_happy="yes" + fi + + AS_IF([test "$prte_check_donau_happy" = "yes"], + [AC_CHECK_FUNC([fork], + [prte_check_donau_happy="yes"], + [prte_check_donau_happy="no"])]) + + AS_IF([test "$prte_check_donau_happy" = "yes"], + [AC_CHECK_FUNC([execve], + [prte_check_donau_happy="yes"], + [prte_check_donau_happy="no"])]) + + AS_IF([test "$prte_check_donau_happy" = "yes"], + [AC_CHECK_FUNC([setpgid], + [prte_check_donau_happy="yes"], + [prte_check_donau_happy="no"])]) + fi + AS_IF([test "$prte_check_donau_happy" = "yes"], + [$2], + [$3]) +]) \ No newline at end of file diff --git a/src/mca/ess/donau/Makefile.am b/src/mca/ess/donau/Makefile.am new file mode 100644 index 00000000000..a0a9badb8b8 --- /dev/null +++ b/src/mca/ess/donau/Makefile.am @@ -0,0 +1,36 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + ess_donau.h \ + ess_donau_component.c \ + ess_donau_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_prte_ess_donau_DSO +component_noinst = +component_install = prte_mca_ess_donau.la +else +component_noinst = libprtemca_ess_donau.la +component_instal = +endif + +mcacomponentdir = $(prtelibdir) +mcacomponent_LTLIBRARIES = $(component_install) +prte_mca_ess_donau_la_SOURCES = $(sources) +prte_mca_ess_donau_la_LDFLAGS = -module -avoid-version +prte_mca_ess_donau_la_LIBADD = $(top_builddir)/src/libprrte.la + +noinst_LTLIBRARIES = $(component_noinst) +libprtemca_ess_donau_la_SOURCES = $(sources) +libprtemca_ess_donau_la_LDFLAGS = -module -avoid-version \ No newline at end of file diff --git a/src/mca/ess/donau/configure.m4 b/src/mca/ess/donau/configure.m4 new file mode 100644 index 00000000000..4e164d07e3b --- /dev/null +++ b/src/mca/ess/donau/configure.m4 @@ -0,0 +1,29 @@ +# -*- shell-script -*- +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ess_donau_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_prte_ess_donau_CONFIG],[ + AC_CONFIG_FILES([src/mca/ess/donau/Makefile]) + + PRTE_CHECK_DONAU([ess_donau], [ess_donau_good=1], [ess_donau_good=0]) + + # if check worked, set wrapper flags if so. + # Evaluate succeed / fail + AS_IF([test "$ess_donau_good" = "1"], + [$1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([ess_donau_CPPFLAGS]) + AC_SUBST([ess_donau_LDFLAGS]) + AC_SUBST([ess_donau_LIBS]) +])dnl \ No newline at end of file diff --git a/src/mca/ess/donau/ess_donau.h b/src/mca/ess/donau/ess_donau.h new file mode 100644 index 00000000000..889d74d3962 --- /dev/null +++ b/src/mca/ess/donau/ess_donau.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "prte_config.h" +#include "src/mca/mca.h" +#include "src/mca/ess/ess.h" + +#ifndef PRTE_ESS_DONAU_H +#define PRTE_ESS_DONAU_H + +BEGIN_C_DECLS + +PRTE_MODULE_EXPORT extern prte_ess_base_component_t prte_mca_ess_donau_component; + +/* + * Module open / close + */ +int prte_ess_donau_component_open(void); +int prte_ess_donau_component_close(void); +int prte_ess_donau_component_query(pmix_mca_base_module_t **module, int *priority); + +END_C_DECLS + +#endif /* ORTE_ESS_DONAU_H */ \ No newline at end of file diff --git a/src/mca/ess/donau/ess_donau_component.c b/src/mca/ess/donau/ess_donau_component.c new file mode 100644 index 00000000000..1fbc661df78 --- /dev/null +++ b/src/mca/ess/donau/ess_donau_component.c @@ -0,0 +1,82 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire component just to query their version and paramters. +*/ + +#include "prte_config.h" +#include "constants.h" + +#include "src/util/proc_info.h" +#include "src/runtime/prte_globals.h" +#include "src/mca/ess/ess.h" +#include "src/mca/ess/donau/ess_donau.h" +#include + +extern prte_ess_base_module_t prte_ess_donau_module; + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +prte_ess_base_component_t prte_mca_ess_donau_component = { + PRTE_ESS_BASE_VERSION_3_0_0, + + /* Component name and version */ + .pmix_mca_component_name = "donau", + PMIX_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PRTE_RELEASE_VERSION), + + /* Component open and close functions */ + .pmix_mca_open_component = prte_ess_donau_component_open, + .pmix_mca_close_component = prte_ess_donau_component_close, + .pmix_mca_query_component = prte_ess_donau_component_query, +}; + +int prte_ess_donau_component_open(void) +{ + return PRTE_SUCCESS; +} + +int prte_ess_donau_component_query(pmix_mca_base_module_t **module, int *priority) +{ + /* Are we running under a DONAU job? Were + * we given a path back to the HNP? If the + * answer to both is "yes", then we were launched + * by mpirun in a donau world, so make ourselves available + */ + char *donau_job_id = getenv("CCS_JOB_ID"); + if (PRTE_PROC_IS_DAEMON && + NULL != donau_job_id && + 0 != strlen(donau_job_id) && + NULL != prte_process_info.my_hnp_uri && + DONAU_DRUN == prte_donau_launch_type) { + *priority = 100; + *module = (pmix_mca_base_module_t *)&prte_ess_donau_module; + return PRTE_SUCCESS; + } + + /*Sadly, no */ + *priority = -1; + *module = NULL; + return PRTE_ERROR; +} + +int prte_ess_donau_component_close(void) +{ + return PRTE_SUCCESS; +} \ No newline at end of file diff --git a/src/mca/ess/donau/ess_donau_module.c b/src/mca/ess/donau/ess_donau_module.c new file mode 100644 index 00000000000..ceee36c1832 --- /dev/null +++ b/src/mca/ess/donau/ess_donau_module.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "prte_config.h" +#include "constants.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#include +#include + +#include "src/util/proc_info.h" +#include "src/util/pmix_show_help.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/rml/rml.h" +#include "src/util/name_fns.h" +#include "src/runtime/prte_globals.h" + +#include "src/mca/ess/ess.h" +#include "src/mca/ess/base/base.h" +#include "src/mca/ess/donau/ess_donau.h" + +static int donau_set_name(void); +static int rte_init(void); +static int rte_finalize(void); + +prte_ess_base_module_t prte_ess_donau_module = { + rte_init, + rte_finalize, +}; + +static int rte_init(void) +{ + int ret; + char *error = NULL; + + /* run the prolog */ + if (PRTE_SUCCESS != (ret = prte_ess_base_std_prolog())) { + error = "prte_ess_base_std_prolog"; + goto error; + } + /* Start by getting a unique name */ + if (PRTE_SUCCESS != (ret = donau_set_name())) { + PRTE_ERROR_LOG(ret); + error = "donau_set_name"; + goto error; + } + if (PRTE_SUCCESS != (ret = prte_ess_base_prted_setup())) { + PRTE_ERROR_LOG(ret); + error = "prte_ess_base_prted_setup"; + goto error; + } + return PRTE_SUCCESS; + +error: + if (PRTE_ERR_SILENT != ret && !prte_report_silent_errors) { + pmix_show_help("help-prte-runtime.txt", + "prte_init:startup:internal-failure", + true, error, PRTE_ERROR_NAME(ret), ret); + } + + return ret; +} + +static int rte_finalize(void) +{ + int ret; + + if (PRTE_SUCCESS != (ret = prte_ess_base_prted_finalize())) { + PRTE_ERROR_LOG(ret); + } + + return PRTE_SUCCESS; +} + +static int donau_set_name(void) +{ + int rc; + pmix_rank_t vpid; + int donau_nodeid; + PMIX_OUTPUT_VERBOSE((1, prte_ess_base_framework.framework_output, + "ess:donau setting name")); + + if (NULL == prte_ess_base_nspace) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + return PRTE_ERR_NOT_FOUND; + } + + PMIX_LOAD_NSPACE(PRTE_PROC_MY_NAME->nspace, prte_ess_base_nspace); + + if (NULL == prte_ess_base_vpid) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + return PRTE_ERR_NOT_FOUND; + } + vpid = strtoul(prte_ess_base_vpid, NULL, 10); + + donau_nodeid = atoi(getenv("CCS_NODE_RANK")); + if (donau_nodeid < 0) { + PRTE_ERROR_LOG(PRTE_ERR_INVALID_NODE_RANK); + return PRTE_ERR_INVALID_NODE_RANK; + } + + PMIX_OUTPUT_VERBOSE((1, prte_ess_base_framework.framework_output, + "ess:donau set name to %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + PRTE_PROC_MY_NAME->rank = vpid + donau_nodeid - 1; + + PMIX_OUTPUT_VERBOSE((1, prte_ess_base_framework.framework_output, + "ess:donau set name to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + prte_process_info.num_daemons = prte_ess_base_num_procs; + + return PRTE_SUCCESS; +} \ No newline at end of file diff --git a/src/mca/ess/donau/owner.txt b/src/mca/ess/donau/owner.txt new file mode 100644 index 00000000000..9c08c938c34 --- /dev/null +++ b/src/mca/ess/donau/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: HUAWEI +status: active \ No newline at end of file diff --git a/src/mca/grpcomm/grpcomm.h b/src/mca/grpcomm/grpcomm.h index 49fb364c3b3..a8e048dff81 100644 --- a/src/mca/grpcomm/grpcomm.h +++ b/src/mca/grpcomm/grpcomm.h @@ -65,6 +65,7 @@ typedef struct { pmix_object_t super; pmix_proc_t *signature; size_t sz; + size_t coll_id; } prte_grpcomm_signature_t; PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_grpcomm_signature_t); diff --git a/src/mca/plm/donau/Makefile.am b/src/mca/plm/donau/Makefile.am new file mode 100644 index 00000000000..83feddff371 --- /dev/null +++ b/src/mca/plm/donau/Makefile.am @@ -0,0 +1,38 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + plm_donau.h \ + plm_donau_component.c \ + plm_donau_module.c + +dist_prtedata_DATA = help-plm-donau.txt + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_prte_plm_donau_DSO +component_noinst = +component_install = prte_mca_plm_donau.la +else +component_noinst = libprtemca_plm_donau.la +component_install = +endif + +mcacomponentdir = $(prtelibdir) +mcacomponent_LTLIBRARIES = $(component_install) +prte_mca_plm_donau_la_SOURCES = $(sources) +prte_mca_plm_donau_la_LDFLAGS = -module -avoid-version +prte_mca_plm_donau_la_LIBADD = $(top_builddir)/src/libprrte.la + +noinst_LTLIBRARIES = $(component_noinst) +libprtemca_plm_donau_la_SOURCES = $(sources) +libprtemca_plm_donau_la_LDFLAGS = -module -avoid-version \ No newline at end of file diff --git a/src/mca/plm/donau/configure.m4 b/src/mca/plm/donau/configure.m4 new file mode 100644 index 00000000000..aada5605c89 --- /dev/null +++ b/src/mca/plm/donau/configure.m4 @@ -0,0 +1,30 @@ +# -*- shell-script -*- +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_plm_donau_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_prte_plm_donau_CONFIG],[ + AC_CONFIG_FILES([src/mca/plm/donau/Makefile]) + + PRTE_CHECK_DONAU([plm_donau], [plm_donau_good=1], [plm_donau_good=0]) + + # if check worked, set wrapper flags if so. + # Evaluate succeed / fail + AS_IF([test "$plm_donau_good" = "1"], + [$1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([plm_donau_CPPFLAGS]) + AC_SUBST([plm_donau_LDFLAGS]) + AC_SUBST([plm_donau_LIBS]) + +])dnl \ No newline at end of file diff --git a/src/mca/plm/donau/help-plm-donau.txt b/src/mca/plm/donau/help-plm-donau.txt new file mode 100644 index 00000000000..3aa9ea21250 --- /dev/null +++ b/src/mca/plm/donau/help-plm-donau.txt @@ -0,0 +1,54 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[multiple-prefixes] +The DONAU process starter for Open MPI does not support multiple +different --prefix options to mpirun. You can specify at most one +unique value for the --prefix option (in any of the application +contexts); it will be applied to all the application contexts of your +parallel job. + +Put simply, you must have Open MPI installed in the same location on +all of your DONAU nodes. + +Multiple different --prefix options were specified to mpirun. This is +a fatal error for the DONAU process starter in Open MPI. + +The first two prefix values supplied were: + %s +and %s +# +[no-hosts-in-list] +The DONAU process starter for Open MPI didn't find any hosts in +the map for this application. This can be caused by a lack of +an allocation, or by an error in the Open MPI code. Please check +to ensure you have a DONAU allocation. If you do, then please pass +the error to the Open MPI user's mailing list for assistance. +# +[multiple-prefixes] +The ALPS process starter for Open MPI does not support multiple +different --prefix options to mpirun. You can specify at most one +unique value for the --prefix option (in any of the application +contexts); it will be applied to all the application contexts of your +parallel job. + +Put simply, you must have Open MPI installed in the same location on +all of your ALPS nodes. + +Multiple different --prefix options were specified to mpirun. This is +a fatal error for the ALPS process starter in Open MPI. + +The first two prefix values supplied were: + %s +and %s +# +[no-drun] +The DONAU process starter for OpenMPI was unable to locate a +usable "drun" command in its path. Please check your path +and try again. \ No newline at end of file diff --git a/src/mca/plm/donau/owner.txt b/src/mca/plm/donau/owner.txt new file mode 100644 index 00000000000..9c08c938c34 --- /dev/null +++ b/src/mca/plm/donau/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: HUAWEI +status: active \ No newline at end of file diff --git a/src/mca/plm/donau/plm_donau.h b/src/mca/plm/donau/plm_donau.h new file mode 100644 index 00000000000..6cb5d6cab1a --- /dev/null +++ b/src/mca/plm/donau/plm_donau.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PRTE_PLM_DONAU_EXPORT_H +#define PRTE_PLM_DONAU_EXPORT_H + +#include "prte_config.h" + +#include "src/mca/mca.h" +#include "src/mca/plm/plm.h" +BEGIN_C_DECLS + +struct prte_mca_plm_donau_component_t { + prte_plm_base_component_t super; + char *custom_args; + bool donau_warning_msg; +}; +typedef struct prte_mca_plm_donau_component_t prte_mca_plm_donau_component_t; + +/* + * Globally exported variable + */ + +PRTE_MODULE_EXPORT extern prte_mca_plm_donau_component_t prte_mca_plm_donau_component; +PRTE_EXPORT extern prte_plm_base_module_t prte_plm_donau_module; + +END_C_DECLS + +#endif /* ORTE_PLM_DONAU_EXPORT_H */ \ No newline at end of file diff --git a/src/mca/plm/donau/plm_donau_component.c b/src/mca/plm/donau/plm_donau_component.c new file mode 100644 index 00000000000..1c220345f13 --- /dev/null +++ b/src/mca/plm/donau/plm_donau_component.c @@ -0,0 +1,121 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "prte_config.h" +#include "constants.h" + +#include "src/mca/base/pmix_mca_base_var.h" +#include "src/util/pmix_environ.h" +#include "src/util/name_fns.h" +#include "src/util/pmix_show_help.h" +#include "src/runtime/prte_globals.h" + +#include "src/mca/plm/plm.h" +#include "src/mca/plm/base/plm_private.h" +#include "plm_donau.h" +#include + + +/* + * Public string showing the plm ompi_donau component version number + */ +const char *mca_plm_donau_component_version_string = + "Open MPI donau plm MCA component version " PRTE_VERSION; + + +/* + * Local functions + */ +static int plm_donau_register(void); +static int plm_donau_open(void); +static int plm_donau_close(void); +static int prte_plm_donau_component_query(pmix_mca_base_module_t **module, int *priority); + + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it. + */ + +prte_mca_plm_donau_component_t prte_mca_plm_donau_component = { + .super = { + PRTE_PLM_BASE_VERSION_2_0_0, + + /* Component name and version */ + .pmix_mca_component_name = "donau", + PMIX_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PRTE_RELEASE_VERSION), + + /* Component open and close functions */ + .pmix_mca_open_component = plm_donau_open, + .pmix_mca_close_component = plm_donau_close, + .pmix_mca_query_component = prte_plm_donau_component_query, + .pmix_mca_register_component_params = plm_donau_register, + } + + /* Other orte_plm_donau_component_t items -- left uninitialized + here; will be initialized in plm_donau_open() */ +}; + + +static int plm_donau_register(void) +{ + pmix_mca_base_component_t *comp = &prte_mca_plm_donau_component.super; + + prte_mca_plm_donau_component.custom_args = NULL; + (void) pmix_mca_base_component_var_register (comp, "args", "Custom arguments to drun", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &prte_mca_plm_donau_component.custom_args); + prte_mca_plm_donau_component.donau_warning_msg = true; + (void) pmix_mca_base_component_var_register (comp, "warning", "Turn off warning message", + PMIX_MCA_BASE_VAR_TYPE_BOOL, + &prte_mca_plm_donau_component.donau_warning_msg); + return PRTE_SUCCESS; +} + +static int plm_donau_open(void) +{ + return PRTE_SUCCESS; +} + +static int prte_plm_donau_component_query(pmix_mca_base_module_t **module, int *priority) +{ + /* Are we running under a DONAU job? */ + char *donau_job_id = getenv("CCS_JOB_ID"); + if (NULL != donau_job_id && + 0 != strlen(donau_job_id) && + DONAU_DRUN == prte_donau_launch_type) { + *priority = 100; + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: available for selection", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + *module = (pmix_mca_base_module_t*)&prte_plm_donau_module; + return PRTE_SUCCESS; + } + + /* Sadly, no */ + *module = NULL; + return PRTE_ERROR; +} + +static int plm_donau_close(void) +{ + return PRTE_SUCCESS; +} \ No newline at end of file diff --git a/src/mca/plm/donau/plm_donau_module.c b/src/mca/plm/donau/plm_donau_module.c new file mode 100644 index 00000000000..b49857c307c --- /dev/null +++ b/src/mca/plm/donau/plm_donau_module.c @@ -0,0 +1,829 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "src/mca/prteinstalldirs/prteinstalldirs.h" +#include "src/mca/plm/plm_types.h" +#include "prte_config.h" +#include "src/runtime/prte_globals.h" +#include +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_TIME_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif + +#include "constants.h" +#include "types.h" +#include "src/util/name_fns.h" +#include "src/util/pmix_show_help.h" +#include "src/threads/pmix_threads.h" +#include "src/runtime/prte_wait.h" +#include "src/runtime/prte_quit.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/mca/rmaps/base/base.h" +#include "src/mca/state/state.h" + +#include "src/prted/prted.h" + +#include "src/mca/plm/plm.h" +#include "src/mca/plm/base/plm_private.h" +#include "plm_donau.h" + +/* + * Local functions + */ +static int plm_donau_init(void); +static int plm_donau_launch_job(prte_job_t *jdata); +static int plm_donau_terminate_prteds(void); +static int plm_donau_signal_job(pmix_nspace_t jobid, int32_t signal); +static int plm_donau_finalize(void); +static int plm_donau_start_proc(int argc, char **argv, char *prefix); + +/* + * Global variable + */ +prte_plm_base_module_1_0_0_t prte_plm_donau_module = { + .init = plm_donau_init, + .set_hnp_name = prte_plm_base_set_hnp_name, + .spawn = plm_donau_launch_job, + .terminate_job = prte_plm_base_prted_terminate_job, + .terminate_orteds = plm_donau_terminate_prteds, + .terminate_procs = prte_plm_base_prted_kill_local_procs, + .signal_job = plm_donau_signal_job, + .finalize = plm_donau_finalize +}; + +/* + * Local variables + */ + +typedef struct Node { + char name[DONAU_MAX_NODENAME_LENGTH]; + int len; + char pre[DONAU_MAX_NODENAME_LENGTH]; + int pre_len; + int num; +} nod; + +typedef enum { + SIMP_SUCCESS = 0, + SIMP_OUT_OF_RESOURCE, + SIMP_NULL +} simp_state; + +nod node[DONAU_MAX_NODELIST_LENGTH]; + +static pid_t primary_drun_pid = 0; +static bool primary_pid_set = false; +static void launch_daemons(int fd, short args, void *cbdata); +static int cmp(const void *a, const void *b); +static void get_pre(char *s, char *result); +static int get_id_num(char *s); +static simp_state donau_nodelist_simp(char *node_list, char *nodelist_result); + +/* + * Init the module + */ +static int plm_donau_init(void) +{ + int rc; + prte_job_t *jdata; + if (PRTE_SUCCESS != (rc = prte_plm_base_comm_start())) { + PRTE_ERROR_LOG(rc); + return rc; + } + jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { + prte_plm_globals.daemon_nodes_assigned_at_launch = true; + } else { + prte_plm_globals.daemon_nodes_assigned_at_launch = false; + } + + /* point to our launch command */ + rc = prte_state.add_job_state(PRTE_JOB_STATE_LAUNCH_DAEMONS, launch_daemons); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + return rc; + } + + return rc; +} + +/* When working in this function, ALWAYS jump to "cleanup" if + * you encounter an error so that orterun will be woken up and + * the job can cleanly terminate + */ +static int plm_donau_launch_job(prte_job_t *jdata) +{ + if (PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_RESTART)) { + /* this is a restart situation - skip to the mapping stage */ + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_INIT); + } + return PRTE_SUCCESS; +} + +static void launch_daemons(int fd, short args, void *cbdata) +{ + prte_app_context_t *app = NULL; + prte_node_t *node = NULL; + int32_t nnode; + prte_job_map_t *map = NULL; + char *param = NULL; + char **argv = NULL; + int argc; + int rc; + char *tmp = NULL; + char **env = NULL; + char *nodelist_flat = NULL; + char *nodelist_simp = NULL; + char **nodelist_argv = NULL; + char *name_string = NULL; + char **custom_strings = NULL; + int num_args; + char *cur_prefix = NULL; + int proc_vpid_index; + bool failed_launch = true; + prte_job_t *daemons = NULL; + prte_state_caddy_t *state = (prte_state_caddy_t*)cbdata; + + PMIX_ACQUIRE_OBJECT(state); + + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: LAUNCH DAEMONS CALLED", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + /* start by setting up the virtual machine */ + daemons = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); + if (NULL == daemons) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + goto cleanup; + } + if (PRTE_SUCCESS != (rc = prte_plm_base_setup_virtual_machine(state->jdata))) { + PRTE_ERROR_LOG(rc); + goto cleanup; + } + /* if we don't want to launch, then don't attempt to + * launch the daemons - the user really wants to just + * look at the proposed process map + */ + if (prte_get_attribute(&daemons->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { + state->jdata->state = PRTE_JOB_STATE_DAEMONS_LAUNCHED; + PRTE_ACTIVATE_JOB_STATE(state->jdata, PRTE_JOB_STATE_DAEMONS_REPORTED); + PMIX_RELEASE(state); + return; + } + + // OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, + // "%s plm:donau: launching vm", + // ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* get the map for the job */ + if (NULL == (map = daemons->map)) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto cleanup; + } + + if (0 == map->num_new_daemons) { + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move the following step + */ + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: no new daemons to launch", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + state->jdata->state = PRTE_JOB_STATE_DAEMONS_LAUNCHED; + PRTE_ACTIVATE_JOB_STATE(state->jdata, PRTE_JOB_STATE_DAEMONS_REPORTED); + PMIX_RELEASE(state); + return; + } + + /* + * start building argv array + */ + argv = NULL; + argc = 0; + + /* + * DONAU drun OPTIONS + */ + + /* add the drun command */ + pmix_argv_append(&argc, &argv, donau_launch_exec); + + /* Append user defined arguments to drun */ + if (NULL != prte_mca_plm_donau_component.custom_args) { + custom_strings = PMIX_ARGV_SPLIT_COMPAT(prte_mca_plm_donau_component.custom_args, ' '); + num_args = PMIX_ARGV_COUNT_COMPAT(custom_strings); + for (int i = 0; i < num_args; ++i) { + pmix_argv_append(&argc, &argv, custom_strings[i]); + } + PMIX_ARGV_FREE_COMPAT(custom_strings); + } + /* create nodelist */ + nodelist_argv = NULL; + + /* get the hnp node and send to donau */ + prte_node_t *hnp_node = (prte_node_t*)pmix_pointer_array_get_item(prte_node_pool, 0); + pmix_argv_append_nosize(&nodelist_argv, hnp_node->name); + + int node_size = map->nodes->size; + for (nnode = 0; nnode < node_size; nnode++) { + if (NULL == (node = (prte_node_t*)pmix_pointer_array_get_item(map->nodes, nnode))) { + continue; + } + /* if the daemon already exists on this node, then + * don't include it + */ + if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_DAEMON_LAUNCHED)) { + continue; + } + + /* otherwise, add it to the list of nodes upon which + * we need to launch a daemon + */ + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&nodelist_argv, node->name); + } + if (0 == PMIX_ARGV_COUNT_COMPAT(nodelist_argv)) { + pmix_show_help("help-plm-donau.txt", "no-hosts-in-list", true); + rc = PRTE_ERR_FAILED_TO_START; + goto cleanup; + } + nodelist_flat = PMIX_ARGV_JOIN_COMPAT(nodelist_argv, ','); + PMIX_ARGV_FREE_COMPAT(nodelist_argv); + + /* simplify nodelist for donau */ + + nodelist_simp = (char *)malloc(DONAU_MAX_NODELIST_LENGTH); + memset(nodelist_simp, 0, sizeof(nodelist_simp)); + simp_state error_num = donau_nodelist_simp(nodelist_flat, nodelist_simp); + if (error_num == SIMP_OUT_OF_RESOURCE) { + PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); + } else if (error_num == SIMP_NULL) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_AVAILABLE); + } + if (error_num != SIMP_SUCCESS) { + free(nodelist_simp); + free(nodelist_flat); + goto cleanup; + } + + pmix_asprintf(&tmp, "-nl"); + pmix_argv_append(&argc, &argv, tmp); + free(tmp); + pmix_asprintf(&tmp, "%s", nodelist_simp); + pmix_argv_append(&argc, &argv, tmp); + free(tmp); + pmix_asprintf(&tmp, "-ao"); + pmix_argv_append(&argc, &argv, tmp); + free(tmp); + pmix_asprintf(&tmp, "%s", hnp_node->name); + pmix_argv_append(&argc, &argv, tmp); + free(tmp); + pmix_asprintf(&tmp, "-rpn"); + pmix_argv_append(&argc, &argv, tmp); + free(tmp); + pmix_asprintf(&tmp, "1"); + pmix_argv_append(&argc, &argv, tmp); + free(tmp); + + PMIX_OUTPUT_VERBOSE((2, prte_plm_base_framework.framework_output, + "%s plm:donau: launching on nodes %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), nodelist_simp)); + free(nodelist_simp); + free(nodelist_flat); + /* + * ORTED OPTIONS + */ + + /* add the daemon command (as specified by user) */ + prte_plm_base_setup_prted_cmd(&argc, &argv); + /* add basic orted command line options, including debug flags */ + prte_plm_base_prted_append_basic_args(&argc, &argv, + "donau", &proc_vpid_index); + /* tell the new daemons the base of the name list so they can compute + * their own name on the other end + */ + rc = prte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); + if (PRTE_SUCCESS != rc) { + pmix_output(0, "plm_donau: unable to get daemon vpid as string"); + goto cleanup; + } + + free(argv[proc_vpid_index]); + argv[proc_vpid_index] = strdup(name_string); + free(name_string); + + char *param1 = NULL; + prte_oob_base_get_addr(¶m1); + if (param1 == NULL) { + pmix_output(0, "plm_donau: unable to get param1 from orte_oob_base_get_addr"); + goto cleanup; + } + pmix_argv_append(&argc, &argv, "--prtemca"); + pmix_argv_append(&argc, &argv, "prte_parent_uri"); + pmix_argv_append(&argc, &argv, param1); + free(param1); + /* Copy the prefix-directory specified in the + * corresponding add_context. If there are multiple, + * different prefix's in the app context, complain (i.e., only + * allow one --prefix option for the entire donau run -- we + * don't support different --prefix'es for different nodes in + * the DONAU plm) + */ + cur_prefix = NULL; + for (nnode = 0; nnode < state->jdata->apps->size; nnode++) { + char *app_prefix_dir = NULL; + if (NULL == (app = (prte_app_context_t*)pmix_pointer_array_get_item(state->jdata->apps, nnode))) { + continue; + } + app_prefix_dir = NULL; + prte_get_attribute(&app->attributes, PRTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, PMIX_STRING); + /* Check for already set cur_prefix_dir -- if different, + complain */ + if (NULL != app_prefix_dir) { + if (NULL != cur_prefix && + 0 != strcmp(cur_prefix, app_prefix_dir)) { + pmix_show_help("help-plm-donau.txt", "multiple-prefixes", + true, cur_prefix, app_prefix_dir); + goto cleanup; + } + + /* If not yet set, copy it; if set, then it's the + * same way + */ + if (NULL == cur_prefix) { + cur_prefix = strdup(app_prefix_dir); + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: Set prefix:%s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + cur_prefix)); + } + free(app_prefix_dir); + } + } + + /* protect the args in case someone has a script wrapper around drun */ + prte_plm_base_wrap_args(argv); + + if (0 < pmix_output_get_verbosity(prte_plm_base_framework.framework_output)) { + param = PMIX_ARGV_JOIN_COMPAT(argv, ' '); + pmix_output(prte_plm_base_framework.framework_output, + "%s plm:donau: final top-level argv:\n\t%s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + (NULL == param) ? "NULL" : param); + if (NULL != param) { + free(param); + } + } + + /* exec the daemon(s) */ + if (PRTE_SUCCESS != (rc = plm_donau_start_proc(argc, argv, cur_prefix))) { + PRTE_ERROR_LOG(rc); + goto cleanup; + } + + /* indicate that the daemons for this job were launched */ + state->jdata->state = PRTE_JOB_STATE_DAEMONS_LAUNCHED; + daemons->state = PRTE_JOB_STATE_DAEMONS_LAUNCHED; + + /* flag that launch was successful, so far as we currently know */ + failed_launch = false; + +cleanup: + if (NULL != argv) { + PMIX_ARGV_FREE_COMPAT(argv); + } + if (NULL != cur_prefix) { + free(cur_prefix); + } + /* check for failed launch - if so, force terminate */ + if (failed_launch) { + PRTE_ACTIVATE_JOB_STATE(state->jdata, PRTE_JOB_STATE_FAILED_TO_LAUNCH); + } + /* cleanup the caddy */ + PMIX_RELEASE(state); +} + +/* + * Terminate the orteds for a given job + */ +static int plm_donau_terminate_prteds(void) +{ + int rc = PRTE_SUCCESS; + prte_job_t *jdata; + + /* check to see if the primary pid is set. If not, this indicates + * that we never launched any additional daemons, so we cannot + * not wait for a waitpid to fire and tell us it's okay to + * exit. Instead, we simply trigger an exit for ourselves. + */ + if (primary_pid_set) { + if (PRTE_SUCCESS != (rc = prte_plm_base_prted_exit(PRTE_DAEMON_EXIT_CMD))) { + PRTE_ERROR_LOG(rc); + } + } else { + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: primary daemons complete!", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); + /* need to set the $terminated value to avoid an incorrect error msg */ + jdata->num_terminated = jdata->num_procs; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_DAEMONS_TERMINATED); + } + + return rc; +} + +/* + * Signal all the processes in the child drun by sending the signal directly to it + */ +static int plm_donau_signal_job(pmix_nspace_t jobid, int32_t signal) +{ + int rc = PRTE_SUCCESS; + + /* order them to pass this signal to their local procs */ + if (PRTE_SUCCESS != (rc = prte_plm_base_prted_signal_local_procs(jobid, signal))) { + PRTE_ERROR_LOG(rc); + } + + return rc; +} + +static int plm_donau_finalize(void) +{ + int rc; + + /* cleanup any pending recvs */ + if (PRTE_SUCCESS != (rc = prte_plm_base_comm_stop())) { + PRTE_ERROR_LOG(rc); + } + + return PRTE_SUCCESS; +} + +static void drun_wait_cb(int sd, short fd, void *cbdata) +{ + prte_wait_tracker_t *t2 = (prte_wait_tracker_t*)cbdata; + prte_proc_t *proc = t2->child; + prte_job_t *jdata; + + jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); + + /* abort only if the status returned is non-zero - i.e., if + * the orteds exited with an error + */ + if (0 != proc->exit_code) { + /* an orted must have died unexpectedly - report + * that the daemon has failed so we exit + */ + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: drun returned non-zero exit status (%d) from launching the per-node daemon", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + proc->exit_code)); + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_DAEMONS_TERMINATED); + } else { + /* otherwise, check to see if this is the primary pid */ + if (primary_drun_pid == proc->pid) { + /* in this case, we just want to fire the proper trigger so + * mpirun can exit + */ + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: primary daemons complete!", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + /* need to set the #terminated value to avoid an incorrect error msg */ + jdata->num_terminated = jdata->num_procs; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_DAEMONS_TERMINATED); + } + } + + /* done with this dummy */ + PMIX_RELEASE(t2); +} + +static int plm_donau_start_proc(int argc, char **argv, char *prefix) +{ + int fd; + int drun_pid; + char *exec_argv = pmix_path_findv(argv[0], 0, environ, NULL); + prte_proc_t *dummy; + + if (NULL == exec_argv) { + pmix_show_help("help-plm-donau.txt", "no-drun", true); + return PRTE_ERR_SILENT; + } + + drun_pid = fork(); + if (-1 == drun_pid) { + PRTE_ERROR_LOG(PRTE_ERR_SYS_LIMITS_CHILDREN); + free(exec_argv); + return PRTE_ERR_SYS_LIMITS_CHILDREN; + } + /* if this is the primary launch - i.e., not a comm_spawn of a + * child job - then save the pid + */ + if (0 < drun_pid && !primary_pid_set) { + primary_drun_pid = drun_pid; + primary_pid_set = true; + } + + /* setup a dummy proc object to track the drun */ + dummy = PMIX_NEW(prte_proc_t); + dummy->pid = drun_pid; + /* be sure to mark it as alive so we don't instantly fire */ + PRTE_FLAG_SET(dummy, PRTE_PROC_FLAG_ALIVE); + /* setup the waitpid so we can find out if drun succeeds! */ + prte_wait_cb(dummy, drun_wait_cb, NULL); + + if (0 == drun_pid) { /* child */ + char *bin_base = NULL; + char *lib_base = NULL; + + /* Figure out the basenames for the libdir and bindir. There + * is a lengthy comment about this in plm_rsh_module.c + * explaining all the rationale for how / why we're doing + * this. + */ + lib_base = pmix_basename(prte_install_dirs.libdir); + bin_base = pmix_basename(prte_install_dirs.bindir); + + /* If we have a prefix, then modify the PATH and + * LD_LIBRARY_PATH environment variables. + */ + if (NULL != prefix) { + char *oldenv = NULL; + char *newenv = NULL; + + /* Reset PATH */ + oldenv = getenv("PATH"); + if (NULL != oldenv) { + pmix_asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv); + } else { + pmix_asprintf(&newenv, "%s/%s", prefix, bin_base); + } + setenv("PATH", newenv, true); + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: reset PATH: %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + newenv)); + free(newenv); + + /* Reset LD_LIBRARY_PATH */ + oldenv = getenv("LD_LIBRARY_PATH"); + if (NULL != oldenv) { + pmix_asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv); + } else { + pmix_asprintf(&newenv, "%s/%s", prefix, lib_base); + } + setenv("LD_LIBRARY_PATH", newenv, true); + PMIX_OUTPUT_VERBOSE((1, prte_plm_base_framework.framework_output, + "%s plm:donau: reset LD_LIBRARY_PATH: %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + newenv)); + free(newenv); + } + + fd = open("/dev/null", O_CREAT|O_RDWR|O_TRUNC, 0666); + if (fd >= 0) { + dup2(fd, 0); + /* When not in debug mode and --debug-daemons was not passed, + * tie stdout/stderr to dev null so we don't see messages from orted + * EXCEPT if the user has requested that we leave sessions attached + */ + if (0 > pmix_output_get_verbosity(prte_plm_base_framework.framework_output) && + !prte_debug_daemons_flag && !prte_leave_session_attached) { + dup2(fd, 1); + dup2(fd, 2); + } + + /* Don't leave the extra fd to /dev/null open */ + if (fd > 2) { + close(fd); + } + } + + /* get the drun process out of orterun's process group so that + * signals sent from the shell (like those resulting from + * cntl-c) don't get sent to drun + */ + setpgid(0, 0); + execvp(exec_argv, argv); + + pmix_output(0, "plm:donau:start_proc: exec failed"); + /* don't return - need to exit - returning would be bad - + * we're not in the calling process anymore + */ + exit(1); + } else { /* parent */ + /* just in case, make sure that the drun process is not in our + * process group anymore. Stevens says always do this on both + * sides of the fork... + */ + setpgid(drun_pid, drun_pid); + + free(exec_argv); + } + + return PRTE_SUCCESS; +} + +// Structure sorting +static int cmp(const void *a, const void *b) { + nod c = *(nod *)a; + nod d = *(nod *)b; + if (strcmp(c.pre, d.pre) != 0) { + return strcmp(c.pre, d.pre); + } else { + return d.num - c.num; + } +} + +// Obtain the node prefix name +static void get_pre(char *s, char *result) { + int len = strlen(s); + int prelen = -1; + memset(result, '\0', sizeof(result)); + for (int i = len - 1; i >= 0; i--) { + if(s[i] >= '0' && s[i] <= '9'){ + continue; + } else { + prelen = i; + break; + } + } + if (prelen == -1) { + prelen = len - 1; + } + memcpy(result, s, prelen + 1); + return ; +} + +// Obtain the node number +static int get_id_num(char *s) { + int result = 0; + int len = strlen(s); + int last_non_zero = len; + for (int i = len - 1; i >= 0; i--) { + if (s[i] < '0' || s[i] > '9') { + break; + } else if (s[i] > '0' && s[i] <= '9') { + last_non_zero = i; + } + } + if (s[len - 1] < '0' || s[len - 1] > '9') { + // Use -2 to make a plain string at the front of the sort + return -2; + } else if (last_non_zero == len && s[len - 1] == '0' && + (len - 2 < 0 || (s[len - 2] < '0' || s[len - 2] > '9'))) { + // Valid number 0 + return 0; + } else if (last_non_zero != 0 && s[last_non_zero - 1] == '0') { + // Contain leading zeros + return -1; + } + for (int i = last_non_zero; i < len; i++) { + result = result * 10 + s[i] - '0'; + } + return result; +} + +// Simplify node name (Split with ",") +static simp_state donau_nodelist_simp(char *node_list, char *nodelist_result) { + int temp_num = 0; + char *temp_s; + + int node_stack[DONAU_MAX_NODELIST_LENGTH]; + int stack_size = 0; + + if (*node_list == '\0' || node_list == NULL) { + return SIMP_NULL; + } + temp_s = strtok(node_list, ","); + strcpy(node[temp_num].name, temp_s); + temp_num++; + while (1) { + temp_s = strtok(NULL, ","); + if(temp_s == NULL) { + break; + } + strcpy(node[temp_num].name, temp_s); + temp_num++; + } + for (int i = 0; i < temp_num; i++) { + node[i].len = strlen(node[i].name); + node[i].num = get_id_num(node[i].name); + get_pre(node[i].name, node[i].pre); + node[i].pre_len = strlen(node[i].pre); + } + + qsort(node, temp_num, sizeof(node[0]), cmp); + + for (int i = 0; i <= temp_num; i++) { + char temp_str[DONAU_MAX_NODELIST_LENGTH] = ""; + int str_len = node[i].pre_len; + + if (i < temp_num && stack_size == 0) { + node_stack[++stack_size] = i; + } else if (i < temp_num && strcmp(node[i].pre, node[i - 1].pre) == 0) { + node_stack[++stack_size] = i; + } else { + int temp_len = 0; + int last_str_len = strlen(node[i - 1].pre); + for (int j = 0; j < last_str_len; j++) { + temp_str[j] = node[i - 1].name[j]; + temp_len++; + } + if (node[node_stack[stack_size]].pre_len == node[node_stack[stack_size]].len) { + if(strlen(nodelist_result) + strlen(temp_str) >= DONAU_MAX_NODELIST_LENGTH) { + return SIMP_OUT_OF_RESOURCE; + } + strcat(nodelist_result, temp_str); + if (i < temp_num || i == temp_num && stack_size > 1) { + strcat(nodelist_result, ","); + } + stack_size--; + } + if (stack_size > 0) { + temp_str[temp_len++] = '['; + // Determine whether a character is at the beginning + int is_beginning = 0; + // Determine whether a character is at the end + int is_end = 0; + while (stack_size > 0) { + // Compress if adjacent to the previous number + if (stack_size >= 1 && node[node_stack[stack_size]].num >= 0 && + node[node_stack[stack_size]].num == node[node_stack[stack_size - 1]].num - 1) { + if (is_beginning == 0){ + is_beginning = 1; + } else { + stack_size--; + continue; + } + } else { + is_beginning =0; + } + if (is_end == 1) { + temp_str[temp_len++] = '-'; + is_end = 0; + } + for (int j = node[node_stack[stack_size]].pre_len; j < node[node_stack[stack_size]].len; j++) { + temp_str[temp_len++] = node[node_stack[stack_size]].name[j]; + } + if (stack_size > 1) { + if ((node[node_stack[stack_size]].num < 0) || + (node[node_stack[stack_size]].num != node[node_stack[stack_size - 1]].num - 1)) { + temp_str[temp_len++] = ','; + } + } + if (is_beginning == 1) { + is_end = 1; + } + + stack_size--; + } + temp_str[temp_len++] = ']'; + if(strlen(nodelist_result) + strlen(temp_str) >= DONAU_MAX_NODELIST_LENGTH) { + return SIMP_OUT_OF_RESOURCE; + } + strcat(nodelist_result, temp_str); + if (i < temp_num) { + strcat(nodelist_result, ","); + } + } + + node_stack[++stack_size] = i; + } + } + + return SIMP_SUCCESS; +} \ No newline at end of file diff --git a/src/mca/ras/donau/Makefile.am b/src/mca/ras/donau/Makefile.am new file mode 100644 index 00000000000..6eb449802f8 --- /dev/null +++ b/src/mca/ras/donau/Makefile.am @@ -0,0 +1,46 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(ras_donau_CPPFLAGS) + +dist_prtedata_DATA = help-ras-donau.txt +sources = \ + ras_donau.h \ + ras_donau_component.c \ + ras_donau_module.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_prte_ras_donau_DSO +lib = +lib_sources = +component = prte_mca_ras_donau.la +component_sources = $(sources) +else +lib = libprtemca_ras_donau.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(prtelibdir) +mcacomponent_LTLIBRARIES = $(component) +prte_mca_ras_donau_la_SOURCES = $(component_sources) +prte_mca_ras_donau_la_LIBADD = ${ras_donau_LIBS} \ + $(top_builddir)/src/libprrte.la +prte_mca_ras_donau_la_LDFLAGS = -module -avoid-version $(ras_donau_LDFLAGS) + +noinst_LTLIBRARIES = $(lib) +libprtemca_ras_donau_la_SOURCES = $(lib_sources) +libprtemca_ras_donau_la_LDFLAGS = -module -avoid-version $(ras_donau_LDFLAGS) +libprtemca_ras_donau_la_LIBADD = $(ras_donau_LIBS) \ No newline at end of file diff --git a/src/mca/ras/donau/configure.m4 b/src/mca/ras/donau/configure.m4 new file mode 100644 index 00000000000..f3729d57f6d --- /dev/null +++ b/src/mca/ras/donau/configure.m4 @@ -0,0 +1,29 @@ +# -*- shell-script -*- +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ras_donau_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_prte_ras_donau_CONFIG],[ + AC_CONFIG_FILES([src/mca/ras/donau/Makefile]) + + PRTE_CHECK_DONAU([ras_donau], [ras_donau_good=1], [ras_donau_good=0]) + + # if check worked, set wrapper flags if so. + # Evaluate succeed / fail + AS_IF([test "$ras_donau_good" = "1"], + [$1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([ras_donau_CPPFLAGS]) + AC_SUBST([ras_donau_LDFLAGS]) + AC_SUBST([ras_donau_LIBS]) +])dnl \ No newline at end of file diff --git a/src/mca/ras/donau/help-ras-donau.txt b/src/mca/ras/donau/help-ras-donau.txt new file mode 100644 index 00000000000..b40e50c8534 --- /dev/null +++ b/src/mca/ras/donau/help-ras-donau.txt @@ -0,0 +1,26 @@ +# -*- text -*- +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI MCA error messages. +# +[nodelist-failed] +While trying to determine what resources are available, DONAU failed when +queried for a list of available nodes. This may indicate a problem with +DONAU or your cluster. +# +[affinity-file-not-found] +The affinity file provided in DONAU_AFFINITY_FILE could not be found: + + File: %s + +We cannot continue. + + + diff --git a/src/mca/ras/donau/owner.txt b/src/mca/ras/donau/owner.txt new file mode 100644 index 00000000000..9c08c938c34 --- /dev/null +++ b/src/mca/ras/donau/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: HUAWEI +status: active \ No newline at end of file diff --git a/src/mca/ras/donau/ras_donau.h b/src/mca/ras/donau/ras_donau.h new file mode 100644 index 00000000000..613e653ec7c --- /dev/null +++ b/src/mca/ras/donau/ras_donau.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ +/** + * @file + * + * Resource Allocation(DONAU) + */ +#ifndef PRTE_RAS_DONAU_H +#define PRTE_RAS_DONAU_H + +#include "prte_config.h" +#include "src/mca/ras/ras.h" +#include "src/mca/ras/base/base.h" + +BEGIN_C_DECLS + +/** + * RAS Component + */ +typedef struct { + prte_ras_base_component_t super; + int param_priority; +} prte_mca_ras_donau_component_t; + +PRTE_EXPORT extern prte_mca_ras_donau_component_t prte_mca_ras_donau_component; +PRTE_EXPORT extern prte_ras_base_module_t prte_ras_donau_module; + +END_C_DECLS + +#endif \ No newline at end of file diff --git a/src/mca/ras/donau/ras_donau_component.c b/src/mca/ras/donau/ras_donau_component.c new file mode 100644 index 00000000000..58d757ae1f8 --- /dev/null +++ b/src/mca/ras/donau/ras_donau_component.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "prte_config.h" +#include "constants.h" + +#include "src/mca/ras/base/base.h" +#include "src/util/pmix_net.h" +#include "src/include/prte_socket_errno.h" + +#include "src/util/name_fns.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/runtime/prte_globals.h" + +#include "src/mca/ras/base/ras_private.h" +#include "ras_donau.h" + +/* + * Local functions + */ +static int ras_donau_register(void); +static int ras_donau_open(void); +static int ras_donau_close(void); +static int prte_mca_ras_donau_component_query(pmix_mca_base_module_t **module, int *priority); + + +prte_mca_ras_donau_component_t prte_mca_ras_donau_component = { + .super = { + PRTE_RAS_BASE_VERSION_2_0_0, + + /* Component name and version */ + .pmix_mca_component_name = "donau", + PMIX_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PMIX_RELEASE_VERSION), + + /* Component open and close functions */ + .pmix_mca_open_component = ras_donau_open, + .pmix_mca_close_component = ras_donau_close, + .pmix_mca_query_component = prte_mca_ras_donau_component_query, + .pmix_mca_register_component_params = ras_donau_register + } +}; + +static int ras_donau_register(void) +{ + pmix_mca_base_component_t *component = &prte_mca_ras_donau_component.super; + + prte_mca_ras_donau_component.param_priority = 100; + (void) pmix_mca_base_component_var_register (component, + "priority", "Priority of the donau ras component", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_mca_ras_donau_component.param_priority); + + return PRTE_SUCCESS; +} +static int ras_donau_open(void) +{ + return PRTE_SUCCESS; +} + +static int ras_donau_close(void) +{ + return PRTE_SUCCESS; +} + +static int prte_mca_ras_donau_component_query(pmix_mca_base_module_t **module, int *priority) +{ + /* check if donau is running here */ + char *donau_job_id = getenv("CCS_JOB_ID"); + if (NULL == donau_job_id || 0 == strlen(donau_job_id) || DONAU_SSH == prte_donau_launch_type) { + /* disqualify ourselves */ + *priority = 0; + *module = NULL; + return PRTE_ERROR; + } + + PMIX_OUTPUT_VERBOSE((2, prte_ras_base_framework.framework_output, + "%s ras:donau: available for selection", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + /* since only one RM can exist on a cluster, just set + * my priority to something - the other components won't + * be responding anyway + */ + *priority = prte_mca_ras_donau_component.param_priority; + *module = (pmix_mca_base_module_t *)&prte_ras_donau_module; + return PRTE_SUCCESS; +} \ No newline at end of file diff --git a/src/mca/ras/donau/ras_donau_module.c b/src/mca/ras/donau/ras_donau_module.c new file mode 100644 index 00000000000..c393b520bd0 --- /dev/null +++ b/src/mca/ras/donau/ras_donau_module.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "constants.h" +#include "prte_config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "src/util/pmix_argv.h" +#include "src/util/pmix_net.h" + +#include "src/mca/errmgr/errmgr.h" +#include "src/mca/rmaps/base/base.h" +#include "src/mca/rmaps/rmaps_types.h" +#include "src/runtime/prte_globals.h" +#include "src/util/pmix_show_help.h" + +#include "src/mca/ras/base/base.h" +#include "src/mca/ras/base/ras_private.h" +#include "ras_donau.h" + +/* + * Local functions + */ +static int prte_ras_donau_allocate(prte_job_t *jdata, pmix_list_t *nodes); +static int prte_ras_donau_finalize(void); +static int donau_get_alloc(char *alloc_path, pmix_list_t *nodes); + +/* + * RAS donau module + */ +prte_ras_base_module_t prte_ras_donau_module = { + NULL, + prte_ras_donau_allocate, + NULL, + prte_ras_donau_finalize +}; + +static int donau_get_alloc(char *alloc_path, pmix_list_t *nodes) +{ + int num_nodes = 0; + prte_node_t *node = NULL; + FILE *fp; + fp = fopen(alloc_path, "r"); + if (NULL == fp) { + return num_nodes; + } + char *line = NULL; + size_t len = 0; + ssize_t read; + while ((read = getline(&line, &len, fp)) != -1) { + char hostname[DONAU_MAX_NODENAME_LENGTH] = {0}; + int num_kernels = 0; + int slots = 0; + if (sscanf(line, "%s %d %d", hostname, &num_kernels, &slots) != 3) { + pmix_output_verbose(10, prte_ras_base_framework.framework_output, + "ras/donau: Get the wrong num of params in CCS_ALLOC_FILE"); + break; + } + + node = PMIX_NEW(prte_node_t); + if (NULL == node) { + num_nodes = 0; + pmix_output_verbose(10, prte_ras_base_framework.framework_output, + "ras/donau: Failed when create obj of orte_node_t"); + goto cleanup; + } + node->name = strdup(hostname); + // Strip off the FQDN if present, ignore IP addresses + if (!prte_keep_fqdn_hostnames && !pmix_net_isaddr(node->name)) { + char *ptr; + if (NULL != (ptr = strchr(node->name, '.'))) { + *ptr = '\0'; + } + } + node->state = PRTE_NODE_STATE_UP; + node->slots_inuse = 0; + node->slots_max = 0; + node->slots = slots; + pmix_list_append(nodes, &node->super); + num_nodes++; + } + free(line); + fclose(fp); + return num_nodes; +cleanup: + if (NULL != nodes) { + PMIX_LIST_RELEASE(nodes); + } + free(line); + fclose(fp); + return num_nodes; +} + +static int prte_ras_donau_allocate(prte_job_t *jdata, pmix_list_t *nodes) { + int num_nodes; + char *alloc_path = NULL; + + /* get the list of allocated nodes */ + alloc_path = getenv("CCS_ALLOC_FILE"); + if (NULL == alloc_path || 0 == strlen(alloc_path) || + ((num_nodes = donau_get_alloc(alloc_path, nodes))) <= 0) { + pmix_show_help("help-ras-donau.txt", "nodelist-failed", true); + return PRTE_ERR_NOT_AVAILABLE; + } + + return PRTE_SUCCESS; +} + +static int prte_ras_donau_finalize(void) +{ + return PRTE_SUCCESS; +} diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index 454879cda20..cc9770451f0 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -327,6 +327,13 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) inherit ? "TRUE" : "FALSE", options.use_hwthreads ? "TRUE" : "FALSE"); + if ((PRTE_MAPPING_GIVEN & PRTE_GET_MAPPING_DIRECTIVE(prte_rmaps_base.mapping)) && + PRTE_MAPPING_BYUSER == PRTE_GET_MAPPING_POLICY(prte_rmaps_base.mapping) && + (PRTE_MAPPING_GIVEN & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) && + PRTE_MAPPING_BYL3CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + jdata->map->mapping = 0; + } + /* set the default mapping policy IFF it wasn't provided */ if (!PRTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { did_map = false; diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file_component.c b/src/mca/rmaps/rank_file/rmaps_rank_file_component.c index 77c342eca9a..76e02afa745 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file_component.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file_component.c @@ -18,6 +18,8 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/src/prted/pmix/pmix_server_fence.c b/src/prted/pmix/pmix_server_fence.c index 11623ea55b1..b08c0b00707 100644 --- a/src/prted/pmix/pmix_server_fence.c +++ b/src/prted/pmix/pmix_server_fence.c @@ -76,6 +76,7 @@ static void pmix_server_release(int status, pmix_data_buffer_t *buf, void *cbdat PMIX_RELEASE(cd); } +static __thread int coll_id = 0; /* this function is called when all the local participants have * called fence - thus, the collective is already locally * complete at this point. We therefore just need to create the @@ -105,6 +106,7 @@ pmix_status_t pmix_server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs, cd->sig->sz = nprocs; cd->sig->signature = (pmix_proc_t *) malloc(cd->sig->sz * sizeof(pmix_proc_t)); memcpy(cd->sig->signature, procs, cd->sig->sz * sizeof(pmix_proc_t)); + cd->sig->coll_id = coll_id++; } rc = prte_pack_ctrl_options(&cd->ctrls, info, ninfo); diff --git a/src/runtime/prte_globals.c b/src/runtime/prte_globals.c index 7e09ceb2503..9b62c83d792 100644 --- a/src/runtime/prte_globals.c +++ b/src/runtime/prte_globals.c @@ -172,6 +172,13 @@ bool prte_execute_quiet = false; bool prte_report_silent_errors = false; bool prte_hwloc_shmem_available = false; +/* flag about donau launch: DONAU_SSH - ssh/dstart + * DONAU_DRUN - drun(default) */ +int prte_donau_launch_type = DONAU_DRUN; + +/* exec path about drun/start from DONAU */ +char *donau_launch_exec = NULL; + /* See comment in src/tools/prun/debuggers.c about this MCA param */ bool prte_in_parallel_debugger = false; diff --git a/src/runtime/prte_globals.h b/src/runtime/prte_globals.h index 5ee815a9738..b02031bea7a 100644 --- a/src/runtime/prte_globals.h +++ b/src/runtime/prte_globals.h @@ -142,6 +142,10 @@ PRTE_EXPORT extern int prte_clean_output; /* define a default error return code for PRTE */ #define PRTE_ERROR_DEFAULT_EXIT_CODE 1 +/* define the default length of nodename and nodelist about donau */ +#define DONAU_MAX_NODENAME_LENGTH 63 +#define DONAU_MAX_NODELIST_LENGTH 36000 + /** * Define a macro for updating the prte_exit_status * The macro provides a convenient way of doing this @@ -627,6 +631,17 @@ PMIX_EXPORT void prte_hide_unused_params(int x, ...); #define PRTE_HIDE_UNUSED_PARAMS(...) #endif +/* flag about donau launch: DONAU_SSH - ssh/dstart + * DONAU_DRUN - drun(default) */ +__prte_attribute_visibility__("default") extern int prte_donau_launch_type; +enum { + DONAU_SSH = 0, + DONAU_DRUN = 1 +}; + +/* exec path about drun/start from DONAU */ +__prte_attribute_visibility__("default") extern char *donau_launch_exec; + END_C_DECLS #endif /* PRTE_RUNTIME_PRTE_GLOBALS_H */ diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index 4a5eea605e5..f343d8dd064 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -453,6 +453,13 @@ int main(int argc, char *argv[]) return rc; } } + + donau_launch_exec = getenv("OMPI_MCA_plm_rsh_agent"); + if (NULL != donau_launch_exec && (NULL != strstr(donau_launch_exec, "ssh") || + strstr(donau_launch_exec, "dstart"))) { + prte_donau_launch_type = DONAU_SSH; + } + // check if they asked for XML output from us opt = pmix_cmd_line_get_param(&results, PRTE_CLI_OUTPUT); if (NULL != opt) { -- Gitee