diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file.c b/src/mca/rmaps/rank_file/rmaps_rank_file.c index ffd14d9a1fcfdec753a2c1421e5753d7b94c744c..35e5890f65bf3466f60bb32434c99c68b6591b1f 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file.c @@ -64,6 +64,7 @@ prte_rmaps_base_module_t prte_rmaps_rank_file_module = { }; static int prte_rmaps_rank_file_parse(const char *); +static int prte_rmaps_donau_affinity_file_parse(const char *affinityfile); static char *prte_rmaps_rank_file_parse_string_or_int(void); static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, char **aff_rankfile); @@ -145,7 +146,11 @@ static int prte_rmaps_rf_map(prte_job_t *jdata, PRTE_JOBID_PRINT(jdata->nspace)); return PRTE_ERR_TAKE_NEXT_OPTION; } - if (PRTE_MAPPING_BYUSER != PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + + char *donau_affinity_path = getenv("CCS_COSCHED_MPI_AFFINITY_FILE"); + bool donau_affinity_avilable = (NULL != getenv("CCS_COSCHED_MPI_AFFINITY_FILE")) && + (0 != strlen(getenv("CCS_COSCHED_MPI_AFFINITY_FILE"))); + if ((!donau_affinity_avilable) && (PRTE_MAPPING_BYUSER != PRTE_GET_MAPPING_POLICY(jdata->map->mapping))) { /* NOT FOR US */ pmix_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps:rf: job %s not using rankfile policy", @@ -159,8 +164,8 @@ static int prte_rmaps_rf_map(prte_job_t *jdata, PRTE_JOBID_PRINT(jdata->nspace)); return PRTE_ERR_TAKE_NEXT_OPTION; } - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, (void **) &rankfile, PMIX_STRING) - || NULL == rankfile) { + if ((!prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, (void **) &rankfile, PMIX_STRING) + || NULL == rankfile) && !donau_affinity_avilable) { /* we cannot do it */ pmix_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps:rf: job %s no rankfile specified", @@ -208,9 +213,16 @@ static int prte_rmaps_rf_map(prte_job_t *jdata, PMIX_CONSTRUCT(&rankmap, pmix_pointer_array_t); /* parse the rankfile, storing its results in the rankmap */ - if (PRTE_SUCCESS != (rc = prte_rmaps_rank_file_parse(rankfile))) { - rc = PRTE_ERR_SILENT; - goto error; + if (rankfile) { + if (PRTE_SUCCESS != (rc = prte_rmaps_rank_file_parse(rankfile))) { + rc = PRTE_ERR_SILENT; + goto error; + } + } else if (donau_affinity_avilable) { + if (PRTE_SUCCESS != (rc = prte_rmaps_donau_affinity_file_parse(donau_affinity_path))) { + PRTE_ERROR_LOG(rc); + goto error; + } } /* cycle through the app_contexts, mapping them sequentially */ @@ -486,6 +498,61 @@ error: return rc; } +static int prte_rmaps_donau_affinity_file_parse(const char *affinityfile) +{ + int rc = PRTE_SUCCESS; + prte_rmaps_rank_file_map_t *rfmap = NULL; + prte_node_t *hnp_node; + FILE *fp; + + /* get the hnp node's info */ + hnp_node = (prte_node_t*)(prte_node_pool->addr[0]); + + fp = fopen(affinityfile, "r"); + if (NULL == fp) { + return PRTE_ERROR; + } + + char *line = NULL; + size_t len = 0; + ssize_t read; + while ((read = getline(&line, &len, fp)) != -1) { + int rank_ID = -1; + char hostname[DONAU_MAX_NODENAME_LENGTH] = {0}; + char physical_index[DONAU_MAX_NODENAME_LENGTH] = {0}; + char logical_index[DONAU_MAX_NODENAME_LENGTH] = {0}; + if (sscanf(line, "%d %s %s %s", &rank_ID, hostname, physical_index, logical_index) != 4) { + pmix_output_verbose(10, prte_rmaps_base_framework.framework_output, + "rmaps/rank_file: Get the wrong num of params in CCS_COSCHED_MPI_AFFINITY_FILE"); + break; + } + rfmap = PMIX_NEW(prte_rmaps_rank_file_map_t); + pmix_pointer_array_set_item(&rankmap, rank_ID, rfmap); + num_ranks++; // keep track of number of provided ranks + + /* check the rank item */ + if (NULL == rfmap) { + pmix_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, affinityfile); + rc = PRTE_ERR_BAD_PARAM; + PRTE_ERROR_LOG(rc); + return rc; + } + /* check if this is the local node */ + if (pmix_ifislocal(hostname)) { + rfmap->node_name = strdup(hnp_node->name); + } else { + rfmap->node_name = strdup(hostname); + } + for (int i = 0; i < strlen(logical_index) && '\0' != logical_index[i]; i++) { + rfmap->slot_list[i] = logical_index[i]; + } + } + free(line); + fclose(fp); + + return rc; +} + static int prte_rmaps_rank_file_parse(const char *rankfile) { int token;