diff --git a/.ci/README.md b/.ci/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c32a0fd6400c6a1cf62ba1a06786fd2d2274851f
--- /dev/null
+++ b/.ci/README.md
@@ -0,0 +1,4 @@
+Top-level directory for CI tests.
+
+Feel free to make your own subdirectory (e.g., for your organization)
+and put CI tests and supporting infrastructure here.
diff --git a/.ci/community-jenkins/Jenkinsfile b/.ci/community-jenkins/Jenkinsfile
new file mode 100644
index 0000000000000000000000000000000000000000..f05ae1b1a9ee9a27b1b8fb5d54cefa015d9b728b
--- /dev/null
+++ b/.ci/community-jenkins/Jenkinsfile
@@ -0,0 +1,102 @@
+// -*- groovy -*-
+//
+// Copyright (c) 2022-2023 Amazon.com, Inc. or its affiliates. All rights
+// reserved.
+// Copyright (c) 2022-2023 Joe Downs. All rights reserved.
+// $COPYRIGHT$
+//
+// Additional copyrights may follow
+//
+// $HEADER$
+//
+// Build an Open MPI Pull Request
+//
+//
+// WORKSPACE Layout:
+// autotools-install/ Autotools install for the builder
+// ompi/ Open MPI source tree
+
+// We if we push changes to a PR, we don't need to keep old jobs running, so
+// we'll use the milestone step in Jenkins. Using an example from
+// https://stackoverflow.com/questions/40760716/jenkins-abort-running-build-if-new-one-is-started:
+//
+// - Build 1 runs and creates milestone 1.
+// - While build 1 is running, build 2 fires. It has milestone 1 and milestone
+// 2. It passes milestone 1, which causes build 1 to abort.
+def buildNumber = env.BUILD_NUMBER as int
+if (buildNumber > 1) {
+ milestone(buildNumber - 1)
+}
+milestone(buildNumber)
+
+// Add build description linking back to PR. This is redundant to the "GitHub"
+// link on the Pull Request page, but the Build page does not have a direct link
+// back to the PR. The "Details" link at the bottom of the GitHub PR page brings
+// you to the Jenkins Build page, so we're adding the link back to the GitHub PR
+// page.
+currentBuild.description = "This is a build of Open MPI PR #${CHANGE_ID}"
+
+check_stages = prepare_check_stages()
+println("Initialized Pipeline")
+
+// Today, we only expect to have one stage (do everything), but allow that
+// we may split build and test stages in the future.
+for (check_stage in check_stages) {
+ parallel(check_stage)
+}
+
+println('Tests Completed')
+
+// Returns a list of build stages ("build Open MPI", "Build Tests", etc.),
+// although currently we only support the one stage of "everything", where each
+// build stage is a map of different configurations to test.
+def prepare_check_stages() {
+ def configure_options = ["--disable-dlopen", "--disable-oshmem", "--enable-builtin-atomic", "--enable-ipv6"]
+ def compilers = ["clang10", "gcc5", "gcc6", "gcc7", "gcc8", "gcc9", "gcc10"]
+ def platforms = ["amazon_linux_2", "amazon_linux_2-arm64", "rhel7", "rhel8", "ubuntu_18.04"]
+ def check_stages_list = []
+
+ // Build everything stage
+ def build_parallel_map = [:]
+ for (platform in platforms) {
+ def name = "Platform: ${platform}".replaceAll("-", "")
+ build_parallel_map.put(name, prepare_build(name, platform, ""))
+ }
+
+ for (compiler in compilers) {
+ def name = "Compiler: ${compiler}".replaceAll("-", "")
+ build_parallel_map.put(name, prepare_build(name, compiler, "--compiler \\\"${compiler}\\\""))
+ }
+
+ for (configure_option in configure_options) {
+ def name = "Configure: ${configure_option}".replaceAll("-", "")
+ build_parallel_map.put(name, prepare_build(name, "(ec2&&linux)", "--configure-args \\\"${configure_option}\\\""))
+ }
+
+ build_parallel_map.put("distcheck", prepare_build("distcheck", "tarball_build", "--distcheck"))
+
+ check_stages_list.add(build_parallel_map)
+
+ return check_stages_list
+}
+
+def prepare_build(build_name, label, build_arg) {
+ return {
+ stage("${build_name}") {
+ node(label) {
+ checkout(changelog: false, poll: false, scm: scm)
+ // If pr-builder.sh fails, the sh step will throw an exception,
+ // which we catch so that the job doesn't abort and continues on
+ // to other steps - such as cleanup. Because we catch the
+ // exception, we need to tell Jenkins the overall job has
+ // failed.
+ try {
+ sh "/bin/bash -x .ci/community-jenkins/pr-builder.sh ${build_arg} ompi"
+ } catch (Exception e) {
+ currentBuild.result = "FAILURE"
+ }
+ cleanWs(notFailBuild: true)
+ }
+ }
+ }
+}
diff --git a/.ci/community-jenkins/pr-builder.sh b/.ci/community-jenkins/pr-builder.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d721e36562448d71c5737973332ef2d146c3f646
--- /dev/null
+++ b/.ci/community-jenkins/pr-builder.sh
@@ -0,0 +1,351 @@
+#!/bin/sh
+#
+# Copyright (c) 2022-2023 Amazon.com, Inc. or its affiliates. All rights
+# reserved.
+# Copyright (c) 2022-2023 Joe Downs. All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+
+# Abort on error
+set -euo pipefail
+
+BUILD_32BIT=0
+COMPILER=
+DISTCHECK=0
+AUTOGEN_ARGS=
+CONFIGURE_ARGS=
+MAKE_ARGS=
+MAKE_J="-j 8"
+PREFIX="${WORKSPACE}/install"
+MPIRUN_MODE=${MPIRUN_MODE:-runall}
+
+#
+# Options Parsing
+#
+# For each option, we need to remove the quotes from their arguments. Without
+# quotes, the command-line options for later commands (such as
+# --disable-oshmem), are interpreted (in the following switch statement) as
+# options for this script.
+
+strip_quotes() {
+ echo `echo "$1" | sed -e "s/\(\"\)\([[:alnum:]|_|-]*\)\(\"\)/\2/"`
+}
+
+PARAMS=""
+while (( "$#" )); do
+ case "$1" in
+ --distcheck)
+ DISTCHECK=1
+ shift
+ ;;
+ --autogen-args)
+ if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+ AUTOGEN_ARGS=$(strip_quotes $2)
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
+ --configure-args)
+ if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+ CONFIGURE_ARGS=$(strip_quotes $2)
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
+ --compiler)
+ if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+ COMPILER=$(strip_quotes $2)
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
+ --mpirun-mode)
+ if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+ MPIRUN_MODE=$(strip_quotes $2)
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
+ -*|--*=) # Unsupported flags
+ echo "Error: Unsupported flag $1" >&2
+ exit 1
+ ;;
+ *) # Preserve positional arguments
+ PARAMS="$PARAMS $1"
+ shift
+ ;;
+ esac
+done
+# Set positional arguments in their proper place.
+eval set -- "$PARAMS"
+
+#
+# Start by figuring out what we are...
+#
+os=`uname -s`
+if test "${os}" = "Linux"; then
+ eval "PLATFORM_ID=`sed -n 's/^ID=//p' /etc/os-release`"
+ eval "VERSION_ID=`sed -n 's/^VERSION_ID=//p' /etc/os-release`"
+else
+ PLATFORM_ID=`uname -s`
+ VERSION_ID=`uname -r`
+fi
+
+echo "--> platform: $PLATFORM_ID"
+echo "--> version: $VERSION_ID"
+
+#
+# See if builder provided a compiler we should use, and translate it to
+# CONFIGURE_ARGS.
+#
+case ${PLATFORM_ID} in
+ rhel)
+ case "$COMPILER" in
+ gcc48|"")
+ echo "--> Using default compilers"
+ ;;
+ *)
+ echo "Unsupported compiler ${COMPILER}. Aborting"
+ exit 1
+ ;;
+ esac
+ ;;
+ amzn)
+ case "$COMPILER" in
+ "")
+ echo "--> Using default compilers"
+ ;;
+ gcc44)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc44 CXX=g++44 FC=gfortran44"
+ ;;
+ gcc48)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc48 CXX=g++48 FC=gfortran48"
+ ;;
+ clang36)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS CC=clang CXX=clang++ --disable-mpi-fortran"
+ ;;
+ *)
+ echo "Unsupported compiler ${COMPILER}. Aborting"
+ exit 1
+ ;;
+ esac
+ ;;
+ ubuntu)
+ case "$COMPILER" in
+ "")
+ echo "--> Using default compilers"
+ ;;
+ gcc4*)
+ version=`echo "$COMPILER" | sed -e 's/gcc4\([0-9]*\)/4.\1/'`
+ CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}"
+ ;;
+ gcc*)
+ version=`echo "$COMPILER" | sed -e 's/gcc\([0-9]*\)/\1/'`
+ CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}"
+ ;;
+ clang3*|clang4*|clang5*|clang6*)
+ version=`echo "$COMPILER" | sed -e 's/clang\([0-9]\)\([0-9]*\)/\1.\2/'`
+ CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran"
+ ;;
+ clang*)
+ version=`echo "$COMPILER" | sed -e 's/clang\([0-9]*\)/\1/'`
+ CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran"
+ ;;
+ *)
+ echo "Unsupported compiler ${COMPILER}. Aborting"
+ exit 1
+ ;;
+ esac
+ ;;
+ sles)
+ case "$COMPILER" in
+ "")
+ echo "--> Using default compilers"
+ ;;
+ gcc48)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-48 CXX=g++-48 FC=gfortran-48"
+ ;;
+ gcc5)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-5 CXX=g++-5 FC=gfortran-5"
+ ;;
+ gcc6)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-6 CXX=g++-6 FC=gfortran-6"
+ ;;
+ *)
+ echo "Unsupported compiler ${COMPILER}. Aborting"
+ exit 1
+ ;;
+ esac
+ ;;
+ FreeBSD)
+ CONFIGURE_ARGS="$CONFIGURE_ARGS LDFLAGS=-Wl,-rpath,/usr/local/lib/gcc5 --with-wrapper-ldflags=-Wl,-rpath,/usr/local/lib/gcc5"
+ ;;
+esac
+
+echo "--> Compiler setup: $CONFIGURE_ARGS"
+
+#
+# Add any Autogen or Configure arguments provided by the builder job.
+#
+if test "$AUTOGEN_ARGS" != ""; then
+ # Special case, to work around the fact that Open MPI can't build when
+ # there's a space in the build path name. (sigh)
+ if test "$AUTOGEN_ARGS" = "--no-orte"; then
+ AUTOGEN_ARGS="--no-orte --no-ompi"
+ fi
+fi
+
+echo "--> Autogen arguments: $AUTOGEN_ARGS"
+echo "--> Configure arguments: $CONFIGURE_ARGS"
+
+# Build
+sha1=`git rev-parse HEAD`
+echo "--> Building commit ${sha1}"
+
+if test -f autogen.pl; then
+ echo "--> running ./autogen.pl ${AUTOGEN_ARGS}"
+ ./autogen.pl ${AUTOGEN_ARGS}
+else
+ if test "${AUTOGEN_ARGS}" != ""; then
+ echo "--> Being a coward and not running with special autogen arguments and autogen.sh"
+ exit 1
+ else
+ echo "--> running ./atogen.sh"
+ ./autogen.sh
+ fi
+fi
+
+echo "--> running ./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}"
+if ! ./configure --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then
+ echo "./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !"
+ if test -f config.log; then
+ echo "config.log content :"
+ cat config.log
+ else
+ echo "no config.log was generated"
+ fi
+ exit 1
+fi
+
+# Shortcut for the distcheck case, as it won't run any tests beyond the built-in
+# make check tests. We need to install the requirements (Sphinx) so we can build
+# the docs.
+if test "${DISTCHECK}" = "1"; then
+ echo "--> running make ${MAKE_ARGS} distcheck"
+ make ${MAKE_ARGS} distcheck
+ exit $?
+fi
+
+echo "--> running make ${MAKE_J} ${MAKE_ARGS} all"
+make ${MAKE_J} ${MAKE_ARGS} all
+echo "--> running make check"
+make ${MAKE_ARGS} check
+echo "--> running make install"
+make ${MAKE_ARGS} install
+
+export PATH="${PREFIX}/bin":${PATH}
+
+case "$AUTOGEN_ARGS" in
+ *--no-ompi*)
+ echo "--> Skipping MPI tests due to --no-ompi"
+ exit 0
+ ;;
+esac
+
+echo "--> running ompi_info"
+ompi_info
+
+echo "--> running make all in examples"
+cd "examples"
+make ${MAKE_ARGS} all
+cd ..
+
+# It's hard to determine what the failure was and there's no printing of error
+# code with set -e, so for the tests, we do per-command checking...
+set +e
+
+run_example() {
+ example=`basename ${2}`
+ echo "--> Running example: $example"
+ ${1} ${2}
+ ret=$?
+ if test ${ret} -ne 0 ; then
+ echo "Example failed: ${ret}"
+ echo "Command was: ${1} ${2}"
+ exit ${ret}
+ fi
+}
+
+if test "${MPIRUN_MODE}" != "none"; then
+ echo "--> running examples"
+ echo "localhost cpu=2" > "${WORKSPACE}/hostfile"
+ # Note: using perl here because figuring out a portable sed regexp
+ # proved to be a little challenging.
+ mpirun_version=`"${WORKSPACE}/install/bin/mpirun" --version | perl -wnE 'say $1 if /mpirun [^\d]*(\d+.\d+)/'`
+ echo "--> mpirun version: ${mpirun_version}"
+ case ${mpirun_version} in
+ 1.*|2.0*)
+ exec="timeout -s SIGSEGV 3m mpirun -hostfile ${WORKSPACE}/hostfile -np 2 "
+ ;;
+ *)
+ exec="timeout -s SIGSEGV 4m mpirun --get-stack-traces --timeout 180 --hostfile ${WORKSPACE}/hostfile -np 2 --bind-to none "
+ ;;
+ esac
+ singleton="timeout -s SIGSEGV 1m "
+ run_example "${exec}" ./examples/hello_c
+ run_example "${singleton}" ./examples/hello_c
+ run_example "${exec}" ./examples/ring_c
+ run_example "${singleton}" ./examples/ring_c
+ run_example "${exec}" ./examples/connectivity_c
+ if ompi_info --parsable | grep -q bindings:cxx:yes >/dev/null; then
+ echo "--> running C++ examples"
+ run_example "${exec}" ./examples/hello_cxx
+ run_example "${singleton}" ./examples/hello_cxx
+ run_example "${exec}" ./examples/ring_cxx
+ run_example "${singleton}" ./examples/ring_cxx
+ else
+ echo "--> skipping C++ examples"
+ fi
+ if ompi_info --parsable | grep -q bindings:mpif.h:yes >/dev/null; then
+ echo "--> running mpif examples"
+ run_example "${exec}" ./examples/hello_mpifh
+ run_example "${singleton}" ./examples/hello_mpifh
+ run_example "${exec}" ./examples/ring_mpifh
+ run_example "${singleton}" ./examples/ring_mpifh
+ else
+ echo "--> skipping mpif examples"
+ fi
+ if ompi_info --parsable | egrep -q bindings:use_mpi:\"\?yes >/dev/null; then
+ echo "--> running usempi examples"
+ run_example "${exec}" ./examples/hello_usempi
+ run_example "${singleton}" ./examples/hello_usempi
+ run_example "${exec}" ./examples/ring_usempi
+ run_example "${singleton}" ./examples/ring_usempi
+ else
+ echo "--> skipping usempi examples"
+ fi
+ if ompi_info --parsable | grep -q bindings:use_mpi_f08:yes >/dev/null; then
+ echo "--> running usempif08 examples"
+ run_example "${exec}" ./examples/hello_usempif08
+ run_example "${singleton}" ./examples/hello_usempif08
+ run_example "${exec}" ./examples/ring_usempif08
+ run_example "${singleton}" ./examples/ring_usempif08
+ else
+ echo "--> skipping usempif08 examples"
+ fi
+else
+ echo "--> Skipping examples (MPIRUN_MODE = none)"
+fi
+
+echo "--> All done!"
diff --git a/.ci/lanl/gitlab-darwin-ci.yml b/.ci/lanl/gitlab-darwin-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..57b78c29ba78110540da581a89c94a10b246e9f9
--- /dev/null
+++ b/.ci/lanl/gitlab-darwin-ci.yml
@@ -0,0 +1,164 @@
+variables:
+ SCHEDULER_PARAMETERS: "-pgeneral -t 4:00:00 -N 1 --ntasks-per-node=16"
+ GIT_STRATEGY: clone
+ NPROCS: 4
+
+stages:
+ - build
+ - test
+
+build:intel:
+ stage: build
+ tags: [darwin-slurm-shared]
+ script:
+ - module load intel/2022.0.1
+ - rm .gitmodules
+ - cp $GITSUBMODULEPATCH .gitmodules
+ - git submodule update --init --recursive
+ - ./autogen.pl
+ - ./configure CC=icx FC=ifx CXX=icpx --prefix=$PWD/install_test --with-libevent=internal
+ - make -j 8 install
+ - make check
+ - export PATH=$PWD/install_test/bin:$PATH
+ - cd examples
+ - make
+ artifacts:
+ name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+ untracked: true
+ paths:
+ - examples
+ - install_test
+ expire_in: 1 week
+
+build:amd:
+ stage: build
+ tags: [darwin-slurm-shared]
+ variables:
+ SCHEDULER_PARAMETERS: "-pamd-rome -t 4:00:00 -N 1 --ntasks-per-node=16"
+ script:
+ - module load aocc/3.0.0
+ - rm .gitmodules
+ - cp $GITSUBMODULEPATCH .gitmodules
+ - git submodule update --init --recursive
+ - ./autogen.pl
+ - ./configure CC=clang FC=flang CXX=clang++ --prefix=$PWD/install_test --with-libevent=internal LIBS="-lucm -lucs"
+ - make -j 8 install
+ - make check
+ - export PATH=$PWD/install_test/bin:$PATH
+ - cd examples
+ - make
+ artifacts:
+ name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+ untracked: true
+ paths:
+ - examples
+ - install_test
+ expire_in: 1 week
+
+build:gnu:
+ stage: build
+ tags: [darwin-slurm-shared]
+ script:
+ - module load gcc
+ - rm .gitmodules
+ - cp $GITSUBMODULEPATCH .gitmodules
+ - git submodule update --init --recursive
+ - ./autogen.pl
+ - ./configure --prefix=$PWD/install_test --with-libevent=internal
+ - make -j 8 install
+ - make check
+ - export PATH=$PWD/install_test/bin:$PATH
+ - cd examples
+ - make
+ artifacts:
+ name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+ untracked: true
+ paths:
+ - examples
+ - install_test
+ expire_in: 1 week
+
+test:intel:
+ stage: test
+ tags: [darwin-slurm-shared]
+ dependencies:
+ - build:intel
+ needs: ["build:intel"]
+ script:
+ - pwd
+ - ls
+ - module load intel/2022.0.1
+ - export PATH=$PWD/install_test/bin:$PATH
+ - which mpirun
+ - cd examples
+ - mpirun -np 4 hostname
+ - mpirun -np 4 ./hello_c
+ - mpirun -np 4 ./ring_c
+ - mpirun -np 4 ./hello_mpifh
+ - mpirun -np 4 ./ring_mpifh
+ - mpirun -np 4 ./hello_usempi
+ - mpirun -np 4 ./ring_usempi
+ - mpirun -np 4 ./hello_usempif08
+ - mpirun -np 4 ./ring_usempif08
+ - mpirun -np 4 ./connectivity_c
+ artifacts:
+ name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+ expire_in: 1 week
+
+test:amd:
+ stage: test
+ tags: [darwin-slurm-shared]
+ variables:
+ SCHEDULER_PARAMETERS: "-pamd-rome -t 2:00:00 -N 1 --ntasks-per-node=16"
+ dependencies:
+ - build:amd
+ needs: ["build:amd"]
+ script:
+ - pwd
+ - ls
+ - module load aocc/3.0.0
+ - export PATH=$PWD/install_test/bin:$PATH
+ - export LD_LIBRARY_PATH=$PWD/install_test/lib:$LD_LIBRARY_PATH
+ - which mpirun
+ - cd examples
+ - mpirun -np 4 hostname
+ - mpirun -np 4 ./hello_c
+ - mpirun -np 4 ./ring_c
+ - mpirun -np 4 ./hello_mpifh
+ - mpirun -np 4 ./ring_mpifh
+ - mpirun -np 4 ./hello_usempi
+ - mpirun -np 4 ./ring_usempi
+ - mpirun -np 4 ./hello_usempif08
+ - mpirun -np 4 ./ring_usempif08
+ - mpirun -np 4 ./connectivity_c
+ artifacts:
+ name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+ expire_in: 1 week
+
+test:gnu:
+ stage: test
+ tags: [darwin-slurm-shared]
+ dependencies:
+ - build:gnu
+ needs: ["build:gnu"]
+ script:
+ - pwd
+ - ls
+ - module load gcc
+ - export PATH=$PWD/install_test/bin:$PATH
+ - which mpirun
+ - cd examples
+ - mpirun -np 4 hostname
+ - mpirun -np 4 ./hello_c
+ - mpirun -np 4 ./ring_c
+ - mpirun -np 4 ./hello_mpifh
+ - mpirun -np 4 ./ring_mpifh
+ - mpirun -np 4 ./hello_usempi
+ - mpirun -np 4 ./ring_usempi
+ - mpirun -np 4 ./hello_usempif08
+ - mpirun -np 4 ./ring_usempif08
+ - mpirun -np 4 ./connectivity_c
+ artifacts:
+ name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+ expire_in: 1 week
+
diff --git a/.ci/mellanox/README.md b/.ci/mellanox/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2a9d5c09b15ee0e4aa71520578f60dd1ae8c60b9
--- /dev/null
+++ b/.ci/mellanox/README.md
@@ -0,0 +1,16 @@
+# Open MPI Continuous Integration (CI) Services
+## Mellanox Open MPI CI
+### Scope
+[Mellanox](https://www.mellanox.com/) Open MPI CI is intended to verify Open MPI with recent Mellanox SW components ([Mellanox OFED](https://www.mellanox.com/page/products_dyn?product_family=26), [UCX](https://www.mellanox.com/page/products_dyn?product_family=281&mtag=ucx) and other [HPC-X](https://www.mellanox.com/page/products_dyn?product_family=189&mtag=hpc-x) components) in the Mellanox lab environment.
+
+CI is managed by [Azure Pipelines](https://docs.microsoft.com/en-us/azure/devops/pipelines/?view=azure-devops) service.
+
+Mellanox Open MPI CI includes:
+* Open MPI building with internal stable engineering versions of UCX and HCOLL. The building is run in Docker-based environment.
+* Sanity functional testing.
+### How to Run CI
+Mellanox Open MPI CI is triggered upon the following events:
+* Create a pull request (PR). CI status is visible in the PR status. CI is restarted automatically upon each new commit within the PR. CI status and log files are also available on the Azure DevOps server.
+* Trigger CI with special PR comments (for example, `/azp run`). Comment triggers are available only if the comment author has write permission to the PR target repo. Detailed information about comment triggers is available in the official Azure DevOps [documentation](https://docs.microsoft.com/en-us/azure/devops/pipelines/repos/github?view=azure-devops&tabs=yaml#comment-triggers).
+### Support
+In case of any issues, questions or suggestions please contact to [Mellanox Open MPI CI support team](mailto:swx-azure-svc@mellanox.com).
diff --git a/.ci/mellanox/azure-pipelines.yml b/.ci/mellanox/azure-pipelines.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6f7651f015bb9b17e8324ff230b136ae2c5e54f7
--- /dev/null
+++ b/.ci/mellanox/azure-pipelines.yml
@@ -0,0 +1,38 @@
+trigger: none
+pr:
+ - main
+ - v*.*.x
+
+pool:
+ name: Default
+ demands:
+ - AGENT_CI_TARGET -equals ompi
+ - MLNX_IB_DEVICE -equals yes
+
+variables:
+ ompi_jenkins_scripts_git_repo_url: https://github.com/mellanox-hpc/jenkins_scripts.git
+ ompi_jenkins_scripts_git_branch: master
+ # Enable debug information, supported values: true, false
+ debug: true
+
+jobs:
+- job: mellanox_ompi_ci
+ displayName: Mellanox Open MPI CI
+ timeoutInMinutes: 90
+ container:
+ image: rdmz-harbor.rdmz.labs.mlnx/hpcx/ompi_ci:latest
+ options: -v /hpc/local:/hpc/local -v /opt:/opt --uts=host --ipc=host --ulimit stack=67108864
+ --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/
+ steps:
+ - checkout: self
+ submodules: recursive
+ path: ompi
+ clean: true
+ - bash: |
+ set -eE
+ [ "$(debug)" = "true" ] && set -x
+ rm -rf $(Pipeline.Workspace)/jenkins_scripts
+ git clone $(ompi_jenkins_scripts_git_repo_url) --branch $(ompi_jenkins_scripts_git_branch) $(Pipeline.Workspace)/jenkins_scripts
+ export WORKSPACE=$(Pipeline.Workspace)/ompi
+ $(Pipeline.Workspace)/jenkins_scripts/jenkins/ompi/ompi_test.sh
+ displayName: Build and test Open MPI
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..d4249d80c2db2c0dfe0dc2ce307d1d914f8a0135
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,173 @@
+# This file represents the coding style enforced by Open MPI. This file
+# is based on the long-held, but not enforced, guidelines from the
+# beginning of the project. We will be requiring that all code going
+# forward uses this style. To check your code before attempting to open
+# a PR install clang-format and run your commits through clang-format.
+#
+# To install clang-format:
+#
+# macOS:
+# Homebrew: brew install clang-format
+# Mac Ports: port install clang
+#
+# Linux:
+# debian/ubuntu/rasbian: apt-get install clang-format
+# redhat/fedora: yum install clang-format
+#
+# To run against your code changes:
+#
+# unstaged changes: git clang-format --style file -f
+# staged changes: git clang-format --style file
+#
+# For interactive add the -p option.
+#
+# To run against all of Open MPI:
+#
+# ./contrib/clang-format-ompi.sh
+#
+# This command is intended to be run only once.
+---
+Language: Cpp
+# BasedOnStyle: LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: true
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterCaseLabel: false
+ AfterClass: false
+ AfterControlStatement: Never
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: false
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ AfterExternBlock: false
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ SplitEmptyFunction: true
+ SplitEmptyRecord: true
+ SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: true
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat: false
+FixNamespaceComments: true
+ForEachMacros:
+ - foreach
+ - Q_FOREACH
+ - BOOST_FOREACH
+ - BOOST_FOREACH
+ - OPAL_LIST_FOREACH
+ - OPAL_LIST_FOREACH_DECL
+ - OPAL_LIST_FOREACH_SAFE
+ - OPAL_LIST_FOREACH_REV
+ - OPAL_LIST_FOREACH_SAFE_REV
+ - OPAL_HASH_TABLE_FOREACH
+ - OPAL_HASH_TABLE_FOREACH_PTR
+IncludeBlocks: Preserve
+IncludeCategories:
+ # Ensure config includes always come first (opal_config.h, ompi_config.h, etc)
+ - Regex: '^".*_config\.h"'
+ Priority: -1
+ # In-tree includes come next (after main include)
+ - Regex: '^".*"'
+ Priority: 2
+ # System includes come last
+ - Regex: '^<.*>'
+ Priority: 3
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: false
+IndentGotoLabels: true
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 250
+PenaltyBreakBeforeFirstCallParameter: 301
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard: Latest
+StatementMacros:
+ - Q_UNUSED
+ - QT_REQUIRE_VERSION
+ - BEGIN_C_DECLS
+ - END_C_DECLS
+TabWidth: 8
+UseCRLF: false
+UseTab: Never
+...
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d17ee08ec1ca44b29b7de1d8f2ef543cd8526ec
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,11 @@
+# How to contribute to Open MPI
+
+First off, thank you for taking the time to prepare a contribution to
+Open MPI!
+
+
+
+Open MPI is hosted on GitHub, and we gladly accept pull requests.
+Please see the [Contributing
+guidelines](https://docs.open-mpi.org/en/main/contributing.html) for
+details on how to contribute to Open MPI.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..276fa10c19ac42a180c837c1856e83db279d095b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,41 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+Thank you for taking the time to submit an issue!
+
+## Background information
+
+### What version of Open MPI are you using? (e.g., v3.0.5, v4.0.2, git branch name and hash, etc.)
+
+
+
+### Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)
+
+
+
+### If you are building/installing from a git clone, please copy-n-paste the output from `git submodule status`.
+
+
+
+### Please describe the system on which you are running
+
+* Operating system/version:
+* Computer hardware:
+* Network type:
+
+-----------------------------
+
+## Details of the problem
+
+Please describe, in detail, the problem that you are having, including the behavior you expect to see, the actual behavior that you are seeing, steps to reproduce the problem, etc. It is most helpful if you can attach a small program that a developer can use to reproduce your problem.
+
+**Note**: If you include verbatim output (or a code block), please use a [GitHub Markdown](https://help.github.com/articles/creating-and-highlighting-code-blocks/) code block like below:
+```shell
+shell$ mpirun -n 2 ./hello_world
+```
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..bbcbbe7d61558adde3cbfd0c7a63a67c27ed6d30
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9132361427ca4e8c26b2d18d8d314c38c0d6e98
--- /dev/null
+++ b/.github/workflows/README.md
@@ -0,0 +1,7 @@
+Be aware that changes to the contents of these files will affect the
+Pull Request in which you make the changes!
+
+For example, if you create a PR that changes one of the Github Actions
+in this directory, it will be used in the CI *for that PR*.
+
+You have been warned. :smile:
diff --git a/.github/workflows/compile-cuda.yaml b/.github/workflows/compile-cuda.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bddcd3c744c4a99b03ed01fb3695535d2f7c84a
--- /dev/null
+++ b/.github/workflows/compile-cuda.yaml
@@ -0,0 +1,28 @@
+name: CUDA
+
+on: [pull_request]
+
+env:
+ CUDA_PATH: /usr/local/cuda
+jobs:
+ compile-cuda:
+ runs-on: ubuntu-22.04
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt update
+ sudo apt install -y --no-install-recommends wget
+ - name: Install extra dependencies
+ run: |
+ wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.1-1_all.deb
+ sudo dpkg -i cuda-keyring_1.1-1_all.deb
+ sudo apt update
+ sudo apt install -y cuda-toolkit
+ - uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Build Open MPI
+ run: |
+ ./autogen.pl
+ ./configure --prefix=${PWD}/install --with-cuda=${CUDA_PATH} --with-cuda-libdir=${CUDA_PATH}/lib64/stubs
+ make -j
diff --git a/.github/workflows/compile-rocm.yaml b/.github/workflows/compile-rocm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2a80f01ae253594a0e67e4b47628305a439c9
--- /dev/null
+++ b/.github/workflows/compile-rocm.yaml
@@ -0,0 +1,31 @@
+name: ROCM
+
+on: [pull_request]
+
+env:
+ ROCM_VER: 5-4
+jobs:
+ compile-rocm:
+ runs-on: ubuntu-22.04
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt update
+ sudo apt install -y --no-install-recommends wget lsb-core software-properties-common gpg curl
+ - name: Install extra dependencies
+ run: |
+ sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+ wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+ echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/5.7.1/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+ echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7.1 jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
+ echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+ sudo apt update
+ sudo apt install -y rocm-hip-runtime
+ - uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Build Open MPI
+ run: |
+ ./autogen.pl
+ ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran
+ LD_LIBRARY_PATH=/opt/rocm/lib make -j
diff --git a/.github/workflows/macos-checks.yaml b/.github/workflows/macos-checks.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b90adeb9a32149f10a1207ad54ad45618250234
--- /dev/null
+++ b/.github/workflows/macos-checks.yaml
@@ -0,0 +1,45 @@
+name: macOS
+
+on: [pull_request]
+
+jobs:
+ macOS:
+ runs-on: macos-latest
+ steps:
+ - name: Setup macOS
+ run: |
+ # Copied from mpi4py/mpi-publish
+ # create gfortran symlink
+ cd $(brew --prefix)/bin
+ gfortran=$(ls gfortran-* | sort | head -n 1)
+ sudo ln -s $gfortran gfortran
+ # install autotools
+ brew install autoconf
+ brew install automake
+ brew install libtool
+ # unlink libevent
+ brew unlink libevent || true
+ - uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Build Open MPI
+ run: |
+ ./autogen.pl
+ ./configure --prefix=/opt/openmpi
+ make -j $(sysctl -n hw.logicalcpu)
+ - name: Run unit tests
+ run: |
+ make check
+ - name: Install Open MPI
+ run: |
+ sudo make install
+ - name: Add Open MPI to PATH
+ run: echo /opt/openmpi/bin >> $GITHUB_PATH
+ - name: Build examples
+ run: |
+ pushd examples
+ make
+ popd
+ - name: Test ring
+ run: |
+ mpirun --map-by ppr:1:core examples/ring_c
diff --git a/.github/workflows/ompi_mpi4py.yaml b/.github/workflows/ompi_mpi4py.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da38e3ea22350e7400f2867b9da46dd0f134b933
--- /dev/null
+++ b/.github/workflows/ompi_mpi4py.yaml
@@ -0,0 +1,138 @@
+name: mpi4py
+
+on: [ pull_request ]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+ steps:
+ - name: Configure hostname
+ run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null
+ if: ${{ runner.os == 'Linux' || runner.os == 'macOS' }}
+
+ - name: Install depencencies
+ run: sudo apt-get install -y -q
+ libnuma-dev
+ if: ${{ runner.os == 'Linux' }}
+
+ - name: Checkout Open MPI
+ uses: actions/checkout@v4
+ with:
+ path: mpi-build
+ submodules: recursive
+
+ - name: Bootstrap Open MPI
+ run: ./autogen.pl
+ working-directory: mpi-build
+
+ # Install into a separate directory (/opt/openmpi) so that we can
+ # bundle up that tree into an artifact to share with other jobs in
+ # this github action. Specifically don't use /usr/local, because
+ # there's a bunch of other stuff already installed in /usr/local,
+ # and we don't need to include that in our artifact.
+ - name: Configure Open MPI
+ run: ./configure
+ --disable-dependency-tracking
+ --enable-debug
+ --enable-mem-debug
+ --disable-sphinx
+ --disable-mpi-fortran
+ --disable-oshmem
+ --prefix=/opt/openmpi
+ LDFLAGS=-Wl,-rpath,/opt/openmpi/lib
+ working-directory: mpi-build
+
+ - name: Build MPI
+ run: make -j $(nproc)
+ working-directory: mpi-build
+
+ - name: Install MPI
+ run: sudo make install
+ working-directory: mpi-build
+
+ - name: Add Open MPI to PATH
+ run: echo /opt/openmpi/bin >> $GITHUB_PATH
+
+ - name: Tweak MPI
+ run: |
+ # Tweak MPI
+ mca_params="$HOME/.openmpi/mca-params.conf"
+ mkdir -p "$(dirname "$mca_params")"
+ echo mpi_param_check = true >> "$mca_params"
+ echo mpi_show_handle_leaks = true >> "$mca_params"
+ echo rmaps_base_oversubscribe = true >> "$mca_params"
+ mca_params="$HOME/.prte/mca-params.conf"
+ mkdir -p "$(dirname "$mca_params")"
+ echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params"
+
+ - name: Show MPI
+ run: ompi_info
+
+ - name: Show MPICC
+ run: mpicc -show
+
+ - name: Use Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3
+ architecture: x64
+
+ - name: Install Python packages (build)
+ run: python -m pip install --upgrade
+ setuptools pip wheel
+
+ - name: Install Python packages (test)
+ run: python -m pip install --upgrade
+ numpy cffi pyyaml
+
+ - name: Checkout mpi4py
+ uses: actions/checkout@v4
+ with:
+ repository: "mpi4py/mpi4py"
+
+ - name: Build mpi4py wheel
+ run: python -m pip wheel .
+ env:
+ CFLAGS: "-O0"
+
+ - name: Save the artifacts for other jobs
+ uses: actions/upload-artifact@v4
+ with:
+ path: |
+ /opt/openmpi
+ ~/.openmpi
+ ~/.prte
+ test
+ demo
+ mpi4py-*.whl
+ retention-days: 2
+ name: build-artifacts
+
+ #==============================================
+
+ run_defaults:
+ # This whole set of tests run with mpi4py's defaults. As of March
+ # 2024, this means disabling the spawn and dynamic tests. We want
+ # this job of tests to pass.
+ needs: [ build ]
+ uses: ./.github/workflows/ompi_mpi4py_tests.yaml
+ with:
+ # This parameter is required, so send a meaningless
+ # environment variable name that will not affect the tests at
+ # all (i.e., the tests will be run with default values).
+ env_name: MAKE_TODAY_AN_OMPI_DAY
+
+ #==============================================
+
+ run_spawn:
+ # This whole set of tests runs explicitly with setting "enable the
+ # spawn tests". As of March 2024, we know that Open MPI is
+ # failing these tests.
+ needs: [ build ]
+ # Only run if the label "mpi4py" is set on this PR.
+ if: ${{ contains(github.event.pull_request.labels.*.name, 'mpi4py-all') }}
+ uses: ./.github/workflows/ompi_mpi4py_tests.yaml
+ with:
+ # Enable the spawn tests
+ env_name: MPI4PY_TEST_SPAWN
diff --git a/.github/workflows/ompi_mpi4py_tests.yaml b/.github/workflows/ompi_mpi4py_tests.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6adabe9cd7fe9e94fc0cdcfe6a59e040e18d3374
--- /dev/null
+++ b/.github/workflows/ompi_mpi4py_tests.yaml
@@ -0,0 +1,69 @@
+# This is a reusable workflow that is invoked from mpi4py.yaml.
+#
+# It expects a single parameter: env_name. This string sets an
+# environment variable of that name to value "1".
+
+name: mpi4py-tests
+
+on:
+ workflow_call:
+ inputs:
+ env_name:
+ required: true
+ type: string
+
+jobs:
+ mpi4py-tests:
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+ env:
+ ${{ inputs.env_name}}: 1
+ steps:
+ - name: Use Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3
+ architecture: x64
+ - name: Get artifacts
+ uses: actions/download-artifact@v4
+ with:
+ path: /
+ name: build-artifacts
+ - name: Restore executable permissions
+ run: chmod a+x /opt/openmpi/bin/*
+ - name: Add Open MPI to PATH
+ run: echo /opt/openmpi/bin >> $GITHUB_PATH
+ - name: Install the mpi4py wheel
+ run: python -m pip install mpi4py --no-index --find-links=.
+
+ #----------------------------------------------
+
+ - name: Test mpi4py (singleton)
+ run: python test/main.py -v
+ if: ${{ true }}
+ timeout-minutes: 10
+ - name: Test mpi4py (np=1)
+ run: mpiexec -n 1 python test/main.py -v
+ if: ${{ true }}
+ timeout-minutes: 10
+ - name: Test mpi4py (np=2)
+ run: mpiexec -n 2 python test/main.py -v -f
+ if: ${{ true }}
+ timeout-minutes: 10
+ - name: Test mpi4py (np=3)
+ run: mpiexec -n 3 python test/main.py -v -f
+ if: ${{ true }}
+ timeout-minutes: 10
+ - name: Test mpi4py (np=4)
+ run: mpiexec -n 4 python test/main.py -v -f
+ if: ${{ true }}
+ timeout-minutes: 10
+ - name: Test mpi4py (np=5)
+ run: mpiexec -n 5 python test/main.py -v -f
+ if: ${{ true }}
+ timeout-minutes: 10
+
+ - name: Test mpi4py.run
+ run: python demo/test-run/test_run.py -v
+ if: ${{ true }}
+ timeout-minutes: 10
diff --git a/.github/workflows/ompi_nvidia.yaml b/.github/workflows/ompi_nvidia.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a3201a36481359fbf894e627a0f98334eef4212
--- /dev/null
+++ b/.github/workflows/ompi_nvidia.yaml
@@ -0,0 +1,42 @@
+name: ompi_NVIDIA CI
+on: [pull_request]
+jobs:
+
+ deployment:
+ if: github.repository == 'open-mpi/ompi'
+ runs-on: [self-hosted, linux, x64, nvidia]
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Checkout CI scripts
+ uses: actions/checkout@v4
+ with:
+ repository: Mellanox/jenkins_scripts
+ path: ompi_ci
+ - name: Deployment infrastructure
+ run: /start deploy
+ build:
+ needs: [deployment]
+ runs-on: [self-hosted, linux, x64, nvidia]
+ steps:
+ - name: Building OMPI,UCX and tests
+ run: /start build
+ test:
+ needs: [deployment, build]
+ runs-on: [self-hosted, linux, x64, nvidia]
+ steps:
+ - name: Running tests
+ run: /start test
+ clean:
+# always() should be used to run "clean" even when the workflow was canceled
+# ( in case of the right repository name)
+# The second condition doesn't work when the workflow was canceled
+
+ if: always() && (github.repository == 'open-mpi/ompi')
+ needs: [deployment, build, test]
+ runs-on: [self-hosted, linux, x64, nvidia]
+ steps:
+ - name: Cleaning
+ run: /start clean
diff --git a/.github/workflows/pr-checks.yaml b/.github/workflows/pr-checks.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7006979e7329db6528ad7ac5634a964d74b9c445
--- /dev/null
+++ b/.github/workflows/pr-checks.yaml
@@ -0,0 +1,51 @@
+name: Git commit checks
+
+# We're using pull_request_target here instead of just pull_request so that the
+# action runs in the context of the base of the pull request, rather than in the
+# context of the merge commit. For more detail about the differences, see:
+# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target
+on:
+ pull_request_target:
+ # We don't need this to be run on all types of PR behavior
+ # See https://docs.github.com/en/actions/reference/events-that-trigger-workflows#pull_request
+ types:
+ - opened
+ - synchronize
+ - edited
+
+permissions: {} # none
+
+jobs:
+ check:
+ permissions:
+ pull-requests: write
+ name: Check Commits
+ runs-on: ubuntu-latest
+ steps:
+ - name: Pull Request Commit Checker
+ uses: open-mpi/pr-git-commit-checker@v1.0.1
+ with:
+ token: "${{ secrets.GITHUB_TOKEN}}"
+ cherry-pick-required: true
+ label:
+ permissions:
+ pull-requests: write
+ name: Label Pull Request
+ runs-on: ubuntu-latest
+ steps:
+ - name: Pull Request Labeler
+ uses: open-mpi/pr-labeler@v1.0.1
+ with:
+ token: "${{ secrets.GITHUB_TOKEN }}"
+
+ milestone:
+ permissions:
+ issues: write
+ pull-requests: write
+ name: Milestone Pull Request
+ runs-on: ubuntu-latest
+ steps:
+ - name: Pull Request Milestoner
+ uses: open-mpi/pr-milestoner@v1.0.1
+ with:
+ token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..24b4333af5a4c5e73c10ec7a0b4c201a5cc98e42
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,593 @@
+.libs
+.deps
+.libs
+.dirstamp
+.DS_Store
+.cdt*
+.project
+.gdb*
+.idea
+
+*.la
+*.lo
+*.o
+*.so
+*.a
+*.dwarf
+*.dSYM
+*.S
+*.loT
+*.orig
+*.rej
+*.bak
+*.class
+*.xcscheme
+*.out
+*.plist
+*.orig
+*.obj
+*.mod
+*.i90
+*.ii
+*.ti
+*.exe
+*.log
+*.trs
+*.sapp
+*~
+*\\#
+
+Makefile
+Makefile.in
+
+# hwloc and pmix have been de-modularized, but still uses parts of the
+# MCA system due to the amount of glue code that ended up in the
+# framework base. Until that is cleaned up, do not ignore the
+# hard-coded static-components.h file.
+static-components.h
+!opal/mca/hwloc/base/static-components.h
+!opal/mca/pmix/base/static-components.h
+
+*\\#
+config.cache
+aclocal.m4
+autom4te.cache
+config.log
+config.status
+configure
+libtool
+doxygen
+bin
+lib
+cscope.*
+etags
+GRTAGS
+GSYMS
+GTAGS
+GPATH
+vc70.pdb
+.hgrc
+.hgignore
+.hg
+.hgignore_local
+stamp-h?
+AUTHORS
+
+ar-lib
+ylwrap
+config.lt
+config.guess
+config.sub
+depcomp
+compile
+install-sh
+ltmain.sh
+missing
+mkinstalldirs
+libtool.m4
+lt~obsolete.m4
+ltdl.m4
+argz.m4
+ltargz.m4
+ltsugar.m4
+ltversion.m4
+ltoptions.m4
+
+# Libevent and hwloc are included as a tarball. Ignore any expanded
+# tarballs, since they are not included in git. Do not ignore the
+# tarballs themselves, and those are artifacts we will store in git.
+3rd-party/libevent-*
+!3rd-party/libevent-*.tar.*
+3rd-party/hwloc-*
+!3rd-party/hwloc-*.tar.*
+3rd-party/treematch/config.h
+
+3rd-party/romio341/adio/include/romioconf.h
+3rd-party/romio341/adio/include/romioconf.h.in
+3rd-party/romio341/include/mpio.h
+3rd-party/romio341/localdefs
+3rd-party/romio341/mpl/confdb/test-driver
+3rd-party/romio341/mpl/include/config.h
+3rd-party/romio341/mpl/include/config.h.in
+3rd-party/romio341/mpl/include/mplconfig.h
+3rd-party/romio341/mpl/include/mpl_timer.h
+3rd-party/romio341/mpl/localdefs
+3rd-party/romio341/test/runtests
+3rd-party/romio341/test/fcoll_test.f
+3rd-party/romio341/test/fmisc.f
+3rd-party/romio341/test/fperf.f
+3rd-party/romio341/test/large_file.c
+3rd-party/romio341/test/misc.c
+3rd-party/romio341/test/pfcoll_test.f
+3rd-party/romio341/util/romioinstall
+
+config/project_list.m4
+config/autogen_found_items.m4
+config/opal_get_version.sh
+config/test-driver
+config/mca_no_configure_components.m4
+config/mca_m4_config_include.m4
+config/ext_no_configure_components.m4
+config/ext_m4_config_include.m4
+config/auto-extracted-pmix-configure-args.m4
+config/auto-extracted-prrte-configure-args.m4
+config/auto-generated-ompi-exclude.ini
+
+contrib/build-mca-comps-outside-of-tree/btl_tcp2_config.h
+contrib/build-mca-comps-outside-of-tree/btl_tcp2_config.h.in
+contrib/build-mca-comps-outside-of-tree/config
+contrib/build-mca-comps-outside-of-tree/aclocal.m4
+contrib/dist/linux/compile_debian_mlnx_example
+contrib/dist/mofed/compile_debian_mlnx_example
+contrib/dist/mofed/debian/changelog
+contrib/dist/mofed/debian/control
+contrib/dist/mofed/debian/copyright
+contrib/dist/mofed/debian/rules
+contrib/platform/intel/bend/*orcm*
+contrib/scaling/orte_no_op
+contrib/scaling/mpi_no_op
+contrib/scaling/mpi_barrier
+contrib/scaling/mpi_memprobe
+
+examples/hello_c
+examples/hello_cxx
+examples/hello_mpifh
+examples/hello_usempi
+examples/hello_usempif08
+examples/ring_c
+examples/ring_cxx
+examples/ring_mpifh
+examples/ring_usempi
+examples/ring_usempif08
+examples/connectivity_c
+examples/ring_oshmem
+examples/hello_oshmem
+examples/ring_oshmemfh
+examples/hello_oshmemfh
+examples/hello_oshmemcxx
+examples/oshmem_circular_shift
+examples/oshmem_max_reduction
+examples/oshmem_shmalloc
+examples/oshmem_strided_puts
+examples/oshmem_symmetric_data
+examples/spc_example
+
+ompi/debuggers/*.in
+ompi/debuggers/dlopen_test
+ompi/debuggers/predefined_gap_test
+ompi/debuggers/predefined_pad_test
+
+ompi/include/mpi.h
+ompi/include/mpif-config.h
+ompi/include/mpif.h
+ompi/include/mpif-c-constants-decl.h
+ompi/include/mpif-c-constants.h
+ompi/include/mpif-common.h
+ompi/include/mpi-ext.h
+ompi/include/mpif-ext.h
+ompi/include/mpif-f08-types.h
+ompi/include/mpif-handles.h
+ompi/include/mpif-io-constants.h
+ompi/include/mpif-constants.h
+ompi/include/mpif-io-handles.h
+ompi/include/mpif-sizeof.h
+ompi/include/mpi_portable_platform.h
+ompi/include/ompi/version.h
+ompi/include/ompi/frameworks.h
+
+ompi/mca/coll/basic/coll-basic-version.h*
+ompi/mca/coll/demo/config
+ompi/mca/coll/demo/coll_demo_config.h*
+ompi/mca/coll/demo/coll-demo-version.h*
+ompi/mca/coll/ml/coll_ml_lex.c
+ompi/mca/coll/self/coll-self-version.h*
+ompi/mca/coll/sm/coll-sm-version.h*
+
+ompi/mca/osc/monitoring/osc_monitoring_template_gen.h
+
+ompi/mca/pml/v/autogen.vprotocols
+ompi/mca/pml/v/mca_vprotocol_config_output
+
+ompi/mca/sharedfp/addproc/mca_sharedfp_addproc_control
+
+ompi/mca/topo/treematch/config.h
+
+ompi/mpi/c/profile/p*.c
+
+ompi/mpi/fortran/configure-fortran-output.h
+ompi/mpi/fortran/mpiext/mpi-ext-module.F90
+ompi/mpi/fortran/mpiext/mpi-f08-ext-module.F90
+ompi/mpi/fortran/mpiext-use-mpi/mpi-ext-module.F90
+ompi/mpi/fortran/mpiext-use-mpi-f08/mpi-f08-ext-module.F90
+
+ompi/mpi/fortran/mpif-h/sizeof_f.f90
+ompi/mpi/fortran/mpif-h/profile/p*.c
+ompi/mpi/fortran/mpif-h/profile/psizeof_f.f90
+
+ompi/mpi/fortran/use-mpi/mpi-types.F90
+
+ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-constants.h
+ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces.h
+ompi/mpi/fortran/use-mpi-f08/sizeof_f08.f90
+ompi/mpi/fortran/use-mpi-f08/sizeof_f08.h
+ompi/mpi/fortran/use-mpi-f08/profile/psizeof_f08.f90
+ompi/mpi/fortran/use-mpi-f08/profile/*.F90
+
+ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces.h
+ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-file-interfaces.h
+ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-sizeof.f90
+ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-sizeof.h
+ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-removed-interfaces.h
+
+ompi/mpi/fortran/use-mpi-tkr/fortran_kinds.sh
+ompi/mpi/fortran/use-mpi-tkr/fortran_sizes.h
+ompi/mpi/fortran/use-mpi-tkr/mpi_kinds.ompi_module
+ompi/mpi/fortran/use-mpi-tkr/mpi-tkr-sizeof.f90
+ompi/mpi/fortran/use-mpi-tkr/mpi-tkr-sizeof.h
+
+ompi/mpi/java/java/mpi
+ompi/mpi/java/java/*.jar
+ompi/mpi/java/java/*.h
+ompi/mpi/java/java/doc
+
+ompi/mpi/tool/profile/*.c
+
+ompi/mpiext/affinity/c/example
+
+ompi/mpiext/ftmpi/c/profile/pcomm_agree.c
+ompi/mpiext/ftmpi/c/profile/pcomm_failure_ack.c
+ompi/mpiext/ftmpi/c/profile/pcomm_failure_get_acked.c
+ompi/mpiext/ftmpi/c/profile/pcomm_iagree.c
+ompi/mpiext/ftmpi/c/profile/pcomm_is_revoked.c
+ompi/mpiext/ftmpi/c/profile/pcomm_revoke.c
+ompi/mpiext/ftmpi/c/profile/pcomm_shrink.c
+ompi/mpiext/ftmpi/c/profile/pcomm_ack_failed.c
+ompi/mpiext/ftmpi/c/profile/pcomm_get_failed.c
+ompi/mpiext/ftmpi/c/profile/pcomm_ishrink.c
+
+ompi/mpiext/example/tests/progress_c
+ompi/mpiext/example/tests/progress_mpifh
+ompi/mpiext/example/tests/progress_usempi
+ompi/mpiext/example/tests/progress_usempif08
+
+ompi/mpiext/cuda/c/mpiext_cuda_c.h
+ompi/mpiext/cuda/c/cuda_c.h
+ompi/mpiext/rocm/c/mpiext_rocm_c.h
+
+ompi/mpiext/pcollreq/c/MPIX_*.3
+ompi/mpiext/pcollreq/c/profile/pallgather_init.c
+ompi/mpiext/pcollreq/c/profile/pallgatherv_init.c
+ompi/mpiext/pcollreq/c/profile/pallreduce_init.c
+ompi/mpiext/pcollreq/c/profile/palltoall_init.c
+ompi/mpiext/pcollreq/c/profile/palltoallv_init.c
+ompi/mpiext/pcollreq/c/profile/palltoallw_init.c
+ompi/mpiext/pcollreq/c/profile/pbarrier_init.c
+ompi/mpiext/pcollreq/c/profile/pbcast_init.c
+ompi/mpiext/pcollreq/c/profile/pexscan_init.c
+ompi/mpiext/pcollreq/c/profile/pgather_init.c
+ompi/mpiext/pcollreq/c/profile/pgatherv_init.c
+ompi/mpiext/pcollreq/c/profile/pmpiext_pcollreq_c.h
+ompi/mpiext/pcollreq/c/profile/pneighbor_allgather_init.c
+ompi/mpiext/pcollreq/c/profile/pneighbor_allgatherv_init.c
+ompi/mpiext/pcollreq/c/profile/pneighbor_alltoall_init.c
+ompi/mpiext/pcollreq/c/profile/pneighbor_alltoallv_init.c
+ompi/mpiext/pcollreq/c/profile/pneighbor_alltoallw_init.c
+ompi/mpiext/pcollreq/c/profile/preduce_init.c
+ompi/mpiext/pcollreq/c/profile/preduce_scatter_block_init.c
+ompi/mpiext/pcollreq/c/profile/preduce_scatter_init.c
+ompi/mpiext/pcollreq/c/profile/pscan_init.c
+ompi/mpiext/pcollreq/c/profile/pscatter_init.c
+ompi/mpiext/pcollreq/c/profile/pscatterv_init.c
+ompi/mpiext/pcollreq/c/profile/ppcollreq_c.h
+
+ompi/mpiext/pcollreq/mpif-h/profile/pallgather_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pallgatherv_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pallreduce_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/palltoall_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/palltoallv_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/palltoallw_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pbarrier_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pbcast_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pexscan_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pgather_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pgatherv_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pneighbor_allgather_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pneighbor_allgatherv_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pneighbor_alltoall_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pneighbor_alltoallv_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pneighbor_alltoallw_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/preduce_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/preduce_scatter_block_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/preduce_scatter_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pscan_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pscatter_init_f.c
+ompi/mpiext/pcollreq/mpif-h/profile/pscatterv_init_f.c
+
+ompi/mpiext/shortfloat/c/mpiext_shortfloat_c.h
+ompi/mpiext/shortfloat/mpif-h/mpiext_shortfloat_mpifh.h
+ompi/mpiext/shortfloat/use-mpi-f08/mpiext_shortfloat_usempif08.h
+
+ompi/tools/mpisync/mpisync
+ompi/tools/mpisync/mpirun_prof
+ompi/tools/mpisync/ompi_timing_post
+ompi/tools/mpirun/mpirun
+
+ompi/tools/ompi_info/ompi_info
+
+ompi/tools/wrappers/mpic++-wrapper-data.txt
+ompi/tools/wrappers/mpicc-wrapper-data.txt
+ompi/tools/wrappers/mpifort-wrapper-data.txt
+ompi/tools/wrappers/ompi_wrapper_script
+ompi/tools/wrappers/ompi.pc
+ompi/tools/wrappers/ompi-c.pc
+ompi/tools/wrappers/ompi-cxx.pc
+ompi/tools/wrappers/ompi-fort.pc
+ompi/tools/wrappers/mpijavac.pl
+ompi/tools/wrappers/mpicxx-wrapper-data.txt
+ompi/tools/wrappers/mpif77-wrapper-data.txt
+ompi/tools/wrappers/mpif90-wrapper-data.txt
+
+opal/asm/atomic-asm.S
+opal/asm/atomic-test
+opal/asm/generated/atomic-*.s
+
+opal/include/opal_config.h
+opal/include/opal_config.h.in
+opal/include/opal/install_dirs.h
+opal/include/opal/version.h
+opal/include/opal/frameworks.h
+opal/include/opal/sys/powerpc/atomic-32.s
+opal/include/opal/sys/powerpc/atomic-64.s
+opal/include/opal/sys/powerpc/atomic-32-64.s
+
+opal/mca/base/mca_base_parse_paramfile_lex.c
+
+opal/mca/btl/usnic/usnic_btl_run_tests
+
+opal/mca/event/libevent*/libevent/config.h.in
+opal/mca/event/libevent*/libevent/config.h
+opal/mca/event/libevent*/libevent/libevent.pc
+opal/mca/event/libevent*/libevent/libevent_openssl.pc
+opal/mca/event/libevent*/libevent/libevent_pthreads.pc
+opal/mca/event/libevent*/libevent/include/event2/event-config.h
+
+opal/mca/installdirs/config/install_dirs.h
+
+opal/tools/wrappers/opalcc-wrapper-data.txt
+opal/tools/wrappers/opalc++-wrapper-data.txt
+opal/tools/wrappers/opalCC-wrapper-data.txt
+opal/tools/wrappers/opal_wrapper
+opal/tools/wrappers/opal.pc
+
+opal/util/show_help_lex.c
+opal/util/keyval/keyval_lex.c
+
+test/simple/abort
+test/simple/accept
+test/simple/attach
+test/simple/bad_exit
+test/simple/bcast_loop
+test/simple/binding
+test/simple/concurrent_spawn
+test/simple/connect
+test/simple/crisscross
+test/simple/delayed_abort
+test/simple/hello_barrier
+test/simple/hello_nodename
+test/simple/hello_output
+test/simple/hello_show_help
+test/simple/hello
+test/simple/hello++
+test/simple/intercomm1
+test/simple/interlib
+test/simple/loop_child
+test/simple/loop_spawn
+test/simple/mpi_barrier
+test/simple/mpi_no_op
+test/simple/mpi_spin
+test/simple/multi_abort
+test/simple/parallel_r8
+test/simple/parallel_r64
+test/simple/parallel_w8
+test/simple/parallel_w64
+test/simple/pinterlib
+test/simple/pmix
+test/simple/pubsub
+test/simple/read_write
+test/simple/reduce-hang
+test/simple/ring
+test/simple/segv
+test/simple/simple_spawn
+test/simple/slave
+test/simple/spawn_multiple
+test/simple/xlib
+test/simple/ziaprobe
+test/simple/ziatest
+test/simple/*.dwarf
+test/simple/junk*
+test/simple/sio
+test/simple/sendrecv_blaster
+test/simple/early_abort
+test/simple/spawn_problem/ch_rec
+test/simple/spawn_problem/output
+test/simple/spawn_problem/start
+test/simple/debugger
+test/simple/server_port_name.txt
+test/simple/singleton_client_server
+test/simple/intercomm_create
+test/simple/spawn_tree
+test/simple/init-exit77
+test/simple/mpi_info
+test/simple/info_spawn
+test/simple/client
+test/simple/server
+test/simple/paccept
+test/simple/pconnect
+test/simple/thread_init
+test/simple/memcached-dummy
+test/simple/coll_test
+test/simple/badcoll
+test/simple/iof
+test/simple/no-disconnect
+test/simple/nonzero
+test/simple/add_host
+
+oshmem/include/shmem.h
+oshmem/include/shmem_portable_platform.h
+oshmem/include/oshmem/frameworks.h
+oshmem/include/oshmem/version.h
+
+oshmem/mca/sshmem/base/static-components.h
+
+oshmem/shmem/c/profile/p*.c
+oshmem/shmem/c/profile/*.c
+
+oshmem/shmem/fortran/libshmem_fortran.la
+oshmem/shmem/fortran/profile/pshmem_*_f.c
+oshmem/shmem/fortran/profile/pshpdeallc_f.c
+oshmem/shmem/fortran/profile/pshpclmove_f.c
+oshmem/shmem/fortran/profile/pmy_pe_f.c
+oshmem/shmem/fortran/profile/pshpalloc_f.c
+oshmem/shmem/fortran/profile/pnum_pes_f.c
+oshmem/shmem/fortran/profile/pstart_pes_f.c
+
+oshmem/tools/oshmem_info/oshmem_info
+
+oshmem/tools/wrappers/oshmem-c.pc
+oshmem/tools/wrappers/oshmem-cxx.pc
+oshmem/tools/wrappers/oshmem-fort.pc
+oshmem/tools/wrappers/oshmem.pc
+oshmem/tools/wrappers/shmemcc-wrapper-data.txt
+oshmem/tools/wrappers/shmemfort-wrapper-data.txt
+oshmem/tools/wrappers/shmemc++-wrapper-data.txt
+
+test/asm/atomic_math_noinline
+test/asm/atomic_barrier
+test/asm/atomic_cmpset_noinline
+test/asm/atomic_math
+test/asm/atomic_cmpset
+test/asm/atomic_spinlock_noinline.c
+test/asm/atomic_barrier_noinline.c
+test/asm/atomic_math_noinline.c
+test/asm/atomic_cmpset_noinline.c
+test/asm/atomic_spinlock_noinline
+test/asm/atomic_barrier_noinline
+test/asm/atomic_spinlock
+
+test/class/*.txt
+test/class/ompi_bitmap_test_out.txt
+test/class/ompi_circular_buffer_fifo
+test/class/ompi_fifo
+test/class/ompi_rb_tree
+test/class/ompi_bitmap
+test/class/opal_bitmap
+test/class/opal_fifo
+test/class/opal_cstring
+test/class/opal_hash_table
+test/class/opal_lifo
+test/class/opal_list
+test/class/opal_pointer_array
+test/class/opal_proc_table
+test/class/opal_tree
+test/class/opal_value_array
+
+test/datatype/ddt_test
+test/datatype/ddt_pack
+test/datatype/external32
+test/datatype/to_self
+test/datatype/checksum
+test/datatype/position
+test/datatype/ddt_raw
+test/datatype/opal_datatype_test
+test/datatype/position_noncontig
+test/datatype/unpack_ooo
+test/datatype/unpack_hetero
+
+test/event/signal-test
+test/event/event-test
+test/event/time-test
+
+test/monitoring/monitoring_test
+test/monitoring/check_monitoring
+test/monitoring/example_reduce_count
+test/monitoring/test_overhead
+test/monitoring/test_pvar_access
+
+test/mpi/environment/chello
+
+test/runtime/parse_context
+test/runtime/sigchld
+test/runtime/start_shut
+test/runtime/opal_init_finalize
+test/runtime/orte_init_finalize
+
+test/spc/spc_test
+
+test/threads/opal_condition
+test/threads/opal_thread
+
+test/util/aaa
+test/util/test_session_dir_out
+test/util/opal_os_path
+test/util/opal_argv
+test/util/opal_os_create_dirpath
+test/util/opal_if
+test/util/opal_error
+test/util/opal_timer
+test/util/orte_sys_info
+test/util/orte_session_dir
+test/util/orte_sys_info
+test/util/orte_universe_setup_file_io
+test/util/opal_basename
+test/util/ompi_numtostr
+test/util/ompi_pack
+test/util/test-file
+test/util/opal_sos
+test/util/opal_path_nfs
+test/util/opal_path_nfs.out
+test/util/opal_bit_ops
+test/util/bipartite_graph
+test/util/opal_sha256
+
+opal/test/reachable/reachable_netlink
+opal/test/reachable/reachable_weighted
+opal/mca/threads/argobots/threads_argobots.h
+opal/mca/threads/qthreads/threads_qthreads.h
+
+docs/_build
+docs/_static
+docs/_static/css/custom.css
+docs/_templates
+
+# Common Python virtual environment directory names
+venv
+py??
+
+# Copies of PRRTE RST files (i.e., not source controlled in this tree)
+docs/prrte-rst-content
+docs/schizo-ompi-rst-content
+
+# Copies of the built HTML docs and man pages (for distribution
+# tarballs)
+docs/html
+docs/man
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..22aa7f5a84a5b4f7210fe98a04a2de91822848a3
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,11 @@
+[submodule "prrte"]
+ path = 3rd-party/prrte
+ url = ../../openpmix/prrte
+ branch = v3.0
+[submodule "openpmix"]
+ path = 3rd-party/openpmix
+ url = ../../openpmix/openpmix.git
+ branch = v4.2
+[submodule "oac"]
+ path = config/oac
+ url = ../../open-mpi/oac
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 0000000000000000000000000000000000000000..42895b1ddd6e94de6bdd059f0e57286aaaae9612
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,139 @@
+# This file exists to help consolidate names and email addresses
+# (e.g., when people accidentally commit with an incorrect or local
+# email address). Two common use cases:
+#
+# 1. Consolidate multiple email addresses from a single person.
+# Example: one commit from John Smith is from
+# and another is from
+# , and a third is from
+# . But they're all from
+# the same John Smith person.
+#
+# 2. Consolidate misspellings / altername names from a single person.
+# Example: one commit is from "John Smith" and another is from
+# "John Smith, CONTRACTOR", and third is from "RocketMan 9982". But
+# these are all really the same person, who can be listed once in
+# AUTHORS as "John Smith".
+#
+# The format of this file is documented in git-shortlog(1). Specifically,
+# a line like this:
+#
+# Proper Name
+#
+# means that when git sees "commit@email.xx" it will display
+# "Proper Name " instead in certain circumstances. Those
+# circumstances include:
+#
+# - git shortlog
+# - git blame
+# - git log --format=tformat:"%aN <%aE>" (and similar)
+#
+
+Jeff Squyres
+Jeff Squyres --quiet <--quiet>
+Jeff Squyres
+Jeff Squyres
+
+George Bosilca
+
+Howard Pritchard
+Howard Pritchard
+
+Andrew Friedley
+
+Devendar Bureddy
+
+Edgar Gabriel
+Edgar Gabriel
+Edgar Gabriel
+Edgar Gabriel
+
+Gilles Gouaillardet
+
+Matias A Cabral
+Matias A Cabral
+
+Pavel Shamis
+Pavel Shamis
+Pavel Shamis
+
+Todd Kordenbrock
+
+Yohann Burette
+Yohann Burette
+
+MPI Team (bot)
+MPI Team (bot)
+MPI Team (bot)
+
+Yossi Itigin
+
+Josh Hursey
+Josh Hursey
+
+Adrian Reber
+
+Elena Elkina
+Elena Elkina
+
+Igor Ivanov
+Igor Ivanov
+
+Mangala Jyothi Bhaskar
+Mangala Jyothi Bhaskar
+
+Ralph Castain
+Ralph Castain
+
+Rolf vandeVaart
+
+Karol Mroz
+
+Nadezhda Kogteva
+
+Thananon Patinyasakdikul
+
+Nysal Jan K A
+Nysal Jan K A
+
+Zhi Ming Wang
+
+Annapurna Dasari
+
+L. R. Rajeshnarayanan
+
+Aurelien Bouteiller
+Aurelien Bouteiller
+
+Alex Mikheev
+
+Thomas Naughton
+
+Geoffrey Paulsen
+
+Anandhi S Jayakumar
+
+Mohan Gandhi
+
+Harumi Kuno
+
+Nick Papior
+Nick Papior
+Nick Papior
+
+Wei-keng Liao
+
+Matthew G. F. Dosanjh
+
+Samuel K. Gutierrez
+Samuel K. Gutierrez
+
+Tomislav Janjusic Tomislavj Janjusic
+
+William P. LePera
+
+George Katevenis
+
+Brian Barrett
+
+Andrii Bilokur B-a-S
diff --git a/.readthedocs-pre-create-environment.sh b/.readthedocs-pre-create-environment.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ae9ef68a587725a73a7d0eb629a0cbf71f0e2df7
--- /dev/null
+++ b/.readthedocs-pre-create-environment.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# The ReadTheDocs build process does not run autogen/configure/make.
+# Hence, we have to copy the PRRTE RST files (from the 3rd-party/prrte
+# tree) to our docs/ tree manually.
+
+# Ensure that we're in the RTD CI environment
+
+if [[ "${READTHEDOCS:-no}" == "no" ]]; then
+ echo "This script is only intended to be run in the ReadTheDocs CI environment"
+ exit 1
+fi
+
+SCHIZO_SRC_DIR=3rd-party/prrte/src/mca/schizo/ompi
+SCHIZO_TARGET_DIR=docs/schizo-ompi-rst-content
+
+PRRTE_RST_SRC_DIR=3rd-party/prrte/src/docs/prrte-rst-content
+PRRTE_RST_TARGET_DIR=docs/prrte-rst-content
+
+# Copy the OMPI schizo file from PRRTE
+#
+# See lengthy comment in docs/Makefile.am about copying in RST files
+# from PRRTE for a longer explanation of what is happening here.
+
+cp -rp $SCHIZO_SRC_DIR $SCHIZO_TARGET_DIR
+cp -rp $PRRTE_RST_SRC_DIR $PRRTE_RST_TARGET_DIR
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ba1fc07842e408890d5d81fe7d6156c56467d79
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,31 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Currently, RTD needs to select an OS with OpenSSL>=1.1.1 because of
+# urllib3's dependence on that system library. (alternately, pin urllib3<2
+# See https://github.com/urllib3/urllib3/issues/2168
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.10"
+ jobs:
+ # RTD doesn't run configure or make. So we have to manually copy
+ # in the PRRTE RST files to docs/.
+ pre_create_environment:
+ - ./.readthedocs-pre-create-environment.sh
+
+python:
+ install:
+ - requirements: docs/requirements.txt
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+ configuration: docs/conf.py
+ fail_on_warning: true
+
+submodules:
+ include: all
diff --git a/3rd-party/Makefile.am b/3rd-party/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..0fdb5d43d3dfec6c5c879104aa659a48780742ad
--- /dev/null
+++ b/3rd-party/Makefile.am
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
+# All Rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# This Makefile exists (as opposed to being part of the top level
+# Makefile) solely to have a blank check-recursive rule. Otherwise, the
+# stability of Open MPI's ability to run "make check" or "make
+# distcheck" is dependent on the ability of our 3rd-party packages to
+# do the same. Libevent's tests are not stable enough for that kind
+# of choice.
+
+SUBDIRS = $(OPAL_3RDPARTY_SUBDIRS)
+DIST_SUBDIRS = $(OPAL_3RDPARTY_DIST_SUBDIRS)
+EXTRA_DIST = $(OPAL_3RDPARTY_EXTRA_DIST) autogen.subdirs
+
+distclean-local:
+ rm -rf $(OPAL_3RDPARTY_DISTCLEAN_DIRS)
+
+check:
+ @echo "auto-recursing into 3rd-party packages for check disabled"
+
+check-recursive:
+ @echo "auto-recursing into 3rd-party packages for check disabled"
diff --git a/3rd-party/autogen.subdirs b/3rd-party/autogen.subdirs
new file mode 100644
index 0000000000000000000000000000000000000000..ac181a64c7de148e8d3a349dde86cee4fb2c377a
--- /dev/null
+++ b/3rd-party/autogen.subdirs
@@ -0,0 +1 @@
+romio341
diff --git a/3rd-party/exclude-config.ini b/3rd-party/exclude-config.ini
new file mode 100644
index 0000000000000000000000000000000000000000..6eee1e41698facebf30cf504c279d22a529957d9
--- /dev/null
+++ b/3rd-party/exclude-config.ini
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2021 IBM Corporation. All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# List of m4 files and configure options to exclude when extracting configure
+# options from 3rd party packages.
+#
+# Exclude a whole file (prefix with 'FILE:'):
+# FILE: ltoptions.m4
+# FILE: 3rd-party/prrte/config/prte_check_slurm.m4
+# Accepts a relative path to the top level directory, or a single filename.
+# Relative path is useful for excluding an .m4 file from a specific package.
+#
+# Exclude an option from all 3rd party packages (prefix with 'OPTION:'):
+# OPTION: pkgconfigdir
+#
+# Exclude an option from a specific 3rd party package
+# (prefix with 'OPTION(pkg):' where 'pkg' is the package name as it is
+# passed to extract-3rd-party-configure.pl in autogen.pl (case insensitive).
+# OPTION(PMIx): max-
+# OPTION(PRRTE): hwloc
+#
+
+#------------------------------------
+# Generally excluded files
+# Example: FILE: ltoptions.m4
+FILE: ltoptions.m4
+FILE: ltsugar.m4
+FILE: ltversion.m4
+FILE: lt~obsolete.m4
+FILE: libtool.m4
+FILE: autogen_found_items.m4
+FILE: config/auto-extracted-pmix-configure-args.m4
+FILE: config/auto-extracted-prrte-configure-args.m4
+
+#------------------------------------
+# Generally excluded options
+# Example: OPTION: pkgconfigdir
+OPTION: pkgconfigdir
+OPTION: noarch-pkgconfigdir
+OPTION: libevent
+OPTION: libevent-header
+OPTION: libevent-libdir
+OPTION: libevent-header
+
+
+#------------------------------------
+# OpenPMIx
+OPTION(PMIx): max-
+
+#------------------------------------
+# PRRTE
+# Example: FILE: 3rd-party/prrte/config/prte_check_slurm.m4
+
+# Excluded options
+# Example: OPTION(PRRTE): zlib-libdir
+OPTION(PRRTE): hwloc
+OPTION(PRRTE): hwloc-libdir
+OPTION(PRRTE): hwloc-header
+OPTION(PRRTE): max-
+OPTION(PRRTE): pmix
+OPTION(PRRTE): pmix-libdir
+OPTION(PRRTE): pmix-header
+OPTION(PRRTE): pmix-devel-support
diff --git a/3rd-party/hwloc-2.7.1.tar.gz b/3rd-party/hwloc-2.7.1.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d627caba9e4893d5621ef519f8ce709b734fa722
Binary files /dev/null and b/3rd-party/hwloc-2.7.1.tar.gz differ
diff --git a/3rd-party/libevent-2.1.12-stable-ompi.tar.gz b/3rd-party/libevent-2.1.12-stable-ompi.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..9a2e9694a3aee63373a2c0fc948cc44ad072f5c4
Binary files /dev/null and b/3rd-party/libevent-2.1.12-stable-ompi.tar.gz differ
diff --git a/3rd-party/openpmix b/3rd-party/openpmix
new file mode 160000
index 0000000000000000000000000000000000000000..8ab6d680b90afd6e61766220a8724065a1b554a7
--- /dev/null
+++ b/3rd-party/openpmix
@@ -0,0 +1 @@
+Subproject commit 8ab6d680b90afd6e61766220a8724065a1b554a7
diff --git a/3rd-party/prrte b/3rd-party/prrte
new file mode 160000
index 0000000000000000000000000000000000000000..b68a0acb32cfc0d3c19249e5514820555bcf438b
--- /dev/null
+++ b/3rd-party/prrte
@@ -0,0 +1 @@
+Subproject commit b68a0acb32cfc0d3c19249e5514820555bcf438b
diff --git a/3rd-party/romio341/.codingcheck b/3rd-party/romio341/.codingcheck
new file mode 100644
index 0000000000000000000000000000000000000000..bc77906acb328df0e371d474a21f4f6f2661419a
--- /dev/null
+++ b/3rd-party/romio341/.codingcheck
@@ -0,0 +1,54 @@
+# Here are names that at least at one point were used within ROMIO.
+# We should look at these and decide which we wish to allow and which
+# should be replaced with something more ROMIO-specific.
+%romioDefines = ( 'ROMIO_[A-Za-z0-9_]+' => romio,
+ 'PROFILE' => romio,
+ 'PRINT_ERR_MSG' => romio,
+ 'HPUX' => romio,
+ 'SPPUX'=> romio,
+ 'SX4'=> romio,
+ 'AIO_SUN'=> romio,
+ 'AIO_HANDLE_IN_AIOCB'=> romio,
+ 'NO_FD_IN_AIOCB'=> romio,
+ 'NO_AIO'=> romio,
+ 'AIO_PRIORITY_DEFAULT'=> romio,
+ 'AIO_SIGNOTIFY_NONE'=> romio,
+ 'MPISGI'=> romio,
+ 'CRAY'=> romio,
+ 'PARAGON'=> romio,
+ 'FREEBSD'=> romio,
+ 'LINUX'=> romio,
+ 'tflops'=> romio,
+ 'NFS'=> romio,
+ 'XFS'=> romio,
+ 'CB_CONFIG_LIST_DEBUG'=> romio,
+ 'SFS'=> romio,
+ 'HFS'=> romio,
+ 'UFS'=> romio,
+ 'PVFS_.+' => romio,
+ 'MPI_hpux'=> romio,
+ 'FORTRANCAPS'=> romio,
+ 'NEEDS_ADIOCB_T'=> romio,
+ 'AGG_DEBUG'=> romio,
+ 'SOLARIS'=> romio,
+ 'IRIX'=> romio,
+ 'AIX'=> romio,
+ 'DEC'=> romio,
+ 'NEEDS_MPI_TEST'=> romio,
+ 'PFS'=> romio,
+ 'PIOFS'=> romio,
+ 'MPICH'=> romio,
+ 'MPICH' => romio,
+ 'MPI_OFFSET_IS_INT'=> romio,
+ 'MPI_COMBINER_NAMED'=> romio,
+ '_UNICOS'=> romio,
+ 'MPIHP'=> romio,
+ );
+
+# Only invoke this function if the function is defined (in case the
+# user removed the cpp defines check with -rmchecks=cppdefines)
+if (defined(&PushDefinesNames)) {
+ &PushDefinesNames( "romioDefines", "tree", "add" );
+}
+
+1;
diff --git a/3rd-party/romio341/.config_params b/3rd-party/romio341/.config_params
new file mode 100644
index 0000000000000000000000000000000000000000..fcc2f9146d1fb8624121f8ec19a3f85a803f0cba
--- /dev/null
+++ b/3rd-party/romio341/.config_params
@@ -0,0 +1,39 @@
+__sun4_
+__rs6000_
+__paragon_
+__solaris_
+__solaris86_
+__tflop_
+__tflops_
+__hpux_
+__sppux_
+__SX4_
+__sgi_
+__sgi5_
+__IRIX_
+__IRIX32_
+__IRIXN32_
+__IRIX64_
+__alpha_
+__ALPHA_
+__freebsd_
+__netbsd_
+__LINUX_
+__LINUX_ALPHA_
+__CRAY_
+__Darwin_
+__nfs_
+__ufs_
+__pfs_
+__piofs_
+__pvfs_
+__testfs_
+__xfs_
+__hfs_
+__sfs_
+__mpich_mpi
+__sgi_mpi
+__hp_mpi
+__cray_mpi
+__lam_mpi
+__open_mpi
diff --git a/3rd-party/romio341/Makefile.am b/3rd-party/romio341/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..70e54c899b3e571e4fc50725e2f302a35071dd9a
--- /dev/null
+++ b/3rd-party/romio341/Makefile.am
@@ -0,0 +1,213 @@
+## Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+## University Research and Technology
+## Corporation. All rights reserved.
+## Copyright (c) 2004-2005 The University of Tennessee and The University
+## of Tennessee Research Foundation. All rights
+## reserved.
+## Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+## University of Stuttgart. All rights reserved.
+## Copyright (c) 2004-2005 The Regents of the University of California.
+## All rights reserved.
+## Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
+## Copyright (c) 2020 Research Organization for Information Science
+## and Technology (RIST). All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+# OMPI: include a top level makefile with some options
+include $(top_srcdir)/Makefile.options
+
+## TODO: need to write an automakefile that handles two primary cases:
+## 1) that ROMIO is being embedded within the MPI library, as in MPICH or Open
+## MPI
+## 2) that ROMIO is being built standalone, old-school style. This case is
+## basically unused in modern practice.
+
+# help autoreconf and friends realize where the macros live
+ACLOCAL_AMFLAGS = -I confdb
+
+# empty variable initializations so that later code can append (+=)
+include_HEADERS =
+nodist_include_HEADERS =
+noinst_HEADERS =
+AM_CFLAGS = @VISIBILITY_CFLAGS@
+EXTRA_DIST =
+SUFFIXES =
+doc1_src_txt =
+
+external_subdirs = @mpl_srcdir@
+external_dist_subdirs = @mpl_dist_srcdir@
+external_includes = @mpl_includedir@
+external_ldflags = @mpl_libdir@
+external_libs = @mpl_lib@
+
+# ------------------------------------------------------------------------
+# variables to be populated by the included Makefile.mk fragments:
+
+# These are files that contain MPI routines (e.g., MPI_File_open).
+# In MPICH these will have an MPI_ and a PMPI_ version. Other implementations
+# (like OMPI) only want these to be MPI_ routines, possibly with some
+# name-shifting prefix.
+romio_mpi_sources =
+
+# regular old source files that implement ROMIO, such as ADIO code
+romio_other_sources =
+
+# code that may need to be "up" called from the MPI library and/or is
+# MPI-implementation-specific in some way
+glue_sources =
+
+# ------------------------------------------------------------------------
+# when building under MPICH we must be able to find mpi.h
+AM_CPPFLAGS += $(MPI_H_INCLUDE)
+
+# ------------------------------------------------------------------------
+# handle the "include" directory here
+AM_CPPFLAGS += -I$(top_builddir)/include -I$(top_srcdir)/include $(external_includes)
+# nodist_ b/c these are created by config.status and should not be distributed
+# Open MPI: do not install mpio.h
+noinst_HEADERS += include/mpio.h
+noinst_HEADERS += include/io_romio_conv.h
+
+# ------------------------------------------------------------------------
+
+SUBDIRS = $(external_subdirs)
+DIST_SUBDIRS = test test-internal $(external_dist_subdirs)
+
+# for the sake of parallel make and avoiding an excessive number of convenience
+# libs, we use a subdir automake fragment strategy
+include mpi-io/Makefile.mk
+include adio/Makefile.mk
+
+EXTRA_DIST += autogen.sh
+
+if BUILD_ROMIO_EMBEDDED
+# Build a libtool convenience library that the enclosing MPI implementation can
+# use by adding it to the right _LIBADD variable.
+noinst_LTLIBRARIES = libromio_dist.la
+libromio_dist_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_sources)
+
+## NOTE: ROMIO's old build system builds a bunch of _foo.o objects that contain
+## PMPI_ implementations as well as calls to only other PMPI routines. In
+## MPICH, these are the objects that need to go into libmpi, while the foo.o
+## objects should go into libpmpi. Furthermore, the -D option for ROMIO's
+## source files is different and inverted (in the boolean sense) compared with
+## MPICH's defintion. And ROMIO was dumping all of the symbols into the main
+## libmpi library, regardless of the separate profiling library's existence.
+##
+## Annoying, right?
+if BUILD_PROFILING_LIB
+# The current best strategy for now is to build the PMPI symbols as a separate
+# convenience lib to permit adding the special "-D..." argument for all objects.
+# MPICH will then link in both convenience library into libmpi, since it
+# won't work very well the other way around.
+noinst_LTLIBRARIES += libpromio.la
+libpromio_la_SOURCES = $(romio_mpi_sources)
+libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
+libpromio_la_LDFLAGS = $(external_ldflags)
+libpromio_la_LIBADD = $(external_libs)
+else !BUILD_PROFILING_LIB
+libromio_dist_la_LDFLAGS = $(external_ldflags)
+libromio_dist_la_LIBADD = $(external_libs)
+endif !BUILD_PROFILING_LIB
+
+else !BUILD_ROMIO_EMBEDDED
+lib_LTLIBRARIES = libromio.la
+libromio_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_sources)
+if BUILD_PROFILING_LIB
+libpromio_la_SOURCES = $(romio_mpi_sources)
+libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
+endif BUILD_PROFILING_LIB
+
+endif
+
+# --------------------------------------------------------------------------
+.PHONY: coverage
+gcov_sources = $(libmpl_la_SOURCES)
+# assumes that these sources were compiled appropriately ("-fprofile-arcs"
+# and "-ftest-coverage")
+coverage:
+ @for file in $(gcov_sources) ; do \
+ dir=`dirname $$file` ; \
+ bname=`basename $$file` ; \
+ aux=`echo $$bname | sed -e 's,\.*$$,,'` ; \
+ echo "( $(GCOV) -b -f -o $$file $$file && mv $${bname}.gcov $$dir )" ; \
+ ( $(GCOV) -b -f -o $$file $$file && mv $${bname}.gcov $$dir ) ; \
+ rm -f *.gcov ; \
+ done
+ for subdir in $(SUBDIRS) - ; do \
+ if test $$subdir = "-" ; then break ; fi ; \
+ ( cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) coverage ) ; \
+ done
+# --------------------------------------------------------------------------
+.PHONY: mandoc mandoc-local htmldoc htmldoc-local
+SUFFIXES += .man-phony .html-phony .man1-phony .html1-phony .txt
+
+# "make V=1" support for our documentation recipes
+doctextman_verbose = $(doctextman_verbose_$(V))
+doctextman_verbose_ = $(doctextman_verbose_$(AM_DEFAULT_VERBOSITY))
+doctextman_verbose_0 = @echo " DOCTEXTMAN " $@;
+doctexthtml_verbose = $(doctexthtml_verbose_$(V))
+doctexthtml_verbose_ = $(doctexthtml_verbose_$(AM_DEFAULT_VERBOSITY))
+doctexthtml_verbose_0 = @echo " DOCTEXTHTML " $@;
+
+# Build dir paths where the man pages will be created. Will usually be
+# overridden by MPICH make.
+mandoc_path1=$(abs_top_builddir)/man/man1
+mandoc_path3=$(abs_top_builddir)/man/man3
+htmldoc_path1=$(abs_top_builddir)/www/www1
+htmldoc_path3=$(abs_top_builddir)/www/www3
+doctext_docnotes=
+# Provide an easily replaced url root for the generated index file.
+# You can override this with URL desired in the index file generated by doctext.
+# You can ignore this if you don't use mapnames or tohtml to add links
+# to the MPI manual pages to documents.
+htmldoc_root3="--your-url-here--"
+
+.c.man-phony:
+ $(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path3) -ext 3 \
+ -heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
+.c.html-phony:
+ $(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path3) \
+ -heading MPI -quotefmt -nolocation \
+ -index $(htmldoc_path3)/mpi.cit -indexdir $(htmldoc_root3) \
+ $(doctext_docnotes) $<
+
+.txt.man1-phony:
+ $(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path1) -ext 1 \
+ -heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
+.txt.html1-phony:
+ $(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path1) \
+ -heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
+
+# use mandoc-local target to force directory creation before running DOCTEXT
+mandoc:
+ test -d $(mandoc_path1) || $(MKDIR_P) $(mandoc_path1)
+ test -d $(mandoc_path3) || $(MKDIR_P) $(mandoc_path3)
+ $(MAKE) $(AM_MAKEFLAGS) mandoc-local
+mandoc-local: $(romio_mpi_sources:.c=.man-phony) $(doc1_src_txt:.txt=.man1-phony)
+
+# use htmldoc-local target to force directory creation before running DOCTEXT
+htmldoc:
+ test -d $(top_builddir)/www/www1 || $(MKDIR_P) $(top_builddir)/www/www1
+ test -d $(top_builddir)/www/www3 || $(MKDIR_P) $(top_builddir)/www/www3
+ $(MAKE) $(AM_MAKEFLAGS) htmldoc-local
+htmldoc-local: $(romio_mpi_sources:.c=.html-phony) $(doc1_src_txt:.txt=.html1-phony)
+
+# --------------------------------------------------------------------------
+
+# sometimes helpful when debugging macros to see the preprocessed output.
+# Also using '-CC' because comments provide useful landmarks
+
+SUFFIXES += .i
+
+.c.i:
+ $(COMPILE) -CC -E -o $@ $<
+
diff --git a/3rd-party/romio341/Makefile.options b/3rd-party/romio341/Makefile.options
new file mode 100644
index 0000000000000000000000000000000000000000..0b72829e152a9a779212becd31d80bb8e5a58456
--- /dev/null
+++ b/3rd-party/romio341/Makefile.options
@@ -0,0 +1,36 @@
+# -*- makefile -*-
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+# University Research and Technology
+# Corporation. All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+# of Tennessee Research Foundation. All rights
+# reserved.
+# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+# University of Stuttgart. All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+# All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+AUTOMAKE_OPTIONS = foreign dist-bzip2
+
+# $(OMPI_TOP_SRCDIR) - mca_base_param.h
+# $(OMPI_TOP_SRCDIR)/opal/include - opal_config_bottom.h
+# $(OMPI_TOP_BUILDDIR)/opal/include - opal_config.h
+# $(OMPI_TOP_BUILDDIR)/ompi/include - mpi.h
+# $(top_srcdir)/include - vpath support
+# $(top_srcdir)/adio/include - vpath support
+
+AM_CPPFLAGS = \
+ -DOMPI_BUILDING=1 \
+ -I$(OMPI_TOP_SRCDIR) \
+ -I$(OMPI_TOP_SRCDIR)/opal/include \
+ -I$(OMPI_TOP_BUILDDIR)/opal/include \
+ -I$(OMPI_TOP_BUILDDIR)/ompi/include \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/adio/include
diff --git a/3rd-party/romio341/README b/3rd-party/romio341/README
new file mode 100644
index 0000000000000000000000000000000000000000..a6fb25a09a51a151685fe5df5e37f96faab05356
--- /dev/null
+++ b/3rd-party/romio341/README
@@ -0,0 +1,660 @@
+ ROMIO: A High-Performance, Portable MPI-IO Implementation
+
+ Version 2008-03-09
+
+Major Changes in this version:
+------------------------------
+* Fixed performance problems with the darray and subarray datatypes
+ when using MPICH.
+
+* Better support for building against existing MPICH and MPICH versions.
+
+ When building against an existing MPICH installation, use the
+ "--with-mpi=mpich" option to ROMIO configure. For MPICH, use the
+ "--with-mpi=mpich" option. These will allow ROMIO to take advantage
+ of internal features of these implementations.
+
+* Deprecation of SFS, HFS, and PIOFS implementations.
+
+ These are no longer actively supported, although the code will continue
+ to be distributed for now.
+
+* Initial support for the Panasas PanFS filesystem.
+
+ PanFS allows users to specify the layout of a file at file-creation time.
+ Layout information includes the number of StorageBlades (SB)
+ across which the data is stored, the number of SBs across which a
+ parity stripe is written, and the number of consecutive stripes that
+ are placed on the same set of SBs. The panfs_layout_* hints are only
+ used if supplied at file-creation time.
+
+ panfs_layout_type - Specifies the layout of a file:
+ 2 = RAID0
+ 3 = RAID5 Parity Stripes
+ panfs_layout_stripe_unit - The size of the stripe unit in bytes
+ panfs_layout_total_num_comps - The total number of StorageBlades a file
+ is striped across.
+ panfs_layout_parity_stripe_width - If the layout type is RAID5 Parity
+ Stripes, this hint specifies the
+ number of StorageBlades in a parity
+ stripe.
+ panfs_layout_parity_stripe_depth - If the layout type is RAID5 Parity
+ Stripes, this hint specifies the
+ number of contiguous parity stripes written
+ across the same set of SBs.
+ panfs_layout_visit_policy - If the layout type is RAID5 Parity Stripes,
+ the policy used to determine the parity
+ stripe a given file offset is written to:
+ 1 = Round Robin
+
+ PanFS supports the "concurrent write" (CW) mode, where groups of cooperating
+ clients can disable the PanFS consistency mechanisms and use their own
+ consistency protocol. Clients participating in concurrent write mode use
+ application specific information to improve performance while maintaining
+ file consistency. All clients accessing the file(s) must enable concurrent
+ write mode. If any client does not enable concurrent write mode, then the
+ PanFS consistency protocol will be invoked. Once a file is opened in CW mode
+ on a machine, attempts to open a file in non-CW mode will fail with
+ EACCES. If a file is already opened in non-CW mode, attempts to open
+ the file in CW mode will fail with EACCES. The following hint is
+ used to enable concurrent write mode.
+
+ panfs_concurrent_write - If set to 1 at file open time, the file
+ is opened using the PanFS concurrent write
+ mode flag. Concurrent write mode is not a
+ persistent attribute of the file.
+
+ Below is an example PanFS layout using the following parameters:
+
+ - panfs_layout_type = 3
+ - panfs_layout_total_num_comps = 100
+ - panfs_layout_parity_stripe_width = 10
+ - panfs_layout_parity_stripe_depth = 8
+ - panfs_layout_visit_policy = 1
+
+ Parity Stripe Group 1 Parity Stripe Group 2 . . . Parity Stripe Group 10
+ ---------------------- ---------------------- --------------------
+ SB1 SB2 ... SB10 SB11 SB12 ... SB20 ... SB91 SB92 ... SB100
+ ----------------------- ----------------------- ---------------------
+ D1 D2 ... D10 D91 D92 ... D100 D181 D182 ... D190
+ D11 D12 D20 D101 D102 D110 D191 D192 D193
+ D21 D22 D30 . . . . . .
+ D31 D32 D40
+ D41 D42 D50
+ D51 D52 D60
+ D61 D62 D70
+ D71 D72 D80
+ D81 D82 D90 D171 D172 D180 D261 D262 D270
+ D271 D272 D273 . . . . . .
+ ...
+
+* Initial support for the Globus GridFTP filesystem. Work contributed by Troy
+ Baer (troy@osc.edu).
+
+Major Changes in Version 1.2.5:
+------------------------------
+
+* Initial support for MPICH-2
+
+* fix for a bug in which ROMIO would get confused for some permutations
+ of the aggregator list
+
+* direct io on IRIX's XFS should work now
+
+* fixed an issue with the Fortran bindings that would cause them to fail
+ when some compilers tried to build them.
+
+* Initial support for deferred opens
+
+Major Changes in Version 1.2.4:
+------------------------------
+* Added section describing ROMIO MPI_FILE_SYNC and MPI_FILE_CLOSE behavior to
+ User's Guide
+
+* Bug removed from PVFS ADIO implementation regarding resize operations
+
+* Added support for PVFS listio operations, including hints to control use
+
+
+Major Changes in Version 1.2.3:
+-------------------------------
+* Enhanced aggregation control via cb_config_list, romio_cb_read,
+ and romio_cb_write hints
+
+* Asynchronous IO can be enabled under Linux with the --enable-aio argument
+ to configure
+
+* Additional PVFS support
+
+* Additional control over data sieving with romio_ds_read hint
+
+* NTFS ADIO implementation integrated into source tree
+
+* testfs ADIO implementation added for debugging purposes
+
+
+Major Changes in Version 1.0.3:
+-------------------------------
+
+* When used with MPICH 1.2.1, the MPI-IO functions return proper error codes
+ and classes, and the status object is filled in.
+
+* On SGI's XFS file system, ROMIO can use direct I/O even if the
+ user's request does not meet the various restrictions needed to use
+ direct I/O. ROMIO does this by doing part of the request with
+ buffered I/O (until all the restrictions are met) and doing the rest
+ with direct I/O. (This feature hasn't been tested rigorously. Please
+ check for errors.)
+
+ By default, ROMIO will use only buffered I/O. Direct I/O can be
+ enabled either by setting the environment variables MPIO_DIRECT_READ
+ and/or MPIO_DIRECT_WRITE to TRUE, or on a per-file basis by using
+ the info keys "direct_read" and "direct_write".
+
+ Direct I/O will result in higher performance only if you are
+ accessing a high-bandwidth disk system. Otherwise, buffered I/O is
+ better and is therefore used as the default.
+
+* Miscellaneous bug fixes.
+
+
+Major Changes Version 1.0.2:
+---------------------------
+
+* Implemented the shared file pointer functions and
+ split collective I/O functions. Therefore, the main
+ components of the MPI I/O chapter not yet implemented are
+ file interoperability and error handling.
+
+* Added support for using "direct I/O" on SGI's XFS file system.
+ Direct I/O is an optional feature of XFS in which data is moved
+ directly between the user's buffer and the storage devices, bypassing
+ the file-system cache. This can improve performance significantly on
+ systems with high disk bandwidth. Without high disk bandwidth,
+ regular I/O (that uses the file-system cache) perfoms better.
+ ROMIO, therefore, does not use direct I/O by default. The user can
+ turn on direct I/O (separately for reading and writing) either by
+ using environment variables or by using MPI's hints mechanism (info).
+ To use the environment-variables method, do
+ setenv MPIO_DIRECT_READ TRUE
+ setenv MPIO_DIRECT_WRITE TRUE
+ To use the hints method, the two keys are "direct_read" and "direct_write".
+ By default their values are "false". To turn on direct I/O, set the values
+ to "true". The environment variables have priority over the info keys.
+ In other words, if the environment variables are set to TRUE, direct I/O
+ will be used even if the info keys say "false", and vice versa.
+ Note that direct I/O must be turned on separately for reading
+ and writing.
+ The environment-variables method assumes that the environment
+ variables can be read by each process in the MPI job. This is
+ not guaranteed by the MPI Standard, but it works with SGI's MPI
+ and the ch_shmem device of MPICH.
+
+* Added support (new ADIO device, ad_pvfs) for the PVFS parallel
+ file system for Linux clusters, developed at Clemson University
+ (see http://www.parl.clemson.edu/pvfs ). To use it, you must first install
+ PVFS and then when configuring ROMIO, specify "-file_system=pvfs" in
+ addition to any other options to "configure". (As usual, you can configure
+ for multiple file systems by using "+"; for example,
+ "-file_system=pvfs+ufs+nfs".) You will need to specify the path
+ to the PVFS include files via the "-cflags" option to configure,
+ for example, "configure -cflags=-I/usr/pvfs/include". You
+ will also need to specify the full path name of the PVFS library.
+ The best way to do this is via the "-lib" option to MPICH's
+ configure script (assuming you are using ROMIO from within MPICH).
+
+* Uses weak symbols (where available) for building the profiling version,
+ i.e., the PMPI routines. As a result, the size of the library is reduced
+ considerably.
+
+* The Makefiles use "virtual paths" if supported by the make utility. GNU make
+ supports it, for example. This feature allows you to untar the
+ distribution in some directory, say a slow NFS directory,
+ and compile the library (the .o files) in another
+ directory, say on a faster local disk. For example, if the tar file
+ has been untarred in an NFS directory called /home/thakur/romio,
+ one can compile it in a different directory, say /tmp/thakur, as follows:
+ cd /tmp/thakur
+ /home/thakur/romio/configure
+ make
+ The .o files will be created in /tmp/thakur; the library will be created in
+ /home/thakur/romio/lib/$ARCH/libmpio.a .
+ This method works only if the make utility supports virtual paths.
+ If the default make does not, you can install GNU make which does,
+ and specify it to configure as
+ /home/thakur/romio/configure -make=/usr/gnu/bin/gmake (or whatever)
+
+* Lots of miscellaneous bug fixes and other enhancements.
+
+* This version is included in MPICH 1.2.0. If you are using MPICH, you
+ need not download ROMIO separately; it gets built as part of MPICH.
+ The previous version of ROMIO is included in LAM, HP MPI, SGI MPI, and
+ NEC MPI. NEC has also implemented the MPI-IO functions missing
+ in ROMIO, and therefore NEC MPI has a complete implementation
+ of MPI-IO.
+
+
+Major Changes in Version 1.0.1:
+------------------------------
+
+* This version is included in MPICH 1.1.1 and HP MPI 1.4.
+
+* Added support for NEC SX-4 and created a new device ad_sfs for
+ NEC SFS file system.
+
+* New devices ad_hfs for HP/Convex HFS file system and ad_xfs for
+ SGI XFS file system.
+
+* Users no longer need to prefix the filename with the type of
+ file system; ROMIO determines the file-system type on its own.
+
+* Added support for 64-bit file sizes on IBM PIOFS, SGI XFS,
+ HP/Convex HFS, and NEC SFS file systems.
+
+* MPI_Offset is an 8-byte integer on machines that support 8-byte integers.
+ It is of type "long long" in C and "integer*8" in Fortran.
+ With a Fortran 90 compiler, you can use either integer*8 or
+ integer(kind=MPI_OFFSET_KIND).
+ If you printf an MPI_Offset in C, remember to use %lld
+ or %ld as required by your compiler. (See what is used in the test
+ program romio/test/misc.c.)
+
+* On some machines, ROMIO detects at configure time that "long long" is
+ either not supported by the C compiler or it doesn't work properly.
+ In such cases, configure sets MPI_Offset to long in C and integer in
+ Fortran. This happens on Intel Paragon, Sun4, and FreeBSD.
+
+* Added support for passing hints to the implementation via the MPI_Info
+ parameter. ROMIO understands the following hints (keys in MPI_Info object):
+
+ /* on all file systems */
+ cb_buffer_size - buffer size for collective I/O
+ cb_nodes - no. of processes that actually perform I/O in collective I/O
+ ind_rd_buffer_size - buffer size for data sieving in independent reads
+
+ /* on all file systems except IBM PIOFS */
+ ind_wr_buffer_size - buffer size for data sieving in independent writes
+ /* ind_wr_buffer_size is ignored on PIOFS because data sieving
+ cannot be done for writes since PIOFS doesn't support file locking */
+
+ /* on Intel PFS and IBM PIOFS only. These hints are understood only if
+ supplied at file-creation time. */
+ striping_factor - no. of I/O devices to stripe the file across
+ striping_unit - the striping unit in bytes
+ start_iodevice - the number of the I/O device from which to start
+ striping (between 0 and (striping_factor-1))
+
+ /* on Intel PFS only. */
+ pfs_svr_buf - turn on or off PFS server buffering by setting the value
+ to "true" or "false", case-sensitive.
+
+ If ROMIO doesn't understand a hint, or if the value is invalid, the hint
+ will be ignored. The values of hints being used by ROMIO at any time
+ can be obtained via MPI_File_get_info.
+
+
+
+General Information
+-------------------
+
+ROMIO is a high-performance, portable implementation of MPI-IO (the
+I/O chapter in MPI). ROMIO's home page is at
+http://www.mcs.anl.gov/romio . The MPI standard is available at
+http://www.mpi-forum.org/docs/docs.html .
+
+This version of ROMIO includes everything defined in the MPI I/O
+chapter except support for file interoperability and
+user-defined error handlers for files. The subarray and
+distributed array datatype constructor functions from Chapter 4
+(Sec. 4.14.4 & 4.14.5) have been implemented. They are useful for
+accessing arrays stored in files. The functions MPI_File_f2c and
+MPI_File_c2f (Sec. 4.12.4) are also implemented.
+
+C, Fortran, and profiling interfaces are provided for all functions
+that have been implemented.
+
+Please read the limitations of this version of ROMIO that are listed
+below (e.g., MPIO_Request object, restriction to homogeneous
+environments).
+
+This version of ROMIO runs on at least the following machines: IBM SP;
+Intel Paragon; HP Exemplar; SGI Origin2000; Cray T3E; NEC SX-4; other
+symmetric multiprocessors from HP, SGI, DEC, Sun, and IBM; and networks of
+workstations (Sun, SGI, HP, IBM, DEC, Linux, and FreeBSD). Supported
+file systems are IBM PIOFS, Intel PFS, HP/Convex HFS, SGI XFS, NEC
+SFS, PVFS, NFS, and any Unix file system (UFS).
+
+This version of ROMIO is included in MPICH 1.2.3; an earlier version
+is included in at least the following MPI implementations: LAM, HP
+MPI, SGI MPI, and NEC MPI.
+
+Note that proper I/O error codes and classes are returned and the
+status variable is filled only when used with MPICH 1.2.1 or later.
+
+You can open files on multiple file systems in the same program. The
+only restriction is that the directory where the file is to be opened
+must be accessible from the process opening the file. For example, a
+process running on one workstation may not be able to access a
+directory on the local disk of another workstation, and therefore
+ROMIO will not be able to open a file in such a directory. NFS-mounted
+files can be accessed.
+
+An MPI-IO file created by ROMIO is no different than any other file
+created by the underlying file system. Therefore, you may use any of
+the commands provided by the file system to access the file, e.g., ls,
+mv, cp, rm, ftp.
+
+
+Using ROMIO on NFS
+------------------
+
+To use ROMIO on NFS, file locking with fcntl must work correctly on
+the NFS installation. On some installations, fcntl locks don't work.
+To get them to work, you need to use Version 3 of NFS, ensure that the
+lockd daemon is running on all the machines, and have the system
+administrator mount the NFS file system with the "noac" option (no
+attribute caching). Turning off attribute caching may reduce
+performance, but it is necessary for correct behavior.
+
+The following are some instructions we received from Ian Wells of HP
+for setting the noac option on NFS. We have not tried them
+ourselves. We are including them here because you may find
+them useful. Note that some of the steps may be specific to HP
+systems, and you may need root permission to execute some of the
+commands.
+
+ >1. first confirm you are running nfs version 3
+ >
+ >rpcnfo -p `hostname` | grep nfs
+ >
+ >ie
+ > goedel >rpcinfo -p goedel | grep nfs
+ > 100003 2 udp 2049 nfs
+ > 100003 3 udp 2049 nfs
+ >
+ >
+ >2. then edit /etc/fstab for each nfs directory read/written by MPIO
+ > on each machine used for multihost MPIO.
+ >
+ > Here is an example of a correct fstab entry for /epm1:
+ >
+ > ie grep epm1 /etc/fstab
+ >
+ > ROOOOT 11>grep epm1 /etc/fstab
+ > gershwin:/epm1 /rmt/gershwin/epm1 nfs bg,intr,noac 0 0
+ >
+ > if the noac option is not present, add it
+ > and then remount this directory
+ > on each of the machines that will be used to share MPIO files
+ >
+ >ie
+ >
+ >ROOOOT >umount /rmt/gershwin/epm1
+ >ROOOOT >mount /rmt/gershwin/epm1
+ >
+ >3. Confirm that the directory is mounted noac:
+ >
+ >ROOOOT >grep gershwin /etc/mnttab
+ >gershwin:/epm1 /rmt/gershwin/epm1 nfs
+ >noac,acregmin=0,acregmax=0,acdirmin=0,acdirmax=0 0 0 899911504
+
+
+
+
+ROMIO Installation Instructions
+-------------------------------
+
+Since ROMIO is included in MPICH, LAM, HP MPI, SGI MPI, and NEC MPI,
+you don't need to install it separately if you are using any of these
+MPI implementations. If you are using some other MPI, you can
+configure and build ROMIO as follows:
+
+Untar the tar file as
+
+ gunzip -c romio.tar.gz | tar xvf -
+
+OR
+
+ zcat romio.tar.Z | tar xvf -
+
+THEN
+
+ cd romio
+ ./configure
+ make
+
+Some example programs and a Makefile are provided in the romio/test directory.
+Run the examples the way you would run any MPI program. Each program takes
+the filename as a command-line argument "-fname filename".
+
+The configure script by default configures ROMIO for the file systems
+most likely to be used on the given machine. If you wish, you can
+explicitly specify the file systems by using the "-file_system" option
+to configure. Multiple file systems can be specified by using "+" as a
+separator. For example,
+
+ ./configure -file_system=xfs+nfs
+
+For the entire list of options to configure do
+
+ ./configure -h | more
+
+After building a specific version as above, you can install it in a
+particular directory with
+
+ make install PREFIX=/usr/local/romio (or whatever directory you like)
+
+or just
+
+ make install (if you used -prefix at configure time)
+
+If you intend to leave ROMIO where you built it, you should NOT install it
+(install is used only to move the necessary parts of a built ROMIO to
+another location). The installed copy will have the include files,
+libraries, man pages, and a few other odds and ends, but not the whole
+source tree. It will have a test directory for testing the
+installation and a location-independent Makefile built during
+installation, which users can copy and modify to compile and link
+against the installed copy.
+
+To rebuild ROMIO with a different set of configure options, do
+
+ make distclean
+
+to clean everything including the Makefiles created by configure.
+Then run configure again with the new options, followed by make.
+
+
+
+Testing ROMIO
+-------------
+
+To test if the installation works, do
+
+ make testing
+
+in the romio/test directory. This calls a script that runs the test
+programs and compares the results with what they should be. By
+default, "make testing" causes the test programs to create files in
+the current directory and use whatever file system that corresponds
+to. To test with other file systems, you need to specify a filename in
+a directory corresponding to that file system as follows:
+
+ make testing TESTARGS="-fname=/foo/piofs/test"
+
+
+
+Compiling and Running MPI-IO Programs
+-------------------------------------
+
+If ROMIO is not already included in the MPI implementation, you need
+to include the file mpio.h for C or mpiof.h for Fortran in your MPI-IO
+program.
+
+Note that on HP machines running HPUX and on NEC SX-4, you need to
+compile Fortran programs with mpifort, because the f77 compilers on
+these machines don't support 8-byte integers.
+
+With MPICH, HP MPI, or NEC MPI, you can compile MPI-IO programs as
+ mpicc foo.c
+or
+ mpif77 foo.f
+or
+ mpifort foo.f
+
+As mentioned above, mpifort is preferred over mpif77 on HPUX and NEC
+because the f77 compilers on those machines do not support 8-byte integers.
+
+With SGI MPI, you can compile MPI-IO programs as
+ cc foo.c -lmpi
+or
+ f77 foo.f -lmpi
+or
+ f90 foo.f -lmpi
+
+With LAM, you can compile MPI-IO programs as
+ hcc foo.c -lmpi
+or
+ hf77 foo.f -lmpi
+
+If you have built ROMIO with some other MPI implementation, you can
+compile MPI-IO programs by explicitly giving the path to the include
+file mpio.h or mpiof.h and explicitly specifying the path to the
+library libmpio.a, which is located in $(ROMIO_HOME)/lib/$(ARCH)/libmpio.a .
+
+
+Run the program as you would run any MPI program on the machine. If
+you use mpirun, make sure you use the correct mpirun for the MPI
+implementation you are using. For example, if you are using MPICH on
+an SGI machine, make sure that you use MPICH's mpirun and not SGI's
+mpirun.
+
+The Makefile in the romio/test directory illustrates how to compile
+and link MPI-IO programs.
+
+
+
+Limitations of this version of ROMIO
+------------------------------------
+
+* When used with any MPI implementation other than MPICH 1.2.1 (or later),
+the "status" argument is not filled in any MPI-IO function. Consequently,
+MPI_Get_count and MPI_Get_elements will not work when passed the status
+object from an MPI-IO operation.
+
+* All nonblocking I/O functions use a ROMIO-defined "MPIO_Request"
+object instead of the usual "MPI_Request" object. Accordingly, two
+functions, MPIO_Test and MPIO_Wait, are provided to wait and test on
+these MPIO_Request objects. They have the same semantics as MPI_Test
+and MPI_Wait.
+
+int MPIO_Test(MPIO_Request *request, int *flag, MPI_Status *status);
+int MPIO_Wait(MPIO_Request *request, MPI_Status *status);
+
+The usual functions MPI_Test, MPI_Wait, MPI_Testany, etc., will not
+work for nonblocking I/O.
+
+* This version works only on a homogeneous cluster of machines,
+and only the "native" file data representation is supported.
+
+* When used with any MPI implementation other than MPICH 1.2.1 (or later),
+all MPI-IO functions return only two possible error codes---MPI_SUCCESS
+on success and MPI_ERR_UNKNOWN on failure.
+
+* Shared file pointers are not supported on PVFS and IBM PIOFS file
+systems because they don't support fcntl file locks, and ROMIO uses
+that feature to implement shared file pointers.
+
+* On HP machines running HPUX and on NEC SX-4, you need to compile
+Fortran programs with mpifort instead of mpif77, because the f77
+compilers on these machines don't support 8-byte integers.
+
+* The file-open mode MPI_MODE_EXCL does not work on Intel PFS file system,
+due to a bug in PFS.
+
+
+
+Usage Tips
+----------
+
+* When using ROMIO with SGI MPI, you may sometimes get an error
+message from SGI MPI: ``MPI has run out of internal datatype
+entries. Please set the environment variable MPI_TYPE_MAX for
+additional space.'' If you get this error message, add this line to
+your .cshrc file:
+ setenv MPI_TYPE_MAX 65536
+Use a larger number if you still get the error message.
+
+* If a Fortran program uses a file handle created using ROMIO's C
+interface, or vice-versa, you must use the functions MPI_File_c2f
+or MPI_File_f2c. Such a situation occurs,
+for example, if a Fortran program uses an I/O library written in C
+with MPI-IO calls. Similar functions MPIO_Request_f2c and
+MPIO_Request_c2f are also provided.
+
+* For Fortran programs on the Intel Paragon, you may need
+to provide the complete path to mpif.h in the include statement, e.g.,
+ include '/usr/local/mpich/include/mpif.h'
+instead of
+ include 'mpif.h'
+This is because the -I option to the Paragon Fortran compiler if77
+doesn't work correctly. It always looks in the default directories first
+and, therefore, picks up Intel's mpif.h, which is actually the
+mpif.h of an older version of MPICH.
+
+
+
+ROMIO Users Mailing List
+------------------------
+
+Please register your copy of ROMIO with us by sending email
+to majordomo@mcs.anl.gov with the message
+
+subscribe romio-users
+
+This will enable us to notify you of new releases of ROMIO as well as
+bug fixes.
+
+
+
+Reporting Bugs
+--------------
+
+If you have trouble, first check the users guide (in
+romio/doc/users-guide.ps.gz). Then check the on-line list of known
+bugs and patches at http://www.mcs.anl.gov/romio .
+Finally, if you still have problems, send a detailed message containing:
+
+ The type of system (often, uname -a)
+ The output of configure
+ The output of make
+ Any programs or tests
+
+to romio-maint@mcs.anl.gov .
+
+
+
+ROMIO Internals
+---------------
+
+A key component of ROMIO that enables such a portable MPI-IO
+implementation is an internal abstract I/O device layer called
+ADIO. Most users of ROMIO will not need to deal with the ADIO layer at
+all. However, ADIO is useful to those who want to port ROMIO to some
+other file system. The ROMIO source code and the ADIO paper
+(see doc/README) will help you get started.
+
+MPI-IO implementation issues are discussed in our IOPADS '99 paper,
+"On Implementing MPI-IO Portably and with High Performance."
+All ROMIO-related papers are available online from
+http://www.mcs.anl.gov/romio.
+
+
+Learning MPI-IO
+---------------
+
+The book "Using MPI-2: Advanced Features of the Message-Passing
+Interface," published by MIT Press, provides a tutorial introduction to
+all aspects of MPI-2, including parallel I/O. It has lots of example
+programs. See http://www.mcs.anl.gov/mpi/usingmpi2 for further
+information about the book.
diff --git a/3rd-party/romio341/adio/Makefile.mk b/3rd-party/romio341/adio/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..408e113c0ed923b31377e7ba1a3995852856f735
--- /dev/null
+++ b/3rd-party/romio341/adio/Makefile.mk
@@ -0,0 +1,42 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+AM_CPPFLAGS += -I$(top_builddir)/adio/include -I$(top_srcdir)/adio/include
+
+noinst_HEADERS += \
+ adio/include/adio.h \
+ adio/include/adio_cb_config_list.h \
+ adio/include/adio_extern.h \
+ adio/include/adioi.h \
+ adio/include/adioi_errmsg.h \
+ adio/include/adioi_error.h \
+ adio/include/adioi_fs_proto.h \
+ adio/include/ad_tuning.h \
+ adio/include/heap_sort.h \
+ adio/include/lock_internal.h \
+ adio/include/mpio_error.h \
+ adio/include/mpipr.h \
+ adio/include/mpiu_greq.h \
+ adio/include/nopackage.h \
+ adio/include/romioconf-undefs.h \
+ adio/include/mpiu_external32.h \
+ adio/include/hint_fns.h
+
+include $(top_srcdir)/adio/ad_daos/Makefile.mk
+include $(top_srcdir)/adio/ad_gpfs/Makefile.mk
+include $(top_srcdir)/adio/ad_gpfs/bg/Makefile.mk
+include $(top_srcdir)/adio/ad_gpfs/pe/Makefile.mk
+include $(top_srcdir)/adio/ad_lustre/Makefile.mk
+include $(top_srcdir)/adio/ad_nfs/Makefile.mk
+## NTFS builds are handled entirely by the separate Windows build system
+##include $(top_srcdir)/adio/ad_ntfs/Makefile.mk
+include $(top_srcdir)/adio/ad_panfs/Makefile.mk
+include $(top_srcdir)/adio/ad_pvfs2/Makefile.mk
+include $(top_srcdir)/adio/ad_testfs/Makefile.mk
+include $(top_srcdir)/adio/ad_ufs/Makefile.mk
+include $(top_srcdir)/adio/ad_xfs/Makefile.mk
+include $(top_srcdir)/adio/ad_ime/Makefile.mk
+include $(top_srcdir)/adio/ad_quobytefs/Makefile.mk
+include $(top_srcdir)/adio/common/Makefile.mk
diff --git a/3rd-party/romio341/adio/ad_daos/Makefile.mk b/3rd-party/romio341/adio/ad_daos/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..a97cb6bd3b7c364ada7c4db976f122f050cf2a4a
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/Makefile.mk
@@ -0,0 +1,24 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_DAOS
+
+noinst_HEADERS += adio/ad_daos/ad_daos.h
+
+romio_other_sources += \
+ adio/ad_daos/ad_daos.c \
+ adio/ad_daos/ad_daos_close.c \
+ adio/ad_daos/ad_daos_common.c \
+ adio/ad_daos/ad_daos_fcntl.c \
+ adio/ad_daos/ad_daos_features.c \
+ adio/ad_daos/ad_daos_hhash.c \
+ adio/ad_daos/ad_daos_hints.c \
+ adio/ad_daos/ad_daos_io.c \
+ adio/ad_daos/ad_daos_io_str.c \
+ adio/ad_daos/ad_daos_open.c \
+ adio/ad_daos/ad_daos_resize.c
+
+endif BUILD_AD_DAOS
+
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos.c b/3rd-party/romio341/adio/ad_daos/ad_daos.c
new file mode 100644
index 0000000000000000000000000000000000000000..00d0d965c50c3615db8cf842b5b11a60afc8bcfa
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+
+/* adioi.h has the ADIOI_Fns_struct define */
+#include "adioi.h"
+
+struct ADIOI_Fns_struct ADIO_DAOS_operations = {
+ ADIOI_DAOS_Open, /* Open */
+ ADIOI_DAOS_OpenColl, /* OpenColl */
+ ADIOI_DAOS_ReadContig, /* ReadContig */
+ ADIOI_DAOS_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_DAOS_Fcntl, /* Fcntl */
+ ADIOI_DAOS_SetInfo, /* SetInfo */
+ ADIOI_DAOS_ReadStrided, /* ReadStrided */
+ ADIOI_DAOS_WriteStrided, /* WriteStrided */
+ ADIOI_DAOS_Close, /* Close */
+ ADIOI_DAOS_IReadContig, /* IreadContig */
+ ADIOI_DAOS_IWriteContig, /* IwriteContig */
+ ADIOI_FAKE_IODone, /* ReadDone */
+ ADIOI_FAKE_IODone, /* WriteDone */
+ ADIOI_FAKE_IOComplete, /* ReadComplete */
+ ADIOI_FAKE_IOComplete, /* WriteComplete */
+ ADIOI_DAOS_IreadStrided, /* IreadStrided */
+ ADIOI_DAOS_IwriteStrided, /* IwriteStrided */
+ ADIOI_DAOS_Flush, /* Flush */
+ ADIOI_DAOS_Resize, /* Resize */
+ ADIOI_DAOS_Delete, /* Delete */
+ ADIOI_DAOS_Feature, /* Features */
+ "DAOS: ROMIO driver for DAOS",
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos.h b/3rd-party/romio341/adio/ad_daos/ad_daos.h
new file mode 100644
index 0000000000000000000000000000000000000000..fed1f829595898069eaf978378d80db3e802ab5c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_DAOS_H_INCLUDED
+#define AD_DAOS_H_INCLUDED
+
+#include "adio.h"
+#include
+#include
+#include
+#include
+#include
+
+/* #define D_PRINT_IO */
+/* #define D_PRINT_IO_MEM */
+
+#define PRINT_MSG(str, fmt, ...) \
+ do { \
+ fprintf(str, "%s:%d %s() - " fmt"\n" , \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
+ } while (0)
+
+struct adio_daos_hdl {
+ d_list_t entry;
+ uuid_t uuid;
+ daos_handle_t open_hdl;
+ dfs_t *dfs;
+ int ref;
+ int type;
+};
+
+struct ADIO_DAOS_cont {
+ /** pool, container uuids + other attributes */
+ struct duns_attr_t attr;
+ /** Container name (Path to the file opened) */
+ char *cont_name;
+ /** Object name (File name) */
+ char *obj_name;
+ /** pool open handle */
+ daos_handle_t poh;
+ /** container open handle */
+ daos_handle_t coh;
+ /** flat namespace mount */
+ dfs_t *dfs;
+ /** dfs object for file */
+ dfs_obj_t *obj;
+ /** Array Object ID for the MPI file */
+ daos_obj_id_t oid;
+ /** file open mode */
+ unsigned int amode;
+ /** Event queue to store all async requests on file */
+ daos_handle_t eqh;
+ /** pool handle for directory holding the file object */
+ struct adio_daos_hdl *p;
+ /** container handle for directory holding the file object */
+ struct adio_daos_hdl *c;
+};
+
+struct ADIO_DAOS_req {
+ MPI_Request req;
+ daos_size_t nbytes;
+ daos_event_t daos_event;
+ dfs_iod_t iod;
+ daos_range_t rg;
+ d_sg_list_t sgl;
+ d_iov_t iov;
+ daos_range_t *rgs;
+ d_iov_t *iovs;
+};
+
+/** initialize the DAOS library and hashtables for handles */
+void ADIOI_DAOS_Init(int *error_code);
+
+/** Container/Pool Handle Hash functions */
+int adio_daos_hash_init(void);
+void adio_daos_hash_finalize(void);
+struct adio_daos_hdl *adio_daos_poh_lookup(const uuid_t uuid);
+int adio_daos_poh_insert(uuid_t uuid, daos_handle_t poh, struct adio_daos_hdl **hdl);
+int adio_daos_poh_lookup_connect(uuid_t uuid, struct adio_daos_hdl **hdl);
+void adio_daos_poh_release(struct adio_daos_hdl *hdl);
+struct adio_daos_hdl *adio_daos_coh_lookup(const uuid_t uuid);
+int adio_daos_coh_insert(uuid_t uuid, daos_handle_t coh, struct adio_daos_hdl **hdl);
+int adio_daos_coh_lookup_create(daos_handle_t poh, uuid_t uuid, int amode,
+ bool create, struct adio_daos_hdl **hdl);
+void adio_daos_coh_release(struct adio_daos_hdl *hdl);
+
+int ADIOI_DAOS_aio_free_fn(void *extra_state);
+int ADIOI_DAOS_aio_poll_fn(void *extra_state, MPI_Status * status);
+int ADIOI_DAOS_aio_wait_fn(int count, void **array_of_states, double timeout, MPI_Status * status);
+int ADIOI_DAOS_err(const char *myname, const char *filename, int line, int rc);
+
+void ADIOI_DAOS_Open(ADIO_File fd, int *error_code);
+void ADIOI_DAOS_OpenColl(ADIO_File fd, int rank, int access_mode, int *error_code);
+int ADIOI_DAOS_Feature(ADIO_File fd, int flag);
+void ADIOI_DAOS_Flush(ADIO_File fd, int *error_code);
+void ADIOI_DAOS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
+void ADIOI_DAOS_Close(ADIO_File fd, int *error_code);
+void ADIOI_DAOS_Delete(const char *filename, int *error_code);
+void ADIOI_DAOS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code);
+void ADIOI_DAOS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+void ADIOI_DAOS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_DAOS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_DAOS_IReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code);
+void ADIOI_DAOS_IWriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code);
+void ADIOI_DAOS_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_DAOS_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_DAOS_IreadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int *error_code);
+void ADIOI_DAOS_IwriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code);
+#endif /* AD_DAOS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_close.c b/3rd-party/romio341/adio/ad_daos/ad_daos_close.c
new file mode 100644
index 0000000000000000000000000000000000000000..c32a872943bea08659bd7955e0a48790776a498f
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_close.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+
+void ADIOI_DAOS_Close(ADIO_File fd, int *error_code)
+{
+ int rank;
+ struct ADIO_DAOS_cont *cont = (struct ADIO_DAOS_cont *) fd->fs_ptr;
+ static char myname[] = "ADIOI_DAOS_CLOSE";
+ int rc;
+
+ MPI_Barrier(fd->comm);
+ MPI_Comm_rank(fd->comm, &rank);
+
+ /* release the dfs object handle for the file. */
+ rc = dfs_release(cont->obj);
+ if (rc != 0) {
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, rc);
+ return;
+ }
+
+ /* decrement ref count on the container and pool in the hashtable. */
+ adio_daos_coh_release(cont->c);
+ cont->c = NULL;
+ adio_daos_poh_release(cont->p);
+ cont->p = NULL;
+
+ if (rank == 0) {
+ ADIOI_Free(cont->obj_name);
+ ADIOI_Free(cont->cont_name);
+ }
+ ADIOI_Free(fd->fs_ptr);
+ fd->fs_ptr = NULL;
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_common.c b/3rd-party/romio341/adio/ad_daos/ad_daos_common.c
new file mode 100644
index 0000000000000000000000000000000000000000..9e0e3b3ab5c01e0324664f536bcceea36a4f510f
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_common.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+#include
+
+int ADIOI_DAOS_Initialized = MPI_KEYVAL_INVALID;
+
+static int ad_daos_end(MPI_Comm comm, int keyval, void *attribute_val, void *extra_state)
+{
+ int error_code = MPI_SUCCESS;
+ int rc;
+
+ adio_daos_hash_finalize();
+ rc = daos_fini();
+
+ if (rc != 0) {
+ error_code = ADIOI_DAOS_err("ad_daos_end", "DAOS Finalize Error", __LINE__, rc);
+ return error_code;
+ }
+
+ MPI_Keyval_free(&keyval);
+ return error_code;
+}
+
+void ADIOI_DAOS_Init(int *error_code)
+{
+ static char myname[] = "ADIOI_DAOS_INIT";
+ int rc;
+
+ *error_code = MPI_SUCCESS;
+
+ /** nothing to do if already initialized */
+ if (ADIOI_DAOS_Initialized != MPI_KEYVAL_INVALID)
+ return;
+
+ rc = daos_init();
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, "DAOS Init Error", __LINE__, rc);
+ fprintf(stderr, "daos_init() failed with %d\n", rc);
+ return;
+ }
+
+ rc = adio_daos_hash_init();
+ if (rc < 0) {
+ *error_code = ADIOI_DAOS_err(myname, "DAOS Init Error", __LINE__, rc);
+ fprintf(stderr, "Failed to init daos handle hash table\n");
+ return;
+ }
+
+ /** attach to comm_self destroy to finalize DAOS */
+ MPI_Keyval_create(MPI_NULL_COPY_FN, ad_daos_end, &ADIOI_DAOS_Initialized, (void *) 0);
+ MPI_Attr_put(MPI_COMM_SELF, ADIOI_DAOS_Initialized, (void *) 0);
+}
+
+int ADIOI_DAOS_err(const char *myname, const char *filename, int line, int rc)
+{
+ int error_code = MPI_SUCCESS;
+
+ if (rc == 0)
+ return MPI_SUCCESS;
+
+ switch (rc) {
+ case -DER_NO_PERM:
+ case EPERM:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ line, MPI_ERR_ACCESS,
+ "**fileaccess", "**fileaccess %s", filename);
+ break;
+ case -DER_ENOENT:
+ case -DER_NONEXIST:
+ case -DER_NO_HDL:
+ case ENOENT:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ line, MPI_ERR_NO_SUCH_FILE,
+ "**filenoexist", "**filenoexist %s", filename);
+ break;
+ case -DER_IO:
+ case EIO:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, line, MPI_ERR_IO, "**io",
+ "**io %s", filename);
+ break;
+ case -DER_EXIST:
+ case EEXIST:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ line, MPI_ERR_FILE_EXISTS, "**fileexist", 0);
+ break;
+ case -DER_NOTDIR:
+ case ENOTDIR:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, line,
+ MPI_ERR_BAD_FILE,
+ "**filenamedir", "**filenamedir %s", filename);
+ break;
+ case -DER_NOSPACE:
+ case ENOSPC:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname, line,
+ MPI_ERR_NO_SPACE, "**filenospace", 0);
+ break;
+ case -DER_INVAL:
+ case EINVAL:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname, line,
+ MPI_ERR_ARG, "**arg", 0);
+ break;
+ case -DER_NOSYS:
+ case ENOSYS:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, line,
+ MPI_ERR_UNSUPPORTED_OPERATION,
+ "**fileopunsupported", 0);
+ break;
+ case -DER_NOMEM:
+ case ENOMEM:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, line, MPI_ERR_NO_MEM, "**allocmem", 0);
+ break;
+ default:
+ error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, line, MPI_ERR_IO, "**io",
+ "**io %s", filename);
+ break;
+ }
+
+ return error_code;
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_fcntl.c b/3rd-party/romio341/adio/ad_daos/ad_daos_fcntl.c
new file mode 100644
index 0000000000000000000000000000000000000000..00b45de506d9b34f638acd869ec887fd95b4dec9
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_fcntl.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+
+void ADIOI_DAOS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code)
+{
+ int rc;
+ struct ADIO_DAOS_cont *cont = fd->fs_ptr;
+ static char myname[] = "ADIOI_DAOS_FCNTL";
+
+ switch (flag) {
+ case ADIO_FCNTL_GET_FSIZE:
+ {
+ daos_size_t fsize;
+
+ rc = dfs_get_size(cont->dfs, cont->obj, &fsize);
+ if (rc != 0) {
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, rc);
+ break;
+ }
+ *error_code = MPI_SUCCESS;
+ fcntl_struct->fsize = (ADIO_Offset) fsize;
+ break;
+ }
+ case ADIO_FCNTL_SET_DISKSPACE:
+ case ADIO_FCNTL_SET_ATOMICITY:
+ default:
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG, "**flag", "**flag %d", flag);
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_features.c b/3rd-party/romio341/adio/ad_daos/ad_daos_features.c
new file mode 100644
index 0000000000000000000000000000000000000000..5c415e450b7e6edcc992005ef97abb936d1e0b4c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_features.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "ad_daos.h"
+
+int ADIOI_DAOS_Feature(ADIO_File fd, int flag)
+{
+ switch (flag) {
+ case ADIO_SCALABLE_OPEN:
+ case ADIO_SCALABLE_RESIZE:
+ return 1;
+ case ADIO_TWO_PHASE:
+ case ADIO_SHARED_FP:
+ case ADIO_LOCKS:
+ case ADIO_SEQUENTIAL:
+ case ADIO_DATA_SIEVING_WRITES:
+ case ADIO_ATOMIC_MODE:
+ case ADIO_UNLINK_AFTER_CLOSE:
+ default:
+ return 0;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_hhash.c b/3rd-party/romio341/adio/ad_daos/ad_daos_hhash.c
new file mode 100644
index 0000000000000000000000000000000000000000..f3c02dc0ec188ebc68ffb61f53481a27f4be9d2e
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_hhash.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+#include "gurt/hash.h"
+#include
+
+static struct d_hash_table *coh_hash;
+static struct d_hash_table *poh_hash;
+
+enum {
+ DAOS_POOL,
+ DAOS_CONT,
+};
+
+static inline struct adio_daos_hdl *hdl_obj(d_list_t * rlink)
+{
+ return container_of(rlink, struct adio_daos_hdl, entry);
+}
+
+static bool
+key_cmp(struct d_hash_table *htable, d_list_t * rlink, const void *key, unsigned int ksize)
+{
+ struct adio_daos_hdl *hdl = hdl_obj(rlink);
+
+ return (uuid_compare(hdl->uuid, key) == 0);
+}
+
+static void rec_addref(struct d_hash_table *htable, d_list_t * rlink)
+{
+ hdl_obj(rlink)->ref++;
+}
+
+static bool rec_decref(struct d_hash_table *htable, d_list_t * rlink)
+{
+ struct adio_daos_hdl *hdl = hdl_obj(rlink);
+
+ assert(hdl->ref > 0);
+ hdl->ref--;
+ return (hdl->ref == 0);
+}
+
+static void rec_free(struct d_hash_table *htable, d_list_t * rlink)
+{
+ struct adio_daos_hdl *hdl = hdl_obj(rlink);
+
+ assert(d_hash_rec_unlinked(&hdl->entry));
+ assert(hdl->ref == 0);
+
+ if (hdl->type == DAOS_POOL)
+ daos_pool_disconnect(hdl->open_hdl, NULL);
+ else if (hdl->type == DAOS_CONT) {
+ dfs_umount(hdl->dfs);
+ daos_cont_close(hdl->open_hdl, NULL);
+ } else
+ assert(0);
+ ADIOI_Free(hdl);
+}
+
+static d_hash_table_ops_t hdl_hash_ops = {
+ .hop_key_cmp = key_cmp,
+ .hop_rec_addref = rec_addref,
+ .hop_rec_decref = rec_decref,
+ .hop_rec_free = rec_free
+};
+
+int adio_daos_hash_init(void)
+{
+ int rc;
+
+ rc = d_hash_table_create(0, 16, NULL, &hdl_hash_ops, &poh_hash);
+ if (rc)
+ return rc;
+
+ return d_hash_table_create(0, 16, NULL, &hdl_hash_ops, &coh_hash);
+}
+
+void adio_daos_hash_finalize(void)
+{
+ d_hash_table_destroy(coh_hash, true /* force */);
+ d_hash_table_destroy(poh_hash, true /* force */);
+}
+
+struct adio_daos_hdl *adio_daos_poh_lookup(const uuid_t uuid)
+{
+ d_list_t *rlink;
+
+ rlink = d_hash_rec_find(poh_hash, uuid, sizeof(uuid_t));
+ if (rlink == NULL)
+ return NULL;
+
+ return hdl_obj(rlink);
+}
+
+void adio_daos_poh_release(struct adio_daos_hdl *hdl)
+{
+ d_hash_rec_decref(poh_hash, &hdl->entry);
+}
+
+int adio_daos_poh_insert(uuid_t uuid, daos_handle_t poh, struct adio_daos_hdl **hdl)
+{
+ struct adio_daos_hdl *phdl;
+ int rc;
+
+ phdl = (struct adio_daos_hdl *) ADIOI_Calloc(1, sizeof(struct adio_daos_hdl));
+ if (phdl == NULL)
+ return -1;
+
+ phdl->type = DAOS_POOL;
+ uuid_copy(phdl->uuid, uuid);
+ phdl->open_hdl.cookie = poh.cookie;
+
+ rc = d_hash_rec_insert(poh_hash, phdl->uuid, sizeof(uuid_t), &phdl->entry, true);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to add phdl to hashtable (%d)\n", rc);
+ goto free_hdl;
+ }
+
+ d_hash_rec_addref(poh_hash, &phdl->entry);
+ *hdl = phdl;
+
+ return 0;
+
+ free_hdl:
+ ADIOI_Free(phdl);
+ return rc;
+}
+
+int adio_daos_poh_lookup_connect(uuid_t uuid, struct adio_daos_hdl **hdl)
+{
+ struct adio_daos_hdl *phdl;
+ int rc;
+
+ phdl = adio_daos_poh_lookup(uuid);
+ if (phdl != NULL) {
+ *hdl = phdl;
+ return 0;
+ }
+
+ phdl = (struct adio_daos_hdl *) ADIOI_Calloc(1, sizeof(struct adio_daos_hdl));
+ if (phdl == NULL)
+ return -1;
+
+ phdl->type = DAOS_POOL;
+ uuid_copy(phdl->uuid, uuid);
+
+ /** Get the SVCL and Server group from env variables. This is temp as those
+ * won't be needed later */
+ char *svcl_str = NULL;
+ char *group = NULL;
+ daos_pool_info_t pool_info;
+ d_rank_list_t *svcl = NULL;
+
+ svcl_str = getenv("DAOS_SVCL");
+ if (svcl_str != NULL) {
+ svcl = daos_rank_list_parse(svcl_str, ":");
+ if (svcl == NULL) {
+ PRINT_MSG(stderr, "Failed to parse SVC list env\n");
+ rc = -1;
+ goto free_hdl;
+ }
+ }
+ group = getenv("DAOS_GROUP");
+
+ rc = daos_pool_connect(uuid, group, svcl, DAOS_PC_RW, &phdl->open_hdl, &pool_info, NULL);
+ d_rank_list_free(svcl);
+ if (rc < 0) {
+ PRINT_MSG(stderr, "Failed to connect to pool (%d)\n", rc);
+ goto free_hdl;
+ }
+
+ rc = d_hash_rec_insert(poh_hash, phdl->uuid, sizeof(uuid_t), &phdl->entry, true);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to add phdl to hashtable (%d)\n", rc);
+ goto err_pool;
+ }
+
+ d_hash_rec_addref(poh_hash, &phdl->entry);
+ *hdl = phdl;
+
+ return 0;
+
+ err_pool:
+ daos_pool_disconnect(phdl->open_hdl, NULL);
+ free_hdl:
+ ADIOI_Free(phdl);
+ return rc;
+}
+
+struct adio_daos_hdl *adio_daos_coh_lookup(const uuid_t uuid)
+{
+ d_list_t *rlink;
+
+ rlink = d_hash_rec_find(coh_hash, uuid, sizeof(uuid_t));
+ if (rlink == NULL)
+ return NULL;
+
+ return hdl_obj(rlink);
+}
+
+void adio_daos_coh_release(struct adio_daos_hdl *hdl)
+{
+ d_hash_rec_decref(coh_hash, &hdl->entry);
+}
+
+int adio_daos_coh_insert(uuid_t uuid, daos_handle_t coh, struct adio_daos_hdl **hdl)
+{
+ struct adio_daos_hdl *co_hdl;
+ int rc;
+
+ co_hdl = (struct adio_daos_hdl *) ADIOI_Calloc(1, sizeof(struct adio_daos_hdl));
+ if (co_hdl == NULL)
+ return -1;
+
+ co_hdl->type = DAOS_CONT;
+ uuid_copy(co_hdl->uuid, uuid);
+ co_hdl->open_hdl.cookie = coh.cookie;
+
+ rc = d_hash_rec_insert(coh_hash, co_hdl->uuid, sizeof(uuid_t), &co_hdl->entry, true);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to add co_hdl to hashtable (%d)\n", rc);
+ goto err_coh;
+ }
+
+ d_hash_rec_addref(coh_hash, &co_hdl->entry);
+ *hdl = co_hdl;
+
+ return 0;
+
+ err_coh:
+ ADIOI_Free(co_hdl);
+ return rc;
+}
+
+int
+adio_daos_coh_lookup_create(daos_handle_t poh, uuid_t uuid, int amode,
+ bool create, struct adio_daos_hdl **hdl)
+{
+ struct adio_daos_hdl *co_hdl;
+ int rc;
+
+ co_hdl = adio_daos_coh_lookup(uuid);
+ if (co_hdl != NULL) {
+ *hdl = co_hdl;
+ return 0;
+ }
+
+ co_hdl = (struct adio_daos_hdl *) ADIOI_Calloc(1, sizeof(struct adio_daos_hdl));
+ if (co_hdl == NULL)
+ return -1;
+
+ co_hdl->type = DAOS_CONT;
+ uuid_copy(co_hdl->uuid, uuid);
+
+ /* Try to open the DAOS container first (the parent directory) */
+ rc = daos_cont_open(poh, uuid, DAOS_COO_RW, &co_hdl->open_hdl, NULL, NULL);
+ /* If fails with NOEXIST we can create it then reopen if create mode */
+ if (rc == -DER_NONEXIST && create) {
+ rc = dfs_cont_create(poh, uuid, NULL, &co_hdl->open_hdl, &co_hdl->dfs);
+ /** if someone got there first, re-open*/
+ if (rc == EEXIST) {
+ rc = daos_cont_open(poh, uuid, DAOS_COO_RW, &co_hdl->open_hdl, NULL, NULL);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to create DFS container (%d)\n", rc);
+ goto free_coh;
+ }
+ rc = dfs_mount(poh, co_hdl->open_hdl, amode, &co_hdl->dfs);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to mount DFS namesapce (%d)\n", rc);
+ goto err_cont;
+ }
+ } else if (rc) {
+ PRINT_MSG(stderr, "Failed to create DFS container (%d)\n", rc);
+ goto free_coh;
+ }
+ } else if (rc == 0) {
+ /* Mount a DFS namespace on the container */
+ rc = dfs_mount(poh, co_hdl->open_hdl, amode, &co_hdl->dfs);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to mount DFS namespace (%d)\n", rc);
+ goto err_cont;
+ }
+ } else {
+ goto free_coh;
+ }
+
+ rc = d_hash_rec_insert(coh_hash, co_hdl->uuid, sizeof(uuid_t), &co_hdl->entry, true);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to add co_hdl to hashtable (%d)\n", rc);
+ goto err_dfs;
+ }
+
+ d_hash_rec_addref(coh_hash, &co_hdl->entry);
+ *hdl = co_hdl;
+
+ return 0;
+
+ err_dfs:
+ dfs_umount(co_hdl->dfs);
+ err_cont:
+ daos_cont_close(co_hdl->open_hdl, NULL);
+ free_coh:
+ ADIOI_Free(co_hdl);
+ return rc;
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_hints.c b/3rd-party/romio341/adio/ad_daos/ad_daos_hints.c
new file mode 100644
index 0000000000000000000000000000000000000000..bb2c5544c36aac1e1d46e79062843899dd6448d6
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_hints.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+
+#include "hint_fns.h"
+
+void ADIOI_DAOS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+{
+ static char myname[] = "ADIOI_DAOS_SETINFO";
+
+ if ((fd->info) == MPI_INFO_NULL) {
+ /* part of the open call */
+ MPI_Info_create(&(fd->info));
+
+ ADIOI_Info_set(fd->info, "romio_daos_chunk_size", "0");
+ fd->hints->fs_hints.daos.chunk_size = 0;
+
+ ADIOI_Info_set(fd->info, "romio_daos_obj_class", "OC_UNKNOWN");
+ fd->hints->fs_hints.daos.obj_class = OC_UNKNOWN;
+
+ if (users_info != MPI_INFO_NULL) {
+ char *oclass = NULL;
+
+ /* Chunk size in each dkey */
+ ADIOI_Info_check_and_install_int(fd, users_info, "romio_daos_chunk_size",
+ &(fd->hints->fs_hints.daos.chunk_size), myname,
+ error_code);
+
+ /* object class for each file */
+ ADIOI_Info_check_and_install_str(fd, users_info, "romio_daos_obj_class",
+ &oclass, myname, error_code);
+
+ if (oclass) {
+ fd->hints->fs_hints.daos.obj_class = daos_oclass_name2id(oclass);
+ ADIOI_Free(oclass);
+ }
+ }
+ }
+
+ /* set the values for collective I/O and data sieving parameters */
+ ADIOI_GEN_SetInfo(fd, users_info, error_code);
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_io.c b/3rd-party/romio341/adio/ad_daos/ad_daos_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..91ed8588f8729f5cbf460dbd596656eaab00b567
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_io.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+
+#include "../../mpi-io/mpioimpl.h"
+#ifdef MPIO_BUILD_PROFILING
+#include "../../mpi-io/mpioprof.h"
+#endif
+#include "mpiu_greq.h"
+
+enum {
+ DAOS_WRITE,
+ DAOS_READ
+};
+
+static MPIX_Grequest_class ADIOI_DAOS_greq_class = 0;
+
+static void DAOS_IOContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ MPI_Request * request, int flag, int *error_code)
+{
+ MPI_Count datatype_size;
+ uint64_t len;
+ d_sg_list_t *sgl, loc_sgl;
+ d_iov_t *iov, loc_iov;
+ daos_size_t *nbytes, loc_nbytes;
+ int ret;
+ struct ADIO_DAOS_cont *cont = fd->fs_ptr;
+ struct ADIO_DAOS_req *aio_req;
+ static char myname[] = "ADIOI_DAOS_IOCONTIG";
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = (ADIO_Offset) datatype_size *(ADIO_Offset) count;
+
+ if (request) {
+ aio_req = (struct ADIO_DAOS_req *) ADIOI_Calloc(sizeof(struct ADIO_DAOS_req), 1);
+ daos_event_init(&aio_req->daos_event, DAOS_HDL_INVAL, NULL);
+
+ sgl = &aio_req->sgl;
+ iov = &aio_req->iov;
+ nbytes = &aio_req->nbytes;
+
+ if (ADIOI_DAOS_greq_class == 0) {
+ MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn,
+ ADIOI_DAOS_aio_free_fn, MPIU_Greq_cancel_fn,
+ ADIOI_DAOS_aio_poll_fn, ADIOI_DAOS_aio_wait_fn,
+ &ADIOI_DAOS_greq_class);
+ }
+ MPIX_Grequest_class_allocate(ADIOI_DAOS_greq_class, aio_req, request);
+ memcpy(&(aio_req->req), request, sizeof(MPI_Request));
+ } else {
+ sgl = &loc_sgl;
+ iov = &loc_iov;
+ nbytes = &loc_nbytes;
+ }
+
+ if (len == 0) {
+ *nbytes = 0;
+ goto done;
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+
+ /** set memory location */
+ sgl->sg_nr = 1;
+ sgl->sg_nr_out = 0;
+ d_iov_set(iov, buf, len);
+ sgl->sg_iovs = iov;
+#ifdef D_PRINT_IO_MEM
+ printf("MEM : off %lld len %zu\n", buf, len);
+#endif
+
+#ifdef D_PRINT_IO
+ int mpi_rank;
+
+ MPI_Comm_rank(fd->comm, &mpi_rank);
+ printf("(%d) CONTIG IO OP %d, Off %llu, Len %zu\n", mpi_rank, flag, offset, len);
+#endif
+
+ if (flag == DAOS_WRITE) {
+ ret = dfs_write(cont->dfs, cont->obj, sgl, offset, (request ? &aio_req->daos_event : NULL));
+ if (ret != 0) {
+ PRINT_MSG(stderr, "dfs_write() failed with %d\n", ret);
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, ret);
+ return;
+ }
+ *nbytes = len;
+ } else if (flag == DAOS_READ) {
+ ret = dfs_read(cont->dfs, cont->obj, sgl, offset, nbytes,
+ (request ? &aio_req->daos_event : NULL));
+ if (ret != 0) {
+ PRINT_MSG(stderr, "dfs_read() failed with %d\n", ret);
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, ret);
+ return;
+ }
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += len;
+ }
+
+ fd->fp_sys_posn = offset + len;
+
+ done:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (request == NULL && status)
+ MPIR_Status_set_bytes(status, datatype, *nbytes);
+#endif
+
+ *error_code = MPI_SUCCESS;
+}
+
+void ADIOI_DAOS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ DAOS_IOContig(fd, buf, count, datatype, file_ptr_type,
+ offset, status, NULL, DAOS_READ, error_code);
+}
+
+void ADIOI_DAOS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ DAOS_IOContig(fd, (void *) buf, count, datatype, file_ptr_type,
+ offset, status, NULL, DAOS_WRITE, error_code);
+}
+
+void ADIOI_DAOS_IReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code)
+{
+ DAOS_IOContig(fd, buf, count, datatype, file_ptr_type,
+ offset, NULL, request, DAOS_READ, error_code);
+}
+
+void ADIOI_DAOS_IWriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code)
+{
+ DAOS_IOContig(fd, (void *) buf, count, datatype, file_ptr_type,
+ offset, NULL, request, DAOS_WRITE, error_code);
+}
+
+int ADIOI_DAOS_aio_free_fn(void *extra_state)
+{
+ struct ADIO_DAOS_req *aio_req = (struct ADIO_DAOS_req *) extra_state;
+
+ if (aio_req->iovs)
+ ADIOI_Free(aio_req->iovs);
+
+ if (aio_req->rgs)
+ ADIOI_Free(aio_req->rgs);
+
+ ADIOI_Free(aio_req);
+
+ return MPI_SUCCESS;
+}
+
+int ADIOI_DAOS_aio_poll_fn(void *extra_state, MPI_Status * status)
+{
+ struct ADIO_DAOS_req *aio_req = (struct ADIO_DAOS_req *) extra_state;;
+ int ret;
+ bool flag;
+
+ /* MSC - MPICH hangs if we just test with NOWAIT.. */
+ ret = daos_event_test(&aio_req->daos_event, DAOS_EQ_WAIT, &flag);
+ if (ret != 0)
+ return MPI_UNDEFINED;
+
+ if (flag)
+ MPI_Grequest_complete(aio_req->req);
+ else
+ return MPI_UNDEFINED;
+
+ if (aio_req->daos_event.ev_error != 0)
+ ret = ADIOI_DAOS_err("ADIOI_DAOS_aio_poll_fn", "DAOS Event Error", __LINE__, ret);
+ else
+ ret = MPI_SUCCESS;
+
+ return ret;
+}
+
+/* wait for multiple requests to complete */
+int ADIOI_DAOS_aio_wait_fn(int count, void **array_of_states, double timeout, MPI_Status * status)
+{
+
+ struct ADIO_DAOS_req **aio_reqlist;
+ int i, nr_complete, ret;
+
+ aio_reqlist = (struct ADIO_DAOS_req **) array_of_states;
+
+ nr_complete = 0;
+ while (nr_complete < count) {
+ for (i = 0; i < count; i++) {
+ bool flag;
+
+ ret = daos_event_test(&aio_reqlist[i]->daos_event,
+ (timeout > 0) ? (int64_t) timeout : DAOS_EQ_WAIT, &flag);
+ if (ret != 0)
+ return MPI_UNDEFINED;
+
+ if (flag) {
+ MPI_Grequest_complete(aio_reqlist[i]->req);
+ nr_complete++;
+ }
+ }
+ }
+ return MPI_SUCCESS; /* TODO: no idea how to deal with errors */
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_io_str.c b/3rd-party/romio341/adio/ad_daos/ad_daos_io_str.c
new file mode 100644
index 0000000000000000000000000000000000000000..a763abf8abe0382a29fcbbe65432f3adb31b331a
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_io_str.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+#include "adio_extern.h"
+#include
+
+#include "../../mpi-io/mpioimpl.h"
+#ifdef MPIO_BUILD_PROFILING
+#include "../../mpi-io/mpioprof.h"
+#endif
+#include "mpiu_greq.h"
+
+enum {
+ DAOS_WRITE,
+ DAOS_READ
+};
+
+static void
+ADIOI_DAOS_StridedListIO(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ MPI_Request * request, int rw_type, int *error_code);
+
+static MPIX_Grequest_class ADIOI_DAOS_greq_class = 0;
+int ADIOI_DAOS_aio_free_fn(void *extra_state);
+int ADIOI_DAOS_aio_poll_fn(void *extra_state, MPI_Status * status);
+int ADIOI_DAOS_aio_wait_fn(int count, void **array_of_states, double timeout, MPI_Status * status);
+
+void ADIOI_DAOS_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ ADIOI_DAOS_StridedListIO(fd, buf, count, datatype, file_ptr_type,
+ offset, status, NULL, DAOS_READ, error_code);
+ return;
+}
+
+void ADIOI_DAOS_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ ADIOI_DAOS_StridedListIO(fd, (void *) buf, count, datatype, file_ptr_type,
+ offset, status, NULL, DAOS_WRITE, error_code);
+ return;
+}
+
+void ADIOI_DAOS_IreadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int *error_code)
+{
+ ADIOI_DAOS_StridedListIO(fd, buf, count, datatype, file_ptr_type,
+ offset, NULL, request, DAOS_READ, error_code);
+ return;
+}
+
+void ADIOI_DAOS_IwriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code)
+{
+ ADIOI_DAOS_StridedListIO(fd, (void *) buf, count, datatype, file_ptr_type,
+ offset, NULL, request, DAOS_WRITE, error_code);
+ return;
+}
+
+
+static void
+ADIOI_DAOS_StridedListIO(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ MPI_Request * request, int rw_type, int *error_code)
+{
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ int i, j, k, fwr_size = 0, st_index = 0;
+ int sum, n_etypes_in_filetype, size_in_filetype;
+ MPI_Count bufsize;
+ int n_filetypes, etype_in_filetype;
+ ADIO_Offset abs_off_in_filetype = 0;
+ MPI_Count filetype_size, etype_size, buftype_size;
+ MPI_Aint filetype_extent, buftype_extent;
+ int buftype_is_contig, filetype_is_contig;
+ ADIO_Offset off, disp, start_off;
+ int flag, st_fwr_size, st_n_filetypes;
+ int mem_list_count;
+ int64_t file_length;
+ int total_blks_to_write;
+ int f_data_wrote;
+ int n_write_lists;
+ struct ADIO_DAOS_cont *cont = fd->fs_ptr;
+ struct ADIO_DAOS_req *aio_req = NULL;
+ static char myname[] = "ADIOI_DAOS_StridedListIO";
+ int err_flag = 0, ret;
+ int mpi_rank;
+
+ MPI_Comm_rank(fd->comm, &mpi_rank);
+ *error_code = MPI_SUCCESS;
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+
+ MPI_Type_extent(fd->filetype, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_extent(datatype, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ bufsize = buftype_size * count;
+
+
+ d_sg_list_t *sgl, loc_sgl;
+ d_iov_t *iovs;
+ dfs_iod_t *iod, loc_iod;
+ daos_range_t *rgs;
+ daos_size_t *nbytes, loc_nbytes;
+
+ if (request) {
+ aio_req = (struct ADIO_DAOS_req *) ADIOI_Calloc(sizeof(struct ADIO_DAOS_req), 1);
+ daos_event_init(&aio_req->daos_event, DAOS_HDL_INVAL, NULL);
+ iod = &aio_req->iod;
+ sgl = &aio_req->sgl;
+ nbytes = &aio_req->nbytes;
+
+ if (ADIOI_DAOS_greq_class == 0) {
+ MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn,
+ ADIOI_DAOS_aio_free_fn, MPIU_Greq_cancel_fn,
+ ADIOI_DAOS_aio_poll_fn, ADIOI_DAOS_aio_wait_fn,
+ &ADIOI_DAOS_greq_class);
+ }
+ MPIX_Grequest_class_allocate(ADIOI_DAOS_greq_class, aio_req, request);
+ memcpy(&(aio_req->req), request, sizeof(MPI_Request));
+ } else {
+ iod = &loc_iod;
+ sgl = &loc_sgl;
+ nbytes = &loc_nbytes;
+ }
+
+ if (filetype_size == 0) {
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ if (bufsize == 0) {
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ /* Create Memory SGL */
+ file_length = 0;
+ if (!buftype_is_contig) {
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+ mem_list_count = count * flat_buf->count;
+
+ iovs = (d_iov_t *) ADIOI_Malloc(mem_list_count * sizeof(d_iov_t));
+
+ k = 0;
+ for (j = 0; j < count; j++) {
+ for (i = 0; i < flat_buf->count; i++) {
+ ADIO_Offset tmp_off;
+
+ if (flat_buf->blocklens[i] == 0) {
+ continue;
+ }
+ if (file_length + flat_buf->blocklens[i] > bufsize)
+ break;
+
+ tmp_off = ((size_t) buf + j * buftype_extent + flat_buf->indices[i]);
+ file_length += flat_buf->blocklens[i];
+ d_iov_set(&iovs[k++], (char *) tmp_off, flat_buf->blocklens[i]);
+
+#ifdef D_PRINT_IO_MEM
+ printf("(MEM %d) %d: off %lld len %zu\n", mpi_rank, k,
+ tmp_off, flat_buf->blocklens[i]);
+#endif
+ }
+ }
+ } else {
+ k = 1;
+ iovs = (d_iov_t *) ADIOI_Malloc(sizeof(d_iov_t));
+ file_length = bufsize;
+ d_iov_set(iovs, (void *) buf, bufsize);
+#ifdef D_PRINT_IO_MEM
+ printf("(MEM SINGLE) off %lld len %zu\n", buf, bufsize);
+#endif
+ }
+ sgl->sg_nr = k;
+ sgl->sg_nr_out = 0;
+ sgl->sg_iovs = iovs;
+ if (request)
+ aio_req->iovs = iovs;
+
+ if (filetype_is_contig) {
+ n_write_lists = 1;
+
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
+ off = fd->disp + etype_size * offset;
+ else
+ off = fd->fp_ind;
+
+ rgs = (daos_range_t *) ADIOI_Malloc(sizeof(daos_range_t));
+ rgs->rg_idx = off;
+ rgs->rg_len = bufsize;
+#ifdef D_PRINT_IO
+ printf("(%d) Single: idx %lld len %zu\n", mpi_rank, rgs->rg_idx, rgs->rg_len);
+#endif
+ } else {
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+ disp = fd->disp;
+
+ /* for each case - ADIO_Individual pointer or explicit, find offset
+ * (file offset in bytes), n_filetypes (how many filetypes into file to
+ * start), fwr_size (remaining amount of data in present file block),
+ * and st_index (start point in terms of blocks in starting filetype) */
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ start_off = fd->fp_ind; /* in bytes */
+ n_filetypes = -1;
+ flag = 0;
+ while (!flag) {
+ n_filetypes++;
+ for (i = 0; i < flat_file->count; i++) {
+ if (disp + flat_file->indices[i] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent +
+ flat_file->blocklens[i] >= start_off) {
+ st_index = i;
+ fwr_size = disp + flat_file->indices[i] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent
+ + flat_file->blocklens[i] - start_off;
+ flag = 1;
+ break;
+ }
+ }
+ } /* while (!flag) */
+ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */
+ else {
+ n_etypes_in_filetype = filetype_size / etype_size;
+ n_filetypes = (int) (offset / n_etypes_in_filetype);
+ etype_in_filetype = (int) (offset % n_etypes_in_filetype);
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i = 0; i < flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ fwr_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+ /* abs. offset in bytes in the file */
+ start_off = disp + ((ADIO_Offset) n_filetypes) * filetype_extent + abs_off_in_filetype;
+ } /* else [file_ptr_type != ADIO_INDIVIDUAL] */
+
+ st_fwr_size = fwr_size;
+ st_n_filetypes = n_filetypes;
+
+ i = 0;
+ j = st_index;
+ f_data_wrote = MPL_MIN(st_fwr_size, bufsize);
+ n_filetypes = st_n_filetypes;
+
+ /* determine how many blocks in file to write */
+ total_blks_to_write = 1;
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+
+ while (f_data_wrote < bufsize) {
+ f_data_wrote += flat_file->blocklens[j];
+ if (flat_file->blocklens[j])
+ total_blks_to_write++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ }
+
+ j = st_index;
+ n_filetypes = st_n_filetypes;
+ n_write_lists = total_blks_to_write;
+
+ rgs = (daos_range_t *) ADIOI_Malloc(sizeof(daos_range_t) * n_write_lists);
+
+#if 0
+ for (i = 0; i < flat_file->count; i++)
+ fprintf(stderr, "(%d) FF: %d: off %lld, len %zu\n", mpi_rank, i,
+ flat_file->indices[i], flat_file->blocklens[i]);
+ fprintf(stderr, "NUM IO lists = %d\n", n_write_lists);
+#endif
+
+ for (i = 0; i < n_write_lists; i++) {
+ if (!i) {
+ rgs[i].rg_idx = start_off;
+ rgs[i].rg_len = MPL_MIN(f_data_wrote, st_fwr_size);
+#ifdef D_PRINT_IO
+ printf("(%d) %d: idx %lld len %zu\n", mpi_rank, i, rgs[i].rg_idx, rgs[i].rg_len);
+#endif
+ } else {
+ if (flat_file->blocklens[j]) {
+ rgs[i].rg_idx = disp +
+ ((ADIO_Offset) n_filetypes) * filetype_extent + flat_file->indices[j];
+ rgs[i].rg_len = flat_file->blocklens[j];
+#ifdef D_PRINT_IO
+ printf("(%d) %d: idx %lld len %zu\n",
+ mpi_rank, i, rgs[i].rg_idx, rgs[i].rg_len);
+#endif
+ } else
+ i--;
+ }
+
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ }
+ }
+
+ /** set array location */
+ iod->iod_nr = n_write_lists;
+ iod->iod_rgs = rgs;
+ if (request)
+ aio_req->rgs = rgs;
+
+ if (rw_type == DAOS_WRITE) {
+ ret = dfs_writex(cont->dfs, cont->obj, iod, sgl, (request ? &aio_req->daos_event : NULL));
+ if (ret != 0) {
+ PRINT_MSG(stderr, "dfs_writex() failed with %d\n", ret);
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, ret);
+ return;
+ }
+ *nbytes = bufsize;
+ } else if (rw_type == DAOS_READ) {
+ ret = dfs_readx(cont->dfs, cont->obj, iod, sgl, nbytes,
+ (request ? &aio_req->daos_event : NULL));
+ if (ret != 0) {
+ PRINT_MSG(stderr, "dfs_readx() failed with %d\n", ret);
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, ret);
+ return;
+ }
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ if (filetype_is_contig)
+ fd->fp_ind += bufsize;
+ else
+ fd->fp_ind = rgs[n_write_lists - 1].rg_idx + rgs[n_write_lists - 1].rg_len;
+ }
+
+ if (!err_flag)
+ *error_code = MPI_SUCCESS;
+
+ fd->fp_sys_posn = -1; /* clear this. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ if (request == NULL && status) {
+ MPIR_Status_set_bytes(status, datatype, *nbytes);
+ }
+#endif
+
+ if (!request) {
+ ADIOI_Free(iovs);
+ ADIOI_Free(rgs);
+ }
+
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_open.c b/3rd-party/romio341/adio/ad_daos/ad_daos_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..17dcb881b305846abbcf0c22c22533849cef6de0
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_open.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+#include
+#include
+#include
+
+static int parse_filename(const char *path, char **_obj_name, char **_cont_name)
+{
+ char *f1;
+ char *f2;
+ char *fname;
+ char *cont_name;
+ int rc = 0;
+
+ f1 = ADIOI_Strdup(path);
+ if (f1 == NULL)
+ return ENOMEM;
+
+ f2 = ADIOI_Strdup(path);
+ if (f2 == NULL) {
+ ADIOI_Free(f1);
+ return ENOMEM;
+ }
+
+ fname = basename(f1);
+ cont_name = dirname(f2);
+
+ if (cont_name[0] == '.') {
+ char *ptr;
+ char cwd[PATH_MAX];
+
+ ptr = getcwd(cwd, PATH_MAX);
+ if (ptr == NULL) {
+ rc = errno;
+ goto out;
+ }
+
+ if (strcmp(cont_name, ".") == 0) {
+ cont_name = ADIOI_Strdup(cwd);
+ if (cont_name == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+ }
+ *_cont_name = cont_name;
+ } else {
+ *_cont_name = ADIOI_Strdup(cont_name);
+ if (*_cont_name == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+ }
+
+ *_obj_name = ADIOI_Strdup(fname);
+ if (*_obj_name == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+
+ out:
+ ADIOI_Free(f1);
+ ADIOI_Free(f2);
+ return rc;
+}
+
+
+static int cache_handles(struct ADIO_DAOS_cont *cont)
+{
+ int rc;
+
+ cont->c = adio_daos_coh_lookup(cont->attr.da_cuuid);
+ if (cont->c == NULL) {
+ /** insert handle into container hashtable */
+ rc = adio_daos_coh_insert(cont->attr.da_cuuid, cont->coh, &cont->c);
+ } else {
+ /** g2l handle not needed, already cached */
+ rc = daos_cont_close(cont->coh, NULL);
+ cont->coh = cont->c->open_hdl;
+ }
+ if (rc)
+ return rc;
+
+ cont->p = adio_daos_poh_lookup(cont->attr.da_puuid);
+ if (cont->p == NULL) {
+ /** insert handle into pool hashtable */
+ rc = adio_daos_poh_insert(cont->attr.da_puuid, cont->poh, &cont->p);
+ } else {
+ /** g2l handle not needed, already cached */
+ rc = daos_pool_disconnect(cont->poh, NULL);
+ cont->poh = cont->p->open_hdl;
+ }
+
+ return rc;
+}
+
+static int share_cont_info(struct ADIO_DAOS_cont *cont, int rank, MPI_Comm comm)
+{
+ char uuid_buf[74];
+ d_iov_t pool_hdl = { NULL, 0, 0 };
+ d_iov_t cont_hdl = { NULL, 0, 0 };
+ d_iov_t dfs_hdl = { NULL, 0, 0 };
+ d_iov_t file_hdl = { NULL, 0, 0 };
+ char *buf = NULL;
+ uint64_t total_size = 0;
+ int rc = 0;
+
+ if (rank == 0) {
+ rc = daos_pool_local2global(cont->poh, &pool_hdl);
+ if (rc)
+ return rc;
+ rc = daos_cont_local2global(cont->coh, &cont_hdl);
+ if (rc)
+ return rc;
+ rc = dfs_local2global(cont->dfs, &dfs_hdl);
+ if (rc)
+ return rc;
+ rc = dfs_obj_local2global(cont->dfs, cont->obj, &file_hdl);
+ if (rc)
+ return rc;
+
+ total_size = sizeof(uuid_buf) + pool_hdl.iov_buf_len + cont_hdl.iov_buf_len +
+ dfs_hdl.iov_buf_len + file_hdl.iov_buf_len + sizeof(daos_size_t) * 4;
+ }
+
+ /** broadcast size to all peers */
+ rc = MPI_Bcast(&total_size, 1, MPI_UINT64_T, 0, comm);
+ if (rc != MPI_SUCCESS)
+ return -1;
+
+ /** allocate buffers */
+ buf = ADIOI_Malloc(total_size);
+ if (buf == NULL)
+ return -1;
+
+ if (rank == 0) {
+ char *ptr = buf;
+
+ uuid_unparse(cont->attr.da_puuid, ptr);
+ ptr += 37;
+ uuid_unparse(cont->attr.da_cuuid, ptr);
+ ptr += 37;
+
+ *((daos_size_t *) ptr) = pool_hdl.iov_buf_len;
+ ptr += sizeof(daos_size_t);
+ pool_hdl.iov_buf = ptr;
+ pool_hdl.iov_len = pool_hdl.iov_buf_len;
+ rc = daos_pool_local2global(cont->poh, &pool_hdl);
+ if (rc)
+ goto out;
+ ptr += pool_hdl.iov_buf_len;
+
+ *((daos_size_t *) ptr) = cont_hdl.iov_buf_len;
+ ptr += sizeof(daos_size_t);
+ cont_hdl.iov_buf = ptr;
+ cont_hdl.iov_len = cont_hdl.iov_buf_len;
+ rc = daos_cont_local2global(cont->coh, &cont_hdl);
+ if (rc)
+ goto out;
+ ptr += cont_hdl.iov_buf_len;
+
+ *((daos_size_t *) ptr) = dfs_hdl.iov_buf_len;
+ ptr += sizeof(daos_size_t);
+ dfs_hdl.iov_buf = ptr;
+ dfs_hdl.iov_len = dfs_hdl.iov_buf_len;
+ rc = dfs_local2global(cont->dfs, &dfs_hdl);
+ if (rc)
+ goto out;
+ ptr += dfs_hdl.iov_buf_len;
+
+ *((daos_size_t *) ptr) = file_hdl.iov_buf_len;
+ ptr += sizeof(daos_size_t);
+ file_hdl.iov_buf = ptr;
+ file_hdl.iov_len = file_hdl.iov_buf_len;
+ rc = dfs_obj_local2global(cont->dfs, cont->obj, &file_hdl);
+ if (rc)
+ goto out;
+ }
+
+ rc = MPI_Bcast(buf, total_size, MPI_BYTE, 0, comm);
+ if (rc != MPI_SUCCESS)
+ goto out;
+
+ if (rank != 0) {
+ char *ptr = buf;
+
+ rc = uuid_parse(ptr, cont->attr.da_puuid);
+ if (rc)
+ goto out;
+ ptr += 37;
+
+ rc = uuid_parse(ptr, cont->attr.da_cuuid);
+ if (rc)
+ goto out;
+ ptr += 37;
+
+ pool_hdl.iov_buf_len = *((daos_size_t *) ptr);
+ ptr += sizeof(daos_size_t);
+ pool_hdl.iov_buf = ptr;
+ pool_hdl.iov_len = pool_hdl.iov_buf_len;
+ rc = daos_pool_global2local(pool_hdl, &cont->poh);
+ if (rc)
+ goto out;
+ ptr += pool_hdl.iov_buf_len;
+
+ cont_hdl.iov_buf_len = *((daos_size_t *) ptr);
+ ptr += sizeof(daos_size_t);
+ cont_hdl.iov_buf = ptr;
+ cont_hdl.iov_len = cont_hdl.iov_buf_len;
+ rc = daos_cont_global2local(cont->poh, cont_hdl, &cont->coh);
+ if (rc)
+ goto out;
+ ptr += cont_hdl.iov_buf_len;
+
+ rc = cache_handles(cont);
+ if (rc)
+ goto out;
+
+ dfs_hdl.iov_buf_len = *((daos_size_t *) ptr);
+ ptr += sizeof(daos_size_t);
+ dfs_hdl.iov_buf = ptr;
+ dfs_hdl.iov_len = dfs_hdl.iov_buf_len;
+ rc = dfs_global2local(cont->poh, cont->coh, O_RDWR, dfs_hdl, &cont->dfs);
+ if (rc)
+ goto out;
+ ptr += dfs_hdl.iov_buf_len;
+
+ if (rank != 0) {
+ if (cont->c->dfs == NULL) {
+ cont->c->dfs = cont->dfs;
+ } else {
+ dfs_umount(cont->dfs);
+ cont->dfs = cont->c->dfs;
+ }
+ }
+
+ file_hdl.iov_buf_len = *((daos_size_t *) ptr);
+ ptr += sizeof(daos_size_t);
+ file_hdl.iov_buf = ptr;
+ file_hdl.iov_len = file_hdl.iov_buf_len;
+ rc = dfs_obj_global2local(cont->dfs, 0, file_hdl, &cont->obj);
+ if (rc)
+ goto out;
+ }
+
+ out:
+ ADIOI_Free(buf);
+ return rc;
+}
+
+static int get_pool_cont_uuids(const char *path, struct duns_attr_t *attr)
+{
+ bool bypass_duns = false;
+ char *uuid_str;
+ int rc;
+
+ d_getenv_bool("DAOS_BYPASS_DUNS", &bypass_duns);
+
+ if (!bypass_duns) {
+ attr->da_no_prefix = true;
+ rc = duns_resolve_path(path, attr);
+ if (rc) {
+ PRINT_MSG(stderr, "duns_resolve_path() failed on path %s (%d)\n", path, rc);
+ return rc;
+ }
+ return 0;
+ }
+
+ /* use the env variables to retrieve the pool and container */
+ uuid_str = getenv("DAOS_POOL");
+ if (uuid_str == NULL) {
+ PRINT_MSG(stderr, "Can't retrieve DAOS pool uuid\n");
+ return EINVAL;
+ }
+ if (uuid_parse(uuid_str, attr->da_puuid) < 0) {
+ PRINT_MSG(stderr, "Failed to parse pool uuid\n");
+ return EINVAL;
+ }
+
+ uuid_str = getenv("DAOS_CONT");
+ if (uuid_str == NULL) {
+ PRINT_MSG(stderr, "Can't retrieve DAOS cont uuid\n");
+ return EINVAL;
+ }
+ if (uuid_parse(uuid_str, attr->da_cuuid) < 0) {
+ PRINT_MSG(stderr, "Failed to parse container uuid\n");
+ return EINVAL;
+ }
+
+ attr->da_oclass_id = OC_UNKNOWN;
+ attr->da_chunk_size = 0;
+
+ return 0;
+}
+
+void ADIOI_DAOS_Open(ADIO_File fd, int *error_code)
+{
+ struct ADIO_DAOS_cont *cont = fd->fs_ptr;
+ static char myname[] = "ADIOI_DAOS_OPEN";
+ dfs_obj_t *parent = NULL;
+ int perm, old_mask, amode;
+ int rc;
+
+ *error_code = MPI_SUCCESS;
+
+ rc = parse_filename(fd->filename, &cont->obj_name, &cont->cont_name);
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, cont->cont_name, __LINE__, rc);
+ return;
+ }
+
+ rc = get_pool_cont_uuids(cont->cont_name, &cont->attr);
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, cont->cont_name, __LINE__, rc);
+ return;
+ }
+
+ /** Info object setting should override */
+ if (fd->hints->fs_hints.daos.obj_class != OC_UNKNOWN)
+ cont->attr.da_oclass_id = fd->hints->fs_hints.daos.obj_class;
+ if (fd->hints->fs_hints.daos.chunk_size != 0)
+ cont->attr.da_chunk_size = fd->hints->fs_hints.daos.chunk_size;
+
+#if 0
+ {
+ char uuid_str[37];
+ uuid_unparse(cont->attr.da_cuuid, uuid_str);
+
+ fprintf(stderr, "Container Open %s %s\n", cont->cont_name, uuid_str);
+ fprintf(stderr, "File %s\n", cont->obj_name);
+ }
+ fprintf(stderr, "chunk_size = %d\n", cont->attr.da_chunk_size);
+ fprintf(stderr, "OCLASS = %d\n", cont->attr.da_oclass_id);
+#endif
+
+ rc = adio_daos_poh_lookup_connect(cont->attr.da_puuid, &cont->p);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to connect to DAOS Pool (%d)\n", rc);
+ *error_code = ADIOI_DAOS_err(myname, cont->cont_name, __LINE__, rc);
+ return;
+ }
+
+ cont->poh = cont->p->open_hdl;
+
+ rc = adio_daos_coh_lookup_create(cont->poh, cont->attr.da_cuuid, O_RDWR,
+ (fd->access_mode & ADIO_CREATE), &cont->c);
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, cont->cont_name, __LINE__, rc);
+ goto err_pool;
+ }
+
+ cont->coh = cont->c->open_hdl;
+
+ assert(cont->c->dfs);
+ cont->dfs = cont->c->dfs;
+
+ /* Set file access flags */
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE)
+ amode = amode | O_CREAT;
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+ /* Set DFS permission mode + object type */
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = old_mask ^ 0666;
+ } else {
+ perm = fd->perm;
+ }
+ perm = S_IFREG | perm;
+
+ /* Lookup the parent directory. this will be NULL in case of root */
+ if (cont->attr.da_rel_path) {
+ rc = dfs_lookup(cont->dfs, cont->attr.da_rel_path, amode, &parent, NULL, NULL);
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, rc);
+ goto err_cont;
+ }
+ }
+
+ rc = dfs_open(cont->dfs, parent, cont->obj_name, perm, amode,
+ cont->attr.da_oclass_id, cont->attr.da_chunk_size, NULL, &cont->obj);
+
+ if (parent)
+ dfs_release(parent);
+
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, rc);
+ goto err_cont;
+ }
+
+ out:
+ return;
+ err_obj:
+ dfs_release(cont->obj);
+ if (fd->access_mode & ADIO_CREATE)
+ dfs_remove(cont->dfs, NULL, cont->obj_name, true, NULL);
+ err_cont:
+ adio_daos_coh_release(cont->c);
+ cont->c = NULL;
+ err_pool:
+ adio_daos_poh_release(cont->p);
+ cont->p = NULL;
+ err_free:
+ ADIOI_Free(cont->obj_name);
+ ADIOI_Free(cont->cont_name);
+ goto out;
+}
+
+void ADIOI_DAOS_OpenColl(ADIO_File fd, int rank, int access_mode, int *error_code)
+{
+ struct ADIO_DAOS_cont *cont;
+ int amode, orig_amode_wronly;
+ MPI_Comm comm = fd->comm;
+ int mpi_size;
+ int rc;
+ static char myname[] = "ADIOI_DAOS_OPENCOLL";
+
+ ADIOI_DAOS_Init(error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+
+ MPI_Comm_size(comm, &mpi_size);
+
+ orig_amode_wronly = access_mode;
+ if (access_mode & ADIO_WRONLY) {
+ access_mode = access_mode ^ ADIO_WRONLY;
+ access_mode = access_mode | ADIO_RDWR;
+ }
+ fd->access_mode = access_mode;
+
+ amode = 0;
+ if (access_mode & ADIO_RDONLY)
+ amode = DAOS_COO_RO;
+ else
+ amode = DAOS_COO_RW;
+
+ cont = (struct ADIO_DAOS_cont *) ADIOI_Calloc(1, sizeof(struct ADIO_DAOS_cont));
+ if (cont == NULL) {
+ *error_code = MPI_ERR_NO_MEM;
+ return;
+ }
+
+ fd->access_mode = access_mode;
+ cont->amode = amode;
+ fd->fs_ptr = cont;
+
+ if (rank == 0) {
+ (*(fd->fns->ADIOI_xxx_Open)) (fd, error_code);
+ MPI_Error_class(*error_code, &rc);
+ }
+
+ if (mpi_size > 1) {
+ MPI_Bcast(&rc, 1, MPI_INT, 0, comm);
+
+ if (rank != 0) {
+ if (rc)
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, rc, "File Open error", 0);
+ else
+ *error_code = MPI_SUCCESS;
+ }
+ }
+ if (*error_code != MPI_SUCCESS)
+ goto err_free;
+
+ if (mpi_size > 1) {
+ rc = share_cont_info(cont, rank, comm);
+ if (rc) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, rc, "File Open error", 0);
+ goto err_free;
+ }
+ }
+
+ fd->is_open = 1;
+ fd->access_mode = orig_amode_wronly;
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
+#endif
+
+ return;
+
+ err_free:
+ ADIOI_Free(cont);
+ return;
+}
+
+void ADIOI_DAOS_Flush(ADIO_File fd, int *error_code)
+{
+ MPI_Barrier(fd->comm);
+ *error_code = MPI_SUCCESS;
+}
+
+void ADIOI_DAOS_Delete(const char *filename, int *error_code)
+{
+ struct adio_daos_hdl *p, *c;
+ dfs_t *dfs;
+ char *obj_name, *cont_name;
+ struct duns_attr_t attr = { };
+ static char myname[] = "ADIOI_DAOS_DELETE";
+ int rc;
+
+ ADIOI_DAOS_Init(error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+
+ rc = parse_filename(filename, &obj_name, &cont_name);
+ if (rc) {
+ *error_code = MPI_ERR_NO_MEM;
+ return;
+ }
+
+ rc = get_pool_cont_uuids(cont_name, &attr);
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, cont_name, __LINE__, rc);
+ return;
+ }
+
+ rc = adio_daos_poh_lookup_connect(attr.da_puuid, &p);
+ if (rc || p == NULL) {
+ PRINT_MSG(stderr, "Failed to connect to pool\n");
+ *error_code = ADIOI_DAOS_err(myname, cont_name, __LINE__, rc);
+ goto out_free;
+ }
+
+ rc = adio_daos_coh_lookup_create(p->open_hdl, attr.da_cuuid, O_RDWR, false, &c);
+ if (rc || c == NULL) {
+ *error_code = ADIOI_DAOS_err(myname, cont_name, __LINE__, rc);
+ goto out_pool;
+ }
+
+ if (c->dfs == NULL) {
+ /* Mount a flat namespace on the container */
+ rc = dfs_mount(p->open_hdl, c->open_hdl, O_RDWR, &dfs);
+ if (rc) {
+ PRINT_MSG(stderr, "Failed to mount flat namespace (%d)\n", rc);
+ *error_code = ADIOI_DAOS_err(myname, obj_name, __LINE__, rc);
+ goto out_cont;
+ }
+ c->dfs = dfs;
+ }
+
+ /* Remove the file from the flat namespace */
+ rc = dfs_remove(c->dfs, NULL, obj_name, true, NULL);
+ if (rc) {
+ *error_code = ADIOI_DAOS_err(myname, obj_name, __LINE__, rc);
+ goto out_cont;
+ }
+
+ *error_code = MPI_SUCCESS;
+
+ out_cont:
+ adio_daos_coh_release(c);
+ out_pool:
+ adio_daos_poh_release(p);
+ out_free:
+ ADIOI_Free(obj_name);
+ ADIOI_Free(cont_name);
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_daos/ad_daos_resize.c b/3rd-party/romio341/adio/ad_daos/ad_daos_resize.c
new file mode 100644
index 0000000000000000000000000000000000000000..f95d5f71168492ac7bd8f5f8eb696af07a9f2013
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_daos/ad_daos_resize.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_daos.h"
+
+void ADIOI_DAOS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
+{
+ int ret, rank;
+ struct ADIO_DAOS_cont *cont = fd->fs_ptr;
+ static char myname[] = "ADIOI_DAOS_RESIZE";
+
+ *error_code = MPI_SUCCESS;
+ MPI_Comm_rank(fd->comm, &rank);
+ MPI_Barrier(fd->comm);
+
+ if (rank == fd->hints->ranklist[0])
+ ret = dfs_punch(cont->dfs, cont->obj, size, DFS_MAX_FSIZE);
+
+ MPI_Bcast(&ret, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
+ if (ret != 0)
+ *error_code = ADIOI_DAOS_err(myname, cont->obj_name, __LINE__, ret);
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/.gitignore b/3rd-party/romio341/adio/ad_gpfs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..509a693e9276eedddbcb63f1ab295064e7557649
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/.gitignore
@@ -0,0 +1,11 @@
+/Makefile
+/.deps
+/*.bb
+/*.bbg
+/*.gcda
+/*.gcno
+/.libs
+/.libstamp*
+/*.lo
+/.*-cache
+/.state-cache
diff --git a/3rd-party/romio341/adio/ad_gpfs/Makefile.mk b/3rd-party/romio341/adio/ad_gpfs/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..9f71ca2b1f3df0b9063ea46d9d1c44d9647f588f
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/Makefile.mk
@@ -0,0 +1,24 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_GPFS
+
+noinst_HEADERS += \
+ adio/ad_gpfs/ad_gpfs_aggrs.h \
+ adio/ad_gpfs/ad_gpfs.h \
+ adio/ad_gpfs/ad_gpfs_tuning.h
+
+romio_other_sources += \
+ adio/ad_gpfs/ad_gpfs_aggrs.c \
+ adio/ad_gpfs/ad_gpfs_close.c \
+ adio/ad_gpfs/ad_gpfs_flush.c \
+ adio/ad_gpfs/ad_gpfs_tuning.c \
+ adio/ad_gpfs/ad_gpfs.c \
+ adio/ad_gpfs/ad_gpfs_open.c \
+ adio/ad_gpfs/ad_gpfs_hints.c \
+ adio/ad_gpfs/ad_gpfs_rdcoll.c \
+ adio/ad_gpfs/ad_gpfs_wrcoll.c
+
+endif BUILD_AD_GPFS
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..203fbdedeff859ed66d99d387e40e2c384443a01
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs.c
+ * \brief ???
+ */
+
+#include "ad_gpfs.h"
+
+/* adioi.h has the ADIOI_Fns_struct define */
+#include "adioi.h"
+
+struct ADIOI_Fns_struct ADIO_GPFS_operations = {
+ ADIOI_GPFS_Open, /* Open */
+ ADIOI_GEN_OpenColl, /* Collective open */
+ ADIOI_GEN_ReadContig, /* ReadContig */
+ ADIOI_GEN_WriteContig, /* WriteContig */
+ ADIOI_GPFS_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GPFS_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_GEN_Fcntl, /* Fcntl */
+#if defined(BGQPLATFORM) || defined(PEPLATFORM)
+ ADIOI_GPFS_SetInfo, /* SetInfo for BlueGene or PE */
+#else
+ ADIOI_GEN_SetInfo, /* SetInfo for any platform besides BlueGene or PE */
+#endif
+ ADIOI_GEN_ReadStrided, /* ReadStrided */
+ ADIOI_GEN_WriteStrided, /* WriteStrided */
+ ADIOI_GPFS_Close, /* Close */
+#ifdef ROMIO_HAVE_WORKING_AIO
+#warning Consider BG support for NFS before enabling this.
+ ADIOI_GEN_IreadContig, /* IreadContig */
+ ADIOI_GEN_IwriteContig, /* IwriteContig */
+#else
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+#endif
+ ADIOI_GEN_IODone, /* ReadDone */
+ ADIOI_GEN_IODone, /* WriteDone */
+ ADIOI_GEN_IOComplete, /* ReadComplete */
+ ADIOI_GEN_IOComplete, /* WriteComplete */
+ ADIOI_GEN_IreadStrided, /* IreadStrided */
+ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
+ ADIOI_GPFS_Flush, /* Flush */
+ ADIOI_GEN_Resize, /* Resize */
+ ADIOI_GEN_Delete, /* Delete */
+ ADIOI_GEN_Feature, /* Features */
+#ifdef BGQPLATFORM
+ "GPFS+BGQ: IBM GPFS for Blue Gene",
+#elif PEPLATFORM
+ "GPFS+PE: IBM GPFS for PE",
+#else
+ "GPFS: IBM GPFS",
+#endif
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs.h b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..2523b82746aaf17c8c744b5ecfac34c3cffd99e8
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs.h
+ * \brief ???
+ */
+
+#ifndef AD_GPFS_H_INCLUDED
+#define AD_GPFS_H_INCLUDED
+
+#include "adio.h"
+#include
+#include
+#include
+#include
+
+#ifdef HAVE_SIGNAL_H
+#include
+#endif
+#ifdef HAVE_AIO_LITE_H
+#include
+#elif defined HAVE_AIO_H
+#include
+#endif
+
+
+void ADIOI_GPFS_Open(ADIO_File fd, int *error_code);
+
+void ADIOI_GPFS_Close(ADIO_File fd, int *error_code);
+
+void ADIOI_GPFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_GPFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+
+void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+
+void ADIOI_GPFS_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_GPFS_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+
+void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+
+void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+
+void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
+
+#include "ad_tuning.h"
+#include "ad_gpfs_tuning.h"
+
+
+#endif /* AD_GPFS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_aggrs.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_aggrs.c
new file mode 100644
index 0000000000000000000000000000000000000000..c261e22d6b0463c97e644fd35177d769fb4feea6
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_aggrs.c
@@ -0,0 +1,771 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_aggrs.c
+ * \brief The externally used function from this file is is declared in ad_gpfs_aggrs.h
+ */
+
+#include "adio.h"
+#include "adio_cb_config_list.h"
+#include "ad_gpfs.h"
+#include "ad_gpfs_aggrs.h"
+
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+
+#ifdef MPL_USE_DBG_LOGGING
+#define AGG_DEBUG 1
+#endif
+
+#ifndef TRACE_ERR
+#define TRACE_ERR(format...)
+#endif
+
+/* Comments copied from common:
+ * This file contains four functions:
+ *
+ * ADIOI_Calc_aggregator()
+ * ADIOI_Calc_file_domains()
+ * ADIOI_Calc_my_req()
+ * ADIOI_Calc_others_req()
+ *
+ * The last three of these were originally in ad_read_coll.c, but they are
+ * also shared with ad_write_coll.c. I felt that they were better kept with
+ * the rest of the shared aggregation code.
+ */
+
+/* Discussion of values available from above:
+ *
+ * ADIO_Offset st_offsets[0..nprocs-1]
+ * ADIO_Offset end_offsets[0..nprocs-1]
+ * These contain a list of start and end offsets for each process in
+ * the communicator. For example, an access at loc 10, size 10 would
+ * have a start offset of 10 and end offset of 19.
+ * int nprocs
+ * number of processors in the collective I/O communicator
+ * ADIO_Offset min_st_offset
+ * ADIO_Offset fd_start[0..nprocs_for_coll-1]
+ * starting location of "file domain"; region that a given process will
+ * perform aggregation for (i.e. actually do I/O)
+ * ADIO_Offset fd_end[0..nprocs_for_coll-1]
+ * start + size - 1 roughly, but it can be less, or 0, in the case of
+ * uneven distributions
+ */
+
+/* Description from common/ad_aggregate.c. (Does it completely apply to bg?)
+ * ADIOI_Calc_aggregator()
+ *
+ * The intention here is to implement a function which provides basically
+ * the same functionality as in Rajeev's original version of
+ * ADIOI_Calc_my_req(). He used a ceiling division approach to assign the
+ * file domains, and we use the same approach here when calculating the
+ * location of an offset/len in a specific file domain. Further we assume
+ * this same distribution when calculating the rank_index, which is later
+ * used to map to a specific process rank in charge of the file domain.
+ *
+ * A better (i.e. more general) approach would be to use the list of file
+ * domains only. This would be slower in the case where the
+ * original ceiling division was used, but it would allow for arbitrary
+ * distributions of regions to aggregators. We'd need to know the
+ * nprocs_for_coll in that case though, which we don't have now.
+ *
+ * Note a significant difference between this function and Rajeev's old code:
+ * this code doesn't necessarily return a rank in the range
+ * 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a
+ * result of the rank mapping; any set of ranks in the communicator could be
+ * used now.
+ *
+ * Returns an integer representing a rank in the collective I/O communicator.
+ *
+ * The "len" parameter is also modified to indicate the amount of data
+ * actually available in this file domain.
+ */
+/*
+ * This is more general aggregator search function which does not base on the assumption
+ * that each aggregator hosts the file domain with the same size
+ */
+int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
+ ADIO_Offset off,
+ ADIO_Offset min_off,
+ ADIO_Offset * len,
+ ADIO_Offset fd_size, ADIO_Offset * fd_start, ADIO_Offset * fd_end)
+{
+ int rank_index, rank;
+ ADIO_Offset avail_bytes;
+ TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
+
+ ADIOI_Assert((off <= fd_end[fd->hints->cb_nodes - 1] && off >= min_off &&
+ fd_start[0] >= min_off));
+
+ /* binary search --> rank_index is returned */
+ int ub = fd->hints->cb_nodes;
+ int lb = 0;
+ /* get an index into our array of aggregators */
+ /* Common code for striping - bg doesn't use it but it's
+ * here to make diff'ing easier.
+ * rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
+ *
+ * if (fd->hints->striping_unit > 0) {
+ * * wkliao: implementation for file domain alignment
+ * fd_start[] and fd_end[] have been aligned with file lock
+ * boundaries when returned from ADIOI_Calc_file_domains() so cannot
+ * just use simple arithmatic as above *
+ * rank_index = 0;
+ * while (off > fd_end[rank_index]) rank_index++;
+ * }
+ * bg does it's own striping below
+ */
+ rank_index = fd->hints->cb_nodes / 2;
+ while (off < fd_start[rank_index] || off > fd_end[rank_index]) {
+ if (off > fd_end[rank_index]) {
+ lb = rank_index;
+ rank_index = (rank_index + ub) / 2;
+ } else if (off < fd_start[rank_index]) {
+ ub = rank_index;
+ rank_index = (rank_index + lb) / 2;
+ }
+ }
+ /* we index into fd_end with rank_index, and fd_end was allocated to be no
+ * bigger than fd->hins->cb_nodes. If we ever violate that, we're
+ * overrunning arrays. Obviously, we should never ever hit this abort */
+ if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
+ FPRINTF(stderr,
+ "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
+ rank_index, fd->hints->cb_nodes, (long long) fd_size, (long long) off);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ /* DBG_FPRINTF ("ADIOI_GPFS_Calc_aggregator: rank_index = %d\n",
+ * rank_index); */
+
+ /*
+ * remember here that even in Rajeev's original code it was the case that
+ * different aggregators could end up with different amounts of data to
+ * aggregate. here we use fd_end[] to make sure that we know how much
+ * data this aggregator is working with.
+ *
+ * the +1 is to take into account the end vs. length issue.
+ */
+ avail_bytes = fd_end[rank_index] + 1 - off;
+ if (avail_bytes < *len && avail_bytes > 0) {
+ /* this file domain only has part of the requested contig. region */
+
+ *len = avail_bytes;
+ }
+
+ /* map our index to a rank */
+ /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
+ rank = fd->hints->ranklist[rank_index];
+ TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");
+
+ return rank;
+}
+
+/*
+ * Compute a dynamic access range based file domain partition among I/O aggregators,
+ * which align to the GPFS block size
+ * Divide the I/O workload among "nprocs_for_coll" processes. This is
+ * done by (logically) dividing the file into file domains (FDs); each
+ * process may directly access only its own file domain.
+ * Additional effort is to make sure that each I/O aggregator get
+ * a file domain that aligns to the GPFS block size. So, there will
+ * not be any false sharing of GPFS file blocks among multiple I/O nodes.
+ *
+ * The common version of this now accepts a min_fd_size and striping_unit.
+ * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
+ * (e.g. we could pass striping unit instead of using fs_ptr->blksize).
+ */
+void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
+ ADIO_Offset * st_offsets,
+ ADIO_Offset * end_offsets,
+ int nprocs,
+ int nprocs_for_coll,
+ ADIO_Offset * min_st_offset_ptr,
+ ADIO_Offset ** fd_start_ptr,
+ ADIO_Offset ** fd_end_ptr,
+ ADIO_Offset * fd_size_ptr, void *fs_ptr)
+{
+ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
+ int i, aggr;
+ TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
+ blksize_t blksize;
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5004, 0, NULL);
+#endif
+
+#if AGG_DEBUG
+ static char myname[] = "ADIOI_GPFS_Calc_file_domains";
+ DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", myname, __LINE__, nprocs_for_coll);
+#endif
+ if (fd->blksize <= 0)
+ /* default to 1M if blksize unset */
+ fd->blksize = 1048576;
+ blksize = fd->blksize;
+
+#if AGG_DEBUG
+ DBG_FPRINTF(stderr, "%s(%d): Blocksize=%ld\n", myname, __LINE__, blksize);
+#endif
+/* find min of start offsets and max of end offsets of all processes */
+ min_st_offset = st_offsets[0];
+ max_end_offset = end_offsets[0];
+ for (i = 1; i < nprocs; i++) {
+ min_st_offset = MPL_MIN(min_st_offset, st_offsets[i]);
+ max_end_offset = MPL_MAX(max_end_offset, end_offsets[i]);
+ }
+
+ /* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_
+ * = %qd, %qd\n", min_st_offset, max_end_offset); */
+
+ /* determine the "file domain (FD)" of each process, i.e., the portion of
+ * the file that will be "owned" by each process */
+
+ ADIO_Offset gpfs_ub = (max_end_offset + blksize - 1) / blksize * blksize - 1;
+ ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
+ ADIO_Offset gpfs_ub_rdoff =
+ (max_end_offset + blksize - 1) / blksize * blksize - 1 - max_end_offset;
+ ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
+ ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
+
+ int naggs = nprocs_for_coll;
+
+ /* Tweak the file domains so that no fd is smaller than a threshold. We
+ * have to strike a balance between efficency and parallelism: somewhere
+ * between 10k processes sending 32-byte requests and one process sending a
+ * 320k request is a (system-dependent) sweet spot
+
+ This is from the common code - the new min_fd_size parm that we didn't implement.
+ (And common code uses a different declaration of fd_size so beware)
+
+ if (fd_size < min_fd_size)
+ fd_size = min_fd_size;
+ */
+ fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
+ *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * 2 * sizeof(ADIO_Offset));
+ *fd_end_ptr = *fd_start_ptr + nprocs_for_coll;
+ fd_start = *fd_start_ptr;
+ fd_end = *fd_end_ptr;
+
+ /* each process will have a file domain of some number of gpfs blocks, but
+ * the division of blocks is not likely to be even. Some file domains will
+ * be "large" and others "small"
+ *
+ * Example: consider 17 blocks distributed over 3 aggregators.
+ * nb_cn_small = 17/3 = 5
+ * naggs_large = 17 - 3*(17/3) = 17 - 15 = 2
+ * naggs_small = 3 - 2 = 1
+ *
+ * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
+ *
+ * what about (relatively) small files? say, a file of 1000 blocks
+ * distributed over 2064 aggregators:
+ * nb_cn_small = 1000/2064 = 0
+ * naggs_large = 1000 - 2064*(1000/2064) = 1000
+ * naggs_small = 2064 - 1000 = 1064
+ * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
+ *
+ * it might be a good idea instead of having all the zeros up front, to
+ * "mix" those zeros into the fd_size array. that way, no pset/bridge-set
+ * is left with zero work. In fact, even if the small file domains aren't
+ * zero, it's probably still a good idea to mix the "small" file domains
+ * across the fd_size array to keep the io nodes in balance */
+
+
+ ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
+ ADIO_Offset nb_cn_small = n_gpfs_blk / naggs;
+ ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk / naggs);
+ ADIO_Offset naggs_small = naggs - naggs_large;
+
+#ifdef BGQPLATFORM
+ if (gpfsmpio_balancecontig == 1) {
+ /* File domains blocks are assigned to aggregators in a breadth-first
+ * fashion relative to the ions - additionally, file domains on the
+ * aggregators sharing the same bridgeset and ion have contiguous
+ * offsets. */
+
+ // initialize everything to small
+ for (i = 0; i < naggs; i++)
+ fd_size[i] = nb_cn_small * blksize;
+
+ // go thru and distribute the large across the bridges
+
+ /* bridelistoffset: agg rank list offsets using the bridgelist - each
+ * entry is created by adding up the indexes for the aggs from all
+ * previous bridges */
+ int *bridgelistoffset =
+ (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges * sizeof(int));
+ /* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
+ * decremented to keep track of bridge assignments during the actual
+ * large block assignments to the agg rank list*/
+ int *tmpbridgelistnum =
+ (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges * sizeof(int));
+
+ int j;
+ for (j = 0; j < fd->hints->fs_hints.bg.numbridges; j++) {
+ int k, bridgerankoffset = 0;
+ for (k = 0; k < j; k++) {
+ bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
+ }
+ bridgelistoffset[j] = bridgerankoffset;
+ }
+
+ for (j = 0; j < fd->hints->fs_hints.bg.numbridges; j++)
+ tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
+ int bridgeiter = 0;
+
+ /* distribute the large blocks across the aggs going breadth-first
+ * across the bridgelist - this distributes the fd sizes across the
+ * ions, so later in the file domain assignment when it iterates thru
+ * the ranklist the offsets will be contiguous within the bridge and
+ * ion as well */
+ for (j = 0; j < naggs_large; j++) {
+ int foundbridge = 0;
+ int numbridgelistpasses = 0;
+ while (!foundbridge) {
+ if (tmpbridgelistnum[bridgeiter] > 0) {
+ foundbridge = 1;
+ /*
+ * printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
+ * printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
+ * printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
+ */
+ int currentbridgelistnum =
+ (fd->hints->fs_hints.bg.bridgelistnum[bridgeiter] -
+ tmpbridgelistnum[bridgeiter]);
+ int currentfdsizeindex = bridgelistoffset[bridgeiter] + currentbridgelistnum;
+ fd_size[currentfdsizeindex] = (nb_cn_small + 1) * blksize;
+ tmpbridgelistnum[bridgeiter]--;
+ }
+ if (bridgeiter == (fd->hints->fs_hints.bg.numbridges - 1)) {
+ /* guard against infinite loop - should only ever make 1 pass
+ * thru bridgelist */
+ ADIOI_Assert(numbridgelistpasses == 0);
+ numbridgelistpasses++;
+ bridgeiter = 0;
+ } else
+ bridgeiter++;
+ }
+ }
+ ADIOI_Free(tmpbridgelistnum);
+ ADIOI_Free(bridgelistoffset);
+
+ } else {
+ /* BG/L- and BG/P-style distribution of file domains: simple allocation of
+ * file domins to each aggregator */
+ for (i = 0; i < naggs; i++) {
+ if (i < naggs_large) {
+ fd_size[i] = (nb_cn_small + 1) * blksize;
+ } else {
+ fd_size[i] = nb_cn_small * blksize;
+ }
+ }
+ }
+#ifdef balancecontigtrace
+ int myrank;
+ MPI_Comm_rank(fd->comm, &myrank);
+ if (myrank == 0) {
+ fprintf(stderr, "naggs_small is %d nb_cn_small is %d\n", naggs_small, nb_cn_small);
+ for (i = 0; i < naggs; i++) {
+ fprintf(stderr, "fd_size[%d] set to %d agg rank is %d\n", i, fd_size[i],
+ fd->hints->ranklist[i]);
+ }
+ }
+#endif
+
+#else // not BGQ platform
+ for (i = 0; i < naggs; i++) {
+ if (i < naggs_large) {
+ fd_size[i] = (nb_cn_small + 1) * blksize;
+ } else {
+ fd_size[i] = nb_cn_small * blksize;
+ }
+ }
+
+#endif
+
+
+#if AGG_DEBUG
+ DBG_FPRINTF(stderr, "%s(%d): "
+ "gpfs_ub %llu, "
+ "gpfs_lb %llu, "
+ "gpfs_ub_rdoff %llu, "
+ "gpfs_lb_rdoff %llu, "
+ "fd_gpfs_range %llu, "
+ "n_gpfs_blk %llu, "
+ "nb_cn_small %llu, "
+ "naggs_large %llu, "
+ "naggs_small %llu, "
+ "\n",
+ myname, __LINE__,
+ gpfs_ub,
+ gpfs_lb,
+ gpfs_ub_rdoff,
+ gpfs_lb_rdoff, fd_gpfs_range, n_gpfs_blk, nb_cn_small, naggs_large, naggs_small);
+#endif
+
+ fd_size[0] -= gpfs_lb_rdoff;
+ fd_size[naggs - 1] -= gpfs_ub_rdoff;
+
+ /* compute the file domain for each aggr */
+ ADIO_Offset offset = min_st_offset;
+ for (aggr = 0; aggr < naggs; aggr++) {
+ fd_start[aggr] = offset;
+ fd_end[aggr] = offset + fd_size[aggr] - 1;
+ offset += fd_size[aggr];
+ }
+
+ *fd_size_ptr = fd_size[0];
+ *min_st_offset_ptr = min_st_offset;
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5005, 0, NULL);
+#endif
+ ADIOI_Free(fd_size);
+ TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
+}
+
+/*
+ * ADIOI_GPFS_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
+ * is specific for static file domain partitioning.
+ *
+ * ADIOI_Calc_my_req() - calculate what portions of the access requests
+ * of this process are located in the file domains of various processes
+ * (including this one)
+ */
+void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset * offset_list, ADIO_Offset * len_list,
+ int contig_access_count, ADIO_Offset
+ min_st_offset, ADIO_Offset * fd_start,
+ ADIO_Offset * fd_end, ADIO_Offset fd_size,
+ int nprocs,
+ int *count_my_req_procs_ptr,
+ int **count_my_req_per_proc_ptr,
+ ADIOI_Access ** my_req_ptr, MPI_Aint ** buf_idx_ptr)
+/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
+ They are used as memory buffer indices so it seems like the 2G limit is in effect */
+{
+ int *count_my_req_per_proc, count_my_req_procs;
+ MPI_Aint *buf_idx;
+ int i, l, proc;
+ ADIO_Offset fd_len, rem_len, curr_idx, off;
+ ADIOI_Access *my_req;
+ TRACE_ERR("Entering ADIOI_GPFS_Calc_my_req\n");
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5024, 0, NULL);
+#endif
+ *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ count_my_req_per_proc = *count_my_req_per_proc_ptr;
+/* count_my_req_per_proc[i] gives the no. of contig. requests of this
+ process in process i's file domain. calloc initializes to zero.
+ I'm allocating memory of size nprocs, so that I can do an
+ MPI_Alltoall later on.*/
+
+ buf_idx = (MPI_Aint *) ADIOI_Malloc(nprocs * sizeof(MPI_Aint));
+/* buf_idx is relevant only if buftype_is_contig.
+ buf_idx[i] gives the index into user_buf where data received
+ from proc. i should be placed. This allows receives to be done
+ without extra buffer. This can't be done if buftype is not contig. */
+
+ /* initialize buf_idx to -1 */
+ for (i = 0; i < nprocs; i++)
+ buf_idx[i] = -1;
+
+ /* one pass just to calculate how much space to allocate for my_req;
+ * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
+ */
+ for (i = 0; i < contig_access_count; i++) {
+ /* short circuit offset/len processing if len == 0
+ * (zero-byte read/write */
+ if (len_list[i] == 0)
+ continue;
+ off = offset_list[i];
+ fd_len = len_list[i];
+ /* note: we set fd_len to be the total size of the access. then
+ * ADIOI_Calc_aggregator() will modify the value to return the
+ * amount that was available from the file domain that holds the
+ * first part of the access.
+ */
+ /* BES */
+ proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
+ fd_start, fd_end);
+ count_my_req_per_proc[proc]++;
+
+ /* figure out how much data is remaining in the access (i.e. wasn't
+ * part of the file domain that had the starting byte); we'll take
+ * care of this data (if there is any) in the while loop below.
+ */
+ rem_len = len_list[i] - fd_len;
+
+ while (rem_len > 0) {
+ off += fd_len; /* point to first remaining byte */
+ fd_len = rem_len; /* save remaining size, pass to calc */
+ proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
+ fd_size, fd_start, fd_end);
+
+ count_my_req_per_proc[proc]++;
+ rem_len -= fd_len; /* reduce remaining length by amount from fd */
+ }
+ }
+
+/* now allocate space for my_req, offset, and len */
+
+ *my_req_ptr = (ADIOI_Access *)
+ ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
+ my_req = *my_req_ptr;
+
+ count_my_req_procs = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (count_my_req_per_proc[i]) {
+ my_req[i].offsets = (ADIO_Offset *)
+ ADIOI_Malloc(count_my_req_per_proc[i] * 2 * sizeof(ADIO_Offset));
+ my_req[i].lens = my_req[i].offsets + count_my_req_per_proc[i];
+ count_my_req_procs++;
+ }
+ my_req[i].count = 0; /* will be incremented where needed
+ * later */
+ }
+
+/* now fill in my_req */
+ curr_idx = 0;
+ for (i = 0; i < contig_access_count; i++) {
+ /* short circuit offset/len processing if len == 0
+ * (zero-byte read/write */
+ if (len_list[i] == 0)
+ continue;
+ off = offset_list[i];
+ fd_len = len_list[i];
+ proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
+ fd_start, fd_end);
+
+ /* for each separate contiguous access from this process */
+ if (buf_idx[proc] == -1) {
+ ADIOI_Assert(curr_idx == (MPI_Aint) curr_idx);
+ buf_idx[proc] = (MPI_Aint) curr_idx;
+ }
+
+ l = my_req[proc].count;
+ curr_idx += fd_len;
+
+ rem_len = len_list[i] - fd_len;
+
+ /* store the proc, offset, and len information in an array
+ * of structures, my_req. Each structure contains the
+ * offsets and lengths located in that process's FD,
+ * and the associated count.
+ */
+ my_req[proc].offsets[l] = off;
+ my_req[proc].lens[l] = fd_len;
+ my_req[proc].count++;
+
+ while (rem_len > 0) {
+ off += fd_len;
+ fd_len = rem_len;
+ proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
+ fd_size, fd_start, fd_end);
+
+ if (buf_idx[proc] == -1) {
+ ADIOI_Assert(curr_idx == (MPI_Aint) curr_idx);
+ buf_idx[proc] = (MPI_Aint) curr_idx;
+ }
+
+ l = my_req[proc].count;
+ curr_idx += fd_len;
+ rem_len -= fd_len;
+
+ my_req[proc].offsets[l] = off;
+ my_req[proc].lens[l] = fd_len;
+ my_req[proc].count++;
+ }
+ }
+
+
+
+#ifdef AGG_DEBUG
+ for (i = 0; i < nprocs; i++) {
+ if (count_my_req_per_proc[i] > 0) {
+ DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i, my_req[i].count);
+ for (l = 0; l < my_req[i].count; l++) {
+ DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %lld\n", l,
+ (long long) my_req[i].offsets[l], l, (long long) my_req[i].lens[l]);
+ }
+ }
+ DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
+ }
+#endif
+
+ *count_my_req_procs_ptr = count_my_req_procs;
+ *buf_idx_ptr = buf_idx;
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5025, 0, NULL);
+#endif
+ TRACE_ERR("Leaving ADIOI_GPFS_Calc_my_req\n");
+}
+
+/*
+ * ADIOI_Calc_others_req (copied to bg and switched to all to all for performance)
+ *
+ * param[in] count_my_req_procs Number of processes whose file domain my
+ * request touches.
+ * param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
+ * contig. requests of this process in
+ * process i's file domain.
+ * param[in] my_req A structure defining my request
+ * param[in] nprocs Number of nodes in the block
+ * param[in] myrank Rank of this node
+ * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
+ * my process's file domain (including my
+ * process itself)
+ * param[out] others_req_ptr Array of other process' requests that lie
+ * in my process's file domain
+ */
+void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
+ int *count_my_req_per_proc,
+ ADIOI_Access * my_req,
+ int nprocs, int myrank,
+ int *count_others_req_procs_ptr, ADIOI_Access ** others_req_ptr)
+{
+ TRACE_ERR("Entering ADIOI_GPFS_Calc_others_req\n");
+/* determine what requests of other processes lie in this process's
+ file domain */
+
+/* count_others_req_procs = number of processes whose requests lie in
+ this process's file domain (including this process itself)
+ count_others_req_per_proc[i] indicates how many separate contiguous
+ requests of proc. i lie in this process's file domain. */
+
+ int *count_others_req_per_proc, count_others_req_procs;
+ int i;
+ ADIOI_Access *others_req;
+
+ /* Parameters for MPI_Alltoallv */
+ int *scounts, *sdispls, *rcounts, *rdispls;
+
+ /* Parameters for MPI_Alltoallv. These are the buffers, which
+ * are later computed to be the lowest address of all buffers
+ * to be sent/received for offsets and lengths. Initialize to
+ * the highest possible address which is the current minimum.
+ */
+ void *sendBuf = (void *) 0xFFFFFFFFFFFFFFFF, *recvBuf = (void *) 0xFFFFFFFFFFFFFFFF;
+
+/* first find out how much to send/recv and from/to whom */
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5026, 0, NULL);
+#endif
+ /* Send 1 int to each process. count_my_req_per_proc[i] is the number of
+ * requests that my process will do to the file domain owned by process[i].
+ * Receive 1 int from each process. count_others_req_per_proc[i] is the number of
+ * requests that process[i] will do to the file domain owned by my process.
+ */
+ count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+/* cora2a1=timebase(); */
+/*for(i=0;icomm);
+
+/* total_cora2a+=timebase()-cora2a1; */
+
+ /* Allocate storage for an array of other nodes' accesses of our
+ * node's file domain. Also allocate storage for the alltoallv
+ * parameters.
+ */
+ *others_req_ptr = (ADIOI_Access *)
+ ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
+ others_req = *others_req_ptr;
+
+ scounts = ADIOI_Malloc(nprocs * sizeof(int));
+ sdispls = ADIOI_Malloc(nprocs * sizeof(int));
+ rcounts = ADIOI_Malloc(nprocs * sizeof(int));
+ rdispls = ADIOI_Malloc(nprocs * sizeof(int));
+
+ /* If process[i] has any requests in my file domain,
+ * initialize an ADIOI_Access structure that will describe each request
+ * from process[i]. The offsets, lengths, and buffer pointers still need
+ * to be obtained to complete the setting of this structure.
+ */
+ count_others_req_procs = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (count_others_req_per_proc[i]) {
+ others_req[i].count = count_others_req_per_proc[i];
+
+ others_req[i].offsets = (ADIO_Offset *)
+ ADIOI_Malloc(count_others_req_per_proc[i] * 2 * sizeof(ADIO_Offset));
+ others_req[i].lens = others_req[i].offsets + count_others_req_per_proc[i];
+
+ if ((uintptr_t) others_req[i].offsets < (uintptr_t) recvBuf)
+ recvBuf = others_req[i].offsets;
+
+ others_req[i].mem_ptrs = (MPI_Aint *)
+ ADIOI_Malloc(count_others_req_per_proc[i] * sizeof(MPI_Aint));
+
+ count_others_req_procs++;
+ } else {
+ others_req[i].count = 0;
+ others_req[i].offsets = NULL;
+ others_req[i].mem_ptrs = NULL;
+ others_req[i].lens = NULL;
+ }
+ }
+ /* If no recv buffer was allocated in the loop above, make it NULL */
+ if (recvBuf == (void *) 0xFFFFFFFFFFFFFFFF)
+ recvBuf = NULL;
+
+ /* Now send the calculated offsets and lengths to respective processes */
+
+ /************************/
+ /* Exchange the offsets */
+ /************************/
+
+ /* Determine the lowest sendBuf */
+ for (i = 0; i < nprocs; i++) {
+ if ((my_req[i].count) && ((uintptr_t) my_req[i].offsets <= (uintptr_t) sendBuf)) {
+ sendBuf = my_req[i].offsets;
+ }
+ /* my_req[i].offsets and my_req[i].lens have been malloc-ed together */
+ }
+
+ /* If no send buffer was found in the loop above, make it NULL */
+ if (sendBuf == (void *) 0xFFFFFFFFFFFFFFFF)
+ sendBuf = NULL;
+
+ /* Calculate the displacements from the sendBuf */
+ for (i = 0; i < nprocs; i++) {
+ /* Send these offsets and lengths to process i. */
+ scounts[i] = count_my_req_per_proc[i] * 2;
+ if (scounts[i] == 0)
+ sdispls[i] = 0;
+ else
+ sdispls[i] = (int)
+ (((uintptr_t) my_req[i].offsets -
+ (uintptr_t) sendBuf) / (uintptr_t) sizeof(ADIO_Offset));
+
+ /* Receive these offsets and lengths from process i. */
+ rcounts[i] = count_others_req_per_proc[i] * 2;
+ if (rcounts[i] == 0)
+ rdispls[i] = 0;
+ else
+ rdispls[i] = (int)
+ (((uintptr_t) others_req[i].offsets -
+ (uintptr_t) recvBuf) / (uintptr_t) sizeof(ADIO_Offset));
+ }
+
+ /* Exchange the offsets and lengths */
+ MPI_Alltoallv(sendBuf, scounts, sdispls, ADIO_OFFSET,
+ recvBuf, rcounts, rdispls, ADIO_OFFSET, fd->comm);
+
+ /* Clean up */
+ ADIOI_Free(count_others_req_per_proc);
+ ADIOI_Free(scounts);
+ ADIOI_Free(sdispls);
+ ADIOI_Free(rcounts);
+ ADIOI_Free(rdispls);
+
+ *count_others_req_procs_ptr = count_others_req_procs;
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5027, 0, NULL);
+#endif
+ TRACE_ERR("Leaving ADIOI_GPFS_Calc_others_req\n");
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_aggrs.h b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_aggrs.h
new file mode 100644
index 0000000000000000000000000000000000000000..234e5c5b1d80a092b213ef3d76954b61d226dd32
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_aggrs.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_aggrs.h
+ * \brief ???
+ */
+
+/*
+ * File: ad_gpfs_aggrs.h
+ *
+ * Declares functions optimized specifically for GPFS parallel I/O solution.
+ *
+ */
+
+#ifndef AD_GPFS_AGGRS_H_INCLUDED
+#define AD_GPFS_AGGRS_H_INCLUDED
+
+#include "adio.h"
+#include
+
+#ifdef HAVE_GPFS_H
+#include
+#endif
+
+
+ /* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
+void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
+ ADIO_Offset * st_offsets,
+ ADIO_Offset * end_offsets,
+ int nprocs,
+ int nprocs_for_coll,
+ ADIO_Offset * min_st_offset_ptr,
+ ADIO_Offset ** fd_start_ptr,
+ ADIO_Offset ** fd_end_ptr,
+ ADIO_Offset * fd_size_ptr, void *fs_ptr);
+
+ /* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
+ * static file domain partitioning */
+int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
+ ADIO_Offset off,
+ ADIO_Offset min_off,
+ ADIO_Offset * len,
+ ADIO_Offset fd_size, ADIO_Offset * fd_start, ADIO_Offset * fd_end);
+
+ /* overriding ADIOI_Calc_my_req for the default implementation is specific for
+ * static file domain partitioning */
+void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset * offset_list, ADIO_Offset * len_list,
+ int contig_access_count, ADIO_Offset
+ min_st_offset, ADIO_Offset * fd_start,
+ ADIO_Offset * fd_end, ADIO_Offset fd_size,
+ int nprocs,
+ int *count_my_req_procs_ptr,
+ int **count_my_req_per_proc_ptr,
+ ADIOI_Access ** my_req_ptr, MPI_Aint ** buf_idx_ptr);
+
+ /*
+ * ADIOI_Calc_others_req
+ *
+ * param[in] count_my_req_procs Number of processes whose file domain my
+ * request touches.
+ * param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
+ * contig. requests of this process in
+ * process i's file domain.
+ * param[in] my_req A structure defining my request
+ * param[in] nprocs Number of nodes in the block
+ * param[in] myrank Rank of this node
+ * param[out] count_others_req_proc_ptr Number of processes whose requests lie in
+ * my process's file domain (including my
+ * process itself)
+ * param[out] others_req_ptr Array of other process' requests that lie
+ * in my process's file domain
+ */
+void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
+ int *count_my_req_per_proc,
+ ADIOI_Access * my_req,
+ int nprocs, int myrank,
+ int *count_others_req_procs_ptr, ADIOI_Access ** others_req_ptr);
+
+
+#endif /* AD_GPFS_AGGRS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_close.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_close.c
new file mode 100644
index 0000000000000000000000000000000000000000..f96112ed7802613d9dfe1fcf024547d4a12e5bdd
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_close.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_close.c
+ * \brief ???
+ */
+
+#include "ad_gpfs.h"
+#include "ad_gpfs_tuning.h"
+#include
+
+void ADIOI_GPFS_Close(ADIO_File fd, int *error_code)
+{
+ int err, derr = 0;
+ static char myname[] = "ADIOI_GPFS_CLOSE";
+
+#ifdef PROFILE
+ MPE_Log_event(9, 0, "start close");
+#endif
+
+ if (fd->null_fd >= 0)
+ close(fd->null_fd);
+
+ err = close(fd->fd_sys);
+ if (fd->fd_direct >= 0) {
+ derr = close(fd->fd_direct);
+ }
+#ifdef PROFILE
+ MPE_Log_event(10, 0, "end close");
+#endif
+
+/* FPRINTF(stderr,"%s(%d):'%s'. Free %#X\n",myname,__LINE__,fd->filename,(int)fd->fs_ptr);*/
+ if (fd->fs_ptr != NULL) {
+ ADIOI_Free(fd->fs_ptr);
+ fd->fs_ptr = NULL;
+ }
+ fd->fd_sys = -1;
+ fd->fd_direct = -1;
+
+ if (err == -1 || derr == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_flush.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_flush.c
new file mode 100644
index 0000000000000000000000000000000000000000..02dde79515e8322f23c9d581a9fc4f33e50f1b7e
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_flush.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_flush.c
+ * \brief Scalable flush for GPFS
+ */
+
+#include "ad_gpfs.h"
+
+void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code)
+{
+ int err = 0;
+ static char myname[] = "ADIOI_GPFS_FLUSH";
+
+ int rank;
+
+ MPI_Comm_rank(fd->comm, &rank);
+
+ /* the old logic about who is an fsync aggregator and who is not fell down
+ * when deferred open was enabled. Instead, make this look more like
+ * ad_pvfs2_flush. If one day the I/O aggregators have something they need
+ * to flush, we can consult the 'fd->hints->ranklist[]' array. For now, a
+ * flush from one process should suffice */
+
+ /* ensure all other proceses are done writing. On many platforms MPI_Reduce
+ * is fastest because it has the lightest constraints. On Blue Gene, BARRIER
+ * is optimized */
+ MPI_Barrier(fd->comm);
+
+ if (rank == fd->hints->ranklist[0]) {
+ err = fsync(fd->fd_sys);
+ DBG_FPRINTF(stderr, "aggregation:fsync %s, err=%#X, errno=%#X\n", fd->filename, err, errno);
+ /* We want errno, not the return code if it failed */
+ if (err == -1)
+ err = errno;
+ else
+ err = 0;
+ }
+ MPI_Bcast(&err, 1, MPI_UNSIGNED, fd->hints->ranklist[0], fd->comm);
+ DBGV_FPRINTF(stderr, "aggregation result:fsync %s, errno %#X,\n", fd->filename, err);
+
+ if (err) { /* if it's non-zero, it must be an errno */
+ errno = err;
+ err = -1;
+ }
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ DBGT_FPRINTF(stderr, "fsync %s, err=%#X, errno=%#X\n", fd->filename, err, errno);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_hints.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_hints.c
new file mode 100644
index 0000000000000000000000000000000000000000..4b41a680eb9767fb9f88b4d7eb92bf4c5de47718
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_hints.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_hints.c
+ * \brief GPFS hint processing - for now, only used for BlueGene and PE platforms
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "hint_fns.h"
+
+#include "ad_gpfs.h"
+
+#define ADIOI_GPFS_CB_BUFFER_SIZE_DFLT "16777216"
+#define ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT "4194304"
+#define ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT "4194304"
+
+#ifdef BGQPLATFORM
+#define ADIOI_BG_NAGG_IN_PSET_HINT_NAME "bg_nodes_pset"
+#endif
+
+/** \page mpiio_vars MPIIO Configuration
+ *
+ * GPFS MPIIO configuration and performance tuning. Used by ad_gpfs ADIO.
+ *
+ * Used for BlueGene and PE platforms, which each have their own aggregator selection
+ * algorithms that ignore user provided cb_config_list.
+ *
+ * \section hint_sec Hints
+ * - bg_nodes_pset - BlueGene only - specify how many aggregators to use per pset.
+ * This hint will override the cb_nodes hint based on BlueGene psets.
+ * - N - Use N nodes per pset as aggregators.
+ * - Default is based on partition configuration and cb_nodes.
+ *
+ * The following default key/value pairs may differ from other platform defaults.
+ *
+ * - key = cb_buffer_size value = 16777216
+ * - key = romio_cb_read value = enable
+ * - key = romio_cb_write value = enable
+ * - key = ind_rd_buffer_size value = 4194304
+ * - key = ind_wr_buffer_size value = 4194304
+ */
+
+#ifdef BGQPLATFORM
+/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
+extern int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
+#elif PEPLATFORM
+extern int ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
+#endif
+
+void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+{
+/* if fd->info is null, create a new info object.
+ Initialize fd->info to default values.
+ Initialize fd->hints to default values.
+ Examine the info object passed by the user. If it contains values that
+ ROMIO understands, override the default. */
+
+ MPI_Info info;
+ char *value;
+ int flag, intval, nprocs = 0, nprocs_is_valid = 0;
+ static char myname[] = "ADIOI_GPFS_SETINFO";
+
+ int did_anything = 0;
+
+ if (fd->info == MPI_INFO_NULL)
+ MPI_Info_create(&(fd->info));
+ info = fd->info;
+
+ /* Note that fd->hints is allocated at file open time; thus it is
+ * not necessary to allocate it, or check for allocation, here.
+ */
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Assert((value != NULL));
+
+ /* initialize info and hints to default values if they haven't been
+ * previously initialized
+ */
+ if (!fd->hints->initialized) {
+
+ ad_get_env_vars();
+ ad_gpfs_get_env_vars();
+ did_anything = 1;
+
+ /* buffer size for collective I/O */
+ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
+ fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
+
+ /* default is to let romio automatically decide when to use
+ * collective buffering
+ */
+ ADIOI_Info_set(info, "romio_cb_read", "enable");
+ fd->hints->cb_read = ADIOI_HINT_ENABLE;
+ ADIOI_Info_set(info, "romio_cb_write", "enable");
+ fd->hints->cb_write = ADIOI_HINT_ENABLE;
+
+ if (fd->hints->cb_config_list != NULL)
+ ADIOI_Free(fd->hints->cb_config_list);
+ fd->hints->cb_config_list = NULL;
+
+ /* number of processes that perform I/O in collective I/O */
+ MPI_Comm_size(fd->comm, &nprocs);
+ nprocs_is_valid = 1;
+ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", nprocs);
+ ADIOI_Info_set(info, "cb_nodes", value);
+ fd->hints->cb_nodes = -1;
+
+ /* hint indicating that no indep. I/O will be performed on this file */
+ ADIOI_Info_set(info, "romio_no_indep_rw", "false");
+ fd->hints->no_indep_rw = 0;
+
+ /* gpfs is not implementing file realms (ADIOI_IOStridedColl),
+ * initialize to disabled it. */
+ /* hint instructing the use of persistent file realms */
+ ADIOI_Info_set(info, "romio_cb_pfr", "disable");
+ fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
+
+ /* hint guiding the assignment of persistent file realms */
+ ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
+ fd->hints->cb_fr_type = ADIOI_FR_AAR;
+
+ /* hint to align file realms with a certain byte value */
+ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
+ fd->hints->cb_fr_alignment = 1;
+
+ /* hint to set a threshold percentage for a datatype's size/extent at
+ * which data sieving should be done in collective I/O */
+ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
+ fd->hints->cb_ds_threshold = 0;
+
+ /* hint to switch between point-to-point or all-to-all for two-phase */
+ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
+ fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
+
+ /* deferred_open derived from no_indep_rw and cb_{read,write} */
+ fd->hints->deferred_open = 0;
+
+ /* buffer size for data sieving in independent reads */
+ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
+ fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
+
+ /* buffer size for data sieving in independent writes */
+ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
+ fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
+
+
+ ADIOI_Info_set(info, "romio_ds_read", "automatic");
+ fd->hints->ds_read = ADIOI_HINT_AUTO;
+ ADIOI_Info_set(info, "romio_ds_write", "automatic");
+ fd->hints->ds_write = ADIOI_HINT_AUTO;
+
+ /* still to do: tune this a bit for a variety of file systems. there's
+ * no good default value so just leave it unset */
+ fd->hints->min_fdomain_size = 0;
+ fd->hints->striping_unit = 0;
+
+ fd->hints->initialized = 1;
+ }
+
+ /* add in user's info if supplied */
+ if (users_info != MPI_INFO_NULL) {
+ ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
+ &(fd->hints->cb_buffer_size), myname, error_code);
+ /* new hints for enabling/disabling coll. buffering on
+ * reads/writes
+ */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
+ &(fd->hints->cb_read), myname, error_code);
+ if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
+ /* romio_cb_read overrides no_indep_rw */
+ ADIOI_Info_set(info, "romio_no_indep_rw", "false");
+ fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
+ }
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
+ &(fd->hints->cb_write), myname, error_code);
+ if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
+ /* romio_cb_write overrides no_indep_rw */
+ ADIOI_Info_set(info, "romio_no_indep_rw", "false");
+ fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
+ }
+ /* Has the user indicated all I/O will be done collectively? */
+ ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
+ &(fd->hints->no_indep_rw), myname, error_code);
+ if (fd->hints->no_indep_rw == 1) {
+ /* if 'no_indep_rw' set, also hint that we will do
+ * collective buffering: if we aren't doing independent io,
+ * then we have to do collective */
+ ADIOI_Info_set(info, "romio_cb_write", "enable");
+ ADIOI_Info_set(info, "romio_cb_read", "enable");
+ fd->hints->cb_read = 1;
+ fd->hints->cb_write = 1;
+ }
+
+ /* new hints for enabling/disabling data sieving on
+ * reads/writes
+ */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
+ &(fd->hints->ds_read), myname, error_code);
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
+ &(fd->hints->ds_write), myname, error_code);
+
+ ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
+ &(fd->hints->ind_wr_buffer_size), myname, error_code);
+ ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
+ &(fd->hints->ind_rd_buffer_size), myname, error_code);
+
+ memset(value, 0, MPI_MAX_INFO_VAL + 1);
+ ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag && ((intval = atoi(value)) > 0)) {
+ ADIOI_Info_set(info, "romio_min_fdomain_size", value);
+ fd->hints->min_fdomain_size = intval;
+ }
+ /* Now we use striping unit in common code so we should
+ * process hints for it. */
+ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
+ &(fd->hints->striping_unit), myname, error_code);
+
+#ifdef BGQPLATFORM
+ memset(value, 0, MPI_MAX_INFO_VAL + 1);
+ ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag);
+ if (flag && ((intval = atoi(value)) > 0)) {
+
+ did_anything = 1;
+ ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
+ fd->hints->cb_nodes = intval;
+ }
+#endif
+ }
+
+ /* special CB aggregator assignment */
+ if (did_anything) {
+#ifdef BGQPLATFORM
+ ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
+#elif PEPLATFORM
+ ADIOI_PE_gen_agg_ranklist(fd);
+#endif
+ }
+
+ /* deferred_open won't be set by callers, but if the user doesn't
+ * explicitly disable collecitve buffering (two-phase) and does hint that
+ * io w/o independent io is going on, we'll set this internal hint as a
+ * convenience */
+ if (((fd->hints->cb_read != ADIOI_HINT_DISABLE)
+ && (fd->hints->cb_write != ADIOI_HINT_DISABLE)
+ && fd->hints->no_indep_rw)) {
+ fd->hints->deferred_open = 1;
+ } else {
+ /* setting romio_no_indep_rw enable and romio_cb_{read,write}
+ * disable at the same time doesn't make sense. honor
+ * romio_cb_{read,write} and force the no_indep_rw hint to
+ * 'disable' */
+ ADIOI_Info_set(info, "romio_no_indep_rw", "false");
+ fd->hints->no_indep_rw = 0;
+ fd->hints->deferred_open = 0;
+ }
+
+ /* BobC commented this out, but since hint processing runs on both bg and
+ * bglockless, we need to keep DS writes enabled on gpfs and disabled on
+ * PVFS */
+ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
+ /* disable data sieving for fs that do not
+ * support file locking */
+ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ /* get rid of this value if it is set */
+ ADIOI_Info_delete(info, "ind_wr_buffer_size");
+ }
+ /* note: leave ind_wr_buffer_size alone; used for other cases
+ * as well. -- Rob Ross, 04/22/2003
+ */
+ ADIOI_Info_set(info, "romio_ds_write", "disable");
+ fd->hints->ds_write = ADIOI_HINT_DISABLE;
+ }
+
+ ADIOI_Free(value);
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_open.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..46bf0d478796e75296d1e38ee7ae081e53118e6c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_open.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_open.c
+ * \brief ???
+ */
+
+#include "ad_gpfs.h"
+#include "ad_gpfs_tuning.h"
+
+#include
+#include
+#include
+#include
+
+
+#ifdef HAVE_GPFS_H
+#include
+#endif
+#ifdef HAVE_GPFS_FCNTL_H
+#include
+#endif
+
+#ifdef HAVE_GPFS_FCNTL_H
+static void gpfs_free_all_locks(int fd)
+{
+ int rc;
+ struct {
+ gpfsFcntlHeader_t header;
+ gpfsFreeRange_t release;
+ } release_all;
+
+ release_all.header.totalLength = sizeof(release_all);
+ release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+ release_all.header.fcntlReserved = 0;
+
+ release_all.release.structLen = sizeof(release_all.release);
+ release_all.release.structType = GPFS_FREE_RANGE;
+ release_all.release.start = 0;
+ release_all.release.length = 0;
+
+ rc = gpfs_fcntl(fd, &release_all);
+ if (rc != 0) {
+ DBGV_FPRINTF(stderr, "GPFS fcntl release failed with rc=%d, errno=%d\n", rc, errno);
+ }
+}
+#endif
+
+
+void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
+{
+ int perm, old_mask, amode, rank, rc;
+ static char myname[] = "ADIOI_GPFS_OPEN";
+
+ /* set internal variables for tuning environment variables */
+ ad_gpfs_get_env_vars();
+
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = old_mask ^ 0666;
+ } else
+ perm = fd->perm;
+
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE)
+ amode = amode | O_CREAT;
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
+#endif
+ fd->fd_sys = open(fd->filename, amode, perm);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
+#endif
+ DBG_FPRINTF(stderr, "open('%s',%#X,%#X) rc=%d, errno=%d\n", fd->filename, amode, perm,
+ fd->fd_sys, errno);
+ fd->fd_direct = -1;
+
+ if (gpfsmpio_devnullio == 1) {
+ fd->null_fd = open("/dev/null", O_RDWR);
+ } else {
+ fd->null_fd = -1;
+ }
+
+ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+
+ if (fd->fd_sys != -1) {
+
+ fd->blksize = 1048576; /* default to 1M */
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
+#endif
+ /* in this fs-specific routine, we might not be called over entire
+ * communicator (deferred open). Collect statistics on one process.
+ * ADIOI_GEN_Opencoll (common-code caller) will take care of the
+ * broadcast */
+
+ MPI_Comm_rank(fd->comm, &rank);
+ if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
+ struct stat gpfs_statbuf;
+ /* Get the (real) underlying file system block size */
+ rc = stat(fd->filename, &gpfs_statbuf);
+ if (rc >= 0) {
+ fd->blksize = gpfs_statbuf.st_blksize;
+ DBGV_FPRINTF(stderr, "Successful stat '%s'. Blocksize=%ld\n",
+ fd->filename, gpfs_statbuf.st_blksize);
+ } else {
+ DBGV_FPRINTF(stderr, "Stat '%s' failed with rc=%d, errno=%d\n",
+ fd->filename, rc, errno);
+ }
+ }
+ /* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll
+ * will take care of that in both standard and deferred-open case */
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
+#endif
+
+#ifdef HAVE_GPFS_FCNTL_H
+ /* in parallel workload, might be helpful to immediately release block
+ * tokens. Or, system call overhead will outweigh any benefits... */
+ if (getenv("ROMIO_GPFS_FREE_LOCKS") != NULL)
+ gpfs_free_all_locks(fd->fd_sys);
+
+#endif
+ }
+
+ if (fd->fd_sys == -1) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_rdcoll.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_rdcoll.c
new file mode 100644
index 0000000000000000000000000000000000000000..8135dc163c540392847ff76fd949a3149181bbc6
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_rdcoll.c
@@ -0,0 +1,1231 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_rdcoll.c
+ * \brief ???
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_gpfs.h"
+#include "ad_gpfs_aggrs.h"
+
+#ifdef PROFILE
+#include "mpe.h"
+#endif
+
+#ifdef MPL_USE_DBG_LOGGING
+#define RDCOLL_DEBUG 1
+#endif
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+/* prototypes of functions used for collective reads only. */
+static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
+ datatype, int nprocs,
+ int myrank, ADIOI_Access
+ * others_req, ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int contig_access_count,
+ ADIO_Offset
+ min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ MPI_Aint * buf_idx, int *error_code);
+static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+ * flat_buf, ADIO_Offset * offset_list, ADIO_Offset
+ * len_list, int *send_size, int *recv_size,
+ int *count, int *start_pos,
+ int *partial_send,
+ int *recd_from_proc, int nprocs,
+ int myrank, int
+ buftype_is_contig, int contig_access_count,
+ ADIO_Offset min_st_offset,
+ ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIOI_Access * others_req,
+ int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx);
+static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+ * flat_buf, ADIO_Offset * offset_list, ADIO_Offset
+ * len_list, int *send_size, int *recv_size,
+ int *count, int *start_pos,
+ int *partial_send,
+ int *recd_from_proc, int nprocs,
+ int myrank, int
+ buftype_is_contig, int contig_access_count,
+ ADIO_Offset min_st_offset,
+ ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIOI_Access * others_req,
+ int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx);
+static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+ * flat_buf, char **recv_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list,
+ unsigned *recv_size,
+ MPI_Request * requests, MPI_Status * statuses,
+ int *recd_from_proc, int nprocs,
+ int contig_access_count,
+ ADIO_Offset min_st_offset,
+ ADIO_Offset fd_size, ADIO_Offset * fd_start,
+ ADIO_Offset * fd_end, MPI_Aint buftype_extent);
+
+extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
+ datatype, int file_ptr_type, ADIO_Offset
+ offset, ADIO_Offset ** offset_list_ptr, ADIO_Offset
+ ** len_list_ptr, ADIO_Offset * start_offset_ptr,
+ ADIO_Offset * end_offset_ptr, int
+ *contig_access_count_ptr);
+
+
+
+void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code)
+{
+/* Uses a generalized version of the extended two-phase method described
+ in "An Extended Two-Phase Method for Accessing Sections of
+ Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
+ Scientific Programming, (5)4:301--317, Winter 1996.
+ http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
+
+ ADIOI_Access *my_req;
+ /* array of nprocs structures, one for each other process in
+ * whose file domain this process's request lies */
+
+ ADIOI_Access *others_req;
+ /* array of nprocs structures, one for each other process
+ * whose request lies in this process's file domain. */
+
+ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
+ int contig_access_count = 0, interleave_count = 0, buftype_is_contig;
+ int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
+ ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
+ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
+ *fd_end = NULL, *end_offsets = NULL;
+ ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
+ ADIO_Offset *count_sizes;
+ int ii;
+ ADIO_Offset *len_list = NULL;
+ MPI_Aint *buf_idx = NULL;
+
+ GPFSMPIO_T_CIO_RESET(r);
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPI_Count bufsize, size;
+#endif
+
+#if 0
+ /* From common code - not implemented for bg. */
+ if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
+ ADIOI_IOStridedColl(fd, buf, count, ADIOI_READ, datatype,
+ file_ptr_type, offset, status, error_code);
+ return;
+ }
+#endif
+#ifdef PROFILE
+ MPE_Log_event(13, 0, "start computation");
+#endif
+
+ MPI_Comm_size(fd->comm, &nprocs);
+ MPI_Comm_rank(fd->comm, &myrank);
+
+ /* number of aggregators, cb_nodes, is stored in the hints */
+ nprocs_for_coll = fd->hints->cb_nodes;
+ orig_fp = fd->fp_ind;
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST);
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST);
+
+ /* only check for interleaving if cb_read isn't disabled */
+ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
+ /* For this process's request, calculate the list of offsets and
+ * lengths in the file and determine the start and end offsets.
+ * Note: end_offset points to the last byte-offset to be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
+ */
+ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+ &offset_list, &len_list, &start_offset,
+ &end_offset, &contig_access_count);
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP);
+#ifdef RDCOLL_DEBUG
+ for (i = 0; i < contig_access_count; i++) {
+ DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n",
+ myrank, (long long) offset_list[i], (long long) len_list[i]);
+ }
+#endif
+
+ /* each process communicates its start and end offsets to other
+ * processes. The result is an array each of start and end offsets
+ * stored in order of process rank. */
+
+ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * 2 * sizeof(ADIO_Offset));
+ end_offsets = st_offsets + nprocs;
+
+ ADIO_Offset my_count_size = 0;
+ /* One-sided aggregation needs the amount of data per rank as well
+ * because the difference in starting and ending offsets for 1 byte is
+ * 0 the same as 0 bytes so it cannot be distiguished.
+ */
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ count_sizes = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
+ MPI_Count buftype_size;
+ MPI_Type_size_x(datatype, &buftype_size);
+ my_count_size = (ADIO_Offset) count *(ADIO_Offset) buftype_size;
+ }
+ if (romio_tunegather) {
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(6 * nprocs * sizeof(ADIO_Offset));
+ gpfs_offsets = gpfs_offsets0 + 3 * nprocs;
+ for (ii = 0; ii < nprocs; ii++) {
+ gpfs_offsets0[ii * 3] = 0;
+ gpfs_offsets0[ii * 3 + 1] = 0;
+ gpfs_offsets0[ii * 3 + 2] = 0;
+ }
+ gpfs_offsets0[myrank * 3] = start_offset;
+ gpfs_offsets0[myrank * 3 + 1] = end_offset;
+ gpfs_offsets0[myrank * 3 + 2] = my_count_size;
+ MPI_Allreduce(gpfs_offsets0, gpfs_offsets, nprocs * 3, ADIO_OFFSET, MPI_MAX,
+ fd->comm);
+ for (ii = 0; ii < nprocs; ii++) {
+ st_offsets[ii] = gpfs_offsets[ii * 3];
+ end_offsets[ii] = gpfs_offsets[ii * 3 + 1];
+ count_sizes[ii] = gpfs_offsets[ii * 3 + 2];
+ }
+ } else {
+ gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(4 * nprocs * sizeof(ADIO_Offset));
+ gpfs_offsets = gpfs_offsets0 + 2 * nprocs;
+ for (ii = 0; ii < nprocs; ii++) {
+ gpfs_offsets0[ii * 2] = 0;
+ gpfs_offsets0[ii * 2 + 1] = 0;
+ }
+ gpfs_offsets0[myrank * 2] = start_offset;
+ gpfs_offsets0[myrank * 2 + 1] = end_offset;
+
+ MPI_Allreduce(gpfs_offsets0, gpfs_offsets, nprocs * 2, ADIO_OFFSET, MPI_MAX,
+ fd->comm);
+
+ for (ii = 0; ii < nprocs; ii++) {
+ st_offsets[ii] = gpfs_offsets[ii * 2];
+ end_offsets[ii] = gpfs_offsets[ii * 2 + 1];
+ }
+ }
+ ADIOI_Free(gpfs_offsets0);
+ } else {
+ MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm);
+ MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm);
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ MPI_Allgather(&count_sizes, 1, ADIO_OFFSET, count_sizes, 1, ADIO_OFFSET, fd->comm);
+ }
+ }
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER);
+
+ /* are the accesses of different processes interleaved? */
+ for (i = 1; i < nprocs; i++)
+ if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i]))
+ interleave_count++;
+ /* This is a rudimentary check for interleaving, but should suffice
+ * for the moment. */
+ }
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+
+ if (fd->hints->cb_read == ADIOI_HINT_DISABLE
+ || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) {
+ /* don't do aggregation */
+ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ }
+
+ fd->fp_ind = orig_fp;
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ if (buftype_is_contig && filetype_is_contig) {
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+ off = fd->disp + (ADIO_Offset) (fd->etype_size) * offset;
+ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
+ off, status, error_code);
+ } else
+ ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code);
+ } else
+ ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code);
+
+ return;
+ }
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA);
+
+ /* We're going to perform aggregation of I/O. Here we call
+ * ADIOI_Calc_file_domains() to determine what processes will handle I/O
+ * to what regions. We pass nprocs_for_coll into this function; it is
+ * used to determine how many processes will perform I/O, which is also
+ * the number of regions into which the range of bytes must be divided.
+ * These regions are called "file domains", or FDs.
+ *
+ * When this function returns, fd_start, fd_end, fd_size, and
+ * min_st_offset will be filled in. fd_start holds the starting byte
+ * location for each file domain. fd_end holds the ending byte location.
+ * min_st_offset holds the minimum byte location that will be accessed.
+ *
+ * Both fd_start[] and fd_end[] are indexed by an aggregator number; this
+ * needs to be mapped to an actual rank in the communicator later.
+ *
+ */
+ int currentNonZeroDataIndex = 0;
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ /* Take out the 0-data offsets by shifting the indexes with data to the
+ * front and keeping track of the non-zero data index for use as the
+ * length. By doing this we will optimally use all available aggs
+ * and spread the actual data across them instead of having offsets
+ * with empty data potentially dilute the file domains and create
+ * problems for the one-sided aggregation.
+ */
+ for (i = 0; i < nprocs; i++) {
+ if (count_sizes[i] > 0) {
+ st_offsets[currentNonZeroDataIndex] = st_offsets[i];
+ end_offsets[currentNonZeroDataIndex] = end_offsets[i];
+ currentNonZeroDataIndex++;
+ }
+ }
+ }
+ if (gpfsmpio_tuneblocking) {
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, currentNonZeroDataIndex,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end, &fd_size, fd->fs_ptr);
+ } else {
+ ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end, &fd_size, fd->fs_ptr);
+ }
+ } else {
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ ADIOI_Calc_file_domains(st_offsets, end_offsets, currentNonZeroDataIndex,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end,
+ fd->hints->min_fdomain_size, &fd_size,
+ fd->hints->striping_unit);
+ } else {
+ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end,
+ fd->hints->min_fdomain_size, &fd_size,
+ fd->hints->striping_unit);
+ }
+ }
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART);
+ if ((romio_read_aggmethod == 1) || (romio_read_aggmethod == 2)) {
+ /* If the user has specified to use a one-sided aggregation method then
+ * do that at this point instead of the two-phase I/O.
+ */
+ ADIOI_OneSidedReadAggregation(fd, offset_list, len_list, contig_access_count, buf,
+ datatype, error_code, st_offsets, end_offsets,
+ currentNonZeroDataIndex, fd_start, fd_end);
+ GPFSMPIO_T_CIO_REPORT(0, fd, myrank, nprocs);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+ ADIOI_Free(count_sizes);
+ goto fn_exit;
+ }
+ if (gpfsmpio_p2pcontig == 1) {
+ /* For some simple yet common(?) workloads, full-on two-phase I/O is
+ * overkill. We can establish sub-groups of processes and their
+ * aggregator, and then these sub-groups will carry out a simplified
+ * two-phase over that sub-group.
+ *
+ * First verify that the filetype is contig and the offsets are
+ * increasing in rank order*/
+ int x, inOrderAndNoGaps = 1;
+ for (x = 0; x < (nprocs - 1); x++) {
+ if (end_offsets[x] != (st_offsets[x + 1] - 1))
+ inOrderAndNoGaps = 0;
+ }
+ if (inOrderAndNoGaps && buftype_is_contig) {
+ /* if these conditions exist then execute the P2PContig code else
+ * execute the original code */
+ ADIOI_P2PContigReadAggregation(fd, buf,
+ error_code, st_offsets, end_offsets, fd_start, fd_end);
+
+ /* NOTE: we are skipping the rest of two-phase in this path */
+ GPFSMPIO_T_CIO_REPORT(0, fd, myrank, nprocs);
+
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+ goto fn_exit;
+ }
+ }
+
+ /* calculate where the portions of the access requests of this process
+ * are located in terms of the file domains. this could be on the same
+ * process or on other processes. this function fills in:
+ * count_my_req_procs - number of processes (including this one) for which
+ * this process has requests in their file domain
+ * count_my_req_per_proc - count of requests for each process, indexed
+ * by rank of the process
+ * my_req[] - array of data structures describing the requests to be
+ * performed by each process (including self). indexed by rank.
+ * buf_idx[] - array of locations into which data can be directly moved;
+ * this is only valid for contiguous buffer case
+ */
+ if (gpfsmpio_tuneblocking)
+ ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+ min_st_offset, fd_start, fd_end, fd_size,
+ nprocs, &count_my_req_procs,
+ &count_my_req_per_proc, &my_req, &buf_idx);
+ else
+ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+ min_st_offset, fd_start, fd_end, fd_size,
+ nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx);
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ);
+
+ /* perform a collective communication in order to distribute the
+ * data calculated above. fills in the following:
+ * count_others_req_procs - number of processes (including this
+ * one) which have requests in this process's file domain.
+ * count_others_req_per_proc[] - number of separate contiguous
+ * requests from proc i lie in this process's file domain.
+ */
+ if (gpfsmpio_tuneblocking)
+ ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
+ count_my_req_per_proc, my_req,
+ nprocs, myrank, &count_others_req_procs, &others_req);
+ else
+ ADIOI_Calc_others_req(fd, count_my_req_procs,
+ count_my_req_per_proc, my_req,
+ nprocs, myrank, &count_others_req_procs, &others_req);
+
+ GPFSMPIO_T_CIO_SET_GET(r, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ);
+
+ /* my_req[] and count_my_req_per_proc aren't needed at this point, so
+ * let's free the memory
+ */
+ ADIOI_Free(count_my_req_per_proc);
+ ADIOI_Free(my_req[0].offsets);
+ ADIOI_Free(my_req);
+
+ /* read data in sizes of no more than ADIOI_Coll_bufsize,
+ * communicate, and fill user buf.
+ */
+ ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank,
+ others_req, offset_list,
+ len_list, contig_access_count, min_st_offset,
+ fd_size, fd_start, fd_end, buf_idx, error_code);
+
+ GPFSMPIO_T_CIO_SET_GET(r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH);
+ GPFSMPIO_T_CIO_SET_GET(r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW);
+ GPFSMPIO_T_CIO_REPORT(0, fd, myrank, nprocs);
+
+ /* free all memory allocated for collective I/O */
+ if (others_req[0].offsets) {
+ ADIOI_Free(others_req[0].offsets);
+ }
+ if (others_req[0].mem_ptrs) {
+ ADIOI_Free(others_req[0].mem_ptrs);
+ }
+ ADIOI_Free(others_req);
+
+ ADIOI_Free(buf_idx);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ MPI_Type_size_x(datatype, &size);
+ bufsize = size * count;
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually read and placed in buf
+ during collective I/O. */
+#endif
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+}
+
+static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
+ datatype, int nprocs,
+ int myrank, ADIOI_Access
+ * others_req, ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int contig_access_count, ADIO_Offset
+ min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ MPI_Aint * buf_idx, int *error_code)
+{
+/* Read in sizes of no more than coll_bufsize, an info parameter.
+ Send data to appropriate processes.
+ Place recd. data in user buf.
+ The idea is to reduce the amount of extra memory required for
+ collective I/O. If all data were read all at once, which is much
+ easier, it would require temp space more than the size of user_buf,
+ which is often unacceptable. For example, to read a distributed
+ array from a file, where each local array is 8Mbytes, requiring
+ at least another 8Mbytes of temp space is unacceptable. */
+
+ int i, j, m, ntimes, max_ntimes, buftype_is_contig;
+ ADIO_Offset st_loc = -1, end_loc = -1, off, done, real_off, req_off;
+ char *read_buf = NULL, *tmp_buf;
+ int *curr_offlen_ptr, *count, *send_size, *recv_size;
+ int *partial_send, *recd_from_proc, *start_pos;
+ /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
+ ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
+ int req_len, flag, rank;
+ MPI_Status status;
+ ADIOI_Flatlist_node *flat_buf = NULL;
+ MPI_Aint lb, buftype_extent;
+ int coll_bufsize;
+#ifdef RDCOLL_DEBUG
+ int iii;
+#endif
+ *error_code = MPI_SUCCESS; /* changed below if error */
+ /* only I/O errors are currently reported */
+
+/* calculate the number of reads of size coll_bufsize
+ to be done by each process and the max among all processes.
+ That gives the no. of communication phases as well.
+ coll_bufsize is obtained from the hints object. */
+
+ coll_bufsize = fd->hints->cb_buffer_size;
+
+ /* grab some initial values for st_loc and end_loc */
+ for (i = 0; i < nprocs; i++) {
+ if (others_req[i].count) {
+ st_loc = others_req[i].offsets[0];
+ end_loc = others_req[i].offsets[0];
+ break;
+ }
+ }
+
+ /* now find the real values */
+ for (i = 0; i < nprocs; i++)
+ for (j = 0; j < others_req[i].count; j++) {
+ st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
+ end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j]
+ + others_req[i].lens[j] - 1));
+ }
+
+ /* calculate ntimes, the number of times this process must perform I/O
+ * operations in order to complete all the requests it has received.
+ * the need for multiple I/O operations comes from the restriction that
+ * we only use coll_bufsize bytes of memory for internal buffering.
+ */
+ if ((st_loc == -1) && (end_loc == -1)) {
+ /* this process does no I/O. */
+ ntimes = 0;
+ } else {
+ /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize) */
+ ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);
+ }
+
+ MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);
+
+ read_buf = fd->io_buf;
+
+ curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ /* its use is explained below. calloc initializes to 0. */
+
+ count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* to store count of how many off-len pairs per proc are satisfied
+ * in an iteration. */
+
+ partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ /* if only a portion of the last off-len pair is sent to a process
+ * in a particular iteration, the length sent is stored here.
+ * calloc initializes to 0. */
+
+ send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* total size of data to be sent to each proc. in an iteration */
+
+ recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* total size of data to be recd. from each proc. in an iteration.
+ * Of size nprocs so that I can use MPI_Alltoall later. */
+
+ recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ /* amount of data recd. so far from each proc. Used in
+ * ADIOI_Fill_user_buffer. initialized to 0 here. */
+
+ start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* used to store the starting value of curr_offlen_ptr[i] in
+ * this iteration */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ if (!buftype_is_contig) {
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+ }
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+
+ done = 0;
+ off = st_loc;
+ for_curr_iter = for_next_iter = 0;
+
+ MPI_Comm_rank(fd->comm, &rank);
+
+#ifdef PROFILE
+ MPE_Log_event(14, 0, "end computation");
+#endif
+
+ for (m = 0; m < ntimes; m++) {
+ /* read buf of size coll_bufsize (or less) */
+ /* go through all others_req and check if any are satisfied
+ * by the current read */
+
+ /* since MPI guarantees that displacements in filetypes are in
+ * monotonically nondecreasing order, I can maintain a pointer
+ * (curr_offlen_ptr) to
+ * current off-len pair for each process in others_req and scan
+ * further only from there. There is still a problem of filetypes
+ * such as: (1, 2, 3 are not process nos. They are just numbers for
+ * three chunks of data, specified by a filetype.)
+ *
+ * 1 -------!--
+ * 2 -----!----
+ * 3 --!-----
+ *
+ * where ! indicates where the current read_size limitation cuts
+ * through the filetype. I resolve this by reading up to !, but
+ * filling the communication buffer only for 1. I copy the portion
+ * left over for 2 into a tmp_buf for use in the next
+ * iteration. i.e., 2 and 3 will be satisfied in the next
+ * iteration. This simplifies filling in the user's buf at the
+ * other end, as only one off-len pair with incomplete data
+ * will be sent. I also don't need to send the individual
+ * offsets and lens along with the data, as the data is being
+ * sent in a particular order. */
+
+ /* off = start offset in the file for the data actually read in
+ * this iteration
+ * size = size of data read corresponding to off
+ * real_off = off minus whatever data was retained in memory from
+ * previous iteration for cases like 2, 3 illustrated above
+ * real_size = size plus the extra corresponding to real_off
+ * req_off = off in file for a particular contiguous request
+ * minus what was satisfied in previous iteration
+ * req_size = size corresponding to req_off */
+
+#ifdef PROFILE
+ MPE_Log_event(13, 0, "start computation");
+#endif
+ size = MPL_MIN((unsigned) coll_bufsize, end_loc - st_loc + 1 - done);
+ real_off = off - for_curr_iter;
+ real_size = size + for_curr_iter;
+
+ for (i = 0; i < nprocs; i++)
+ count[i] = send_size[i] = 0;
+ for_next_iter = 0;
+
+ for (i = 0; i < nprocs; i++) {
+#ifdef RDCOLL_DEBUG
+ DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count);
+#endif
+ if (others_req[i].count) {
+ start_pos[i] = curr_offlen_ptr[i];
+ for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+ if (partial_send[i]) {
+ /* this request may have been partially
+ * satisfied in the previous iteration. */
+ req_off = others_req[i].offsets[j] + partial_send[i];
+ req_len = others_req[i].lens[j] - partial_send[i];
+ partial_send[i] = 0;
+ /* modify the off-len pair to reflect this change */
+ others_req[i].offsets[j] = req_off;
+ others_req[i].lens[j] = req_len;
+ } else {
+ req_off = others_req[i].offsets[j];
+ req_len = others_req[i].lens[j];
+ }
+ if (req_off < real_off + real_size) {
+ count[i]++;
+ ADIOI_Assert((((ADIO_Offset) (uintptr_t) read_buf) + req_off - real_off) ==
+ (ADIO_Offset) (uintptr_t) (read_buf + req_off - real_off));
+ MPI_Get_address(read_buf + req_off - real_off, &(others_req[i].mem_ptrs[j]));
+ ADIOI_Assert((real_off + real_size - req_off) ==
+ (int) (real_off + real_size - req_off));
+ send_size[i] +=
+ (int) (MPL_MIN
+ (real_off + real_size - req_off,
+ (ADIO_Offset) (unsigned) req_len));
+
+ if (real_off + real_size - req_off < (ADIO_Offset) (unsigned) req_len) {
+ partial_send[i] = (int) (real_off + real_size - req_off);
+ if ((j + 1 < others_req[i].count) &&
+ (others_req[i].offsets[j + 1] < real_off + real_size)) {
+ /* this is the case illustrated in the
+ * figure above. */
+ for_next_iter = MPL_MAX(for_next_iter,
+ real_off + real_size -
+ others_req[i].offsets[j + 1]);
+ /* max because it must cover requests
+ * from different processes */
+ }
+ break;
+ }
+ } else
+ break;
+ }
+ curr_offlen_ptr[i] = j;
+ }
+ }
+
+ flag = 0;
+ for (i = 0; i < nprocs; i++)
+ if (count[i])
+ flag = 1;
+
+#ifdef PROFILE
+ MPE_Log_event(14, 0, "end computation");
+#endif
+ if (flag) {
+ char round[50];
+ MPL_snprintf(round, sizeof(round), "two-phase-round=%d", m);
+ setenv("LIBIOLOG_EXTRA_INFO", round, 1);
+ ADIOI_Assert(size == (int) size);
+ ADIO_ReadContig(fd, read_buf + for_curr_iter, (int) size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, error_code);
+#ifdef RDCOLL_DEBUG
+ DBG_FPRINTF(stderr, "\tread_coll: 700, data read [%lld] = ", (long long) size);
+ for (iii = 0; iii < size && iii < 80; iii++) {
+ DBGV_FPRINTF(stderr, "%3d,", *((unsigned char *) read_buf + for_curr_iter + iii));
+ }
+ DBG_FPRINTF(stderr, "\n");
+#endif
+
+ if (*error_code != MPI_SUCCESS)
+ return;
+ }
+
+ for_curr_iter = for_next_iter;
+
+#ifdef PROFILE
+ MPE_Log_event(7, 0, "start communication");
+#endif
+ if (gpfsmpio_comm == 1)
+ ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
+ send_size, recv_size, count,
+ start_pos, partial_send, recd_from_proc, nprocs,
+ myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, m, buftype_extent, buf_idx);
+ else if (gpfsmpio_comm == 0) {
+ ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
+ send_size, recv_size, count,
+ start_pos, partial_send, recd_from_proc, nprocs,
+ myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, m, buftype_extent, buf_idx);
+ }
+#ifdef PROFILE
+ MPE_Log_event(8, 0, "end communication");
+#endif
+
+ if (for_next_iter) {
+ tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
+ ADIOI_Assert((((ADIO_Offset) (uintptr_t) read_buf) + real_size - for_next_iter) ==
+ (ADIO_Offset) (uintptr_t) (read_buf + real_size - for_next_iter));
+ ADIOI_Assert((for_next_iter + coll_bufsize) == (size_t) (for_next_iter + coll_bufsize));
+ memcpy(tmp_buf, read_buf + real_size - for_next_iter, for_next_iter);
+ ADIOI_Free(fd->io_buf);
+ fd->io_buf = (char *) ADIOI_Malloc(for_next_iter + coll_bufsize);
+ memcpy(fd->io_buf, tmp_buf, for_next_iter);
+ read_buf = fd->io_buf;
+ ADIOI_Free(tmp_buf);
+ }
+
+ off += size;
+ done += size;
+ }
+
+ for (i = 0; i < nprocs; i++)
+ count[i] = send_size[i] = 0;
+#ifdef PROFILE
+ MPE_Log_event(7, 0, "start communication");
+#endif
+ for (m = ntimes; m < max_ntimes; m++)
+/* nothing to send, but check for recv. */
+
+ if (gpfsmpio_comm == 1)
+ ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
+ send_size, recv_size, count,
+ start_pos, partial_send, recd_from_proc, nprocs,
+ myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, m, buftype_extent, buf_idx);
+ else /* strncmp(env_switch, "alltoall", 8) == 0 */ if (gpfsmpio_comm == 0)
+ ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
+ send_size, recv_size, count,
+ start_pos, partial_send, recd_from_proc, nprocs,
+ myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, m, buftype_extent, buf_idx);
+
+#ifdef PROFILE
+ MPE_Log_event(8, 0, "end communication");
+#endif
+
+ ADIOI_Free(curr_offlen_ptr);
+ ADIOI_Free(count);
+ ADIOI_Free(partial_send);
+ ADIOI_Free(send_size);
+ ADIOI_Free(recv_size);
+ ADIOI_Free(recd_from_proc);
+ ADIOI_Free(start_pos);
+
+ unsetenv("LIBIOLOG_EXTRA_INFO");
+}
+
+static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+ * flat_buf, ADIO_Offset * offset_list, ADIO_Offset
+ * len_list, int *send_size, int *recv_size,
+ int *count, int *start_pos, int *partial_send,
+ int *recd_from_proc, int nprocs,
+ int myrank, int
+ buftype_is_contig, int contig_access_count,
+ ADIO_Offset min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIOI_Access * others_req,
+ int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx)
+{
+ int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send;
+ char **recv_buf = NULL;
+ MPI_Request *requests;
+ MPI_Datatype send_type;
+ MPI_Status *statuses;
+
+/* exchange send_size info so that each process knows how much to
+ receive from whom and how much memory to allocate. */
+
+ MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm);
+
+ nprocs_recv = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i])
+ nprocs_recv++;
+
+ nprocs_send = 0;
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i])
+ nprocs_send++;
+
+ requests = (MPI_Request *)
+ ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request));
+/* +1 to avoid a 0-size malloc */
+
+/* post recvs. if buftype_is_contig, data can be directly recd. into
+ user buf at location given by buf_idx. else use recv_buf. */
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5032, 0, NULL);
+#endif
+
+ if (buftype_is_contig) {
+ j = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i]) {
+ MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i],
+ MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, requests + j);
+ j++;
+ buf_idx[i] += recv_size[i];
+ }
+ } else {
+/* allocate memory for recv_buf and post receives */
+ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i])
+ recv_buf[i] = (char *) ADIOI_Malloc(recv_size[i]);
+ }
+
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+ MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i,
+ myrank + i + 100 * iter, fd->comm, requests + j);
+ j++;
+#ifdef RDCOLL_DEBUG
+ DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n",
+ myrank, recv_size[i], myrank + i + 100 * iter);
+#endif
+ }
+ }
+ }
+
+/* create derived datatypes and send data */
+
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (send_size[i]) {
+/* take care if the last off-len pair is a partial send */
+ if (partial_send[i]) {
+ k = start_pos[i] + count[i] - 1;
+ tmp = others_req[i].lens[k];
+ others_req[i].lens[k] = partial_send[i];
+ }
+ ADIOI_Type_create_hindexed_x(count[i],
+ &(others_req[i].lens[start_pos[i]]),
+ &(others_req[i].mem_ptrs[start_pos[i]]),
+ MPI_BYTE, &send_type);
+ /* absolute displacement; use MPI_BOTTOM in send */
+ MPI_Type_commit(&send_type);
+ MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank + i + 100 * iter,
+ fd->comm, requests + nprocs_recv + j);
+ MPI_Type_free(&send_type);
+ if (partial_send[i])
+ others_req[i].lens[k] = tmp;
+ j++;
+ }
+ }
+
+#ifdef MPI_STATUSES_IGNORE
+ statuses = MPI_STATUSES_IGNORE;
+#else
+ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status));
+ /* +1 to avoid a 0-size malloc */
+#endif
+
+ /* wait on the receives */
+ if (nprocs_recv) {
+#ifdef NEEDS_MPI_TEST
+ j = 0;
+ while (!j)
+ MPI_Testall(nprocs_recv, requests, &j, statuses);
+#else
+ MPI_Waitall(nprocs_recv, requests, statuses);
+#endif
+
+ /* if noncontiguous, to the copies from the recv buffers */
+ if (!buftype_is_contig)
+ ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
+ offset_list, len_list, (unsigned *) recv_size,
+ requests, statuses, recd_from_proc,
+ nprocs, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end, buftype_extent);
+ }
+
+ /* wait on the sends */
+ MPI_Waitall(nprocs_send, requests + nprocs_recv, statuses + nprocs_recv);
+
+#ifndef MPI_STATUSES_IGNORE
+ ADIOI_Free(statuses);
+#endif
+ ADIOI_Free(requests);
+
+ if (!buftype_is_contig) {
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i])
+ ADIOI_Free(recv_buf[i]);
+ ADIOI_Free(recv_buf);
+ }
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5033, 0, NULL);
+#endif
+}
+
+#define ADIOI_BUF_INCR \
+ { \
+ while (buf_incr) { \
+ size_in_buf = MPL_MIN(buf_incr, flat_buf_sz); \
+ user_buf_idx += size_in_buf; \
+ flat_buf_sz -= size_in_buf; \
+ if (!flat_buf_sz) { \
+ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
+ else { \
+ flat_buf_idx = 0; \
+ n_buftypes++; \
+ } \
+ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
+ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
+ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
+ } \
+ buf_incr -= size_in_buf; \
+ } \
+ }
+
+
+#define ADIOI_BUF_COPY \
+ { \
+ while (size) { \
+ size_in_buf = MPL_MIN(size, flat_buf_sz); \
+ ADIOI_Assert((((ADIO_Offset)(uintptr_t)buf) + user_buf_idx) == (ADIO_Offset)(uintptr_t)(buf + user_buf_idx)); \
+ ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
+ memcpy(((char *) buf) + user_buf_idx, \
+ &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \
+ recv_buf_idx[p] += size_in_buf; /* already tested (size_t)size_in_buf*/ \
+ user_buf_idx += size_in_buf; \
+ flat_buf_sz -= size_in_buf; \
+ if (!flat_buf_sz) { \
+ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
+ else { \
+ flat_buf_idx = 0; \
+ n_buftypes++; \
+ } \
+ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
+ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
+ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
+ } \
+ size -= size_in_buf; \
+ buf_incr -= size_in_buf; \
+ } \
+ ADIOI_BUF_INCR \
+ }
+
+static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+ * flat_buf, char **recv_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list,
+ unsigned *recv_size,
+ MPI_Request * requests, MPI_Status * statuses,
+ int *recd_from_proc, int nprocs,
+ int contig_access_count,
+ ADIO_Offset min_st_offset,
+ ADIO_Offset fd_size, ADIO_Offset * fd_start,
+ ADIO_Offset * fd_end, MPI_Aint buftype_extent)
+{
+
+/* this function is only called if buftype is not contig */
+
+ int i, p, flat_buf_idx;
+ ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
+ int n_buftypes;
+ ADIO_Offset off, len, rem_len, user_buf_idx;
+ /* Not sure unsigned is necessary, but it makes the math safer */
+ unsigned *curr_from_proc, *done_from_proc, *recv_buf_idx;
+
+ MPL_UNREFERENCED_ARG(requests);
+ MPL_UNREFERENCED_ARG(statuses);
+
+/* curr_from_proc[p] = amount of data recd from proc. p that has already
+ been accounted for so far
+ done_from_proc[p] = amount of data already recd from proc. p and
+ filled into user buffer in previous iterations
+ user_buf_idx = current location in user buffer
+ recv_buf_idx[p] = current location in recv_buf of proc. p */
+ curr_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
+ done_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
+ recv_buf_idx = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
+
+ for (i = 0; i < nprocs; i++) {
+ recv_buf_idx[i] = curr_from_proc[i] = 0;
+ done_from_proc[i] = recd_from_proc[i];
+ }
+
+ user_buf_idx = flat_buf->indices[0];
+ flat_buf_idx = 0;
+ n_buftypes = 0;
+ flat_buf_sz = flat_buf->blocklens[0];
+
+ /* flat_buf_idx = current index into flattened buftype
+ * flat_buf_sz = size of current contiguous component in
+ * flattened buf */
+
+ for (i = 0; i < contig_access_count; i++) {
+ off = offset_list[i];
+ rem_len = len_list[i];
+
+ /* this request may span the file domains of more than one process */
+ while (rem_len > 0) {
+ len = rem_len;
+ /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
+ * longer than the single region that processor "p" is responsible
+ * for.
+ */
+ p = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end);
+
+ if (recv_buf_idx[p] < recv_size[p]) {
+ if (curr_from_proc[p] + len > done_from_proc[p]) {
+ if (done_from_proc[p] > curr_from_proc[p]) {
+ size = MPL_MIN(curr_from_proc[p] + len -
+ done_from_proc[p], recv_size[p] - recv_buf_idx[p]);
+ buf_incr = done_from_proc[p] - curr_from_proc[p];
+ ADIOI_BUF_INCR buf_incr = curr_from_proc[p] + len - done_from_proc[p];
+ ADIOI_Assert((done_from_proc[p] + size) ==
+ (unsigned) ((ADIO_Offset) done_from_proc[p] + size));
+ curr_from_proc[p] = done_from_proc[p] + size;
+ ADIOI_BUF_COPY} else {
+ size = MPL_MIN(len, recv_size[p] - recv_buf_idx[p]);
+ buf_incr = len;
+ ADIOI_Assert((curr_from_proc[p] + size) ==
+ (unsigned) ((ADIO_Offset) curr_from_proc[p] + size));
+ curr_from_proc[p] += (unsigned) size;
+ ADIOI_BUF_COPY}
+ } else {
+ ADIOI_Assert((curr_from_proc[p] + len) ==
+ (unsigned) ((ADIO_Offset) curr_from_proc[p] + len));
+ curr_from_proc[p] += (unsigned) len;
+ buf_incr = len;
+ ADIOI_BUF_INCR}
+ } else {
+ buf_incr = len;
+ ADIOI_BUF_INCR}
+ off += len;
+ rem_len -= len;
+ }
+ }
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i])
+ recd_from_proc[i] = curr_from_proc[i];
+
+ ADIOI_Free(curr_from_proc);
+ ADIOI_Free(done_from_proc);
+ ADIOI_Free(recv_buf_idx);
+}
+
+static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node
+ * flat_buf, ADIO_Offset * offset_list, ADIO_Offset
+ * len_list, int *send_size, int *recv_size,
+ int *count, int *start_pos, int *partial_send,
+ int *recd_from_proc, int nprocs,
+ int myrank, int
+ buftype_is_contig, int contig_access_count,
+ ADIO_Offset min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIOI_Access * others_req,
+ int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx)
+{
+ int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send;
+ char **recv_buf = NULL;
+ MPI_Request *requests = NULL;
+ MPI_Status *statuses = NULL;
+ int rtail, stail;
+ char *sbuf_ptr, *from_ptr;
+ int len;
+ int *sdispls, *rdispls;
+ char *all_recv_buf, *all_send_buf;
+
+ /* exchange send_size info so that each process knows how much to
+ * receive from whom and how much memory to allocate. */
+ MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm);
+
+ nprocs_recv = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i]) {
+ nprocs_recv++;
+ break;
+ }
+
+ nprocs_send = 0;
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i]) {
+ nprocs_send++;
+ break;
+ }
+
+ /* receiver side data structures */
+ rdispls = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ rtail = 0;
+ for (i = 0; i < nprocs; i++) {
+ rdispls[i] = rtail;
+ rtail += recv_size[i];
+ }
+
+ /* data buffer */
+ all_recv_buf = (char *) ADIOI_Malloc(rtail);
+ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
+ for (i = 0; i < nprocs; i++) {
+ recv_buf[i] = all_recv_buf + rdispls[i];
+ }
+
+ /* sender side data structures */
+ sdispls = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ stail = 0;
+ for (i = 0; i < nprocs; i++) {
+ sdispls[i] = stail;
+ stail += send_size[i];
+ }
+
+ /* data buffer */
+ all_send_buf = (char *) ADIOI_Malloc(stail);
+ for (i = 0; i < nprocs; i++) {
+ if (send_size[i]) {
+ if (partial_send[i]) {
+ k = start_pos[i] + count[i] - 1;
+ tmp = others_req[i].lens[k];
+ others_req[i].lens[k] = partial_send[i];
+ }
+ sbuf_ptr = all_send_buf + sdispls[i];
+ for (j = 0; j < count[i]; j++) {
+ from_ptr =
+ (char *) ADIOI_AINT_CAST_TO_VOID_PTR(others_req[i].mem_ptrs[start_pos[i] + j]);
+ len = others_req[i].lens[start_pos[i] + j];
+ memcpy(sbuf_ptr, from_ptr, len);
+ sbuf_ptr += len;
+ }
+ if (partial_send[i])
+ others_req[i].lens[k] = tmp;
+ }
+ }
+
+#if RDCOLL_DEBUG
+ DBG_FPRINTF(stderr, "\tsend_size = [%d]%2d,", 0, send_size[0]);
+ for (i = 1; i < nprocs; i++)
+ if (send_size[i - 1] != send_size[i]) {
+ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, send_size[i]);
+ }
+ DBG_FPRINTF(stderr, "\trecv_size = [%d]%2d,", 0, recv_size[0]);
+ for (i = 1; i < nprocs; i++)
+ if (recv_size[i - 1] != recv_size[i]) {
+ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, recv_size[i]);
+ }
+ DBG_FPRINTF(stderr, "\tsdispls = [%d]%2d,", 0, sdispls[0]);
+ for (i = 1; i < nprocs; i++)
+ if (sdispls[i - 1] != sdispls[i]) {
+ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, sdispls[i]);
+ }
+ DBG_FPRINTF(stderr, "\trdispls = [%d]%2d,", 0, rdispls[0]);
+ for (i = 1; i < nprocs; i++)
+ if (rdispls[i - 1] != rdispls[i]) {
+ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, rdispls[i]);
+ }
+ DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail);
+ if (nprocs_send) {
+ DBG_FPRINTF(stderr, "\tall_send_buf = [%d]%2d,", 0, all_send_buf[0]);
+ /* someone at some point found it useful to look at the 128th kilobyte of data from each processor, but this segfaults in many situations if "all debugging" enabled */
+ //for (i=1; icomm);
+
+#if 0
+ DBG_FPRINTF(stderr, "\tall_recv_buf = ");
+ for (i = 131072; i < 131073; i++) {
+ DBG_FPRINTF(stderr, "%2d,", all_recv_buf[i]);
+ }
+ DBG_FPRINTF(stderr, "\n");
+#endif
+
+ /* unpack at the receiver side */
+ if (nprocs_recv) {
+ if (!buftype_is_contig)
+ ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, offset_list, len_list, (unsigned *) recv_size, requests, statuses, /* never used inside */
+ recd_from_proc,
+ nprocs, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end, buftype_extent);
+ else {
+ rtail = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i]) {
+ memcpy((char *) buf + buf_idx[i], all_recv_buf + rtail, recv_size[i]);
+ buf_idx[i] += recv_size[i];
+ rtail += recv_size[i];
+ }
+ }
+ }
+
+ ADIOI_Free(all_send_buf);
+ ADIOI_Free(all_recv_buf);
+ ADIOI_Free(recv_buf);
+ ADIOI_Free(sdispls);
+ ADIOI_Free(rdispls);
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_tuning.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_tuning.c
new file mode 100644
index 0000000000000000000000000000000000000000..6158784d974672a43299b16805c00d1fa6410108
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_tuning.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_tuning.c
+ * \brief Defines ad_gpfs performance tuning
+ */
+
+/*---------------------------------------------------------------------
+ * ad_gpfs_tuning.c
+ *
+ * defines global variables and functions for performance tuning and
+ * functional debugging.
+ *---------------------------------------------------------------------*/
+
+#include "ad_gpfs_tuning.h"
+#include "mpi.h"
+
+#if !defined(PVFS2_SUPER_MAGIC)
+#define PVFS2_SUPER_MAGIC (0x20030528)
+#endif
+
+
+int gpfsmpio_timing;
+int gpfsmpio_timing2;
+int gpfsmpio_timing_cw_level;
+int gpfsmpio_comm;
+int gpfsmpio_tuneblocking;
+long bglocklessmpio_f_type;
+int gpfsmpio_bg_nagg_pset;
+int gpfsmpio_pthreadio;
+int gpfsmpio_p2pcontig;
+int gpfsmpio_balancecontig;
+int gpfsmpio_devnullio;
+int gpfsmpio_bridgeringagg;
+
+double gpfsmpio_prof_cw[GPFSMPIO_CIO_LAST + 1];
+double gpfsmpio_prof_cr[GPFSMPIO_CIO_LAST + 1];
+
+/* set internal variables for tuning environment variables */
+/** \page mpiio_vars MPIIO Configuration
+ \section env_sec Environment Variables
+ * - GPFSMPIO_COMM - Define how data is exchanged on collective
+ * reads and writes. Possible values:
+ * - 0 - Use MPI_Alltoallv.
+ * - 1 - Use MPI_Isend/MPI_Irecv.
+ * - Default is 0.
+ *
+ * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
+ * Possible values:
+ * - 0 - Do not collect/report timing.
+ * - 1 - Collect/report timing.
+ * - Default is 0.
+ *
+ * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
+ * calculated (block size). Possible values:
+ * - 0 - Evenly calculate file domains across aggregators. Also use
+ * MPI_Isend/MPI_Irecv to exchange domain information.
+ * - 1 - Align file domains with the underlying file system's block size. Also use
+ * MPI_Alltoallv to exchange domain information.
+ * - Default is 1.
+ *
+ * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
+ * the ad_bglockless driver. NOTE: Using romio prefixes (such as
+ * "bg:" or "bglockless:") on a file name will override this environment
+ * variable. Possible values:
+ * - 0xnnnnnnnn - Any valid file system type (or "magic number") from
+ * statfs() field f_type.
+ * - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
+ *
+ * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
+ * compute group (compute nodes + i/o nodes). Possible values:
+ * - any integer
+ * - Default is 8
+ *
+ * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
+ * pthread is spawned to do the posix writes while the main thread does the
+ * data aggregation - useful for large files where multiple rounds are
+ * required (more that the cb_buffer_size of data per aggregator). User
+ * must ensure there is hw resource available for the thread to run. I
+ * am sure there is a better way to do this involving comm threads - this is
+ * just a start. NOTE: For some reason the stats collected when this is
+ * enabled misses some of the data so the data sizes are off a bit - this is
+ * a statistical issue only, the data is still accurately written out
+ *
+ * - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the
+ * aggregator and the procs that feed it. Performance could be enhanced by a
+ * one-sided put algorithm. Current implementation allows only 1 round of
+ * data. Useful/allowed only when:
+ * 1.) The datatype is contiguous.
+ * 2.) The offsets are increasing in rank-order.
+ * 3.) There are no gaps between the offsets.
+ * 4.) No single rank has a data size which spans multiple file domains.
+ *
+ * - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
+ * to aggregators in a breadth-first fashion relative to the ions - additionally,
+ * file domains on the aggregators sharing the same bridgeset and ion have contiguous
+ * offsets. The breadth-first assignment improves performance in the case of
+ * a relatively small file of size less than the gpfs block size multiplied
+ * by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values
+ * - 0 - assign file domain blocks in the traditional manner
+ * - 1 - if there are variable sized file domain blocks, spread them out
+ * (balance) across bridge nodes
+ *
+ * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
+ * system. When experimenting with different two-phase I/O strategies, it's
+ * helpful to remove the highly variable file system from the experiment.
+ * - 0 (disabled) or 1 (enabled)
+ * - Default is 0
+ *
+ * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement
+ * optimization whch forms a 5-d ring around the bridge node starting at
+ * GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results
+ * suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
+ * and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still
+ * GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
+ *
+ */
+
+void ad_gpfs_get_env_vars()
+{
+ char *x, *dummy;
+
+ gpfsmpio_comm = 0;
+ x = getenv("GPFSMPIO_COMM");
+ if (x)
+ gpfsmpio_comm = atoi(x);
+ gpfsmpio_timing = 0;
+ x = getenv("GPFSMPIO_TIMING");
+ if (x)
+ gpfsmpio_timing = atoi(x);
+ gpfsmpio_tuneblocking = 1;
+ x = getenv("GPFSMPIO_TUNEBLOCKING");
+ if (x)
+ gpfsmpio_tuneblocking = atoi(x);
+ bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
+ x = getenv("BGLOCKLESSMPIO_F_TYPE");
+ if (x)
+ bglocklessmpio_f_type = strtol(x, &dummy, 0);
+ DBG_FPRINTF(stderr, "BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
+ bglocklessmpio_f_type, bglocklessmpio_f_type);
+ /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
+ * when we know a bit more about what "largest possible value" and
+ * "smallest possible value" should be */
+ gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
+ x = getenv("GPFSMPIO_NAGG_PSET");
+ if (x)
+ gpfsmpio_bg_nagg_pset = atoi(x);
+
+ gpfsmpio_p2pcontig = 0;
+ x = getenv("GPFSMPIO_P2PCONTIG");
+ if (x)
+ gpfsmpio_p2pcontig = atoi(x);
+
+ gpfsmpio_balancecontig = 0;
+ x = getenv("GPFSMPIO_BALANCECONTIG");
+ if (x)
+ gpfsmpio_balancecontig = atoi(x);
+
+ gpfsmpio_devnullio = 0;
+ x = getenv("GPFSMPIO_DEVNULLIO");
+ if (x)
+ gpfsmpio_devnullio = atoi(x);
+
+ gpfsmpio_bridgeringagg = 0;
+ x = getenv("GPFSMPIO_BRIDGERINGAGG");
+ if (x)
+ gpfsmpio_bridgeringagg = atoi(x);
+
+}
+
+/* report timing breakdown for MPI I/O collective call */
+void ad_gpfs_timing_crw_report(int rw, ADIO_File fd, int myrank, int nprocs)
+{
+ int i;
+
+ if (gpfsmpio_timing) {
+ /* Timing across the whole communicator is a little bit interesting,
+ * but what is *more* interesting is if we single out the aggregators
+ * themselves. non-aggregators spend a lot of time in "exchange" not
+ * exchanging data, but blocked because they are waiting for
+ * aggregators to finish writing. If we focus on just the aggregator
+ * processes we will get a more clear picture about the data exchange
+ * vs. i/o time breakdown */
+
+ /* if deferred open enabled, we could use the aggregator communicator */
+ MPI_Comm agg_comm;
+ int nr_aggs, agg_rank;
+ MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
+ if (agg_comm != MPI_COMM_NULL) {
+ MPI_Comm_size(agg_comm, &nr_aggs);
+ MPI_Comm_rank(agg_comm, &agg_rank);
+ }
+
+ double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
+ if (rw)
+ gpfsmpio_prof_org = gpfsmpio_prof_cw;
+
+ double gpfsmpio_prof_avg[GPFSMPIO_CIO_LAST];
+ double gpfsmpio_prof_max[GPFSMPIO_CIO_LAST];
+
+ if (agg_comm != MPI_COMM_NULL) {
+ MPI_Reduce(gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM,
+ 0, agg_comm);
+ MPI_Reduce(gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX,
+ 0, agg_comm);
+ }
+ if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
+
+ for (i = 0; i < GPFSMPIO_CIO_LAST; i++)
+ gpfsmpio_prof_avg[i] /= nr_aggs;
+
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_B_POSI_RW] =
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_DATA_SIZE] * nr_aggs /
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_POSI_RW];
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_B_MPIO_RW] =
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_DATA_SIZE] * nr_aggs /
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_MPIO_RW];
+
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_B_MPIO_CRW] =
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_DATA_SIZE] * nr_aggs /
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_MPIO_CRW];
+
+ fprintf(stderr, "TIMING-%1s,", (rw ? "W" : "R"));
+ fprintf(stderr, "SIZE: %12.4lld , ",
+ (long long int) (gpfsmpio_prof_avg[GPFSMPIO_CIO_DATA_SIZE] * nr_aggs));
+ fprintf(stderr, "SEEK-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_T_SEEK]);
+ fprintf(stderr, "SEEK-max: %10.3f , ", gpfsmpio_prof_max[GPFSMPIO_CIO_T_SEEK]);
+ fprintf(stderr, "LOCAL-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_T_LCOMP]);
+ fprintf(stderr, "GATHER-max: %10.3f , ", gpfsmpio_prof_max[GPFSMPIO_CIO_T_GATHER]);
+ fprintf(stderr, "PATTERN-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_T_PATANA]);
+ fprintf(stderr, "FILEDOMAIN-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_T_FD_PART]);
+ fprintf(stderr, "MYREQ-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_T_MYREQ]);
+ fprintf(stderr, "OTHERREQ-max: %10.3f , ", gpfsmpio_prof_max[GPFSMPIO_CIO_T_OTHREQ]);
+ fprintf(stderr, "EXCHANGE-max: %10.3f , ", gpfsmpio_prof_max[GPFSMPIO_CIO_T_DEXCH]);
+ fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_DEXCH_RECV_EXCH]);
+ fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_DEXCH_SETUP]);
+ fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_DEXCH_NET]);
+ fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_DEXCH_SORT]);
+ fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
+ gpfsmpio_prof_max[GPFSMPIO_CIO_T_DEXCH_SIEVE]);
+ fprintf(stderr, "POSIX-TIME-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_T_POSI_RW]);
+ fprintf(stderr, "POSIX-TIME-max: %10.3f , ", gpfsmpio_prof_max[GPFSMPIO_CIO_T_POSI_RW]);
+ fprintf(stderr, "MPIIO-CONTIG-TIME-avg: %10.3f , ",
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_T_MPIO_RW]);
+ fprintf(stderr, "MPIIO-STRIDED-TIME-avg: %10.3f , ",
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_T_MPIO_CRW]);
+ fprintf(stderr, "POSIX-BW-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_B_POSI_RW]);
+ fprintf(stderr, "MPI-BW-avg: %10.3f , ", gpfsmpio_prof_avg[GPFSMPIO_CIO_B_MPIO_RW]);
+ fprintf(stderr, "MPI-BW-collective-avg: %10.3f\n ",
+ gpfsmpio_prof_avg[GPFSMPIO_CIO_B_MPIO_CRW]);
+ }
+ if (agg_comm != MPI_COMM_NULL)
+ MPI_Comm_free(&agg_comm);
+ }
+
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_tuning.h b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_tuning.h
new file mode 100644
index 0000000000000000000000000000000000000000..d34b304249bd3db52968d7df92cf8c32f1f746c5
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_tuning.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_tuning.h
+ * \brief ???
+ */
+
+/*---------------------------------------------------------------------
+ * ad_gpfs_tuning.h
+ *
+ * declares global variables and macros for performance tuning and
+ * functional debugging.
+ *---------------------------------------------------------------------*/
+
+#ifndef AD_GPFS_TUNING_H_INCLUDED
+#define AD_GPFS_TUNING_H_INCLUDED
+
+#include "adio.h"
+
+
+/*-----------------------------------------
+ * Global variables for the control of
+ * 1. timing
+ * 2. select specific optimizations
+ * 3. global flags for certain optimizations
+ *-----------------------------------------*/
+
+/* timing fields */
+enum {
+ GPFSMPIO_CIO_DATA_SIZE = 0,
+ GPFSMPIO_CIO_T_SEEK,
+ GPFSMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
+ GPFSMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
+ GPFSMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
+ GPFSMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
+ GPFSMPIO_CIO_T_MYREQ, /* time for ADIOI_Calc_my_req(), local */
+ GPFSMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
+ GPFSMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
+ /* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
+ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH, /* time for each process to exchange recieve
+ * size info with everyone else */
+ GPFSMPIO_CIO_T_DEXCH_SETUP, /* time for setup portion of I/O data exchange */
+ GPFSMPIO_CIO_T_DEXCH_NET, /* time for network portion of I/O data exchange */
+ GPFSMPIO_CIO_T_DEXCH_SORT, /* time to sort requesst in I/O data exchange */
+ GPFSMPIO_CIO_T_DEXCH_SIEVE, /* time for read portion of RMW in two phase */
+ GPFSMPIO_CIO_T_POSI_RW,
+ GPFSMPIO_CIO_B_POSI_RW,
+ GPFSMPIO_CIO_T_MPIO_RW, /* time for ADIOI_WriteContig() */
+ GPFSMPIO_CIO_B_MPIO_RW,
+ GPFSMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_GPFS_WriteStridedColl() */
+ GPFSMPIO_CIO_B_MPIO_CRW,
+ GPFSMPIO_CIO_LAST
+};
+
+/* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */
+extern double gpfsmpio_prof_cw[GPFSMPIO_CIO_LAST + 1];
+extern double gpfsmpio_prof_cr[GPFSMPIO_CIO_LAST + 1];
+
+/* corresponds to environment variables to select optimizations and timing level */
+extern int gpfsmpio_timing;
+extern int gpfsmpio_timing_cw_level;
+extern int gpfsmpio_comm;
+extern int gpfsmpio_tuneblocking;
+extern long bglocklessmpio_f_type;
+extern int gpfsmpio_pthreadio;
+extern int gpfsmpio_p2pcontig;
+extern int gpfsmpio_balancecontig;
+extern int gpfsmpio_devnullio;
+extern int gpfsmpio_bridgeringagg;
+
+/* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
+ * i/o node and all compute nodes wired to it. On Blue Gene /Q that
+ * relationship is a lot more fluid. There are still I/O nodes, and compute
+ * nodes are assigned to an i/o node, but there are two routes to the i/o node,
+ * via compute nodes designated as "bridge nodes". In this code, what we used
+ * to call a "pset" is actually "compute nodes associated with and including a
+ * bridge node". So, "nAgg" is roughly "number of aggregators per bridge", but
+ * look closely at ADIOI_BG_persInfo_init() for the details */
+
+#define ADIOI_BG_NAGG_PSET_DFLT 16
+
+extern int gpfsmpio_bg_nagg_pset;
+
+
+/* set internal variables for tuning environment variables */
+void ad_gpfs_get_env_vars(void);
+
+/* report timing breakdown for MPI I/O collective call */
+void ad_gpfs_timing_crw_report(int rw, ADIO_File fd, int myrank, int nprocs);
+
+/* note:
+ * T := timing;
+ * CIO := collective I/O
+ */
+#define GPFSMPIO_T_CIO_RESET(RW) \
+ { \
+ int _i; \
+ for (_i = 0; _i < GPFSMPIO_CIO_LAST; _i ++) \
+ gpfsmpio_prof_c##RW [ _i ] = 0; \
+ }
+
+#define GPFSMPIO_T_CIO_REPORT(RW, FD, MYRANK, NPROCS) \
+ ad_gpfs_timing_crw_report (RW, FD, MYRANK, NPROCS); \
+
+#define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2) \
+ { \
+ double temp = MPI_Wtime(); \
+ if (ISSET) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \
+ if (ISGET) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ; \
+ }
+
+#endif /* AD_GPFS_TUNING_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_wrcoll.c b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_wrcoll.c
new file mode 100644
index 0000000000000000000000000000000000000000..13d31282204152e0f4cacb34beeda0cdcb8aea69
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/ad_gpfs_wrcoll.c
@@ -0,0 +1,1756 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_gpfs_wrcoll.c
+ * \brief ???
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_gpfs.h"
+#include "ad_gpfs_aggrs.h"
+
+#ifdef BGQPLATFORM
+#include "bg/ad_bg_pset.h"
+#endif
+
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+#ifdef PROFILE
+#include "mpe.h"
+#endif
+
+#include
+
+#ifdef HAVE_GPFS_H
+#include
+#endif
+#ifdef HAVE_GPFS_FCNTL_H
+#include
+#endif
+
+#include
+/* prototypes of functions used for collective writes only. */
+static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
+ datatype, int nprocs, int myrank, ADIOI_Access
+ * others_req, ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int contig_access_count, ADIO_Offset
+ min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ MPI_Aint * buf_idx, int *error_code);
+static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf,
+ ADIOI_Flatlist_node * flat_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list, int *send_size,
+ int *recv_size, ADIO_Offset off, int size,
+ int *count, int *start_pos, int *partial_recv,
+ int *sent_to_proc, int nprocs,
+ int myrank, int
+ buftype_is_contig, int contig_access_count,
+ ADIO_Offset min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIOI_Access * others_req,
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int *hole, int iter,
+ MPI_Aint buftype_extent, MPI_Aint * buf_idx, int *error_code);
+static void ADIOI_W_Exchange_data_alltoallv(ADIO_File fd, const void *buf, char *write_buf, /* 1 */
+ ADIOI_Flatlist_node * flat_buf, ADIO_Offset * offset_list, ADIO_Offset * len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, /* 2 */
+ int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset * fd_start, ADIO_Offset * fd_end, ADIOI_Access * others_req, int *send_buf_idx, int *curr_to_proc, /* 3 */
+ int *done_to_proc, int *hole, /* 4 */
+ int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx,
+ int *error_code);
+static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
+ * flat_buf, char **send_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list, int *send_size,
+ MPI_Request * requests, int *sent_to_proc,
+ int nprocs, int myrank,
+ int contig_access_count, ADIO_Offset
+ min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int iter, MPI_Aint buftype_extent);
+static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
+ * flat_buf, char **send_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list, int *send_size,
+ MPI_Request * requests, int *sent_to_proc,
+ int nprocs, int myrank,
+ int contig_access_count, ADIO_Offset
+ min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int iter, MPI_Aint buftype_extent);
+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
+ ADIO_Offset * srt_off, int *srt_len, int *start_pos,
+ int nprocs, int nprocs_recv, int total_elements);
+
+
+void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code)
+{
+/* Uses a generalized version of the extended two-phase method described
+ in "An Extended Two-Phase Method for Accessing Sections of
+ Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
+ Scientific Programming, (5)4:301--317, Winter 1996.
+ http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
+
+ ADIOI_Access *my_req;
+ /* array of nprocs access structures, one for each other process in
+ * whose file domain this process's request lies */
+
+ ADIOI_Access *others_req;
+ /* array of nprocs access structures, one for each other process
+ * whose request lies in this process's file domain. */
+
+ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
+ int contig_access_count = 0, interleave_count = 0, buftype_is_contig;
+ int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
+ ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off;
+ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
+ *fd_end = NULL, *end_offsets = NULL;
+ ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
+ ADIO_Offset *count_sizes;
+ int ii;
+
+ MPI_Aint *buf_idx = NULL;
+ ADIO_Offset *len_list = NULL;
+ GPFSMPIO_T_CIO_RESET(w)
+#ifdef PROFILE
+ MPE_Log_event(13, 0, "start computation");
+#endif
+
+ MPI_Comm_size(fd->comm, &nprocs);
+ MPI_Comm_rank(fd->comm, &myrank);
+
+/* the number of processes that actually perform I/O, nprocs_for_coll,
+ * is stored in the hints off the ADIO_File structure
+ */
+ nprocs_for_coll = fd->hints->cb_nodes;
+ orig_fp = fd->fp_ind;
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST);
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST);
+
+ /* only check for interleaving if cb_write isn't disabled */
+ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+ /* For this process's request, calculate the list of offsets and
+ * lengths in the file and determine the start and end offsets.
+ * Note: end_offset points to the last byte-offset to be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
+ */
+ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+ &offset_list, &len_list, &start_offset,
+ &end_offset, &contig_access_count);
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP);
+
+ /* each process communicates its start and end offsets to other
+ * processes. The result is an array each of start and end offsets stored
+ * in order of process rank. */
+ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * 2 * sizeof(ADIO_Offset));
+ end_offsets = st_offsets + nprocs;
+
+ ADIO_Offset my_count_size = 0;
+ /* One-sided aggregation needs the amount of data per rank as well
+ * because the difference in starting and ending offsets for 1 byte is
+ * 0 the same as 0 bytes so it cannot be distiguished.
+ */
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ count_sizes = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
+ MPI_Count buftype_size;
+ MPI_Type_size_x(datatype, &buftype_size);
+ my_count_size = (ADIO_Offset) count *(ADIO_Offset) buftype_size;
+ }
+ if (romio_tunegather) {
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(6 * nprocs * sizeof(ADIO_Offset));
+ gpfs_offsets = gpfs_offsets0 + 3 * nprocs;
+ for (ii = 0; ii < nprocs; ii++) {
+ gpfs_offsets0[ii * 3] = 0;
+ gpfs_offsets0[ii * 3 + 1] = 0;
+ gpfs_offsets0[ii * 3 + 2] = 0;
+ }
+ gpfs_offsets0[myrank * 3] = start_offset;
+ gpfs_offsets0[myrank * 3 + 1] = end_offset;
+ gpfs_offsets0[myrank * 3 + 2] = my_count_size;
+ MPI_Allreduce(gpfs_offsets0, gpfs_offsets, nprocs * 3, ADIO_OFFSET, MPI_MAX,
+ fd->comm);
+ for (ii = 0; ii < nprocs; ii++) {
+ st_offsets[ii] = gpfs_offsets[ii * 3];
+ end_offsets[ii] = gpfs_offsets[ii * 3 + 1];
+ count_sizes[ii] = gpfs_offsets[ii * 3 + 2];
+ }
+ } else {
+ gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset));
+ gpfs_offsets = (ADIO_Offset *) ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset));
+ for (ii = 0; ii < nprocs; ii++) {
+ gpfs_offsets0[ii * 2] = 0;
+ gpfs_offsets0[ii * 2 + 1] = 0;
+ }
+ gpfs_offsets0[myrank * 2] = start_offset;
+ gpfs_offsets0[myrank * 2 + 1] = end_offset;
+
+ MPI_Allreduce(gpfs_offsets0, gpfs_offsets, nprocs * 2, ADIO_OFFSET, MPI_MAX,
+ fd->comm);
+
+ for (ii = 0; ii < nprocs; ii++) {
+ st_offsets[ii] = gpfs_offsets[ii * 2];
+ end_offsets[ii] = gpfs_offsets[ii * 2 + 1];
+ }
+ }
+ ADIOI_Free(gpfs_offsets0);
+ } else {
+ MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm);
+ MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm);
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ MPI_Allgather(&my_count_size, 1, ADIO_OFFSET, count_sizes, 1,
+ ADIO_OFFSET, fd->comm);
+ }
+ }
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER);
+
+ /* are the accesses of different processes interleaved? */
+ for (i = 1; i < nprocs; i++)
+ if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i]))
+ interleave_count++;
+ /* This is a rudimentary check for interleaving, but should suffice
+ * for the moment. */
+ }
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+
+ if (fd->hints->cb_write == ADIOI_HINT_DISABLE ||
+ (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) {
+ /* use independent accesses */
+ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ }
+
+ fd->fp_ind = orig_fp;
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ if (buftype_is_contig && filetype_is_contig) {
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+ off = fd->disp + (ADIO_Offset) (fd->etype_size) * offset;
+ ADIO_WriteContig(fd, buf, count, datatype,
+ ADIO_EXPLICIT_OFFSET, off, status, error_code);
+ } else
+ ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code);
+ } else
+ ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code);
+
+ return;
+ }
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA)
+
+/* Divide the I/O workload among "nprocs_for_coll" processes. This is
+ done by (logically) dividing the file into file domains (FDs); each
+ process may directly access only its own file domain. */
+ ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
+ int currentValidDataIndex = 0;
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ /* Take out the 0-data offsets by shifting the indexes with data to the front
+ * and keeping track of the valid data index for use as the length.
+ */
+ for (i = 0; i < nprocs; i++) {
+ if (count_sizes[i] > 0) {
+ st_offsets[currentValidDataIndex] = st_offsets[i];
+ end_offsets[currentValidDataIndex] = end_offsets[i];
+ lastFileOffset = MPL_MAX(lastFileOffset, end_offsets[currentValidDataIndex]);
+ if (firstFileOffset == -1)
+ firstFileOffset = st_offsets[currentValidDataIndex];
+ else
+ firstFileOffset = MPL_MIN(firstFileOffset, st_offsets[currentValidDataIndex]);
+
+ currentValidDataIndex++;
+ }
+ }
+ }
+
+ if (gpfsmpio_tuneblocking) {
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets,
+ currentValidDataIndex,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end, &fd_size, fd->fs_ptr);
+ } else {
+ ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end, &fd_size, fd->fs_ptr);
+ }
+ } else {
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ ADIOI_Calc_file_domains(st_offsets, end_offsets, currentValidDataIndex,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end,
+ fd->hints->min_fdomain_size, &fd_size,
+ fd->hints->striping_unit);
+ } else {
+ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
+ nprocs_for_coll, &min_st_offset,
+ &fd_start, &fd_end,
+ fd->hints->min_fdomain_size, &fd_size,
+ fd->hints->striping_unit);
+ }
+ }
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART);
+
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ /* If the user has specified to use a one-sided aggregation method then do that at
+ * this point instead of the two-phase I/O.
+ */
+ /* pass this datastructure to indicate we are a non-striping filesystem
+ * to the onesided algorithm by setting stripe size to 0.
+ */
+ ADIOI_OneSidedStripeParms noStripeParms;
+ noStripeParms.stripeSize = 0;
+ noStripeParms.segmentLen = 0;
+ noStripeParms.stripesPerAgg = 0;
+ noStripeParms.segmentIter = 0;
+ noStripeParms.flushCB = 1;
+ noStripeParms.stripedLastFileOffset = 0;
+ noStripeParms.firstStripedWriteCall = 0;
+ noStripeParms.lastStripedWriteCall = 0;
+ noStripeParms.iWasUsedStripingAgg = 0;
+ noStripeParms.numStripesUsed = 0;
+ noStripeParms.amountOfStripedDataExpected = 0;
+ noStripeParms.bufTypeExtent = 0;
+ noStripeParms.lastDataTypeExtent = 0;
+ noStripeParms.lastFlatBufIndice = 0;
+ noStripeParms.lastIndiceOffset = 0;
+
+ int holeFound = 0;
+ ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count,
+ buf, datatype, error_code, firstFileOffset, lastFileOffset,
+ currentValidDataIndex, fd_start, fd_end, &holeFound,
+ &noStripeParms);
+ int anyHolesFound = 0;
+ if (!romio_onesided_no_rmw)
+ MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm);
+ if (anyHolesFound == 0) {
+ GPFSMPIO_T_CIO_REPORT(1, fd, myrank, nprocs);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+ ADIOI_Free(count_sizes);
+ goto fn_exit;
+ } else {
+ /* Holes are found in the data and the user has not set
+ * romio_onesided_no_rmw --- set romio_onesided_always_rmw to 1
+ * and re-call ADIOI_OneSidedWriteAggregation and if the user has
+ * romio_onesided_inform_rmw set then inform him of this condition
+ * and behavior.
+ */
+
+ if (romio_onesided_inform_rmw && (myrank == 0))
+ FPRINTF(stderr, "Information: Holes found during one-sided "
+ "write aggregation algorithm --- re-running one-sided "
+ "write aggregation with ROMIO_ONESIDED_ALWAYS_RMW set to 1.\n");
+ romio_onesided_always_rmw = 1;
+ int prev_romio_onesided_no_rmw = romio_onesided_no_rmw;
+ romio_onesided_no_rmw = 1;
+ ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf,
+ datatype, error_code, firstFileOffset, lastFileOffset,
+ currentValidDataIndex, fd_start, fd_end, &holeFound,
+ &noStripeParms);
+ romio_onesided_no_rmw = prev_romio_onesided_no_rmw;
+ GPFSMPIO_T_CIO_REPORT(1, fd, myrank, nprocs);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+ ADIOI_Free(count_sizes);
+ goto fn_exit;
+ }
+ }
+ if (gpfsmpio_p2pcontig == 1) {
+ /* For some simple yet common(?) workloads, full-on two-phase I/O is
+ * overkill. We can establish sub-groups of processes and their
+ * aggregator, and then these sub-groups will carry out a simplified
+ * two-phase over that sub-group.
+ *
+ * First verify that the filetype is contig and the offsets are
+ * increasing in rank order
+ */
+ int inOrderAndNoGaps = 1;
+ for (i = 0; i < (nprocs - 1); i++) {
+ if (end_offsets[i] != (st_offsets[i + 1] - 1))
+ inOrderAndNoGaps = 0;
+ }
+ if (inOrderAndNoGaps && buftype_is_contig) {
+ /* if these conditions exist then execute the P2PContig code else
+ * execute the original code */
+ ADIOI_P2PContigWriteAggregation(fd, buf,
+ error_code, st_offsets, end_offsets, fd_start, fd_end);
+ /* NOTE: we are skipping the rest of two-phase in this path */
+ GPFSMPIO_T_CIO_REPORT(1, fd, myrank, nprocs);
+
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+ goto fn_exit;
+ }
+ }
+
+/* calculate what portions of the access requests of this process are
+ located in what file domains */
+
+ if (gpfsmpio_tuneblocking)
+ ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+ min_st_offset, fd_start, fd_end, fd_size,
+ nprocs, &count_my_req_procs,
+ &count_my_req_per_proc, &my_req, &buf_idx);
+ else
+ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+ min_st_offset, fd_start, fd_end, fd_size,
+ nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx);
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ);
+
+ /* based on everyone's my_req, calculate what requests of other
+ * processes lie in this process's file domain.
+ * count_others_req_procs = number of processes whose requests lie in
+ * this process's file domain (including this process itself)
+ * count_others_req_per_proc[i] indicates how many separate contiguous
+ * requests of proc. i lie in this process's file domain.
+ */
+ if (gpfsmpio_tuneblocking)
+ ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
+ count_my_req_per_proc, my_req,
+ nprocs, myrank, &count_others_req_procs, &others_req);
+ else
+ ADIOI_Calc_others_req(fd, count_my_req_procs,
+ count_my_req_per_proc, my_req,
+ nprocs, myrank, &count_others_req_procs, &others_req);
+
+ GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ);
+
+ ADIOI_Free(count_my_req_per_proc);
+ ADIOI_Free(my_req[0].offsets);
+ ADIOI_Free(my_req);
+
+ /* exchange data and write in sizes of no more than coll_bufsize. */
+ ADIOI_Exch_and_write(fd, buf, datatype, nprocs, myrank,
+ others_req, offset_list,
+ len_list, contig_access_count, min_st_offset,
+ fd_size, fd_start, fd_end, buf_idx, error_code);
+
+ GPFSMPIO_T_CIO_SET_GET(w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH);
+ GPFSMPIO_T_CIO_SET_GET(w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW);
+ GPFSMPIO_T_CIO_REPORT(1, fd, myrank, nprocs);
+
+ /* free all memory allocated for collective I/O */
+ if (others_req[0].offsets) {
+ ADIOI_Free(others_req[0].offsets);
+ }
+ if (others_req[0].mem_ptrs) {
+ ADIOI_Free(others_req[0].mem_ptrs);
+ }
+ ADIOI_Free(others_req);
+
+ ADIOI_Free(buf_idx);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(fd_start);
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status) {
+ MPI_Count bufsize, size;
+ /* Don't set status if it isn't needed */
+ MPI_Type_size_x(datatype, &size);
+ bufsize = size * count;
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+ }
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written during collective I/O. */
+#endif
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5013, 0, NULL);
+#endif
+}
+
+static void gpfs_wr_access_start(int fd, ADIO_Offset offset, ADIO_Offset length)
+{
+ int rc = 0;
+#ifdef HAVE_GPFS_FCNTL_H
+ struct {
+ gpfsFcntlHeader_t header;
+ gpfsAccessRange_t access;
+ } take_locks;
+
+ take_locks.header.totalLength = sizeof(take_locks);
+ take_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+ take_locks.header.fcntlReserved = 0;
+
+ take_locks.access.structLen = sizeof(take_locks.access);
+ take_locks.access.structType = GPFS_ACCESS_RANGE;
+ take_locks.access.start = offset;
+ take_locks.access.length = length;
+ take_locks.access.isWrite = 1;
+
+ rc = gpfs_fcntl(fd, &take_locks);
+#endif
+ ADIOI_Assert(rc == 0);
+}
+
+static void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
+{
+ int rc = 0;
+#ifdef HAVE_GPFS_FCNTL_H
+ struct {
+ gpfsFcntlHeader_t header;
+ gpfsFreeRange_t free;
+ } free_locks;
+
+
+ free_locks.header.totalLength = sizeof(free_locks);
+ free_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+ free_locks.header.fcntlReserved = 0;
+
+ free_locks.free.structLen = sizeof(free_locks.free);
+ free_locks.free.structType = GPFS_FREE_RANGE;
+ free_locks.free.start = offset;
+ free_locks.free.length = length;
+
+ rc = gpfs_fcntl(fd, &free_locks);
+#endif
+ ADIOI_Assert(rc == 0);
+}
+
+#ifdef BGQPLATFORM
+/* my_start, my_end: this processes file domain. coudd be -1,-1 for "no i/o"
+ * fd_start, fd_end: arrays of length fd->hints->cb_nodes specifying all file domains */
+static int gpfs_find_access_for_ion(ADIO_File fd,
+ ADIO_Offset my_start, ADIO_Offset my_end,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIO_Offset * start, ADIO_Offset * end)
+{
+ int my_ionode = BGQ_IO_node_id();
+ int *rank_to_ionode;
+ int i, nprocs, rank;
+ ADIO_Offset group_start = LLONG_MAX, group_end = 0;
+
+ MPI_Comm_size(fd->comm, &nprocs);
+ MPI_Comm_rank(fd->comm, &rank);
+
+ rank_to_ionode = ADIOI_Calloc(nprocs, sizeof(int));
+ MPI_Allgather(&my_ionode, 1, MPI_INT, rank_to_ionode, 1, MPI_INT, fd->comm);
+
+ /* rank_to_ionode now contains a mapping from MPI rank to IO node */
+ /* fd->hints->ranklist[] contains a list of MPI ranks that are aggregators */
+ /* fd_start[] and fd_end[] contain a list of file domains. */
+
+ /* what we really want to do is take all the file domains associated
+ * with a given i/o node and find the begin/end of that range.
+ *
+ * Because gpfs_fcntl hints are expected to be released, we'll pass this
+ * start/end back to the caller, who will both declare and free this range
+ */
+ if (my_start == -1 || my_end == -1) {
+ ADIOI_Free(rank_to_ionode);
+ return 0; /* no work to do */
+ }
+
+ for (i = 0; i < fd->hints->cb_nodes; i++) {
+ if (my_ionode == rank_to_ionode[fd->hints->ranklist[i]]) {
+ group_start = MPL_MIN(fd_start[i], group_start);
+ group_end = MPL_MAX(fd_end[i], group_end);
+ }
+ }
+ *start = group_start;
+ *end = group_end;
+ ADIOI_Free(rank_to_ionode);
+ return 1;
+}
+#endif // BGQPLATFORM
+
+
+/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
+ * code is created and returned in error_code.
+ */
+static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
+ datatype, int nprocs,
+ int myrank,
+ ADIOI_Access
+ * others_req, ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int contig_access_count,
+ ADIO_Offset min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ MPI_Aint * buf_idx, int *error_code)
+{
+/* Send data to appropriate processes and write in sizes of no more
+ than coll_bufsize.
+ The idea is to reduce the amount of extra memory required for
+ collective I/O. If all data were written all at once, which is much
+ easier, it would require temp space more than the size of user_buf,
+ which is often unacceptable. For example, to write a distributed
+ array to a file, where each local array is 8Mbytes, requiring
+ at least another 8Mbytes of temp space is unacceptable. */
+
+ /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
+ ADIO_Offset size = 0;
+ int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
+ ADIO_Offset st_loc = -1, end_loc = -1, off, done, req_off;
+ char *write_buf = NULL, *write_buf2 = NULL;
+ int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
+ int *partial_recv, *sent_to_proc, *start_pos, flag;
+ int *send_buf_idx, *curr_to_proc, *done_to_proc;
+ MPI_Status status;
+ ADIOI_Flatlist_node *flat_buf = NULL;
+ MPI_Aint lb, buftype_extent;
+ int info_flag, coll_bufsize;
+ char *value;
+ static char myname[] = "ADIOI_EXCH_AND_WRITE";
+ pthread_t io_thread;
+ void *thread_ret;
+ ADIOI_IO_ThreadFuncData io_thread_args;
+
+ *error_code = MPI_SUCCESS; /* changed below if error */
+ /* only I/O errors are currently reported */
+
+/* calculate the number of writes of size coll_bufsize
+ to be done by each process and the max among all processes.
+ That gives the no. of communication phases as well. */
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
+ coll_bufsize = atoi(value);
+ ADIOI_Free(value);
+
+ if (gpfsmpio_pthreadio == 1) {
+ /* ROMIO will spawn an additional thread. both threads use separate
+ * halves of the collective buffer*/
+ coll_bufsize = coll_bufsize / 2;
+ }
+
+ for (i = 0; i < nprocs; i++) {
+ if (others_req[i].count) {
+ st_loc = others_req[i].offsets[0];
+ end_loc = others_req[i].offsets[0];
+ break;
+ }
+ }
+
+ for (i = 0; i < nprocs; i++)
+ for (j = 0; j < others_req[i].count; j++) {
+ st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
+ end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j]
+ + others_req[i].lens[j] - 1));
+ }
+
+/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
+
+ ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);
+
+ if ((st_loc == -1) && (end_loc == -1)) {
+ ntimes = 0; /* this process does no writing. */
+ }
+ if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
+ * an aggregator -- otherwise will fail for deferred open */
+ if (getenv("ROMIO_GPFS_DECLARE_ACCESS") != NULL) {
+ gpfs_wr_access_start(fd->fd_sys, st_loc, end_loc - st_loc);
+ }
+ }
+
+ ADIO_Offset st_loc_ion = 0, end_loc_ion = 0, needs_gpfs_access_cleanup = 0;
+#ifdef BGQPLATFORM
+ if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
+ * an aggregator -- otherwise will fail for deferred open */
+
+ if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS") != NULL) {
+ if (gpfs_find_access_for_ion(fd, st_loc, end_loc, fd_start, fd_end,
+ &st_loc_ion, &end_loc_ion)) {
+ gpfs_wr_access_start(fd->fd_sys, st_loc_ion, end_loc_ion - st_loc_ion);
+ needs_gpfs_access_cleanup = 1;
+ }
+ }
+ }
+#endif
+
+ MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);
+
+ write_buf = fd->io_buf;
+ if (gpfsmpio_pthreadio == 1) {
+ write_buf2 = fd->io_buf + coll_bufsize;
+ }
+
+ curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ /* its use is explained below. calloc initializes to 0. */
+
+ count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* to store count of how many off-len pairs per proc are satisfied
+ * in an iteration. */
+
+ partial_recv = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ /* if only a portion of the last off-len pair is recd. from a process
+ * in a particular iteration, the length recd. is stored here.
+ * calloc initializes to 0. */
+
+ send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* total size of data to be sent to each proc. in an iteration.
+ * Of size nprocs so that I can use MPI_Alltoall later. */
+
+ recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* total size of data to be recd. from each proc. in an iteration. */
+
+ sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ /* amount of data sent to each proc so far. Used in
+ * ADIOI_Fill_send_buffer. initialized to 0 here. */
+
+ send_buf_idx = (int *) ADIOI_Malloc(nprocs * 3 * sizeof(int));
+ curr_to_proc = send_buf_idx + nprocs;
+ done_to_proc = curr_to_proc + nprocs;
+ /* Above three are used in ADIOI_Fill_send_buffer */
+
+ start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* used to store the starting value of curr_offlen_ptr[i] in
+ * this iteration */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ if (!buftype_is_contig) {
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+ }
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+
+
+/* I need to check if there are any outstanding nonblocking writes to
+ the file, which could potentially interfere with the writes taking
+ place in this collective write call. Since this is not likely to be
+ common, let me do the simplest thing possible here: Each process
+ completes all pending nonblocking operations before completing. */
+
+ /*ADIOI_Complete_async(error_code);
+ * if (*error_code != MPI_SUCCESS) return;
+ * MPI_Barrier(fd->comm);
+ */
+
+ done = 0;
+ off = st_loc;
+
+ if (gpfsmpio_pthreadio == 1)
+ io_thread = pthread_self();
+
+#ifdef PROFILE
+ MPE_Log_event(14, 0, "end computation");
+#endif
+
+ for (m = 0; m < ntimes; m++) {
+ /* go through all others_req and check which will be satisfied
+ * by the current write */
+
+ /* Note that MPI guarantees that displacements in filetypes are in
+ * monotonically nondecreasing order and that, for writes, the
+ * filetypes cannot specify overlapping regions in the file. This
+ * simplifies implementation a bit compared to reads. */
+
+ /* off = start offset in the file for the data to be written in
+ * this iteration
+ * size = size of data written (bytes) corresponding to off
+ * req_off = off in file for a particular contiguous request
+ * minus what was satisfied in previous iteration
+ * req_size = size corresponding to req_off */
+
+ /* first calculate what should be communicated */
+
+#ifdef PROFILE
+ MPE_Log_event(13, 0, "start computation");
+#endif
+ for (i = 0; i < nprocs; i++)
+ count[i] = recv_size[i] = 0;
+
+ size = MPL_MIN((unsigned) coll_bufsize, end_loc - st_loc + 1 - done);
+
+ for (i = 0; i < nprocs; i++) {
+ if (others_req[i].count) {
+ start_pos[i] = curr_offlen_ptr[i];
+ for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+ if (partial_recv[i]) {
+ /* this request may have been partially
+ * satisfied in the previous iteration. */
+ req_off = others_req[i].offsets[j] + partial_recv[i];
+ req_len = others_req[i].lens[j] - partial_recv[i];
+ partial_recv[i] = 0;
+ /* modify the off-len pair to reflect this change */
+ others_req[i].offsets[j] = req_off;
+ others_req[i].lens[j] = req_len;
+ } else {
+ req_off = others_req[i].offsets[j];
+ req_len = others_req[i].lens[j];
+ }
+ if (req_off < off + size) {
+ count[i]++;
+ ADIOI_Assert((((ADIO_Offset) (uintptr_t) write_buf) + req_off - off) ==
+ (ADIO_Offset) (uintptr_t) (write_buf + req_off - off));
+ MPI_Get_address(write_buf + req_off - off, &(others_req[i].mem_ptrs[j]));
+ ADIOI_Assert((off + size - req_off) == (int) (off + size - req_off));
+ recv_size[i] += (int) (MPL_MIN(off + size - req_off, (unsigned) req_len));
+
+ if (off + size - req_off < (unsigned) req_len) {
+ partial_recv[i] = (int) (off + size - req_off);
+
+ /* --BEGIN ERROR HANDLING-- */
+ if ((j + 1 < others_req[i].count) &&
+ (others_req[i].offsets[j + 1] < off + size)) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname,
+ __LINE__,
+ MPI_ERR_ARG,
+ "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)",
+ 0);
+ /* allow to continue since additional
+ * communication might have to occur
+ */
+ }
+ /* --END ERROR HANDLING-- */
+ break;
+ }
+ } else
+ break;
+ }
+ curr_offlen_ptr[i] = j;
+ }
+ }
+
+#ifdef PROFILE
+ MPE_Log_event(14, 0, "end computation");
+ MPE_Log_event(7, 0, "start communication");
+#endif
+ if (gpfsmpio_comm == 1)
+ ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
+ len_list, send_size, recv_size, off, size, count,
+ start_pos, partial_recv,
+ sent_to_proc, nprocs, myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, send_buf_idx, curr_to_proc,
+ done_to_proc, &hole, m, buftype_extent, buf_idx, error_code);
+ else if (gpfsmpio_comm == 0)
+ ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
+ len_list, send_size, recv_size, off, size, count,
+ start_pos, partial_recv,
+ sent_to_proc, nprocs, myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, send_buf_idx, curr_to_proc,
+ done_to_proc, &hole, m, buftype_extent, buf_idx,
+ error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+#ifdef PROFILE
+ MPE_Log_event(8, 0, "end communication");
+#endif
+
+ flag = 0;
+ for (i = 0; i < nprocs; i++)
+ if (count[i])
+ flag = 1;
+
+ if (flag) {
+ char round[50];
+ MPL_snprintf(round, sizeof(round), "two-phase-round=%d", m);
+ setenv("LIBIOLOG_EXTRA_INFO", round, 1);
+ ADIOI_Assert(size == (int) size);
+ if (gpfsmpio_pthreadio == 1) {
+ /* there is no such thing as "invalid pthread identifier", so
+ * we'll use pthread_self() instead. Before we do I/O we want
+ * to complete I/O from any previous iteration -- but only a
+ * previous iteration that had I/O work to do (i.e. set 'flag')
+ */
+ if (!pthread_equal(io_thread, pthread_self())) {
+ pthread_join(io_thread, &thread_ret);
+ *error_code = *(int *) thread_ret;
+ if (*error_code != MPI_SUCCESS)
+ return;
+ io_thread = pthread_self();
+
+ }
+ io_thread_args.fd = fd;
+ /* do a little pointer shuffling: background I/O works from one
+ * buffer while two-phase machinery fills up another */
+ io_thread_args.buf = write_buf;
+ write_buf = write_buf2;
+ write_buf2 = io_thread_args.buf;
+ io_thread_args.io_kind = ADIOI_WRITE;
+ io_thread_args.size = size;
+ io_thread_args.offset = off;
+ io_thread_args.status = &status;
+ io_thread_args.error_code = *error_code;
+ if ((pthread_create(&io_thread, NULL,
+ ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
+ io_thread = pthread_self();
+ } else {
+ ADIO_WriteContig(fd, write_buf, (int) size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+ }
+ }
+
+ off += size;
+ done += size;
+ }
+ if (gpfsmpio_pthreadio == 1) {
+ if (!pthread_equal(io_thread, pthread_self())) {
+ pthread_join(io_thread, &thread_ret);
+ *error_code = *(int *) thread_ret;
+ }
+ }
+
+ for (i = 0; i < nprocs; i++)
+ count[i] = recv_size[i] = 0;
+#ifdef PROFILE
+ MPE_Log_event(7, 0, "start communication");
+#endif
+ for (m = ntimes; m < max_ntimes; m++)
+ /* nothing to recv, but check for send. */
+ if (gpfsmpio_comm == 1)
+ ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
+ len_list, send_size, recv_size, off, size, count,
+ start_pos, partial_recv,
+ sent_to_proc, nprocs, myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, send_buf_idx,
+ curr_to_proc, done_to_proc, &hole, m,
+ buftype_extent, buf_idx, error_code);
+ else if (gpfsmpio_comm == 0)
+ ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
+ len_list, send_size, recv_size, off, size, count,
+ start_pos, partial_recv,
+ sent_to_proc, nprocs, myrank,
+ buftype_is_contig, contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ others_req, send_buf_idx,
+ curr_to_proc, done_to_proc, &hole, m,
+ buftype_extent, buf_idx, error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+#ifdef PROFILE
+ MPE_Log_event(8, 0, "end communication");
+#endif
+
+ ADIOI_Free(curr_offlen_ptr);
+ ADIOI_Free(count);
+ ADIOI_Free(partial_recv);
+ ADIOI_Free(send_size);
+ ADIOI_Free(recv_size);
+ ADIOI_Free(sent_to_proc);
+ ADIOI_Free(start_pos);
+ ADIOI_Free(send_buf_idx);
+
+ if (ntimes != 0 && getenv("ROMIO_GPFS_DECLARE_ACCESS") != NULL) {
+ gpfs_wr_access_end(fd->fd_sys, st_loc, end_loc - st_loc);
+ }
+
+ if (needs_gpfs_access_cleanup) {
+ gpfs_wr_access_end(fd->fd_sys, st_loc_ion, end_loc_ion - st_loc_ion);
+ needs_gpfs_access_cleanup = 0;
+ }
+
+ unsetenv("LIBIOLOG_EXTRA_INFO");
+}
+
+
+/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
+ * in the case of error.
+ */
+static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf,
+ ADIOI_Flatlist_node * flat_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list, int *send_size,
+ int *recv_size, ADIO_Offset off, int size,
+ int *count, int *start_pos,
+ int *partial_recv,
+ int *sent_to_proc, int nprocs,
+ int myrank, int
+ buftype_is_contig, int contig_access_count,
+ ADIO_Offset min_st_offset,
+ ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ ADIOI_Access * others_req,
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int *hole, int iter,
+ MPI_Aint buftype_extent, MPI_Aint * buf_idx, int *error_code)
+{
+ int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err;
+ char **send_buf = NULL;
+ MPI_Request *requests, *send_req;
+ MPI_Datatype *recv_types;
+ MPI_Status *statuses, status;
+ int *srt_len, sum;
+ ADIO_Offset *srt_off;
+ static char myname[] = "ADIOI_W_EXCHANGE_DATA";
+
+/* exchange recv_size info so that each process knows how much to
+ send to whom. */
+
+ MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
+
+ /* create derived datatypes for recv */
+
+ nprocs_recv = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i])
+ nprocs_recv++;
+
+ recv_types = (MPI_Datatype *)
+ ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype));
+/* +1 to avoid a 0-size malloc */
+
+ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+/* take care if the last off-len pair is a partial recv */
+ if (partial_recv[i]) {
+ k = start_pos[i] + count[i] - 1;
+ tmp_len[i] = others_req[i].lens[k];
+ others_req[i].lens[k] = partial_recv[i];
+ }
+ ADIOI_Type_create_hindexed_x(count[i],
+ &(others_req[i].lens[start_pos[i]]),
+ &(others_req[i].mem_ptrs[start_pos[i]]),
+ MPI_BYTE, recv_types + j);
+ /* absolute displacements; use MPI_BOTTOM in recv */
+ MPI_Type_commit(recv_types + j);
+ j++;
+ }
+ }
+
+ /* To avoid a read-modify-write, check if there are holes in the
+ * data to be written. For this, merge the (sorted) offset lists
+ * others_req using a heap-merge. */
+
+ sum = 0;
+ for (i = 0; i < nprocs; i++)
+ sum += count[i];
+ srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
+ srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
+ /* +1 to avoid a 0-size malloc */
+
+ ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum);
+
+/* for partial recvs, restore original lengths */
+ for (i = 0; i < nprocs; i++)
+ if (partial_recv[i]) {
+ k = start_pos[i] + count[i] - 1;
+ others_req[i].lens[k] = tmp_len[i];
+ }
+ ADIOI_Free(tmp_len);
+
+ /* check if there are any holes. If yes, must do read-modify-write.
+ * holes can be in three places. 'middle' is what you'd expect: the
+ * processes are operating on noncontigous data. But holes can also show
+ * up at the beginning or end of the file domain (see John Bent ROMIO REQ
+ * #835). Missing these holes would result in us writing more data than
+ * recieved by everyone else. */
+ *hole = 0;
+ if (off != srt_off[0]) /* hole at the front */
+ *hole = 1;
+ else { /* coalesce the sorted offset-length pairs */
+ for (i = 1; i < sum; i++) {
+ if (srt_off[i] <= srt_off[0] + srt_len[0]) {
+ int new_len = srt_off[i] + srt_len[i] - srt_off[0];
+ if (new_len > srt_len[0])
+ srt_len[0] = new_len;
+ } else
+ break;
+ }
+ if (i < sum || size != srt_len[0]) /* hole in middle or end */
+ *hole = 1;
+ }
+
+ ADIOI_Free(srt_off);
+ ADIOI_Free(srt_len);
+
+ if (nprocs_recv) {
+ if (*hole) {
+ const char *stuff = "data-sieve-in-two-phase";
+ setenv("LIBIOLOG_EXTRA_INFO", stuff, 1);
+ ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, &err);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err != MPI_SUCCESS) {
+ *error_code = MPIO_Err_create_code(err,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+ unsetenv("LIBIOLOG_EXTRA_INFO");
+ }
+ }
+
+ nprocs_send = 0;
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i])
+ nprocs_send++;
+
+ if (fd->atomicity) {
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ requests = (MPI_Request *)
+ ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request));
+ send_req = requests;
+ } else {
+ requests = (MPI_Request *)
+ ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request));
+ /* +1 to avoid a 0-size malloc */
+
+ /* post receives */
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+ MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter,
+ fd->comm, requests + j);
+ j++;
+ }
+ }
+ send_req = requests + nprocs_recv;
+ }
+
+/* post sends. if buftype_is_contig, data can be directly sent from
+ user buf at location given by buf_idx. else use send_buf. */
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5032, 0, NULL);
+#endif
+ if (buftype_is_contig) {
+ j = 0;
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i]) {
+ MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
+ MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j);
+ j++;
+ buf_idx[i] += send_size[i];
+ }
+ } else if (nprocs_send) {
+ /* buftype is not contig */
+ size_t msgLen = 0;
+ for (i = 0; i < nprocs; i++)
+ msgLen += send_size[i];
+ send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
+ send_buf[0] = (char *) ADIOI_Malloc(msgLen * sizeof(char));
+ for (i = 1; i < nprocs; i++)
+ send_buf[i] = send_buf[i - 1] + send_size[i - 1];
+
+ ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf,
+ offset_list, len_list, send_size,
+ send_req,
+ sent_to_proc, nprocs, myrank,
+ contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent);
+ /* the send is done in ADIOI_Fill_send_buffer */
+ }
+
+ if (fd->atomicity) {
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ MPI_Status wkl_status;
+ if (recv_size[i]) {
+ MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter,
+ fd->comm, &wkl_status);
+ j++;
+ }
+ }
+ }
+
+ for (i = 0; i < nprocs_recv; i++)
+ MPI_Type_free(recv_types + i);
+ ADIOI_Free(recv_types);
+
+#ifdef MPI_STATUSES_IGNORE
+ statuses = MPI_STATUSES_IGNORE;
+#else
+ if (fd->atomicity) {
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status));
+ /* +1 to avoid a 0-size malloc */
+ } else {
+ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
+ sizeof(MPI_Status));
+ /* +1 to avoid a 0-size malloc */
+ }
+#endif
+
+#ifdef NEEDS_MPI_TEST
+ i = 0;
+ if (fd->atomicity) {
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ while (!i)
+ MPI_Testall(nprocs_send, send_req, &i, statuses);
+ } else {
+ while (!i)
+ MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
+ }
+#else
+ if (fd->atomicity)
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ MPI_Waitall(nprocs_send, send_req, statuses);
+ else
+ MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
+#endif
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5033, 0, NULL);
+#endif
+#ifndef MPI_STATUSES_IGNORE
+ ADIOI_Free(statuses);
+#endif
+ ADIOI_Free(requests);
+ if (!buftype_is_contig && nprocs_send) {
+ ADIOI_Free(send_buf[0]);
+ ADIOI_Free(send_buf);
+ }
+}
+
+
+#define ADIOI_BUF_INCR \
+{ \
+ while (buf_incr) { \
+ size_in_buf = MPL_MIN(buf_incr, flat_buf_sz); \
+ user_buf_idx += size_in_buf; \
+ flat_buf_sz -= size_in_buf; \
+ if (!flat_buf_sz) { \
+ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
+ else { \
+ flat_buf_idx = 0; \
+ n_buftypes++; \
+ } \
+ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
+ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
+ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
+ } \
+ buf_incr -= size_in_buf; \
+ } \
+}
+
+
+#define ADIOI_BUF_COPY \
+{ \
+ while (size) { \
+ size_in_buf = MPL_MIN(size, flat_buf_sz); \
+ ADIOI_Assert((((ADIO_Offset)(uintptr_t)buf) + user_buf_idx) == (ADIO_Offset)(uintptr_t)((uintptr_t)buf + user_buf_idx)); \
+ ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
+ memcpy(&(send_buf[p][send_buf_idx[p]]), \
+ ((char *) buf) + user_buf_idx, size_in_buf); \
+ send_buf_idx[p] += size_in_buf; \
+ user_buf_idx += size_in_buf; \
+ flat_buf_sz -= size_in_buf; \
+ if (!flat_buf_sz) { \
+ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
+ else { \
+ flat_buf_idx = 0; \
+ n_buftypes++; \
+ } \
+ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
+ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
+ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
+ } \
+ size -= size_in_buf; \
+ buf_incr -= size_in_buf; \
+ } \
+ ADIOI_BUF_INCR \
+}
+
+static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
+ * flat_buf, char **send_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list, int *send_size,
+ MPI_Request * requests, int *sent_to_proc,
+ int nprocs, int myrank,
+ int contig_access_count,
+ ADIO_Offset min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int iter, MPI_Aint buftype_extent)
+{
+/* this function is only called if buftype is not contig */
+
+ int i, p, flat_buf_idx;
+ ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
+ int jj, n_buftypes;
+ ADIO_Offset off, len, rem_len, user_buf_idx;
+
+/* curr_to_proc[p] = amount of data sent to proc. p that has already
+ been accounted for so far
+ done_to_proc[p] = amount of data already sent to proc. p in
+ previous iterations
+ user_buf_idx = current location in user buffer
+ send_buf_idx[p] = current location in send_buf of proc. p */
+
+ for (i = 0; i < nprocs; i++) {
+ send_buf_idx[i] = curr_to_proc[i] = 0;
+ done_to_proc[i] = sent_to_proc[i];
+ }
+ jj = 0;
+
+ user_buf_idx = flat_buf->indices[0];
+ flat_buf_idx = 0;
+ n_buftypes = 0;
+ flat_buf_sz = flat_buf->blocklens[0];
+
+ /* flat_buf_idx = current index into flattened buftype
+ * flat_buf_sz = size of current contiguous component in
+ * flattened buf */
+
+ for (i = 0; i < contig_access_count; i++) {
+ off = offset_list[i];
+ rem_len = len_list[i];
+
+ /*this request may span the file domains of more than one process */
+ while (rem_len != 0) {
+ len = rem_len;
+ /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
+ * longer than the single region that processor "p" is responsible
+ * for.
+ */
+ p = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end);
+
+ if (send_buf_idx[p] < send_size[p]) {
+ if (curr_to_proc[p] + len > done_to_proc[p]) {
+ if (done_to_proc[p] > curr_to_proc[p]) {
+ size = MPL_MIN(curr_to_proc[p] + len -
+ done_to_proc[p], send_size[p] - send_buf_idx[p]);
+ buf_incr = done_to_proc[p] - curr_to_proc[p];
+ ADIOI_BUF_INCR
+ ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) ==
+ (unsigned) (curr_to_proc[p] + len - done_to_proc[p]));
+ buf_incr = curr_to_proc[p] + len - done_to_proc[p];
+ ADIOI_Assert((done_to_proc[p] + size) ==
+ (unsigned) (done_to_proc[p] + size));
+ curr_to_proc[p] = done_to_proc[p] + size;
+ ADIOI_BUF_COPY} else {
+ size = MPL_MIN(len, send_size[p] - send_buf_idx[p]);
+ buf_incr = len;
+ ADIOI_Assert((curr_to_proc[p] + size) ==
+ (unsigned) ((ADIO_Offset) curr_to_proc[p] + size));
+ curr_to_proc[p] += size;
+ ADIOI_BUF_COPY}
+ if (send_buf_idx[p] == send_size[p]) {
+ MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
+ myrank + p + 100 * iter, fd->comm, requests + jj);
+ jj++;
+ }
+ } else {
+ ADIOI_Assert((curr_to_proc[p] + len) ==
+ (unsigned) ((ADIO_Offset) curr_to_proc[p] + len));
+ curr_to_proc[p] += len;
+ buf_incr = len;
+ ADIOI_BUF_INCR}
+ } else {
+ buf_incr = len;
+ ADIOI_BUF_INCR}
+ off += len;
+ rem_len -= len;
+ }
+ }
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i])
+ sent_to_proc[i] = curr_to_proc[i];
+}
+
+
+
+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
+ ADIO_Offset * srt_off, int *srt_len, int *start_pos,
+ int nprocs, int nprocs_recv, int total_elements)
+{
+ typedef struct {
+ ADIO_Offset *off_list;
+ ADIO_Offset *len_list;
+ int nelem;
+ } heap_struct;
+
+ heap_struct *a, tmp;
+ int i, j, heapsize, l, r, k, smallest;
+
+ a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) * sizeof(heap_struct));
+
+ j = 0;
+ for (i = 0; i < nprocs; i++)
+ if (count[i]) {
+ a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
+ a[j].len_list = &(others_req[i].lens[start_pos[i]]);
+ a[j].nelem = count[i];
+ j++;
+ }
+
+ /* build a heap out of the first element from each list, with
+ * the smallest element of the heap at the root */
+
+ heapsize = nprocs_recv;
+ for (i = heapsize / 2 - 1; i >= 0; i--) {
+ /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
+ * modified for a heap with smallest element at root. I have
+ * removed the recursion so that there are no function calls.
+ * Function calls are too expensive. */
+ k = i;
+ while (1) {
+ l = 2 * (k + 1) - 1;
+ r = 2 * (k + 1);
+
+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
+ smallest = l;
+ else
+ smallest = k;
+
+ if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list)))
+ smallest = r;
+
+ if (smallest != k) {
+ tmp.off_list = a[k].off_list;
+ tmp.len_list = a[k].len_list;
+ tmp.nelem = a[k].nelem;
+
+ a[k].off_list = a[smallest].off_list;
+ a[k].len_list = a[smallest].len_list;
+ a[k].nelem = a[smallest].nelem;
+
+ a[smallest].off_list = tmp.off_list;
+ a[smallest].len_list = tmp.len_list;
+ a[smallest].nelem = tmp.nelem;
+
+ k = smallest;
+ } else
+ break;
+ }
+ }
+
+ for (i = 0; i < total_elements; i++) {
+ /* extract smallest element from heap, i.e. the root */
+ srt_off[i] = *(a[0].off_list);
+ srt_len[i] = *(a[0].len_list);
+ (a[0].nelem)--;
+
+ if (!a[0].nelem) {
+ a[0].off_list = a[heapsize - 1].off_list;
+ a[0].len_list = a[heapsize - 1].len_list;
+ a[0].nelem = a[heapsize - 1].nelem;
+ heapsize--;
+ } else {
+ (a[0].off_list)++;
+ (a[0].len_list)++;
+ }
+
+ /* Heapify(a, 0, heapsize); */
+ k = 0;
+ while (1) {
+ l = 2 * (k + 1) - 1;
+ r = 2 * (k + 1);
+
+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
+ smallest = l;
+ else
+ smallest = k;
+
+ if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list)))
+ smallest = r;
+
+ if (smallest != k) {
+ tmp.off_list = a[k].off_list;
+ tmp.len_list = a[k].len_list;
+ tmp.nelem = a[k].nelem;
+
+ a[k].off_list = a[smallest].off_list;
+ a[k].len_list = a[smallest].len_list;
+ a[k].nelem = a[smallest].nelem;
+
+ a[smallest].off_list = tmp.off_list;
+ a[smallest].len_list = tmp.len_list;
+ a[smallest].nelem = tmp.nelem;
+
+ k = smallest;
+ } else
+ break;
+ }
+ }
+
+ ADIOI_Free(a);
+}
+
+
+static void ADIOI_W_Exchange_data_alltoallv(ADIO_File fd, const void *buf, char *write_buf, /* 1 */
+ ADIOI_Flatlist_node * flat_buf, ADIO_Offset * offset_list, ADIO_Offset * len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, /* 2 */
+ int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset * fd_start, ADIO_Offset * fd_end, ADIOI_Access * others_req, int *send_buf_idx, int *curr_to_proc, /* 3 */
+ int *done_to_proc, int *hole, /* 4 */
+ int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx,
+ int *error_code)
+{
+ int i, j, k = 0, nprocs_recv, nprocs_send, *tmp_len, err;
+ char **send_buf = NULL;
+ MPI_Request *send_req = NULL;
+ MPI_Status status;
+ int rtail, stail;
+ char *sbuf_ptr, *to_ptr;
+ int len;
+ int *sdispls, *rdispls;
+ char *all_recv_buf, *all_send_buf;
+ int *srt_len, sum;
+ ADIO_Offset *srt_off;
+ static char myname[] = "ADIOI_W_EXCHANGE_DATA";
+ double io_time;
+
+ io_time = MPI_Wtime();
+ /* exchange recv_size info so that each process knows how much to
+ * send to whom. */
+ MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
+
+ gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] += MPI_Wtime() - io_time;
+ io_time = MPI_Wtime();
+
+ nprocs_recv = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_size[i]) {
+ nprocs_recv++;
+ }
+ nprocs_send = 0;
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i]) {
+ nprocs_send++;
+ }
+
+ /* receiver side data structures */
+ rdispls = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ rtail = 0;
+ for (i = 0; i < nprocs; i++) {
+ rdispls[i] = rtail;
+ rtail += recv_size[i];
+ }
+
+ /* data buffer */
+ all_recv_buf = (char *) ADIOI_Malloc(rtail);
+
+ /* sender side data structures */
+ sdispls = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ stail = 0;
+ for (i = 0; i < nprocs; i++) {
+ sdispls[i] = stail;
+ stail += send_size[i];
+ }
+
+ /* data buffer */
+ all_send_buf = (char *) ADIOI_Malloc(stail);
+ if (buftype_is_contig) {
+ for (i = 0; i < nprocs; i++) {
+ if (send_size[i]) {
+ sbuf_ptr = all_send_buf + sdispls[i];
+ memcpy(sbuf_ptr, (char *) buf + buf_idx[i], send_size[i]);
+ buf_idx[i] += send_size[i];
+ }
+ }
+ } else {
+ send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
+ for (i = 0; i < nprocs; i++)
+ send_buf[i] = all_send_buf + sdispls[i];
+ ADIOI_Fill_send_buffer_nosend(fd, buf, flat_buf, send_buf,
+ offset_list, len_list, send_size,
+ send_req,
+ sent_to_proc, nprocs, myrank,
+ contig_access_count,
+ min_st_offset, fd_size, fd_start, fd_end,
+ send_buf_idx, curr_to_proc, done_to_proc, iter,
+ buftype_extent);
+ ADIOI_Free(send_buf);
+ }
+
+ gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += MPI_Wtime() - io_time;
+
+ io_time = MPI_Wtime();
+ /* alltoallv */
+ MPI_Alltoallv(all_send_buf, send_size, sdispls, MPI_BYTE,
+ all_recv_buf, recv_size, rdispls, MPI_BYTE, fd->comm);
+
+ ADIOI_Free(all_send_buf);
+ ADIOI_Free(sdispls);
+
+ gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += MPI_Wtime() - io_time;
+ io_time = MPI_Wtime();
+ /* data sieving pre-read */
+ /* To avoid a read-modify-write, check if there are holes in the
+ * data to be written. For this, merge the (sorted) offset lists
+ * others_req using a heap-merge. */
+
+ sum = 0;
+ for (i = 0; i < nprocs; i++)
+ sum += count[i];
+ srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
+ srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
+
+ ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum);
+
+ /* check if there are any holes */
+ *hole = 0;
+ /* See if there are holes before the first request or after the last request */
+ if ((srt_off[0] > off) || ((srt_off[sum - 1] + srt_len[sum - 1]) < (off + size))) {
+ *hole = 1;
+ } else /* See if there are holes between the requests, if there are more than one */
+ for (i = 0; i < sum - 1; i++)
+ if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
+ *hole = 1;
+ break;
+ }
+
+ ADIOI_Free(srt_off);
+ ADIOI_Free(srt_len);
+
+ gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SORT] += MPI_Wtime() - io_time;
+ io_time = MPI_Wtime();
+ if (nprocs_recv) {
+ if (*hole) {
+ ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, &err);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err != MPI_SUCCESS) {
+ *error_code = MPIO_Err_create_code(err,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+ }
+ }
+ gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SIEVE] += MPI_Wtime() - io_time;
+
+ /* scater all_recv_buf into 4M cb_buffer */
+ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+ if (partial_recv[i]) {
+ k = start_pos[i] + count[i] - 1;
+ tmp_len[i] = others_req[i].lens[k];
+ others_req[i].lens[k] = partial_recv[i];
+ }
+
+ sbuf_ptr = all_recv_buf + rdispls[i];
+ for (j = 0; j < count[i]; j++) {
+ to_ptr =
+ (char *) ADIOI_AINT_CAST_TO_VOID_PTR(others_req[i].mem_ptrs[start_pos[i] + j]);
+ len = others_req[i].lens[start_pos[i] + j];
+ memcpy(to_ptr, sbuf_ptr, len);
+ sbuf_ptr += len;
+ }
+
+ /* restore */
+ if (partial_recv[i]) {
+ k = start_pos[i] + count[i] - 1;
+ others_req[i].lens[k] = tmp_len[i];
+ }
+
+ }
+ }
+
+ ADIOI_Free(tmp_len);
+ ADIOI_Free(all_recv_buf);
+ ADIOI_Free(rdispls);
+ return;
+}
+
+static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
+ * flat_buf, char **send_buf, ADIO_Offset
+ * offset_list, ADIO_Offset * len_list, int *send_size,
+ MPI_Request * requests, int *sent_to_proc,
+ int nprocs, int myrank,
+ int contig_access_count,
+ ADIO_Offset min_st_offset, ADIO_Offset fd_size,
+ ADIO_Offset * fd_start, ADIO_Offset * fd_end,
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int iter, MPI_Aint buftype_extent)
+{
+/* this function is only called if buftype is not contig */
+
+ int i, p, flat_buf_idx;
+ ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
+ int jj, n_buftypes;
+ ADIO_Offset off, len, rem_len, user_buf_idx;
+
+/* curr_to_proc[p] = amount of data sent to proc. p that has already
+ been accounted for so far
+ done_to_proc[p] = amount of data already sent to proc. p in
+ previous iterations
+ user_buf_idx = current location in user buffer
+ send_buf_idx[p] = current location in send_buf of proc. p */
+
+ for (i = 0; i < nprocs; i++) {
+ send_buf_idx[i] = curr_to_proc[i] = 0;
+ done_to_proc[i] = sent_to_proc[i];
+ }
+ jj = 0;
+
+ user_buf_idx = flat_buf->indices[0];
+ flat_buf_idx = 0;
+ n_buftypes = 0;
+ flat_buf_sz = flat_buf->blocklens[0];
+
+ /* flat_buf_idx = current index into flattened buftype
+ * flat_buf_sz = size of current contiguous component in
+ * flattened buf */
+
+ for (i = 0; i < contig_access_count; i++) {
+ off = offset_list[i];
+ rem_len = len_list[i];
+
+ /*this request may span the file domains of more than one process */
+ while (rem_len != 0) {
+ len = rem_len;
+ /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
+ * longer than the single region that processor "p" is responsible
+ * for.
+ */
+ p = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end);
+
+ if (send_buf_idx[p] < send_size[p]) {
+ if (curr_to_proc[p] + len > done_to_proc[p]) {
+ if (done_to_proc[p] > curr_to_proc[p]) {
+ size = MPL_MIN(curr_to_proc[p] + len -
+ done_to_proc[p], send_size[p] - send_buf_idx[p]);
+ buf_incr = done_to_proc[p] - curr_to_proc[p];
+ ADIOI_BUF_INCR
+ ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) ==
+ (unsigned) (curr_to_proc[p] + len - done_to_proc[p]));
+ buf_incr = curr_to_proc[p] + len - done_to_proc[p];
+ ADIOI_Assert((done_to_proc[p] + size) ==
+ (unsigned) (done_to_proc[p] + size));
+ curr_to_proc[p] = done_to_proc[p] + size;
+ ADIOI_BUF_COPY} else {
+ size = MPL_MIN(len, send_size[p] - send_buf_idx[p]);
+ buf_incr = len;
+ ADIOI_Assert((curr_to_proc[p] + size) ==
+ (unsigned) ((ADIO_Offset) curr_to_proc[p] + size));
+ curr_to_proc[p] += size;
+ ADIOI_BUF_COPY}
+ /* moved to alltoallv */
+ /*
+ * if (send_buf_idx[p] == send_size[p]) {
+ * MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
+ * myrank+p+100*iter, fd->comm, requests+jj);
+ * jj++;
+ * }
+ */
+ } else {
+ ADIOI_Assert((curr_to_proc[p] + len) ==
+ (unsigned) ((ADIO_Offset) curr_to_proc[p] + len));
+ curr_to_proc[p] += (int) len;
+ buf_incr = len;
+ ADIOI_BUF_INCR}
+ } else {
+ buf_incr = len;
+ ADIOI_BUF_INCR}
+ off += len;
+ rem_len -= len;
+ }
+ }
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i])
+ sent_to_proc[i] = curr_to_proc[i];
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/bg/Makefile.mk b/3rd-party/romio341/adio/ad_gpfs/bg/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..682a85c98eb998d8c46154057275a5293f2a8380
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/bg/Makefile.mk
@@ -0,0 +1,16 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_BG
+
+noinst_HEADERS += \
+ adio/ad_gpfs/bg/ad_bg_aggrs.h \
+ adio/ad_gpfs/bg/ad_bg_pset.h
+
+romio_other_sources += \
+ adio/ad_gpfs/bg/ad_bg_aggrs.c \
+ adio/ad_gpfs/bg/ad_bg_pset.c
+
+endif BUILD_AD_BG
diff --git a/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_aggrs.c b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_aggrs.c
new file mode 100644
index 0000000000000000000000000000000000000000..d58c472265c4920ee83c20e8eab3fad342f7aa8b
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_aggrs.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_bg_aggrs.c
+ * \brief The externally used function from this file is is declared in ad_bg_aggrs.h
+ */
+
+/*#define TRACE_ON */
+
+// Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
+// #define balancecontigtrace 1
+// #define bridgeringaggtrace 1
+
+#include "adio.h"
+#include "adio_cb_config_list.h"
+#include "../ad_gpfs.h"
+#include "ad_bg_pset.h"
+#include "ad_bg_aggrs.h"
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+
+#ifdef MPL_USE_DBG_LOGGING
+#define AGG_DEBUG 1
+#endif
+
+#ifndef TRACE_ERR
+#define TRACE_ERR(format...)
+#endif
+
+/* Comments copied from common:
+ * This file contains four functions:
+ *
+ * ADIOI_Calc_aggregator()
+ * ADIOI_Calc_file_domains()
+ * ADIOI_Calc_my_req()
+ * ADIOI_Calc_others_req()
+ *
+ * The last three of these were originally in ad_read_coll.c, but they are
+ * also shared with ad_write_coll.c. I felt that they were better kept with
+ * the rest of the shared aggregation code.
+ */
+
+/* Discussion of values available from above:
+ *
+ * ADIO_Offset st_offsets[0..nprocs-1]
+ * ADIO_Offset end_offsets[0..nprocs-1]
+ * These contain a list of start and end offsets for each process in
+ * the communicator. For example, an access at loc 10, size 10 would
+ * have a start offset of 10 and end offset of 19.
+ * int nprocs
+ * number of processors in the collective I/O communicator
+ * ADIO_Offset min_st_offset
+ * ADIO_Offset fd_start[0..nprocs_for_coll-1]
+ * starting location of "file domain"; region that a given process will
+ * perform aggregation for (i.e. actually do I/O)
+ * ADIO_Offset fd_end[0..nprocs_for_coll-1]
+ * start + size - 1 roughly, but it can be less, or 0, in the case of
+ * uneven distributions
+ */
+
+/* forward declaration */
+static void
+ADIOI_BG_compute_agg_ranklist_serial(ADIO_File fd,
+ const ADIOI_BG_ConfInfo_t * confInfo,
+ ADIOI_BG_ProcInfo_t * all_procInfo);
+
+/*
+ * Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
+ * The parameters are
+ * . the number of aggregators (proxies) : fd->hints->cb_nodes
+ * . the ranks of the aggregators : fd->hints->ranklist
+ * By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
+ * ADIO can work more efficiently.
+ */
+int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
+{
+ int r, s;
+ ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
+ ADIOI_BG_ConfInfo_t *confInfo;
+ TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
+
+ MPI_Comm_size(fd->comm, &s);
+ MPI_Comm_rank(fd->comm, &r);
+
+ /* Collect individual BG personality information */
+ confInfo = ADIOI_BG_ConfInfo_new();
+ procInfo = ADIOI_BG_ProcInfo_new();
+ ADIOI_BG_persInfo_init(confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
+
+ /* Gather BG personality infomation onto process 0 */
+ /* if (r == 0) */
+ all_procInfo = ADIOI_BG_ProcInfo_new_n(s);
+
+ MPI_Gather((void *) procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
+ (void *) all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE, 0, fd->comm);
+
+ /* Compute a list of the ranks of chosen IO proxy CN on process 0 */
+ if (r == 0) {
+ ADIOI_BG_compute_agg_ranklist_serial(fd, confInfo, all_procInfo);
+ /* ADIOI_BG_ProcInfo_free (all_procInfo); */
+ }
+ ADIOI_BG_ProcInfo_free(all_procInfo);
+
+ /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
+ * Declared in adio_cb_config_list.h */
+ ADIOI_cb_bcast_rank_map(fd);
+ if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
+ * bridgelistnum and numbridges to all
+ * ranks */
+ if (r != 0) {
+ fd->hints->fs_hints.bg.bridgelist = ADIOI_Malloc(fd->hints->cb_nodes * sizeof(int));
+ if (fd->hints->fs_hints.bg.bridgelist == NULL) {
+ /* NEED TO HANDLE ENOMEM */
+ }
+ }
+ MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0, fd->comm);
+
+ if (r != 0) {
+ fd->hints->fs_hints.bg.bridgelistnum = ADIOI_Malloc(fd->hints->cb_nodes * sizeof(int));
+ if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
+ /* NEED TO HANDLE ENOMEM */
+ }
+ }
+ MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes, MPI_INT, 0, fd->comm);
+
+ MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0, fd->comm);
+
+ }
+
+
+ ADIOI_BG_persInfo_free(confInfo, procInfo);
+ TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
+ return 0;
+}
+
+
+/* There are some number of bridge nodes (randomly) distributed through the job
+ * We need to split the nodes among the bridge nodes */
+/* Maybe find which bridge node is closer (manhattan distance) and try to
+ * distribute evenly.
+ */
+/*
+ * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
+ * The first order of tmp_ranklist is : PSET number
+ * The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
+ */
+typedef struct {
+ int rank;
+ int bridge;
+} sortstruct;
+
+typedef struct {
+ int bridgeRank;
+ int numAggsAssigned;
+} bridgeAggAssignment;
+
+static int intsort(const void *p1, const void *p2)
+{
+ sortstruct *i1, *i2;
+ i1 = (sortstruct *) p1;
+ i2 = (sortstruct *) p2;
+ return (i1->bridge - i2->bridge);
+}
+
+static int
+ADIOI_BG_compute_agg_ranklist_serial_do(const ADIOI_BG_ConfInfo_t * confInfo,
+ ADIOI_BG_ProcInfo_t * all_procInfo, int *tmp_ranklist)
+{
+ TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
+ /* BES: This should be done in the init routines probably. */
+ int i, j;
+ int aggTotal;
+ int *aggList;
+
+ if (gpfsmpio_bridgeringagg > 0) {
+
+ int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize */ ;
+ /* the number of aggregators is (numAggs per bridgenode) */
+ if (numAggs == 1)
+ aggTotal = 1;
+ else
+ aggTotal = confInfo->numBridgeRanks * numAggs;
+
+ aggList = (int *) ADIOI_Malloc(aggTotal * sizeof(int));
+ if (aggTotal == 1) { /* special case when we only have one bridge node */
+
+ sortstruct *bridgelist =
+ (sortstruct *) ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
+ for (i = 0; i < confInfo->nProcs; i++) {
+ bridgelist[i].bridge = all_procInfo[i].bridgeRank;
+ bridgelist[i].rank = i;
+ TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
+ }
+
+ /* This list contains rank->bridge info. Now, we need to sort this list. */
+ qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
+
+ aggList[0] = bridgelist[0].bridge;
+ ADIOI_Free(bridgelist);
+
+ } else { // aggTotal > 1
+
+ int currentAggListSize = 0;
+ int numBridgesWithAggAssignments = 0;
+ bridgeAggAssignment *aggAssignments =
+ (bridgeAggAssignment *) ADIOI_Malloc(confInfo->numBridgeRanks *
+ sizeof(bridgeAggAssignment));
+
+ int partitionSize = all_procInfo[0].numNodesInPartition;
+ int *nodesAssigned = (int *) ADIOI_Malloc(partitionSize * sizeof(int));
+ for (i = 0; i < partitionSize; i++)
+ nodesAssigned[i] = 0;
+
+ int currentNumHops = gpfsmpio_bridgeringagg;
+ int allAggsAssigned = 0;
+
+ /* Iterate thru the process infos and select aggregators starting at currentNumHops
+ * away. Increase the currentNumHops until all bridges have numAggs assigned to them.
+ */
+ while (!allAggsAssigned) {
+ /* track whether any aggs are selected durng this round */
+ int startingCurrentAggListSize = currentAggListSize;
+ int numIterForHopsWithNoAggs = 0;
+ for (i = 0; i < confInfo->nProcs; i++) {
+ if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
+ if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
+ int foundBridge = 0;
+ for (j = 0; (j < numBridgesWithAggAssignments && !foundBridge); j++) {
+ if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
+ foundBridge = 1;
+ if (aggAssignments[j].numAggsAssigned < numAggs) {
+ aggAssignments[j].numAggsAssigned++;
+ nodesAssigned[all_procInfo[i].nodeRank] = 1;
+ aggList[currentAggListSize] = all_procInfo[i].rank;
+ currentAggListSize++;
+#ifdef bridgeringaggtrace
+ printf
+ ("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",
+ all_procInfo[i].rank, all_procInfo[i].nodeRank,
+ all_procInfo[i].bridgeRank, currentNumHops);
+#endif
+ }
+ }
+ }
+ if (!foundBridge) {
+ aggAssignments[numBridgesWithAggAssignments].bridgeRank =
+ all_procInfo[i].bridgeRank;
+ aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
+ numBridgesWithAggAssignments++;
+ nodesAssigned[all_procInfo[i].nodeRank] = 1;
+ aggList[currentAggListSize] = all_procInfo[i].rank;
+ currentAggListSize++;
+#ifdef bridgeringaggtrace
+ printf
+ ("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",
+ all_procInfo[i].rank, all_procInfo[i].nodeRank,
+ all_procInfo[i].bridgeRank, currentNumHops);
+#endif
+ }
+ }
+ }
+ }
+
+ if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
+ allAggsAssigned = 1;
+ for (i = 0; (i < numBridgesWithAggAssignments && allAggsAssigned); i++) {
+ if (aggAssignments[i].numAggsAssigned < numAggs)
+ allAggsAssigned = 0;
+ }
+ }
+ currentNumHops++;
+ /* Handle the case where the numAggs is more than exists starting
+ * at gpfsmpio_bridgeringagg hops, wrap back and restart at 0 to
+ * assign the overrun - it is up to the user to realize this
+ * situation and adjust numAggs and gpfsmpio_bridgeringagg
+ * accordingly.
+ */
+ if (currentNumHops > 16)
+ currentNumHops = 0;
+ /* If 3 rounds go by without selecting an agg abort to avoid
+ * infinite loop.
+ */
+ if (startingCurrentAggListSize == currentAggListSize)
+ numIterForHopsWithNoAggs++;
+ else
+ numIterForHopsWithNoAggs = 0;
+ ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
+ }
+
+ ADIOI_Free(aggAssignments);
+ ADIOI_Free(nodesAssigned);
+
+ } // else aggTotal > 1
+
+ memcpy(tmp_ranklist, aggList, aggTotal * sizeof(int));
+ } // gpfsmpio_bridgeringagg > 0
+
+ else { // gpfsmpio_bridgeringagg unset - default code
+
+ int distance, numAggs;
+
+ /* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
+ * bridge node */
+
+ sortstruct *bridgelist = (sortstruct *) ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
+ for (i = 0; i < confInfo->nProcs; i++) {
+ bridgelist[i].bridge = all_procInfo[i].bridgeRank;
+ bridgelist[i].rank = i;
+ TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
+ }
+
+ /* This list contains rank->bridge info. Now, we need to sort this list. */
+ qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
+
+ /* In this array, we can pick an appropriate number of midpoints based on
+ * our bridgenode index and the number of aggregators */
+
+ numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize */ ;
+ if (numAggs == 1)
+ aggTotal = 1;
+ else
+ /* the number of aggregators is (numAggs per bridgenode) plus each
+ * bridge node is an aggregator */
+ aggTotal = confInfo->numBridgeRanks * (numAggs + 1);
+
+ if (aggTotal > confInfo->nProcs)
+ aggTotal = confInfo->nProcs;
+
+ TRACE_ERR
+ ("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n",
+ confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks,
+ confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize */ , numAggs, aggTotal);
+ aggList = (int *) ADIOI_Malloc(aggTotal * sizeof(int));
+
+
+ /* For each bridge node, determine who the aggregators will be */
+ /* basically, the n*distance and bridge node */
+ if (aggTotal == 1) /* special case when we only have one bridge node */
+ aggList[0] = bridgelist[0].bridge;
+ else {
+ int lastBridge = bridgelist[confInfo->nProcs - 1].bridge;
+ int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
+ int psetSize = 0;
+ int procIndex;
+ for (procIndex = confInfo->nProcs - 1; procIndex >= 0; procIndex--) {
+ TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n", procIndex,
+ bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
+ if (lastBridge == bridgelist[procIndex].bridge) {
+ psetSize++;
+ if (procIndex)
+ continue;
+ else
+ procIndex--; /* procIndex == 0 */
+ }
+ /* Sets up a list of nodes which will act as aggregators. numAggs
+ * per bridge node total. The list of aggregators is
+ * bridgeNode 0
+ * bridgeNode 1
+ * bridgeNode ...
+ * bridgeNode N
+ * bridgeNode[0]aggr[0]
+ * bridgeNode[0]aggr[1]...
+ * bridgeNode[0]aggr[N]...
+ * ...
+ * bridgeNode[N]aggr[0]..
+ * bridgeNode[N]aggr[N]
+ */
+ aggList[nextBridge] = lastBridge;
+ distance = psetSize / numAggs;
+ TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n", nextBridge,
+ aggList[nextBridge], distance, psetSize);
+ if (numAggs > 1) {
+ for (j = 0; j < numAggs; j++) {
+ ADIOI_Assert(nextAggr < aggTotal);
+ aggList[nextAggr] = bridgelist[procIndex + j * distance + 1].rank;
+ TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr,
+ procIndex + j * distance + 1, aggList[nextAggr]);
+ if (aggList[nextAggr] == lastBridge) { /* can't have bridge in the list twice */
+ aggList[nextAggr] = bridgelist[procIndex + psetSize].rank; /* take the last one in the pset */
+ TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr,
+ procIndex + psetSize, aggList[nextAggr]);
+ }
+ nextAggr++;
+ }
+ }
+ if (procIndex < 0)
+ break;
+ lastBridge = bridgelist[procIndex].bridge;
+ psetSize = 1;
+ nextBridge++;
+ }
+ }
+
+ TRACE_ERR
+ ("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",
+ numAggs, confInfo->numBridgeRanks, numAggs,
+ (numAggs * confInfo->numBridgeRanks + numAggs), aggTotal);
+ memcpy(tmp_ranklist, aggList, aggTotal * sizeof(int));
+ for (i = 0; i < aggTotal; i++) {
+ TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
+ }
+
+
+ ADIOI_Free(bridgelist);
+
+ TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
+ }
+
+ ADIOI_Free(aggList);
+ return aggTotal;
+
+}
+
+/*
+ * compute aggregators ranklist and put it into fd->hints struct
+ */
+static void
+ADIOI_BG_compute_agg_ranklist_serial(ADIO_File fd,
+ const ADIOI_BG_ConfInfo_t * confInfo,
+ ADIOI_BG_ProcInfo_t * all_procInfo)
+{
+ TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
+ int i;
+ int naggs;
+ int size;
+ int *tmp_ranklist;
+
+ /* compute the ranklist of IO aggregators and put into tmp_ranklist */
+ tmp_ranklist = (int *) ADIOI_Malloc(confInfo->nProcs * sizeof(int));
+
+#if AGG_DEBUG
+ for (i = 0; i < confInfo->nProcs; i++) {
+ DBG_FPRINTF(stderr, "\trank = %6d\n", all_procInfo[i].rank);
+ }
+#endif
+
+ naggs = ADIOI_BG_compute_agg_ranklist_serial_do(confInfo, all_procInfo, tmp_ranklist);
+
+#define VERIFY 1
+#if VERIFY
+ DBG_FPRINTF(stderr,
+ "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, ratio: %.4f; naggs = %d\n",
+ confInfo->ioMinSize, confInfo->ioMaxSize, confInfo->nAggrs,
+ confInfo->numBridgeRanks, confInfo->nProcs,
+ confInfo->ioMaxSize /*virtualPsetSize */ ,
+ confInfo->aggRatio, naggs);
+#endif
+ MPI_Comm_size(fd->comm, &size);
+ /* This fix is for when the bridgenode rnk is not part of the particular
+ * subcomm associated with this MPI File operation. I don't know if
+ * this is the best/right answer but it passes the test cases at least.
+ * I don't know how common file IO in subcomms is anyway... */
+ for (i = 0; i < naggs; i++) {
+ if (tmp_ranklist[i] > size) {
+ TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
+ i, tmp_ranklist[i], fd->comm);
+ tmp_ranklist[i] = 0;
+ }
+ }
+
+#if AGG_DEBUG
+ for (i = 0; i < naggs; i++) {
+ DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i]);
+ }
+#endif
+ if (gpfsmpio_balancecontig == 1) {
+ /* what comes out of this code block is the agg ranklist sorted by
+ * bridge set and ion id with associated bridge info stored in the
+ * hints structure for later access during file domain assignment */
+
+ // sort the agg ranklist by ions and bridges
+
+ int *interleavedbridgeranklist = (int *) ADIOI_Malloc(naggs * sizeof(int)); // resorted agg rank list
+ /* list of all bridge ranks */
+ int *bridgelist = (int *) ADIOI_Malloc(naggs * sizeof(int));
+
+ /* each entry here is the number of aggregators associated with the
+ * bridge rank of the same index in bridgelist */
+ int *bridgelistnum = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ /* list of all ion IDs corresponding with bridgelist entries of same index */
+ int *ionlist = (int *) ADIOI_Malloc(naggs * sizeof(int));
+
+ int numbridges = 0;
+
+ for (i = 0; i < naggs; i++)
+ bridgelistnum[i] = 0;
+
+ /* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
+ * agg rank on that ion. */
+ int *summarybridgeminionaggrank = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ for (i = 0; i < naggs; i++)
+ summarybridgeminionaggrank[i] = -1;
+
+ /* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
+ * entry and find the associated bridge list index - at the end we will
+ * know how many aggs belong to each bridge in each ion */
+ for (i = 0; i < naggs; i++) {
+ int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
+ int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
+ int foundrank = 0;
+ int summaryranklistbridgeindex = 0;
+ int j;
+ for (j = 0; (j < numbridges && !foundrank); j++) {
+ if (bridgelist[j] == aggbridgerank) {
+ foundrank = 1;
+ summaryranklistbridgeindex = j;
+ } else
+ summaryranklistbridgeindex++;
+ }
+ if (!foundrank) {
+ bridgelist[summaryranklistbridgeindex] = aggbridgerank;
+ ionlist[summaryranklistbridgeindex] = aggionid;
+
+ if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
+ summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
+ else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
+ summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
+ numbridges++;
+ }
+
+ bridgelistnum[summaryranklistbridgeindex]++;
+ }
+
+ /* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
+ * need to make each entry the minimum bridge rank for the entire ion. */
+ for (i = 0; i < numbridges; i++) {
+ int aggIonId = ionlist[i];
+ int j;
+ for (j = 0; j < numbridges; j++) {
+ if (ionlist[j] == aggIonId) {
+ if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
+ summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
+ }
+ }
+ }
+
+ // resort by io node minimum bridge rank
+ int x;
+ for (x = 0; x < numbridges; x++) {
+ for (i = 0; i < (numbridges - 1); i++) {
+ if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i + 1]) {
+ int tmpminionaggrank = summarybridgeminionaggrank[i];
+ summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i + 1];
+ summarybridgeminionaggrank[i + 1] = tmpminionaggrank;
+ int tmpionid = ionlist[i];
+ ionlist[i] = ionlist[i + 1];
+ ionlist[i + 1] = tmpionid;
+ int tmpbridgerank = bridgelist[i];
+ bridgelist[i] = bridgelist[i + 1];
+ bridgelist[i + 1] = tmpbridgerank;
+ int tmpbridgeranknum = bridgelistnum[i];
+ bridgelistnum[i] = bridgelistnum[i + 1];
+ bridgelistnum[i + 1] = tmpbridgeranknum;
+ }
+ }
+ }
+
+ // for each io node make sure bridgelist is in rank order
+ int startSortIndex = -1;
+ int endSortIndex = -1;
+ int currentBridgeIndex = 0;
+
+ while (currentBridgeIndex < numbridges) {
+ int currentIonId = ionlist[currentBridgeIndex];
+ startSortIndex = currentBridgeIndex;
+ while (ionlist[currentBridgeIndex] == currentIonId)
+ currentBridgeIndex++;
+ endSortIndex = currentBridgeIndex - 1;
+ for (x = startSortIndex; x <= endSortIndex; x++) {
+ for (i = startSortIndex; i < endSortIndex; i++) {
+ if (bridgelist[i] > bridgelist[i + 1]) {
+ int tmpbridgerank = bridgelist[i];
+ bridgelist[i] = bridgelist[i + 1];
+ bridgelist[i + 1] = tmpbridgerank;
+ int tmpbridgeranknum = bridgelistnum[i];
+ bridgelistnum[i] = bridgelistnum[i + 1];
+ bridgelistnum[i + 1] = tmpbridgeranknum;
+ }
+ }
+ }
+ }
+
+
+ /* populate interleavedbridgeranklist - essentially the agg rank list
+ * is now sorted by the ion minimum bridge rank and bridge node */
+ int currentrankoffset = 0;
+ for (i = 0; i < numbridges; i++) {
+ int *thisBridgeAggList = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ int numAggsForThisBridge = 0;
+
+ int k;
+ for (k = 0; k < naggs; k++) {
+ int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
+ if (aggbridgerank == bridgelist[i]) {
+ thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
+ numAggsForThisBridge++;
+ }
+ }
+
+ // sort thisBridgeAggList
+ for (x = 0; x < numAggsForThisBridge; x++) {
+ int n;
+ for (n = 0; n < (numAggsForThisBridge - 1); n++) {
+ if (thisBridgeAggList[n] > thisBridgeAggList[n + 1]) {
+ int tmpthisBridgeAggList = thisBridgeAggList[n];
+ thisBridgeAggList[n] = thisBridgeAggList[n + 1];
+ thisBridgeAggList[n + 1] = tmpthisBridgeAggList;
+ }
+ }
+ }
+ int n;
+ for (n = 0; n < numAggsForThisBridge; n++) {
+ interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
+ currentrankoffset++;
+ }
+ ADIOI_Free(thisBridgeAggList);
+ }
+
+#ifdef balancecontigtrace
+ fprintf(stderr, "Interleaved aggregator list:\n");
+ for (i = 0; i < naggs; i++) {
+ fprintf(stderr, "Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n", i,
+ interleavedbridgeranklist[i],
+ all_procInfo[interleavedbridgeranklist[i]].bridgeRank,
+ all_procInfo[interleavedbridgeranklist[i]].ionID);
+ }
+ fprintf(stderr, "Bridges list:\n");
+ for (i = 0; i < numbridges; i++) {
+ fprintf(stderr, "bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n", i,
+ summarybridgeminionaggrank[i], bridgelist[i], bridgelistnum[i], ionlist[i]);
+ }
+
+#endif
+ /* copy the ranklist of IO aggregators to fd->hints */
+ if (fd->hints->ranklist != NULL)
+ ADIOI_Free(fd->hints->ranklist);
+ if (fd->hints->fs_hints.bg.bridgelist != NULL)
+ ADIOI_Free(fd->hints->fs_hints.bg.bridgelist);
+ if (fd->hints->fs_hints.bg.bridgelistnum != NULL)
+ ADIOI_Free(fd->hints->fs_hints.bg.bridgelistnum);
+
+ fd->hints->cb_nodes = naggs;
+ fd->hints->fs_hints.bg.numbridges = numbridges;
+ fd->hints->ranklist = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ memcpy(fd->hints->ranklist, interleavedbridgeranklist, naggs * sizeof(int));
+
+ fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ memcpy(fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs * sizeof(int));
+
+ fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ memcpy(fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs * sizeof(int));
+
+ ADIOI_Free(summarybridgeminionaggrank);
+ ADIOI_Free(tmp_ranklist);
+ ADIOI_Free(bridgelistnum);
+ ADIOI_Free(bridgelist);
+ ADIOI_Free(interleavedbridgeranklist);
+ ADIOI_Free(ionlist);
+
+ } else {
+ /* classic topology-agnostic copy of the ranklist of IO aggregators to
+ * fd->hints */
+ if (fd->hints->ranklist != NULL)
+ ADIOI_Free(fd->hints->ranklist);
+
+ fd->hints->cb_nodes = naggs;
+ fd->hints->ranklist = (int *) ADIOI_Malloc(naggs * sizeof(int));
+ memcpy(fd->hints->ranklist, tmp_ranklist, naggs * sizeof(int));
+
+ ADIOI_Free(tmp_ranklist);
+ }
+ TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_aggrs.h b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_aggrs.h
new file mode 100644
index 0000000000000000000000000000000000000000..8668ff394484ea2e0281081a805f1ae8f3bd74ff
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_aggrs.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_bg_aggrs.h
+ * \brief ???
+ */
+
+/*
+ *
+ * Declares functions specific for the BlueGene platform within the GPFS
+ * parallel I/O solution. Implements aligned file-domain partitioning
+ * (7/28/2005); persistent file doamin work not implemented
+ *
+ */
+
+#ifndef AD_BG_AGGRS_H_INCLUDED
+#define AD_BG_AGGRS_H_INCLUDED
+
+#include "adio.h"
+#include
+
+#ifdef HAVE_GPFS_H
+#include
+#endif
+#if !defined(GPFS_SUPER_MAGIC)
+#define GPFS_SUPER_MAGIC (0x47504653)
+#endif
+
+ /* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
+int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
+
+#endif /* AD_BG_AGGRS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_pset.c b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_pset.c
new file mode 100644
index 0000000000000000000000000000000000000000..1d3f00e7f5019bffde7d9444cef0b6bb8bb8eb3c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_pset.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_bg_pset.c
+ * \brief Definition of functions associated to structs ADIOI_BG_ProcInfo_t and ADIOI_BG_ConfInfo_t
+ */
+
+/* #define TRACE_ON */
+// #define bridgeringaggtrace 1
+
+#include
+#include
+#include "../ad_gpfs.h"
+#include "ad_bg_pset.h"
+#include
+#include
+
+#define BGQ_TORUS_MAX_DIMS 5
+#define BGQ_FULL_TORUS_SIZE 512
+
+#ifndef TRACE_ERR
+#define TRACE_ERR(fmt...)
+#endif
+
+ADIOI_BG_ProcInfo_t *ADIOI_BG_ProcInfo_new()
+{
+ ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc(sizeof(ADIOI_BG_ProcInfo_t));
+ ADIOI_Assert((p != NULL));
+ return p;
+}
+
+ADIOI_BG_ProcInfo_t *ADIOI_BG_ProcInfo_new_n(int n)
+{
+ ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc(n * sizeof(ADIOI_BG_ProcInfo_t));
+ ADIOI_Assert((p != NULL));
+ return p;
+}
+
+void ADIOI_BG_ProcInfo_free(ADIOI_BG_ProcInfo_t * info)
+{
+ if (info != NULL)
+ ADIOI_Free(info);
+}
+
+ADIOI_BG_ConfInfo_t *ADIOI_BG_ConfInfo_new()
+{
+ ADIOI_BG_ConfInfo_t *p = (ADIOI_BG_ConfInfo_t *) ADIOI_Malloc(sizeof(ADIOI_BG_ConfInfo_t));
+ ADIOI_Assert((p != NULL));
+ return p;
+}
+
+
+void ADIOI_BG_ConfInfo_free(ADIOI_BG_ConfInfo_t * info)
+{
+ if (info != NULL)
+ ADIOI_Free(info);
+}
+
+
+typedef struct {
+ int rank;
+ int bridgeCoord;
+} sortstruct;
+
+static int intsort(const void *p1, const void *p2)
+{
+ sortstruct *i1, *i2;
+ i1 = (sortstruct *) p1;
+ i2 = (sortstruct *) p2;
+ return (i1->bridgeCoord - i2->bridgeCoord);
+}
+
+unsigned torusSize[BGQ_TORUS_MAX_DIMS];
+bool dimTorus[BGQ_TORUS_MAX_DIMS];
+
+/* This function computes the number of hops between the torus coordinates of the
+ * aggCoords and bridgeCoords parameters.
+*/
+static unsigned procManhattanDistance(unsigned *aggCoords, unsigned *bridgeCoords)
+{
+
+ unsigned totalDistance = 0;
+ int i;
+ for (i = 0; i < BGQ_TORUS_MAX_DIMS; i++) {
+ unsigned dimDistance = abs((int) aggCoords[i] - (int) bridgeCoords[i]);
+ if (dimDistance > 0) { // could torus make it closer?
+ if (dimTorus[i]) {
+ if (aggCoords[i] == torusSize[i]) { // is wrap-around closer
+ if ((bridgeCoords[i] + 1) < dimDistance) // assume will use torus link
+ dimDistance = bridgeCoords[i] + 1;
+ } else if (bridgeCoords[i] == torusSize[i]) { // is wrap-around closer
+ if ((aggCoords[i] + 1) < dimDistance) // assume will use torus link
+ dimDistance = aggCoords[i] + 1;
+ }
+ }
+ } /* else: dimDistance == 0, meaning aggCoords[i] and bridgeCoords[i] are
+ * the same and there's no closer point to pick */
+ totalDistance += dimDistance;
+ }
+ return totalDistance;
+}
+
+int BGQ_IO_node_id()
+{
+ static unsigned long IO_node_id = ULONG_MAX;
+
+ if (IO_node_id != ULONG_MAX)
+ return (int) (IO_node_id >> 32);
+
+ int rc;
+ int fd;
+ char *uci_str;
+ char buffer[4096];
+
+ fd = open("/dev/bgpers", O_RDONLY, 0);
+ assert(fd >= 0);
+ rc = read(fd, buffer, sizeof(buffer));
+ assert(rc > 0);
+ close(fd);
+
+ uci_str = strstr(buffer, "BG_UCI=");
+ assert(uci_str);
+ uci_str += sizeof("BG_UCI=") - 1;
+
+ IO_node_id = strtoul(uci_str, NULL, 16);
+ return (int) (IO_node_id >> 32);
+}
+
+void
+ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t * conf,
+ ADIOI_BG_ProcInfo_t * proc, int size, int rank, int n_aggrs, MPI_Comm comm)
+{
+ int i, iambridge = 0, bridgerank = -1, bridgeIndex;
+ int countPset;
+ sortstruct *bridges;
+ int commsize;
+
+ TRACE_ERR("Entering BG_persInfo_init, size: %d, rank: %d, n_aggrs: %d, comm: %d\n", size, rank,
+ n_aggrs, (int) comm);
+
+ Personality_t pers;
+
+
+ Kernel_GetPersonality(&pers, sizeof(pers));
+ Personality_Networks_t *net = &pers.Network_Config;
+
+ TRACE_ERR("BG_persInfo_init, my coords{%u,%u,%u,%u,%u}\n", net->Acoord, net->Bcoord,
+ net->Ccoord, net->Dcoord, net->Ecoord);
+ proc->rank = rank;
+
+ if (gpfsmpio_bridgeringagg > 0) {
+#ifdef bridgeringaggtrace
+ if (rank == 0)
+ fprintf(stderr, "Block dimensions:\n");
+#endif
+
+ /* Set the numNodesInPartition and nodeRank for this proc
+ */
+ unsigned dimMaxArray[BGQ_TORUS_MAX_DIMS];
+ dimMaxArray[0] = net->Anodes;
+ dimMaxArray[1] = net->Bnodes;
+ dimMaxArray[2] = net->Cnodes;
+ dimMaxArray[3] = net->Dnodes;
+ dimMaxArray[4] = net->Enodes;
+
+ unsigned hwCoordsArray[BGQ_TORUS_MAX_DIMS];
+ hwCoordsArray[0] = net->Acoord;
+ hwCoordsArray[1] = net->Bcoord;
+ hwCoordsArray[2] = net->Ccoord;
+ hwCoordsArray[3] = net->Dcoord;
+ hwCoordsArray[4] = net->Ecoord;
+ proc->numNodesInPartition =
+ net->Anodes * net->Bnodes * net->Cnodes * net->Dnodes * net->Enodes;
+ proc->nodeRank = 0;
+ /* Set the indicator for if a dimension in the partitions is a torus or not.
+ */
+ dimTorus[0] = (bool) (ND_ENABLE_TORUS_DIM_A & net->NetFlags);
+ dimTorus[1] = (bool) (ND_ENABLE_TORUS_DIM_B & net->NetFlags);
+ dimTorus[2] = (bool) (ND_ENABLE_TORUS_DIM_C & net->NetFlags);
+ dimTorus[3] = (bool) (ND_ENABLE_TORUS_DIM_D & net->NetFlags);
+ dimTorus[4] = (bool) (ND_ENABLE_TORUS_DIM_E & net->NetFlags);
+ for (i = 0; i < BGQ_TORUS_MAX_DIMS; i++) {
+ torusSize[i] = dimMaxArray[i];
+ int baseNum = 1, j;
+ for (j = 0; j < i; j++)
+ baseNum *= dimMaxArray[j];
+ proc->nodeRank += (hwCoordsArray[i] * baseNum);
+#ifdef bridgeringaggtrace
+ if (rank == 0)
+ fprintf(stderr,
+ "numNodesInPartition is %d Dimension %d has %d elements wrap-around value is %d\n",
+ proc->numNodesInPartition, i, torusSize[i], dimTorus[i]);
+#endif
+ }
+ }
+
+ MPI_Comm_size(comm, &commsize);
+
+ proc->ionID = BGQ_IO_node_id();
+
+ if (size == 1) {
+ proc->iamBridge = 1;
+ proc->bridgeRank = rank;
+ if (gpfsmpio_bridgeringagg > 0) {
+ proc->manhattanDistanceToBridge = 0;
+ }
+
+ /* Set up the other parameters */
+ proc->myIOSize = size;
+ proc->ioNodeIndex = 0;
+ conf->ioMinSize = size;
+ conf->ioMaxSize = size;
+ conf->numBridgeRanks = 1;
+ conf->nProcs = size;
+ conf->nAggrs = 1;
+ conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize */ ;
+ if (conf->aggRatio > 1)
+ conf->aggRatio = 1.;
+ TRACE_ERR("I am (single) Bridge rank\n");
+ return;
+ }
+
+ /* Find the nearest bridge node coords. We don't know the
+ * rank in our comm so we will collective find/pick a bridge
+ * rank later.
+ */
+ int32_t bridgeCoords;
+ bridgeCoords = pers.Network_Config.cnBridge_A << 24 |
+ pers.Network_Config.cnBridge_B << 18 |
+ pers.Network_Config.cnBridge_C << 12 |
+ pers.Network_Config.cnBridge_D << 6 | pers.Network_Config.cnBridge_E << 2;
+ ADIOI_Assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
+
+ if ((net->Acoord == pers.Network_Config.cnBridge_A) &&
+ (net->Bcoord == pers.Network_Config.cnBridge_B) &&
+ (net->Ccoord == pers.Network_Config.cnBridge_C) &&
+ (net->Dcoord == pers.Network_Config.cnBridge_D) &&
+ (net->Ecoord == pers.Network_Config.cnBridge_E)) {
+ iambridge = 1; /* I am bridge */
+ if (gpfsmpio_bridgeringagg > 0) {
+ proc->manhattanDistanceToBridge = 0;
+ }
+ } else { // calculate manhattan distance to bridge if gpfsmpio_bridgeringagg is set
+ if (gpfsmpio_bridgeringagg > 0) {
+ unsigned aggCoords[BGQ_TORUS_MAX_DIMS], manhattanBridgeCoords[BGQ_TORUS_MAX_DIMS];
+ aggCoords[0] = net->Acoord;
+ manhattanBridgeCoords[0] = pers.Network_Config.cnBridge_A;
+ aggCoords[1] = net->Bcoord;
+ manhattanBridgeCoords[1] = pers.Network_Config.cnBridge_B;
+ aggCoords[2] = net->Ccoord;
+ manhattanBridgeCoords[2] = pers.Network_Config.cnBridge_C;
+ aggCoords[3] = net->Dcoord;
+ manhattanBridgeCoords[3] = pers.Network_Config.cnBridge_D;
+ aggCoords[4] = net->Ecoord;
+ manhattanBridgeCoords[4] = pers.Network_Config.cnBridge_E;
+
+ proc->manhattanDistanceToBridge =
+ procManhattanDistance(aggCoords, manhattanBridgeCoords);
+#ifdef bridgeringaggtrace
+ fprintf(stderr,
+ "agg coords are %u %u %u %u %u bridge coords are %u %u %u %u %u distance is %u\n",
+ aggCoords[0], aggCoords[1], aggCoords[2], aggCoords[3], aggCoords[4],
+ manhattanBridgeCoords[0], manhattanBridgeCoords[1], manhattanBridgeCoords[2],
+ manhattanBridgeCoords[3], manhattanBridgeCoords[4],
+ proc->manhattanDistanceToBridge);
+#endif
+ }
+ }
+
+ TRACE_ERR("Bridge coords(%8.8X): %d %d %d %d %d, %d. iambridge %d\n", bridgeCoords,
+ pers.Network_Config.cnBridge_A, pers.Network_Config.cnBridge_B,
+ pers.Network_Config.cnBridge_C, pers.Network_Config.cnBridge_D,
+ pers.Network_Config.cnBridge_E, 0, iambridge);
+
+ /* Allgather the ranks and bridgeCoords to determine the bridge
+ * rank and how many ranks belong to each bridge rank */
+ bridges = (sortstruct *) ADIOI_Malloc(sizeof(sortstruct) * size);
+
+ /* We're going to sort this structure by bridgeCoord:
+ *
+ * typedef struct
+ * {
+ * int rank;
+ * int bridgeCoord;
+ * } sortstruct;
+ *
+ * and I want the rank that IS the bridge to sort first, so
+ * OR in '1' on non-bridge ranks that use a bridge coord.
+ */
+
+ /* My input to the collective */
+ bridges[rank].rank = rank;
+ bridges[rank].bridgeCoord = bridgeCoords;
+ if (!iambridge)
+ bridges[rank].bridgeCoord |= 1; /* I am not bridge, turn on bit */
+
+
+ MPI_Allgather(MPI_IN_PLACE, 2, MPI_INT, bridges, 2, MPI_INT, comm);
+
+ qsort(bridges, size, sizeof(sortstruct), intsort);
+
+ /* Once the list is sorted walk through it to setup bridge
+ * info and find bridge ranks, etc. */
+
+ int tempCoords, tempRank, mincompute, maxcompute;
+ tempCoords = bridges[0].bridgeCoord & ~1;
+ tempRank = bridges[0].rank;
+
+ countPset = 1;
+ bridgeIndex = 0;
+ mincompute = size + 1;
+ maxcompute = 1;
+
+ for (i = 1; i < size; i++) {
+ if ((bridges[i].bridgeCoord & ~1) == tempCoords)
+ countPset++; /* same bridge (pset), count it */
+ else { /* new bridge found */
+
+#ifdef TRACE_ON
+ if (rank == 0)
+ TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
+ bridgeIndex, tempRank, tempCoords, countPset);
+#endif
+ if (countPset > maxcompute)
+ maxcompute = countPset;
+ if (countPset < mincompute)
+ mincompute = countPset;
+
+ /* Was this my bridge we finished? */
+ if (tempCoords == bridgeCoords) {
+ /* Am I the bridge rank? */
+ if (tempRank == rank)
+ iambridge = 1;
+ else
+ iambridge = 0; /* Another rank on my node may have taken over */
+ TRACE_ERR
+ ("Rank %u, bridge set %u, bridge rank %d (%#8.8X) has %d ranks, iambridge %u\n",
+ rank, bridgeIndex, tempRank, tempCoords, countPset, iambridge);
+ bridgerank = tempRank;
+ proc->myIOSize = countPset;
+ proc->ioNodeIndex = bridgeIndex;
+ }
+ /* Setup next bridge */
+ tempCoords = bridges[i].bridgeCoord & ~1;
+ tempRank = bridges[i].rank;
+ bridgeIndex++;
+ countPset = 1;
+ }
+ }
+ /* Process last bridge */
+
+#ifdef TRACE_ON
+ if (rank == 0)
+ TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
+ bridgeIndex, tempRank, tempCoords, countPset);
+#endif
+ if (countPset > maxcompute)
+ maxcompute = countPset;
+ if (countPset < mincompute)
+ mincompute = countPset;
+
+ /* Was this my bridge? */
+ if (tempCoords == bridgeCoords) {
+ /* Am I the bridge rank? */
+ if (tempRank == rank)
+ iambridge = 1;
+ else
+ iambridge = 0; /* Another rank on my node may have taken over */
+ bridgerank = tempRank;
+ proc->myIOSize = countPset;
+ proc->ioNodeIndex = bridgeIndex;
+ }
+
+
+ if (rank == 0) {
+ /* Only rank 0 has a conf structure, fill in stuff as appropriate */
+ conf->ioMinSize = mincompute;
+ conf->ioMaxSize = maxcompute; /* equivalent to pset size */
+ conf->numBridgeRanks = bridgeIndex + 1;
+ conf->nProcs = size;
+
+ conf->nAggrs = n_aggrs;
+ /* First pass gets nAggrs = -1 */
+ if (conf->nAggrs <= 0)
+ conf->nAggrs = gpfsmpio_bg_nagg_pset;
+ if (conf->ioMinSize <= conf->nAggrs)
+ conf->nAggrs = MPL_MAX(1, conf->ioMinSize - 1); /* not including bridge itself */
+/* if (conf->nAggrs > conf->numBridgeRanks)
+ conf->nAggrs = conf->numBridgeRanks;
+*/
+ conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize */ ;
+/* if (conf->aggRatio > 1) conf->aggRatio = 1.; */
+ TRACE_ERR
+ ("n_aggrs %zd, conf->nProcs %zu, conf->ioMaxSize %zu, ADIOI_BG_NAGG_PSET_DFLT %zu,conf->numBridgeRanks %zu,conf->nAggrs %zu\n",
+ (size_t) n_aggrs, (size_t) conf->nProcs, (size_t) conf->ioMaxSize,
+ (size_t) ADIOI_BG_NAGG_PSET_DFLT, (size_t) conf->numBridgeRanks,
+ (size_t) conf->nAggrs);
+ TRACE_ERR
+ ("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n",
+ maxcompute, mincompute, conf->nAggrs, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT,
+ conf->nAggrs, conf->aggRatio);
+ }
+
+ ADIOI_Assert((bridgerank != -1));
+ proc->bridgeRank = bridgerank;
+ proc->iamBridge = iambridge;
+ TRACE_ERR
+ ("Rank %d has bridge set index %d (bridge rank: %d) with %d other ranks, ioNodeIndex: %d\n",
+ rank, proc->ioNodeIndex, bridgerank, proc->myIOSize, proc->ioNodeIndex);
+
+ ADIOI_Free(bridges);
+
+}
+
+void ADIOI_BG_persInfo_free(ADIOI_BG_ConfInfo_t * conf, ADIOI_BG_ProcInfo_t * proc)
+{
+ ADIOI_BG_ConfInfo_free(conf);
+ ADIOI_BG_ProcInfo_free(proc);
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_pset.h b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_pset.h
new file mode 100644
index 0000000000000000000000000000000000000000..e76bdcf0bde5e714fb7b9cee924e5b5751a7a2a7
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/bg/ad_bg_pset.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_bg_pset.h
+ * \brief ???
+ */
+
+/* File: ad_bg_pset.h
+ *
+ * Defines two structures that keep BlueGene PSET specific information and their public interfaces:
+ * . ADIOI_BG_ProcInfo_t object keeps specific information to each process
+ * . ADIOI_BG_ConfInfo_t object keeps general information for the whole communicator, only kept
+ * on process 0.
+ */
+
+#ifndef AD_BG_PSET_H_INCLUDED
+#define AD_BG_PSET_H_INCLUDED
+
+
+/* Keeps specific information to each process, will be exchanged among processes */
+typedef struct {
+ int ioNodeIndex; /* similar to psetNum on BGL/BGP */
+ int rank; /* my rank */
+ int ionID; /* ion id this cn is using */
+/* int myCoords[5]; */
+ int bridgeRank; /* my bridge node (or proxy) rank */
+ unsigned char threadID; /* unlikely to be useful but better than just padding */
+ unsigned char __cpad[2];
+ int myIOSize; /* number of ranks sharing my bridge/IO
+ * node, i.e. psetsize */
+ int iamBridge; /* am *I* the bridge rank? */
+ int __ipad[2];
+ unsigned nodeRank; /* torus coords converted to an integer for use with gpfsmpio_bridgeringagg */
+ unsigned numNodesInPartition; /* number of physical nodes in the job partition */
+ unsigned manhattanDistanceToBridge; /* number of hops between this rank and the bridge node */
+} ADIOI_BG_ProcInfo_t __attribute__ ((aligned(16)));
+
+/* Keeps general information for the whole communicator, only on process 0 */
+typedef struct {
+ int ioMinSize; /* Smallest number of ranks shareing 1 bridge node */
+ int ioMaxSize; /* Largest number of ranks sharing 1 bridge node */
+ /* ioMaxSize will be the "psetsize" */
+ int nAggrs;
+ int numBridgeRanks;
+ /*int virtualPsetSize; ppn * pset size */
+ int nProcs;
+ int cpuIDsize; /* num ppn */
+ float aggRatio;
+
+} ADIOI_BG_ConfInfo_t __attribute__ ((aligned(16)));
+
+
+
+
+
+/* public funcs for ADIOI_BG_ProcInfo_t objects */
+ADIOI_BG_ProcInfo_t *ADIOI_BG_ProcInfo_new();
+ADIOI_BG_ProcInfo_t *ADIOI_BG_ProcInfo_new_n(int n);
+void ADIOI_BG_ProcInfo_free(ADIOI_BG_ProcInfo_t * info);
+
+
+/* public funcs for ADIOI_BG_ConfInfo_t objects */
+ADIOI_BG_ConfInfo_t *ADIOI_BG_ConfInfo_new();
+void ADIOI_BG_ConfInfo_free(ADIOI_BG_ConfInfo_t * info);
+
+
+/* public funcs for a pair of ADIOI_BG_ConfInfo_t and ADIOI_BG_ProcInfo_t objects */
+int BGQ_IO_node_id();
+void ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t * conf,
+ ADIOI_BG_ProcInfo_t * proc, int s, int r, int n_aggrs, MPI_Comm comm);
+void ADIOI_BG_persInfo_free(ADIOI_BG_ConfInfo_t * conf, ADIOI_BG_ProcInfo_t * proc);
+
+
+#endif /* AD_BG_PSET_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_gpfs/pe/Makefile.mk b/3rd-party/romio341/adio/ad_gpfs/pe/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..cc5d406b4dcf5588d7071ec6f8ea5cabac712d4f
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/pe/Makefile.mk
@@ -0,0 +1,14 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_PE
+
+noinst_HEADERS += \
+ adio/ad_gpfs/pe/ad_pe_aggrs.h
+
+romio_other_sources += \
+ adio/ad_gpfs/pe/ad_pe_aggrs.c
+
+endif BUILD_AD_PE
diff --git a/3rd-party/romio341/adio/ad_gpfs/pe/ad_pe_aggrs.c b/3rd-party/romio341/adio/ad_gpfs/pe/ad_pe_aggrs.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6564c74151c579b47fb2164935428b9065e8238
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/pe/ad_pe_aggrs.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_pe_aggrs.c
+ * \brief The externally used function from this file is is declared in ad_pe_aggrs.h
+ */
+
+/*#define TRACE_ON */
+
+#include "adio.h"
+#include "adio_cb_config_list.h"
+#include "../ad_gpfs.h"
+#include "ad_pe_aggrs.h"
+#include "mpiimpl.h"
+
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+#ifdef MPL_USE_DBG_LOGGING
+#define AGG_DEBUG 1
+#endif
+
+#ifndef TRACE_ERR
+#define TRACE_ERR(format...)
+#endif
+
+/*
+ * Compute the aggregator-related parameters that are required in 2-phase
+ * collective IO of ADIO.
+ * The parameters are
+ * . the number of aggregators (proxies) : fd->hints->cb_nodes
+ * . the ranks of the aggregators : fd->hints->ranklist
+ * If MP_IONODEFILE is defined, POE determines all tasks on every node listed
+ * in the node file and defines MP_IOTASKLIST with them, making them all
+ * aggregators. Alternatively, the user can explictly set MP_IOTASKLIST
+ * themselves. The format of the MP_IOTASKLIST is a colon-delimited list of
+ * task ids, the first entry being the total number of aggregators, for example
+ * to specify 4 aggregators on task ids 0,8,16,24 the value would be:
+ * 4:0:8:16:24. If there is no MP_IONODEFILE, or MP_IOTASKLIST, then the
+ * default aggregator selection is 1 task per node for every node of the job -
+ * additionally, an environment variable MP_IOAGGR_CNT can be specified, which
+ * defines the total number of aggregators, spread evenly across all the nodes.
+ * The romio_cb_nodes and romio_cb_config_list hint user settings are ignored.
+ */
+int ADIOI_PE_gen_agg_ranklist(ADIO_File fd)
+{
+
+ int numAggs = 0;
+ char *ioTaskList = getenv("MP_IOTASKLIST");
+ char *ioAggrCount = getenv("MP_IOAGGR_CNT");
+ int i, j;
+ int inTERcommFlag = 0;
+
+ int myRank, commSize;
+ MPI_Comm_rank(fd->comm, &myRank);
+ MPI_Comm_size(fd->comm, &commSize);
+
+ MPI_Comm_test_inter(fd->comm, &inTERcommFlag);
+ if (inTERcommFlag) {
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: inTERcomms are not supported in MPI-IO - aborting....\n");
+ perror("ADIOI_PE_gen_agg_ranklist:");
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+
+ if (ioTaskList) {
+ int ioTaskListLen = strlen(ioTaskList);
+ int ioTaskListPos = 0;
+ char tmpBuf[8]; /* Big enough for 1M tasks (7 digits task ID). */
+ tmpBuf[7] = '\0';
+ for (i = 0; i < 7; i++) {
+ tmpBuf[i] = *ioTaskList++; /* Maximum is 7 digits for 1 million. */
+ ioTaskListPos++;
+ if (*ioTaskList == ':') { /* If the next char is a ':' ends it. */
+ tmpBuf[i + 1] = '\0';
+ break;
+ }
+ }
+ numAggs = atoi(tmpBuf);
+ if (numAggs == 0)
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: Number of aggregators specified in MP_IOTASKLIST set at 0 - default aggregator selection will be used.\n");
+ else if (!((numAggs > 0) && (numAggs <= commSize))) {
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: The number of aggregators (%s) specified in MP_IOTASKLIST is outside the communicator task range of %d.\n",
+ tmpBuf, commSize);
+ numAggs = commSize;
+ }
+ fd->hints->ranklist = (int *) ADIOI_Malloc(numAggs * sizeof(int));
+
+ int aggIndex = 0;
+ while (aggIndex < numAggs) {
+ ioTaskList++; /* Advance past the ':' */
+ ioTaskListPos++;
+ int allDigits = 1;
+ for (i = 0; i < 7; i++) {
+ if (*ioTaskList < '0' || *ioTaskList > '9')
+ allDigits = 0;
+ tmpBuf[i] = *ioTaskList++;
+ ioTaskListPos++;
+ if ((*ioTaskList == ':') || (*ioTaskList == '\0')) {
+ tmpBuf[i + 1] = '\0';
+ break;
+ }
+ }
+ if (allDigits) {
+ int newAggRank = atoi(tmpBuf);
+ if (!((newAggRank >= 0) && (newAggRank < commSize))) {
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not within the communicator task range of 0 to %d - it will be ignored.\n",
+ tmpBuf, commSize - 1);
+ } else {
+ int aggAlreadyAdded = 0;
+ for (i = 0; i < aggIndex; i++)
+ if (fd->hints->ranklist[i] == newAggRank) {
+ aggAlreadyAdded = 1;
+ break;
+ }
+ if (!aggAlreadyAdded)
+ fd->hints->ranklist[aggIndex++] = newAggRank;
+ else
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: The aggregator '%d' is specified multiple times in MP_IOTASKLIST - duplicates are ignored.\n",
+ newAggRank);
+ }
+ } else {
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not a valid integer task id - it will be ignored.\n",
+ tmpBuf);
+ }
+
+ /* At the end check whether the list is shorter than specified. */
+ if (ioTaskListPos == ioTaskListLen) {
+ if (aggIndex == 0) {
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: No aggregators were correctly specified in MP_IOTASKLIST - default aggregator selection will be used.\n");
+ ADIOI_Free(fd->hints->ranklist);
+ } else if (aggIndex < numAggs)
+ FPRINTF(stderr,
+ "ERROR: ATTENTION: %d aggregators were specified in MP_IOTASKLIST but only %d were correctly specified - setting the number of aggregators to %d.\n",
+ numAggs, aggIndex, aggIndex);
+ numAggs = aggIndex;
+ }
+ }
+ }
+ if (numAggs == 0) {
+ MPIR_Comm *mpidCommData;
+
+ MPIR_Comm_get_ptr(fd->comm, mpidCommData);
+ int localSize = mpidCommData->local_size;
+
+ // get my node rank
+ int myNodeRank = mpidCommData->intranode_table[mpidCommData->rank];
+
+ int *allNodeRanks = (int *) ADIOI_Malloc(localSize * sizeof(int));
+
+ allNodeRanks[myRank] = myNodeRank;
+ MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, allNodeRanks, 1, MPI_INT, fd->comm);
+
+#ifdef AGG_DEBUG
+ printf("MPIR_Comm data: local_size is %d\nintranode_table entries:\n",
+ mpidCommData->local_size);
+ for (i = 0; i < localSize; i++) {
+ printf("%d ", mpidCommData->intranode_table[i]);
+ }
+ printf("\ninternode_table entries:\n");
+ for (i = 0; i < localSize; i++) {
+ printf("%d ", mpidCommData->internode_table[i]);
+ }
+ printf("\n");
+
+ printf("\nallNodeRanks entries:\n");
+ for (i = 0; i < localSize; i++) {
+ printf("%d ", allNodeRanks[i]);
+ }
+ printf("\n");
+
+#endif
+
+ if (ioAggrCount) {
+ int cntType = -1;
+
+ if (strcasecmp(ioAggrCount, "ALL")) {
+ if ((cntType = atoi(ioAggrCount)) <= 0) {
+ /* Input is other non-digit or less than 1 the assume */
+ /* 1 aggregator per node. Note: atoi(-1) reutns -1. */
+ /* No warning message given here -- done earlier. */
+ cntType = -1;
+ }
+ } else {
+ /* ALL is specified set aggr count to localSize */
+ cntType = -2;
+ }
+ switch (cntType) {
+ case -1:
+ /* 1 aggr/node case */
+ {
+ int rankListIndex = 0;
+ fd->hints->ranklist = (int *) ADIOI_Malloc(localSize * sizeof(int));
+ for (i = 0; i < localSize; i++) {
+ if (allNodeRanks[i] == 0) {
+ fd->hints->ranklist[rankListIndex++] = i;
+ numAggs++;
+ }
+ }
+ }
+ break;
+ case -2:
+ /* ALL tasks case */
+ fd->hints->ranklist = (int *) ADIOI_Malloc(localSize * sizeof(int));
+ for (i = 0; i < localSize; i++) {
+ fd->hints->ranklist[i] = i;
+ numAggs++;
+ }
+ break;
+ default:
+ /* Specific aggr count case -- MUST be less than localSize, otherwise set to localSize */
+ if (cntType > localSize)
+ cntType = localSize;
+
+ numAggs = cntType;
+ // Round-robin thru allNodeRanks - pick the 0's, then the 1's, etc
+ int currentNodeRank = 0; // node rank currently being selected as aggregator
+ int rankListIndex = 0;
+ int currentAllNodeIndex = 0;
+
+ fd->hints->ranklist = (int *) ADIOI_Malloc(numAggs * sizeof(int));
+
+ while (rankListIndex < numAggs) {
+ int foundEntry = 0;
+ while (!foundEntry && (currentAllNodeIndex < localSize)) {
+ if (allNodeRanks[currentAllNodeIndex] == currentNodeRank) {
+ fd->hints->ranklist[rankListIndex++] = currentAllNodeIndex;
+ foundEntry = 1;
+ }
+ currentAllNodeIndex++;
+ }
+ if (!foundEntry) {
+ currentNodeRank++;
+ currentAllNodeIndex = 0;
+ }
+ } // while
+ break;
+ } // switch(cntType)
+ } // if (ioAggrCount)
+
+ else { // default is 1 aggregator per node
+ // take the 0 entries from allNodeRanks
+ int rankListIndex = 0;
+ fd->hints->ranklist = (int *) ADIOI_Malloc(localSize * sizeof(int));
+ for (i = 0; i < localSize; i++) {
+ if (allNodeRanks[i] == 0) {
+ fd->hints->ranklist[rankListIndex++] = i;
+ numAggs++;
+ }
+ }
+ }
+
+ ADIOI_Free(allNodeRanks);
+
+ }
+
+ if (getenv("MP_I_SHOW_AGGRS")) {
+ if (myRank == 0) {
+ printf("Agg rank list of %d generated:\n", numAggs);
+ for (i = 0; i < numAggs; i++) {
+ printf("%d ", fd->hints->ranklist[i]);
+ }
+ printf("\n");
+ }
+ }
+
+ fd->hints->cb_nodes = numAggs;
+
+ return 0;
+}
diff --git a/3rd-party/romio341/adio/ad_gpfs/pe/ad_pe_aggrs.h b/3rd-party/romio341/adio/ad_gpfs/pe/ad_pe_aggrs.h
new file mode 100644
index 0000000000000000000000000000000000000000..38df48e9673b7e558eaac730bac5969770d36e8b
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_gpfs/pe/ad_pe_aggrs.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+/**
+ * \file ad_pe_aggrs.h
+ * \brief ???
+ */
+
+/*
+ *
+ * Declares functions specific for the PE platform within the GPFS
+ * parallel I/O solution. For now simply processes the MP_IOTASKLIST
+ * env var.
+ *
+ */
+
+#ifndef AD_PE_AGGRS_H_INCLUDED
+#define AD_PE_AGGRS_H_INCLUDED
+
+#include "adio.h"
+#include
+
+#if !defined(GPFS_SUPER_MAGIC)
+#define GPFS_SUPER_MAGIC (0x47504653)
+#endif
+
+ /* generate a list of I/O aggregators following a methodology specific for PE */
+int ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
+
+#endif /* AD_PE_AGGRS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_ime/Makefile.mk b/3rd-party/romio341/adio/ad_ime/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..0d25a9878b4ee8c60d2e82620a0ce9ea866da657
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/Makefile.mk
@@ -0,0 +1,22 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_IME
+
+noinst_HEADERS += adio/ad_ime/ad_ime.h adio/ad_ime/ad_ime_common.h
+
+romio_other_sources += \
+ adio/ad_ime/ad_ime.c \
+ adio/ad_ime/ad_ime_close.c \
+ adio/ad_ime/ad_ime_common.c \
+ adio/ad_ime/ad_ime_delete.c \
+ adio/ad_ime/ad_ime_fcntl.c \
+ adio/ad_ime/ad_ime_flush.c \
+ adio/ad_ime/ad_ime_io.c \
+ adio/ad_ime/ad_ime_open.c \
+ adio/ad_ime/ad_ime_resize.c \
+ adio/ad_ime/ad_ime_features.c
+
+endif BUILD_AD_IME
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime.c b/3rd-party/romio341/adio/ad_ime/ad_ime.c
new file mode 100644
index 0000000000000000000000000000000000000000..bb9f9a4f1385d127e4adf8f40b87a4f529846852
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+
+/* adioi.h has the ADIOI_Fns_struct define */
+#include "adioi.h"
+
+struct ADIOI_Fns_struct ADIO_IME_operations = {
+ ADIOI_IME_Open, /* Open */
+ ADIOI_SCALEABLE_OpenColl, /* OpenColl */ /*XXX*/
+ ADIOI_IME_ReadContig, /* ReadContig */
+ ADIOI_IME_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_IME_Fcntl, /* Fcntl */
+ ADIOI_GEN_SetInfo, /* SetInfo */
+ ADIOI_GEN_ReadStrided, /* ReadStrided */
+ ADIOI_GEN_WriteStrided, /* WriteStrided */
+ ADIOI_IME_Close, /* Close */
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+ ADIOI_FAKE_IODone, /* ReadDone */
+ ADIOI_FAKE_IODone, /* WriteDone */
+ ADIOI_FAKE_IOComplete, /* ReadComplete */
+ ADIOI_FAKE_IOComplete, /* WriteComplete */
+ ADIOI_FAKE_IreadStrided, /* IreadStrided */
+ ADIOI_FAKE_IwriteStrided, /* IwriteStrided */
+ ADIOI_IME_Flush, /* Flush */
+ ADIOI_IME_Resize, /* Resize */
+ ADIOI_IME_Delete, /* Delete */
+ ADIOI_IME_Feature,
+ ADIOI_IME_PREFIX,
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime.h b/3rd-party/romio341/adio/ad_ime/ad_ime.h
new file mode 100644
index 0000000000000000000000000000000000000000..8db4d2cb5fa2ced1034b75356286de0852f0fbc6
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_IME_H_INCLUDED
+#define AD_IME_H_INCLUDED
+
+#include "adio.h"
+#ifdef HAVE_IME_NATIVE_H
+#include "ime_native.h"
+#endif
+
+#define ADIOI_IME_PREFIX "ime:"
+#define ADIOI_IME_PREFIX_LEN (sizeof(ADIOI_IME_PREFIX) - 1)
+
+void ADIOI_IME_Open(ADIO_File fd, int *error_code);
+
+void ADIOI_IME_Close(ADIO_File fd, int *error_code);
+
+void ADIOI_IME_ReadContig(ADIO_File fd,
+ void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+
+void ADIOI_IME_WriteContig(ADIO_File fd,
+ const void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+
+void ADIOI_IME_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code);
+
+void ADIOI_IME_Flush(ADIO_File fd, int *error_code);
+
+void ADIOI_IME_Delete(const char *filename, int *error_code);
+
+void ADIOI_IME_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
+
+void ADIOI_IME_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+
+int ADIOI_IME_Feature(ADIO_File fd, int flag);
+#endif /* AD_IME_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_close.c b/3rd-party/romio341/adio/ad_ime/ad_ime_close.c
new file mode 100644
index 0000000000000000000000000000000000000000..2aabde8d21b07af6325d201a64b2a3de2da5d1c6
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_close.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "ad_ime_common.h"
+#include
+
+void ADIOI_IME_Close(ADIO_File fd, int *error_code)
+{
+ static char myname[] = "ADIOI_IME_CLOSE";
+ int ret;
+ struct ADIOI_IME_fs_s *ime_fs;
+ int tmp_error_code;
+
+ ret = ime_native_close(fd->fd_sys);
+ if (ret != 0) {
+ tmp_error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_UNKNOWN, "Error in ime_native_close", 0);
+ } else {
+ tmp_error_code = MPI_SUCCESS;
+ }
+
+ if (error_code) {
+ *error_code = tmp_error_code;
+ }
+
+ ime_fs = (ADIOI_IME_fs *) fd->fs_ptr;
+ assert(ime_fs);
+ ADIOI_Free(ime_fs->ime_filename);
+ ime_fs->ime_filename = NULL;
+ ADIOI_Free(ime_fs);
+
+ /* reset fds */
+ fd->fd_direct = -1;
+ fd->fd_sys = -1;
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_common.c b/3rd-party/romio341/adio/ad_ime/ad_ime_common.c
new file mode 100644
index 0000000000000000000000000000000000000000..7851b6dfd47a63be45b9402059f2f70a0ceb3e71
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_common.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "ad_ime_common.h"
+#include
+#include
+
+/* keyval hack to both tell us if we've already initialized im and also
+ * close it down when mpi exits */
+int ADIOI_IME_Initialized = MPI_KEYVAL_INVALID;
+
+void ADIOI_IME_End(int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_IME_END";
+
+ ret = ime_native_finalize();
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_FILE, "Error in ime_native_finalize", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ *error_code = MPI_SUCCESS;
+}
+
+int ADIOI_IME_End_call(MPI_Comm comm, int keyval, void *attribute_val, void *extra_state)
+{
+ int error_code;
+ ADIOI_IME_End(&error_code);
+ MPI_Keyval_free(&keyval);
+ return error_code;
+}
+
+void ADIOI_IME_Init(int rank, int *error_code)
+{
+ /* do nothing if we've already fired up the im interface */
+ if (ADIOI_IME_Initialized != MPI_KEYVAL_INVALID) {
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ ime_native_init();
+
+ *error_code = MPI_SUCCESS;
+
+ MPI_Keyval_create(MPI_NULL_COPY_FN, ADIOI_IME_End_call, &ADIOI_IME_Initialized, (void *) 0);
+ /* just like romio does, we make a dummy attribute so we
+ * get cleaned up */
+ MPI_Attr_put(MPI_COMM_SELF, ADIOI_IME_Initialized, (void *) 0);
+}
+
+/* Return an IME-compatible filename (add 'ime:' prefix).
+ * New filename must be free'd by the user */
+char *ADIOI_IME_Add_prefix(const char *filename)
+{
+ static char myname[] = "ADIOI_IME_ADD_PREFIX";
+ size_t f_len = strlen(filename) + 1;
+ char *ime_filename = ADIOI_Malloc(f_len + ADIOI_IME_PREFIX_LEN);
+
+ if (!ime_filename) {
+
+ MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_FATAL,
+ myname, __LINE__, MPI_ERR_UNKNOWN, "Error allocating memory", 0);
+
+ return NULL;
+ }
+
+ ADIOI_Strncpy(ime_filename, ADIOI_IME_PREFIX, ADIOI_IME_PREFIX_LEN);
+ ADIOI_Strncpy((ime_filename + ADIOI_IME_PREFIX_LEN), filename, f_len);
+ return ime_filename;
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_common.h b/3rd-party/romio341/adio/ad_ime/ad_ime_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..f090009f0c2e5c0b5462786fd4f9fb38de1c14f3
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_common.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_IME_COMMON_H_INCLUDED
+#define AD_IME_COMMON_H_INCLUDED
+#include "ad_ime.h"
+
+struct ADIOI_IME_fs_s {
+ char *ime_filename;
+};
+
+typedef struct ADIOI_IME_fs_s ADIOI_IME_fs;
+
+void ADIOI_IME_Init(int rank, int *error_code);
+void ADIOI_IME_End(int *error_code);
+int ADIOI_IME_End_call(MPI_Comm comm, int keyval, void *attribute_val, void *extra_state);
+
+char *ADIOI_IME_Add_prefix(const char *filename);
+#endif /* AD_IME_COMMON_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_delete.c b/3rd-party/romio341/adio/ad_ime/ad_ime_delete.c
new file mode 100644
index 0000000000000000000000000000000000000000..188456f4fca3ccb21def824f434d5129b5971bbf
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_delete.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "adio.h"
+
+#include "ad_ime_common.h"
+
+void ADIOI_IME_Delete(const char *filename, int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_IME_DELETE";
+
+ char *ime_filename = ADIOI_IME_Add_prefix(filename);
+ ret = ime_native_unlink(ime_filename);
+ ADIOI_Free(ime_filename);
+ if (ret)
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_FILE, "Error in ime_native_unlink", 0);
+ else
+ *error_code = MPI_SUCCESS;
+
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_fcntl.c b/3rd-party/romio341/adio/ad_ime/ad_ime_fcntl.c
new file mode 100644
index 0000000000000000000000000000000000000000..b2b5d2cd202639bd0afdb005bf09877697d34190
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_fcntl.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "adio_extern.h"
+#include "ad_ime_common.h"
+#include
+
+void ADIOI_IME_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_IME_FCNTL";
+
+ switch (flag) {
+ case ADIO_FCNTL_GET_FSIZE:
+ {
+ struct stat stbuf;
+
+ stbuf.st_size = 0;
+ struct ADIOI_IME_fs_s *ime_fs = (ADIOI_IME_fs *) fd->fs_ptr;
+ assert(ime_fs);
+ ret = ime_native_stat(ime_fs->ime_filename, &stbuf);
+
+ if (ret) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_FILE, "Error in ime_native_stat", 0);
+ return;
+ }
+
+ fcntl_struct->fsize = stbuf.st_size;
+ *error_code = MPI_SUCCESS;
+ break;
+ }
+ case ADIO_FCNTL_SET_DISKSPACE:
+ ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
+ break;
+
+ case ADIO_FCNTL_SET_ATOMICITY:
+ default:
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG, "**flag", "**flag %d", flag);
+ break;
+ };
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_features.c b/3rd-party/romio341/adio/ad_ime/ad_ime_features.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc26f68fb135bf2faf2ba7f796b29ae8e8f519b3
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_features.c
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "ad_ime.h"
+
+int ADIOI_IME_Feature(ADIO_File fd, int flag)
+{
+ switch (flag) {
+ case ADIO_SCALABLE_OPEN:
+ case ADIO_SHARED_FP:
+ case ADIO_LOCKS:
+ case ADIO_SEQUENTIAL:
+ case ADIO_DATA_SIEVING_WRITES:
+ default:
+ return 0;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_flush.c b/3rd-party/romio341/adio/ad_ime/ad_ime_flush.c
new file mode 100644
index 0000000000000000000000000000000000000000..3f7d39e290e8b651b9626beb39ddfdbfcf05ec55
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_flush.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "ad_ime_common.h"
+
+#include
+
+void ADIOI_IME_Flush(ADIO_File fd, int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_IME_FLUSH";
+
+ if (!error_code) {
+ return;
+ }
+
+ if (!fd) {
+ *error_code = MPI_ERR_FILE;
+ return;
+ }
+
+ ret = ime_native_fsync(fd->fd_sys);
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_FILE, "Error in ime_native_fsync", 0);
+ return;
+ }
+
+ *error_code = MPI_SUCCESS;
+
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_io.c b/3rd-party/romio341/adio/ad_ime/ad_ime_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..052022983cf241b496f241e7e7b539b579bedf00
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_io.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_ime.h"
+
+#include "ad_ime_common.h"
+
+#include
+
+#define IME_READ 0
+#define IME_WRITE 1
+
+static void IME_IOContig(ADIO_File fd,
+ void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int io_flag, int *error_code)
+{
+ ssize_t ret;
+ MPI_Count datatype_size;
+ size_t mem_len;
+ uint64_t file_offset = offset;
+ static char myname[] = "ADIOI_IME_IOCONTIG";
+
+ if (count == 0) {
+ ret = 0;
+ goto fn_exit;
+ }
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ mem_len = datatype_size * count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ file_offset = fd->fp_ind;
+
+ switch (io_flag) {
+ case IME_READ:
+ ret = ime_native_pread(fd->fd_sys, buf, mem_len, offset);
+ break;
+ case IME_WRITE:
+ ret = ime_native_pwrite(fd->fd_sys, buf, mem_len, offset);
+ break;
+ default:
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO, "Unknown flag", 0);
+ goto exit;
+
+ break;
+ };
+
+ /* Let the application decide how to fail */
+ if (ret < 0) {
+ *error_code = MPI_SUCCESS;
+ goto exit;
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += ret;
+ fd->fp_sys_posn = file_offset + ret;
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, ret);
+#endif
+
+ *error_code = MPI_SUCCESS;
+
+ exit:
+ return;
+}
+
+void ADIOI_IME_ReadContig(ADIO_File fd,
+ void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ IME_IOContig(fd, buf, count, datatype, file_ptr_type, offset, status, IME_READ, error_code);
+}
+
+void ADIOI_IME_WriteContig(ADIO_File fd,
+ const void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ IME_IOContig(fd,
+ (void *) buf,
+ count, datatype, file_ptr_type, offset, status, IME_WRITE, error_code);
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_open.c b/3rd-party/romio341/adio/ad_ime/ad_ime_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..6a9b07c648e86c0c41c6a80948a1bb50d8c70e40
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_open.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "ad_ime_common.h"
+
+#include
+
+void ADIOI_IME_Open(ADIO_File fd, int *error_code)
+{
+ static char myname[] = "ADIOI_IME_OPEN";
+ struct ADIOI_IME_fs_s *ime_fs;
+ int perm;
+ int amode = 0;
+ int ret;
+ int rank = 0;
+ mode_t old_mask;
+
+ /* validate input args */
+ if (!fd) {
+ *error_code = MPI_ERR_FILE;
+ return;
+ }
+ if (!error_code) {
+ *error_code = MPI_ERR_FILE;
+ return;
+ }
+
+ /* setup file permissions */
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = old_mask ^ 0666;
+ } else
+ perm = fd->perm;
+
+ /* setup the file access mode */
+ if (fd->access_mode & ADIO_CREATE)
+ amode = amode | O_CREAT;
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+ /* XXX no O_APPEND support */
+ assert((fd->access_mode & ADIO_APPEND) == 0);
+
+ /* init IME */
+ MPI_Comm_rank(fd->comm, &rank);
+ ADIOI_IME_Init(rank, error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+
+ ime_fs = (ADIOI_IME_fs *) ADIOI_Malloc(sizeof(ADIOI_IME_fs));
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (ime_fs == NULL) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_UNKNOWN, "Error allocating memory", 0);
+ return;
+ }
+
+ ime_fs->ime_filename = ADIOI_IME_Add_prefix(fd->filename);
+
+ /* all processes open the file */
+ ret = ime_native_open(ime_fs->ime_filename, amode, perm);
+ if (ret < 0) {
+ *error_code = MPI_ERR_FILE;
+ ADIOI_Free(ime_fs->ime_filename);
+ ADIOI_Free(ime_fs);
+ return;
+ }
+
+ fd->fd_sys = ret;
+ fd->fd_direct = -1;
+ fd->fs_ptr = ime_fs;
+
+ *error_code = MPI_SUCCESS;
+
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_ime/ad_ime_resize.c b/3rd-party/romio341/adio/ad_ime/ad_ime_resize.c
new file mode 100644
index 0000000000000000000000000000000000000000..abe41875fb0400b4f730354740f70d7edab9f998
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_ime/ad_ime_resize.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_ime.h"
+#include "ad_ime_common.h"
+#include
+
+void ADIOI_IME_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_IME_RESIZE";
+
+ if (!error_code)
+ return;
+ if (!fd) {
+ *error_code = MPI_ERR_FILE;
+ return;
+ }
+
+ ret = ime_native_ftruncate(fd->fd_sys, size);
+
+ if (ret != 0)
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_FILE, "Error in ime_native_ftruncate", 0);
+ else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/Makefile.mk b/3rd-party/romio341/adio/ad_lustre/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2b58648ecc00961e3a632ec5f46761dad9f70
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/Makefile.mk
@@ -0,0 +1,24 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_LUSTRE
+
+noinst_HEADERS += adio/ad_lustre/ad_lustre.h
+
+romio_other_sources += \
+ adio/ad_lustre/ad_lustre.c \
+ adio/ad_lustre/ad_lustre_open.c \
+ adio/ad_lustre/ad_lustre_rwcontig.c \
+ adio/ad_lustre/ad_lustre_wrcoll.c \
+ adio/ad_lustre/ad_lustre_wrstr.c \
+ adio/ad_lustre/ad_lustre_hints.c \
+ adio/ad_lustre/ad_lustre_aggregate.c
+
+if LUSTRE_LOCKAHEAD
+romio_other_sources += \
+ adio/ad_lustre/ad_lustre_lock.c
+endif LUSTRE_LOCKAHEAD
+
+endif BUILD_AD_LUSTRE
diff --git a/3rd-party/romio341/adio/ad_lustre/README b/3rd-party/romio341/adio/ad_lustre/README
new file mode 100644
index 0000000000000000000000000000000000000000..a217c0f8fe5e39e82a2b6c07c81d573f7e815b18
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/README
@@ -0,0 +1,55 @@
+Upcoming soon:
+ o Hierarchical striping as described in the paper from CCGrid2007
+ http://ft.ornl.gov/projects/io/pubs/CCGrid-2007-file-joining.pdf
+Further out:
+ o To post the code for ParColl (Partitioned collective IO)
+
+-----------------------------------------------------
+V05:
+-----------------------------------------------------
+Improved data redistribution
+ o Improve I/O pattern identification. Besides checking interleaving,
+ if request I/O size is small, collective I/O will be performed.
+ The hint bigsize can be used to define the req size value.
+ o Provide hint CO for load balancing to control the number of
+ IO clients for each OST
+ o Produce stripe-contiguous I/O pattern that Lustre prefers
+ o Control read-modify-write in data sieving in collective IO
+ by hint ds_in_coll.
+ o Reduce extent lock conflicts by make each OST accessed by one or
+ more constant clients.
+
+-----------------------------------------------------
+V04:
+-----------------------------------------------------
+ o Direct IO and Lockless IO support
+
+-----------------------------------------------------
+V03:
+-----------------------------------------------------
+ o Correct detection of fs_type when lustre: prefix is not given
+ o Further fix on stripe alignment
+ o Tested/Enabled striping hints over Cray XT (Catamount and CNL)
+
+-----------------------------------------------------
+V02:
+-----------------------------------------------------
+The Lustre ADIO driver has been cleaned up quite a lot. Compared
+to the intital posting, here are the changes:
+ o Removal of dead/redundant code
+ o Removal of asynchronous IO piece as it appears outdated
+ o Bug fixes for setting Lustre Hints
+ o Bug fixes for data sieving
+ o Improved Setsize operation with one process calling ftruncate
+ o Improved collective IO with domain partitioning on
+ Lustre stripe boundary
+
+Contributing:
+ o You may contribute via many different ways, such as
+ testing results, bug reports, and new feature patches.
+ o We appreciate any courtesy reference of this work.
+ o Disclaimer: you are welcome to try the code, but at your own risk.
+
+Contact info:
+ For more info, visit http://ft.ornl.gov/projects/io/
+
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre.c
new file mode 100644
index 0000000000000000000000000000000000000000..b7f5108bbf210d370b55fac8959ff053759ee930
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+
+struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
+ ADIOI_LUSTRE_Open, /* Open */
+ ADIOI_GEN_OpenColl, /* OpenColl */
+ ADIOI_LUSTRE_ReadContig, /* ReadContig */
+ ADIOI_LUSTRE_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_GEN_Fcntl, /* Fcntl */
+ ADIOI_LUSTRE_SetInfo, /* SetInfo */
+ ADIOI_GEN_ReadStrided, /* ReadStrided */
+ ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
+ ADIOI_GEN_Close, /* Close */
+#if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
+ ADIOI_GEN_IreadContig, /* IreadContig */
+ ADIOI_GEN_IwriteContig, /* IwriteContig */
+#else
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+#endif
+ ADIOI_GEN_IODone, /* ReadDone */
+ ADIOI_GEN_IODone, /* WriteDone */
+ ADIOI_GEN_IOComplete, /* ReadComplete */
+ ADIOI_GEN_IOComplete, /* WriteComplete */
+ ADIOI_GEN_IreadStrided, /* IreadStrided */
+ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
+ ADIOI_GEN_Flush, /* Flush */
+ ADIOI_GEN_Resize, /* Resize */
+ ADIOI_GEN_Delete, /* Delete */
+ ADIOI_GEN_Feature, /* Features */
+ "LUSTRE:",
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre.h b/3rd-party/romio341/adio/ad_lustre/ad_lustre.h
new file mode 100644
index 0000000000000000000000000000000000000000..0190d7628b50948aa0f366ccca0929ea2d7aba71
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_LUSTRE_H_INCLUDED
+#define AD_LUSTRE_H_INCLUDED
+
+/* temp*/
+#define HAVE_ASM_TYPES_H 1
+
+#include "adio.h"
+#define _GNU_SOURCE 1
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#ifdef __linux__
+#include /* necessary for: */
+#include
+#ifndef __USE_GNU
+#define __USE_GNU 1 /* O_DIRECT and */
+#include /* IO operations */
+#endif
+#undef __USE_GNU
+#endif /* __linux__ */
+
+#include
+
+#include "ad_tuning.h"
+
+#ifdef HAVE_LUSTRE_LUSTRE_USER_H
+#include
+#endif
+#ifdef HAVE_LINUX_LUSTRE_LUSTRE_USER_H
+#include
+#endif
+
+
+#ifdef HAVE_SIGNAL_H
+#include
+#endif
+
+#ifdef HAVE_AIO_LITE_H
+#include
+#else
+#ifdef HAVE_AIO_H
+#include
+#endif
+#ifdef HAVE_SYS_AIO_H
+#include
+#endif
+#endif /* End of HAVE_AIO_LITE_H */
+
+void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
+void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code);
+void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+
+/* the lustre utilities: */
+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
+ ADIO_Offset * len_list, int nprocs);
+
+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int *striping_info, int mode);
+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int contig_access_count,
+ int *striping_info, int nprocs,
+ int *count_my_req_procs_ptr,
+ int **count_my_req_per_proc_ptr,
+ ADIOI_Access ** my_req_ptr, ADIO_Offset *** buf_idx_ptr);
+
+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
+ ADIO_Offset * len, int *striping_info);
+#endif /* AD_LUSTRE_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_aggregate.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_aggregate.c
new file mode 100644
index 0000000000000000000000000000000000000000..493abaa0377499c790eafc91aa70db3b669e5684
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_aggregate.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+#include "adio_extern.h"
+
+#undef AGG_DEBUG
+
+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int *striping_info, int mode)
+{
+ /* get striping information:
+ * striping_info[0]: stripe_size
+ * striping_info[1]: stripe_count
+ * striping_info[2]: avail_cb_nodes
+ */
+ int stripe_size, stripe_count, CO = 1;
+ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
+
+ /* Get hints value */
+ /* stripe size */
+ stripe_size = fd->hints->striping_unit;
+ /* stripe count */
+ /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
+ stripe_count = fd->hints->striping_factor;
+
+ /* Calculate the available number of I/O clients */
+ if (!mode) {
+ /* for collective read,
+ * if "CO" clients access the same OST simultaneously,
+ * the OST disk seek time would be much. So, to avoid this,
+ * it might be better if 1 client only accesses 1 OST.
+ * So, we set CO = 1 to meet the above requirement.
+ */
+ CO = 1;
+ /*XXX: maybe there are other better way for collective read */
+ } else {
+ /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
+ CO = fd->hints->fs_hints.lustre.co_ratio;
+ }
+ /* Calculate how many IO clients we need */
+ /* Algorithm courtesy Pascal Deveze (pascal.deveze@bull.net) */
+ /* To avoid extent lock conflicts,
+ * avail_cb_nodes should either
+ * - be a multiple of stripe_count,
+ * - or divide stripe_count exactly
+ * so that each OST is accessed by a maximum of CO constant clients. */
+ if (nprocs_for_coll >= stripe_count)
+ /* avail_cb_nodes should be a multiple of stripe_count and the number
+ * of procs per OST should be limited to the minimum between
+ * nprocs_for_coll/stripe_count and CO
+ *
+ * e.g. if stripe_count=20, nprocs_for_coll=42 and CO=3 then
+ * avail_cb_nodes should be equal to 40 */
+ avail_cb_nodes = stripe_count * MPL_MIN(nprocs_for_coll / stripe_count, CO);
+ else {
+ /* nprocs_for_coll is less than stripe_count */
+ /* avail_cb_nodes should divide stripe_count */
+ /* e.g. if stripe_count=60 and nprocs_for_coll=8 then
+ * avail_cb_nodes should be egal to 6 */
+ /* This could be done with :
+ * while (stripe_count % avail_cb_nodes != 0) avail_cb_nodes--;
+ * but this can be optimized for large values of nprocs_for_coll and
+ * stripe_count */
+ divisor = 2;
+ avail_cb_nodes = 1;
+ /* try to divise */
+ while (stripe_count >= divisor * divisor) {
+ if ((stripe_count % divisor) == 0) {
+ if (stripe_count / divisor <= nprocs_for_coll) {
+ /* The value is found ! */
+ avail_cb_nodes = stripe_count / divisor;
+ break;
+ }
+ /* if divisor is less than nprocs_for_coll, divisor is a
+ * solution, but it is not sure that it is the best one */
+ else if (divisor <= nprocs_for_coll)
+ avail_cb_nodes = divisor;
+ }
+ divisor++;
+ }
+ }
+
+ striping_info[0] = stripe_size;
+ striping_info[1] = stripe_count;
+ striping_info[2] = avail_cb_nodes;
+}
+
+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
+ ADIO_Offset * len, int *striping_info)
+{
+ int rank_index, rank;
+ ADIO_Offset avail_bytes;
+ int stripe_size = striping_info[0];
+ int avail_cb_nodes = striping_info[2];
+
+ /* Produce the stripe-contiguous pattern for Lustre */
+ rank_index = (int) ((off / stripe_size) % avail_cb_nodes);
+
+ /* we index into fd_end with rank_index, and fd_end was allocated to be no
+ * bigger than fd->hins->cb_nodes. If we ever violate that, we're
+ * overrunning arrays. Obviously, we should never ever hit this abort
+ */
+ if (rank_index >= fd->hints->cb_nodes)
+ MPI_Abort(MPI_COMM_WORLD, 1);
+
+ avail_bytes = (off / (ADIO_Offset) stripe_size + 1) * (ADIO_Offset) stripe_size - off;
+ if (avail_bytes < *len) {
+ /* this proc only has part of the requested contig. region */
+ *len = avail_bytes;
+ }
+ /* map our index to a rank */
+ /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
+ rank = fd->hints->ranklist[rank_index];
+
+ return rank;
+}
+
+/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
+ * of this process are located in the file domains of various processes
+ * (including this one)
+ */
+
+
+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int contig_access_count,
+ int *striping_info, int nprocs,
+ int *count_my_req_procs_ptr,
+ int **count_my_req_per_proc_ptr,
+ ADIOI_Access ** my_req_ptr, ADIO_Offset *** buf_idx_ptr)
+{
+ /* Nothing different from ADIOI_Calc_my_req(), except calling
+ * ADIOI_Lustre_Calc_aggregator() instead of the old one */
+ int *count_my_req_per_proc, count_my_req_procs;
+ int i, l, proc;
+ size_t memLen;
+ ADIO_Offset avail_len, rem_len, curr_idx, off, **buf_idx, *ptr;
+ ADIOI_Access *my_req;
+
+ *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ count_my_req_per_proc = *count_my_req_per_proc_ptr;
+ /* count_my_req_per_proc[i] gives the no. of contig. requests of this
+ * process in process i's file domain. calloc initializes to zero.
+ * I'm allocating memory of size nprocs, so that I can do an
+ * MPI_Alltoall later on.
+ */
+
+ /* one pass just to calculate how much space to allocate for my_req;
+ * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
+ */
+ for (i = 0; i < contig_access_count; i++) {
+ /* short circuit offset/len processing if len == 0
+ * (zero-byte read/write
+ */
+ if (len_list[i] == 0)
+ continue;
+ off = offset_list[i];
+ avail_len = len_list[i];
+ /* note: we set avail_len to be the total size of the access.
+ * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
+ * the amount that was available.
+ */
+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+ count_my_req_per_proc[proc]++;
+
+ /* figure out how many data is remaining in the access
+ * we'll take care of this data (if there is any)
+ * in the while loop below.
+ */
+ rem_len = len_list[i] - avail_len;
+
+ while (rem_len != 0) {
+ off += avail_len; /* point to first remaining byte */
+ avail_len = rem_len; /* save remaining size, pass to calc */
+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+ count_my_req_per_proc[proc]++;
+ rem_len -= avail_len; /* reduce remaining length by amount from fd */
+ }
+ }
+
+ /* buf_idx is relevant only if buftype_is_contig.
+ * buf_idx[i] gives the index into user_buf where data received
+ * from proc 'i' should be placed. This allows receives to be done
+ * without extra buffer. This can't be done if buftype is not contig.
+ */
+
+ memLen = 0;
+ for (i = 0; i < nprocs; i++)
+ memLen += count_my_req_per_proc[i];
+ ptr = (ADIO_Offset *) ADIOI_Malloc((memLen * 3 + nprocs) * sizeof(ADIO_Offset));
+
+ /* initialize buf_idx vectors */
+ buf_idx = (ADIO_Offset **) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset *));
+ buf_idx[0] = ptr;
+ for (i = 1; i < nprocs; i++)
+ buf_idx[i] = buf_idx[i - 1] + count_my_req_per_proc[i - 1] + 1;
+ ptr += memLen + nprocs; /* "+ nprocs" puts a terminal index at the end */
+
+ /* now allocate space for my_req, offset, and len */
+ *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
+ my_req = *my_req_ptr;
+ my_req[0].offsets = ptr;
+
+ count_my_req_procs = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (count_my_req_per_proc[i]) {
+ my_req[i].offsets = ptr;
+ ptr += count_my_req_per_proc[i];
+ my_req[i].lens = ptr;
+ ptr += count_my_req_per_proc[i];
+ count_my_req_procs++;
+ }
+ my_req[i].count = 0; /* will be incremented where needed later */
+ }
+
+ /* now fill in my_req */
+ curr_idx = 0;
+ for (i = 0; i < contig_access_count; i++) {
+ /* short circuit offset/len processing if len == 0
+ * (zero-byte read/write */
+ if (len_list[i] == 0)
+ continue;
+ off = offset_list[i];
+ avail_len = len_list[i];
+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+
+ l = my_req[proc].count;
+
+ ADIOI_Assert(l < count_my_req_per_proc[proc]);
+ buf_idx[proc][l] = curr_idx;
+ curr_idx += avail_len;
+
+ rem_len = len_list[i] - avail_len;
+
+ /* store the proc, offset, and len information in an array
+ * of structures, my_req. Each structure contains the
+ * offsets and lengths located in that process's FD,
+ * and the associated count.
+ */
+ my_req[proc].offsets[l] = off;
+ ADIOI_Assert(avail_len == (int) avail_len);
+ my_req[proc].lens[l] = (int) avail_len;
+ my_req[proc].count++;
+
+ while (rem_len != 0) {
+ off += avail_len;
+ avail_len = rem_len;
+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+
+ l = my_req[proc].count;
+ ADIOI_Assert(l < count_my_req_per_proc[proc]);
+ buf_idx[proc][l] = curr_idx;
+
+ curr_idx += avail_len;
+ rem_len -= avail_len;
+
+ my_req[proc].offsets[l] = off;
+ ADIOI_Assert(avail_len == (int) avail_len);
+ my_req[proc].lens[l] = (int) avail_len;
+ my_req[proc].count++;
+ }
+ }
+
+#ifdef AGG_DEBUG
+ for (i = 0; i < nprocs; i++) {
+ if (count_my_req_per_proc[i] > 0) {
+ FPRINTF(stdout, "data needed from %d (count = %d):\n", i, my_req[i].count);
+ for (l = 0; l < my_req[i].count; l++) {
+ FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
+ l, (long long) my_req[i].offsets[l], l, (long long) my_req[i].lens[l]);
+ }
+ }
+ }
+#endif
+
+ *count_my_req_procs_ptr = count_my_req_procs;
+ *buf_idx_ptr = buf_idx;
+}
+
+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
+ ADIO_Offset * len_list, int nprocs)
+{
+ /* If the processes are non-interleaved, we will check the req_size.
+ * if (avg_req_size > big_req_size) {
+ * docollect = 0;
+ * }
+ */
+
+ int i, docollect = 1, big_req_size = 0;
+ ADIO_Offset req_size = 0, total_req_size;
+ int avg_req_size, total_access_count;
+
+ /* calculate total_req_size and total_access_count */
+ for (i = 0; i < contig_access_count; i++)
+ req_size += len_list[i];
+ MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM, fd->comm);
+ MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM, fd->comm);
+ /* avoid possible divide-by-zero) */
+ if (total_access_count != 0) {
+ /* estimate average req_size */
+ avg_req_size = (int) (total_req_size / total_access_count);
+ } else {
+ avg_req_size = 0;
+ }
+ /* get hint of big_req_size */
+ big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
+ /* Don't perform collective I/O if there are big requests */
+ if ((big_req_size > 0) && (avg_req_size > big_req_size))
+ docollect = 0;
+
+ return docollect;
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_fcntl.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_fcntl.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d9f4026214aaf4d3b52b7676ab597a744f57ba0
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_fcntl.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+#include "adio_extern.h"
+
+void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code)
+{
+ int i, ntimes;
+ ADIO_Offset curr_fsize, alloc_size, size, len, done;
+ ADIO_Status status;
+ char *buf;
+#if defined(MPICH) || !defined(PRINT_ERR_MSG)
+ static char myname[] = "ADIOI_LUSTRE_FCNTL";
+#endif
+
+ switch (flag) {
+ case ADIO_FCNTL_GET_FSIZE:
+ fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
+ if (fd->fp_sys_posn != -1)
+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
+ if (fcntl_struct->fsize == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+ break;
+
+ case ADIO_FCNTL_SET_DISKSPACE:
+ /* will be called by one process only */
+ /* On file systems with no preallocation function, I have to
+ * explicitly write
+ * to allocate space. Since there could be holes in the file,
+ * I need to read up to the current file size, write it back,
+ * and then write beyond that depending on how much
+ * preallocation is needed.
+ * read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */
+
+ curr_fsize = lseek(fd->fd_sys, 0, SEEK_END);
+ alloc_size = fcntl_struct->diskspace;
+
+ size = MPL_MIN(curr_fsize, alloc_size);
+
+ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1) / ADIOI_PREALLOC_BUFSZ;
+ buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ);
+ done = 0;
+
+ for (i = 0; i < ntimes; i++) {
+ len = MPL_MIN(size - done, ADIOI_PREALLOC_BUFSZ);
+ ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done,
+ &status, error_code);
+ if (*error_code != MPI_SUCCESS) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s",
+ strerror(errno));
+ return;
+ }
+ ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
+ done, &status, error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+ done += len;
+ }
+
+ if (alloc_size > curr_fsize) {
+ memset(buf, 0, ADIOI_PREALLOC_BUFSZ);
+ size = alloc_size - curr_fsize;
+ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1) / ADIOI_PREALLOC_BUFSZ;
+ for (i = 0; i < ntimes; i++) {
+ len = MPL_MIN(alloc_size - done, ADIOI_PREALLOC_BUFSZ);
+ ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
+ done, &status, error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+ done += len;
+ }
+ }
+ ADIOI_Free(buf);
+ if (fd->fp_sys_posn != -1)
+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
+ *error_code = MPI_SUCCESS;
+ break;
+
+ case ADIO_FCNTL_SET_ATOMICITY:
+ fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
+ *error_code = MPI_SUCCESS;
+ break;
+
+ default:
+ FPRINTF(stderr, "Unknown flag passed to ADIOI_LUSTRE_Fcntl\n");
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_hints.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_hints.c
new file mode 100644
index 0000000000000000000000000000000000000000..152ba31f6f93a50152170e35aee7a74fe7417cc1
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_hints.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+#include "adio_extern.h"
+#include "hint_fns.h"
+#ifdef HAVE_LIMITS_H
+#include
+#endif
+
+void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+{
+ char *value;
+ int flag;
+ ADIO_Offset stripe_val[3], str_factor = -1, str_unit = 0, start_iodev = -1;
+ int myrank;
+ static char myname[] = "ADIOI_LUSTRE_SETINFO";
+
+
+#ifdef HAVE_LUSTRE_LOCKAHEAD
+ /* Set lock ahead default hints */
+ fd->hints->fs_hints.lustre.lock_ahead_read = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_write = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_num_extents = 500;
+ fd->hints->fs_hints.lustre.lock_ahead_flags = 0;
+#endif
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ if ((fd->info) == MPI_INFO_NULL) {
+ /* This must be part of the open call. can set striping parameters
+ * if necessary. */
+ MPI_Info_create(&(fd->info));
+
+ ADIOI_Info_set(fd->info, "direct_read", "false");
+ ADIOI_Info_set(fd->info, "direct_write", "false");
+ fd->direct_read = fd->direct_write = 0;
+ /* initialize lustre hints */
+ ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
+ fd->hints->fs_hints.lustre.co_ratio = 1;
+ ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
+ fd->hints->fs_hints.lustre.coll_threshold = 0;
+ ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
+ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
+
+ /* has user specified striping or server buffering parameters
+ * and do they have the same value on all processes? */
+ if (users_info != MPI_INFO_NULL) {
+ /* striping information */
+ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ ADIOI_Info_set(fd->info, "striping_unit", value);
+ str_unit = atoll(value);
+ }
+
+ ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ ADIOI_Info_set(fd->info, "striping_factor", value);
+ str_factor = atoll(value);
+ }
+
+ ADIOI_Info_get(users_info, "romio_lustre_start_iodevice",
+ MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
+ start_iodev = atoll(value);
+ }
+
+
+ /* direct read and write */
+ ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
+ ADIOI_Info_set(fd->info, "direct_read", "true");
+ fd->direct_read = 1;
+ }
+ ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
+ ADIOI_Info_set(fd->info, "direct_write", "true");
+ fd->direct_write = 1;
+ }
+#ifdef HAVE_LUSTRE_LOCKAHEAD
+ /* Get lock ahead hints */
+
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "romio_lustre_cb_lock_ahead_write",
+ &(fd->hints->fs_hints.lustre.lock_ahead_write),
+ myname, error_code);
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "romio_lustre_cb_lock_ahead_read",
+ &(fd->hints->fs_hints.lustre.lock_ahead_read),
+ myname, error_code);
+
+ /* If, and only if, we're using lock ahead,
+ * process/set the number of extents to pre-lock and the flags */
+ if (fd->hints->fs_hints.lustre.lock_ahead_read ||
+ fd->hints->fs_hints.lustre.lock_ahead_write) {
+ /* Get user's number of extents */
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "romio_lustre_cb_lock_ahead_num_extents",
+ &(fd->hints->fs_hints.
+ lustre.lock_ahead_num_extents), myname,
+ error_code);
+
+ /* ADIOI_Info_check_and_install_int doesn't set the
+ * value in fd unless it was in user_info, but knowing
+ * the value - default or explicit - is useful.
+ * Set the final number of extents in the fd->info */
+ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d",
+ fd->hints->fs_hints.lustre.lock_ahead_num_extents);
+ ADIOI_Info_set(fd->info, "romio_lustre_cb_lock_ahead_num_extents", value);
+
+ /* Get user's flags */
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "romio_lustre_cb_lock_ahead_flags",
+ &(fd->hints->fs_hints.lustre.lock_ahead_flags),
+ myname, error_code);
+ }
+#endif
+ }
+
+
+
+ /* set striping information with ioctl */
+ MPI_Comm_rank(fd->comm, &myrank);
+ if (myrank == 0) {
+ stripe_val[0] = str_factor;
+ stripe_val[1] = str_unit;
+ stripe_val[2] = start_iodev;
+ }
+ MPI_Bcast(stripe_val, 3, MPI_OFFSET, 0, fd->comm);
+
+ /* do not open file in hint processing. Open file in open routines,
+ * where we can better deal with EXCL flag . Continue to check the
+ * "all processors set a value" condition holds. */
+ if (stripe_val[0] != str_factor
+ || stripe_val[1] != str_unit || stripe_val[2] != start_iodev) {
+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME("ADIOI_LUSTRE_SetInfo",
+ "str_factor or str_unit or start_iodev", error_code);
+ ADIOI_Free(value);
+ return;
+ }
+ }
+
+ /* get other hint */
+ if (users_info != MPI_INFO_NULL) {
+ /* CO: IO Clients/OST,
+ * to keep the load balancing between clients and OSTs */
+ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_co_ratio",
+ &(fd->hints->fs_hints.lustre.co_ratio), myname,
+ error_code);
+
+ /* coll_threshold:
+ * if the req size is bigger than this, collective IO may not be performed.
+ */
+ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_coll_threshold",
+ &(fd->hints->fs_hints.lustre.coll_threshold), myname,
+ error_code);
+
+ /* ds_in_coll: disable data sieving in collective IO */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_lustre_ds_in_coll",
+ &(fd->hints->fs_hints.lustre.ds_in_coll), myname,
+ error_code);
+
+ }
+ /* set the values for collective I/O and data sieving parameters */
+ ADIOI_GEN_SetInfo(fd, users_info, error_code);
+
+ /* generic hints might step on striping_unit */
+ if (users_info != MPI_INFO_NULL) {
+ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", NULL, myname, error_code);
+ }
+
+ if (ADIOI_Direct_read)
+ fd->direct_read = 1;
+ if (ADIOI_Direct_write)
+ fd->direct_write = 1;
+
+ ADIOI_Free(value);
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_lock.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_lock.c
new file mode 100644
index 0000000000000000000000000000000000000000..eb4e6307888bc49fc14044a89b0c02dc91ad3a9f
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_lock.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+
+#include "ad_lustre.h"
+
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+#include
+
+/* If necessary (older luster client headers) define the new
+ locking structures. */
+
+
+
+
+
+
+//#define LOCK_AHEAD_DEBUG
+
+#ifndef LL_IOC_LADVISE
+#define LL_IOC_LADVISE _IOR('f', 250, struct llapi_lu_ladvise)
+
+enum lu_ladvise_type {
+ LU_LADVISE_INVALID = 0,
+ LU_LADVISE_WILLREAD = 1,
+ LU_LADVISE_DONTNEED = 2,
+ LU_LADVISE_LOCKNOEXPAND = 3,
+ LU_LADVISE_LOCKAHEAD = 4,
+ LU_LADVISE_MAX
+};
+
+#define LU_LADVISE_NAMES { \
+ [LU_LADVISE_WILLREAD] = "willread", \
+ [LU_LADVISE_DONTNEED] = "dontneed", \
+ [LU_LADVISE_LOCKNOEXPAND] = "locknoexpand", \
+ [LU_LADVISE_LOCKAHEAD] = "lockahead", \
+}
+
+/* This is the userspace argument for ladvise. It is currently the same as
+ * what goes on the wire (struct lu_ladvise), but is defined separately as we
+ * may need info which is only used locally. */
+struct llapi_lu_ladvise {
+ __u16 lla_advice; /* advice type */
+ __u16 lla_value1; /* values for different advice types */
+ __u32 lla_value2;
+ __u64 lla_start; /* first byte of extent for advice */
+ __u64 lla_end; /* last byte of extent for advice */
+ __u32 lla_value3;
+ __u32 lla_value4;
+};
+enum ladvise_flag {
+ LF_ASYNC = 0x00000001,
+ LF_UNSET = 0x00000002,
+ /* For lock requests */
+ LF_NONBLOCK = 0x00000003,
+};
+
+#define LADVISE_MAGIC 0x1ADF1CE0
+/* Masks of valid flags for each advice */
+#define LF_LOCKNOEXPAND_MASK LF_UNSET
+#define LF_LOCKAHEAD_MASK LF_NONBLOCK
+/* Flags valid for all advices not explicitly specified */
+#define LF_DEFAULT_MASK LF_ASYNC
+/* All flags */
+#define LF_MASK (LF_ASYNC | LF_UNSET | LF_NONBLOCK)
+
+#define lla_lockahead_mode lla_value1
+#define lla_peradvice_flags lla_value2
+#define lla_lockahead_result lla_value3
+
+/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
+ * is used on the wire. It is defined separately as we may need info which is
+ * only used locally. */
+struct llapi_ladvise_hdr {
+ __u32 lah_magic; /* LADVISE_MAGIC */
+ __u32 lah_count; /* number of advices */
+ __u64 lah_flags; /* from enum ladvise_flag */
+ __u32 lah_value1; /* unused */
+ __u32 lah_value2; /* unused */
+ __u64 lah_value3; /* unused */
+ struct llapi_lu_ladvise lah_advise[0]; /* advices in this header */
+};
+
+#define LAH_COUNT_MAX (1024)
+
+enum lock_mode_user {
+ MODE_READ_USER = 1,
+ MODE_WRITE_USER,
+ MODE_MAX_USER,
+};
+
+#define LOCK_MODE_NAMES { \
+ [MODE_READ_USER] = "READ",\
+ [MODE_WRITE_USER] = "WRITE"\
+}
+
+enum lockahead_results {
+ LLA_RESULT_SENT = 0,
+ LLA_RESULT_DIFFERENT,
+ LLA_RESULT_SAME,
+};
+#endif
+
+
+int llapi_ladvise_lock(ADIO_File fd, unsigned long long flags, int num_advise,
+ ADIO_Offset * offset, int stripe_size, int num_extents,
+ ADIO_Offset step_size)
+{
+ struct llapi_ladvise_hdr *ladvise_hdr;
+ int rc;
+ int i;
+ enum lock_mode_user mode = 0;
+
+ if (num_advise < 1 || num_advise >= LAH_COUNT_MAX) {
+ errno = EINVAL;
+ /*llapi_error(LLAPI_MSG_ERROR, -EINVAL,
+ * "bad advice number %d", num_advise); */
+ return -1;
+ }
+
+ ladvise_hdr =
+ ADIOI_Malloc(sizeof(struct llapi_ladvise_hdr) +
+ sizeof(struct llapi_lu_ladvise) * num_advise);
+
+ if (ladvise_hdr == NULL) {
+ errno = ENOMEM;
+ //llapi_error(LLAPI_MSG_ERROR, -ENOMEM, "not enough memory");
+ return -1;
+ }
+ ladvise_hdr->lah_magic = LADVISE_MAGIC;
+ ladvise_hdr->lah_count = num_advise;
+ ladvise_hdr->lah_flags = flags & LF_MASK;
+ ladvise_hdr->lah_value1 = 0;
+ ladvise_hdr->lah_value2 = 0;
+ ladvise_hdr->lah_value3 = 0;
+
+ if (fd->hints->fs_hints.lustre.lock_ahead_write)
+ mode = MODE_WRITE_USER;
+ else if (fd->hints->fs_hints.lustre.lock_ahead_read) /* read only */
+ mode = MODE_READ_USER;
+ else
+ MPI_Abort(MPI_COMM_WORLD, 1);
+
+ for (i = 0; i < num_extents; ++i) {
+ ladvise_hdr->lah_advise[i].lla_advice = LU_LADVISE_LOCKAHEAD;
+ ladvise_hdr->lah_advise[i].lla_lockahead_mode = mode;
+ ladvise_hdr->lah_advise[i].lla_peradvice_flags = flags | LF_ASYNC;
+ ladvise_hdr->lah_advise[i].lla_start = *offset;
+ ladvise_hdr->lah_advise[i].lla_end = *offset + stripe_size - 1;;
+ ladvise_hdr->lah_advise[i].lla_value3 = 0;
+ ladvise_hdr->lah_advise[i].lla_value4 = 0;
+ ladvise_hdr->lah_advise[i].lla_lockahead_result = 0;
+ *offset += step_size;
+ }
+
+
+ rc = ioctl(fd->fd_sys, LL_IOC_LADVISE, ladvise_hdr);
+
+ if (rc < 0) {
+ ADIOI_Free(ladvise_hdr);
+ //llapi_error(LLAPI_MSG_ERROR, -errno, "cannot give advice");
+ return -1;
+ }
+
+
+ /* Simply save the new start/end extents, forget what we aleady had locked
+ * since lustre may reclaim it at any time. */
+ fd->hints->fs_hints.lustre.lock_ahead_start_extent = ladvise_hdr->lah_advise[0].lla_start;
+ fd->hints->fs_hints.lustre.lock_ahead_end_extent =
+ ladvise_hdr->lah_advise[num_extents - 1].lla_end;
+
+
+#ifdef LOCK_AHEAD_DEBUG
+ /* Print any per extent errors */
+ for (i = 0; i < num_extents; ++i) {
+ if (ladvise_hdr->lah_advise[i].lla_lockahead_result) {
+ fprintf(stderr, "%s(%d) "
+ "lock ahead extent[%4.4d] {%ld,%ld} stripe {%ld,%ld} error %d\n",
+ __func__, __LINE__,
+ i,
+ (long int) ladvise_hdr->lah_advise[i].lla_start,
+ (long int) ladvise_hdr->lah_advise[i].lla_end,
+ (long int) ladvise_hdr->lah_advise[i].lla_start / stripe_size,
+ (long int) ladvise_hdr->lah_advise[i].lla_end / stripe_size,
+ ladvise_hdr->lah_advise[i].lla_lockahead_result);
+ }
+ }
+
+#endif
+ ADIOI_Free(ladvise_hdr);
+
+ return 0;
+}
+
+
+
+
+
+
+
+/* Set lustre locks to only lock the requested byte range, do not
+ extend any locks to 'infinity' which is the normal behavior.
+ This will enhance 'lock ahead' extent locking, which we do not
+ want to auto-extend. */
+int ADIOI_LUSTRE_request_only_lock_ioctl(ADIO_File fd)
+{
+ int err = 0;
+
+ struct llapi_ladvise_hdr *noexpand_hdr;
+ noexpand_hdr = ADIOI_Malloc(sizeof(struct llapi_ladvise_hdr) + sizeof(struct llapi_lu_ladvise));
+ if (!noexpand_hdr) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ noexpand_hdr->lah_magic = LADVISE_MAGIC;
+ noexpand_hdr->lah_count = 1;
+ noexpand_hdr->lah_flags = 0;
+ noexpand_hdr->lah_value1 = 0;
+ noexpand_hdr->lah_value2 = 0;
+ noexpand_hdr->lah_value3 = 0;
+ noexpand_hdr->lah_advise[0].lla_advice = LU_LADVISE_LOCKNOEXPAND;
+ noexpand_hdr->lah_advise[0].lla_peradvice_flags = 0;
+
+ noexpand_hdr->lah_advise[0].lla_value1 = 0;
+ noexpand_hdr->lah_advise[0].lla_start = 0;
+ noexpand_hdr->lah_advise[0].lla_end = 0;
+ noexpand_hdr->lah_advise[0].lla_value3 = 0;
+ noexpand_hdr->lah_advise[0].lla_value4 = 0;
+
+ int rc = ioctl(fd->fd_sys, LL_IOC_LADVISE, noexpand_hdr);
+ if (rc < 0) {
+ ADIOI_Free(noexpand_hdr);
+ //llapi_error(LLAPI_MSG_ERROR, -errno, "cannot give advice");
+ return -1;
+ }
+
+ ADIOI_Free(noexpand_hdr);
+
+
+ out:
+
+ return err;
+}
+
+/* Use group locks to 'clear' existing locks on the file
+ before attempting 'lock ahead' extent locking. */
+int ADIOI_LUSTRE_clear_locks(ADIO_File fd)
+{
+ int err = 0;
+ int id;
+
+ if (!fd->my_cb_nodes_index) {
+ srand(time(NULL));
+ id = rand();
+ err = ioctl(fd->fd_sys, LL_IOC_GROUP_LOCK, id);
+ err = ioctl(fd->fd_sys, LL_IOC_GROUP_UNLOCK, id);
+ }
+ return err;
+}
+
+/* Lock a predefined series of 'extents' in the file.
+ The intent is to match the aggregator locking pattern. */
+void ADIOI_LUSTRE_lock_ahead_ioctl(ADIO_File fd, int avail_cb_nodes, ADIO_Offset next_offset,
+ int *error_code)
+{
+
+ int err = 0;
+ int num_extents = fd->hints->fs_hints.lustre.lock_ahead_num_extents;
+ int flags = fd->hints->fs_hints.lustre.lock_ahead_flags;
+ ADIO_Offset offset = 0, step_size = 0;
+ int stripe_size = fd->hints->striping_unit;
+
+ int agg_idx = fd->my_cb_nodes_index;
+
+ /* Not a collective aggregator? Do nothing and return
+ * since current code is based on aggregator/stripes */
+ if (agg_idx < 0) {
+ /* Disable further lock ahead ...
+ * fd->hints->fs_hints.lustre.lock_ahead_read = 0;
+ * fd->hints->fs_hints.lustre.lock_ahead_write = 0;
+ * fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0;
+ * fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX;
+ */
+ return;
+ }
+#ifdef LOCK_AHEAD_DEBUG
+ {
+ /* Debug check. Calculate the expected rank for this stripe */
+ int rank_index;
+ rank_index = (int) ((next_offset / stripe_size) % avail_cb_nodes);
+ /* Not sure why, but this happens in the generic read coll?
+ * It doesn't do the aggregation striped quite as expected.
+ * We'll probably lock the wrong stripes for this read ...
+ * but we're more interested in write locks than read locks
+ * so stick with the lustre specific calculations for now.
+ * Consider dropping read support if performance isn't improved
+ * or ad_lustre doesn't add read coll code.
+ */
+ if (agg_idx != rank_index) {
+ fprintf(stderr, "%s(%d) rank[%d] file system %d "
+ "lock ahead debug R(%d)/W(%d), "
+ "aggregator %d(%d)/%d(%d), "
+ "offset %lld, start offset %lld, stripe %lld "
+ "num_extents %d\n",
+ __func__, __LINE__,
+ fd->hints->ranklist[agg_idx],
+ fd->file_system,
+ fd->hints->fs_hints.lustre.lock_ahead_read,
+ fd->hints->fs_hints.lustre.lock_ahead_write,
+ agg_idx, rank_index,
+ avail_cb_nodes, fd->hints->cb_nodes,
+ (long long) next_offset, (long long) (next_offset / stripe_size * stripe_size),
+ (long long) next_offset / stripe_size, num_extents);
+ }
+ /* Just checking the config vs what was passed in */
+ if (agg_idx >= avail_cb_nodes) {
+ fprintf(stderr, "%s(%d) file system %d "
+ "lock ahead debug R(%d)/W(%d), "
+ "aggregator %d(%d)/%d(%d), "
+ "num_extents %d\n",
+ __func__, __LINE__, fd->file_system,
+ fd->hints->fs_hints.lustre.lock_ahead_read,
+ fd->hints->fs_hints.lustre.lock_ahead_write,
+ agg_idx, rank_index, avail_cb_nodes, fd->hints->cb_nodes, num_extents);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+#endif
+
+ /* Check file access vs requested lock ahead */
+ if (fd->access_mode & ADIO_RDONLY) {
+ /* Don't need write lock ahead */
+ fd->hints->fs_hints.lustre.lock_ahead_write = 0;
+
+ /* Do need read lock ahead or give up. */
+ if (!(fd->hints->fs_hints.lustre.lock_ahead_read)) {
+ fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX;
+ return;
+ }
+ }
+ if (fd->access_mode & ADIO_WRONLY) {
+ /* Don't need read lock ahead */
+ fd->hints->fs_hints.lustre.lock_ahead_read = 0;
+
+ /* Do need write lock ahead or give up. */
+ if (!(fd->hints->fs_hints.lustre.lock_ahead_write)) {
+ fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX;
+ return;
+ }
+ }
+
+
+ step_size = (ADIO_Offset) avail_cb_nodes *stripe_size;
+
+ if (next_offset == 0) { /* 1st call, calculate our starting offset */
+ offset = (ADIO_Offset) agg_idx *stripe_size;
+ } else /* Have to assume we're writing to one of our stripes */
+ offset = next_offset / stripe_size * stripe_size; /* start of stripe */
+
+ err = llapi_ladvise_lock(fd, flags, num_extents, &offset, stripe_size, num_extents, step_size);
+
+
+ if (err == -1) { /* turn off lock ahead after a failure */
+#ifdef LOCK_AHEAD_DEBUG
+ fprintf(stderr, "%s(%d) file system %d "
+ "lock ahead failure R(%d)/W(%d), "
+ "aggregator %d/%d, "
+ "next offset %lld, stripe %lld, "
+ "last offset %lld, stripe %lld, "
+ "step %lld, stripe size %lld "
+ "num_extents %d\n",
+ __func__, __LINE__, fd->file_system,
+ fd->hints->fs_hints.lustre.lock_ahead_read,
+ fd->hints->fs_hints.lustre.lock_ahead_write,
+ agg_idx,
+ avail_cb_nodes,
+ (long long) next_offset, (long long) next_offset / stripe_size,
+ (long long) offset, (long long) offset / stripe_size,
+ (long long) step_size, (long long) stripe_size, num_extents);
+#endif
+ fd->hints->fs_hints.lustre.lock_ahead_read = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_write = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0;
+ fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX;
+
+ *error_code = ADIOI_Err_create_code("ADIOI_LUSTRE_lock_ahead_ioctl", fd->filename, errno);
+ if (agg_idx == 0) {
+ fprintf(stderr, "%s: ioctl(LL_IOC_LADVISE) \'%s\'\n", __func__, strerror(errno));
+ }
+ /* Note: it's too late to turn off 'request only' locking, which
+ * could affect performance without also having 'lock ahead'.
+ *
+ * We expect lustre to support this (turning it off) later */
+ }
+
+
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_open.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..7fbd4d03f628f816f23cdbdc8e27594e9cac90de
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_open.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+
+/* what is the basis for this define?
+ * what happens if there are more than 1k UUIDs? */
+
+#define MAX_LOV_UUID_COUNT 1000
+
+int ADIOI_LUSTRE_clear_locks(ADIO_File fd); /* in ad_lustre_lock.c */
+int ADIOI_LUSTRE_request_only_lock_ioctl(ADIO_File fd); /* in ad_lustre_lock.c */
+
+void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
+{
+ int perm, old_mask, amode, amode_direct;
+ int lumlen, myrank, flag, set_layout = 0, err;
+ struct lov_user_md *lum = NULL;
+ char *value;
+ ADIO_Offset str_factor = -1, str_unit = 0, start_iodev = -1;
+ size_t value_sz = (MPI_MAX_INFO_VAL + 1) * sizeof(char);
+
+#if defined(MPICH) || !defined(PRINT_ERR_MSG)
+ static char myname[] = "ADIOI_LUSTRE_OPEN";
+#endif
+
+ MPI_Comm_rank(fd->comm, &myrank);
+
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = old_mask ^ 0666;
+ } else
+ perm = fd->perm;
+
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE)
+ amode = amode | O_CREAT;
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+ amode_direct = amode | O_DIRECT;
+
+ /* odd length here because lov_user_md contains some fixed data and
+ * then a list of 'lmm_objects' representing stripe */
+ lumlen = sizeof(struct lov_user_md) + MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data);
+ lum = (struct lov_user_md *) ADIOI_Calloc(1, lumlen);
+
+ value = (char *) ADIOI_Malloc(value_sz);
+ /* we already validated in LUSTRE_SetInfo that these are going to be the same */
+ if (fd->info != MPI_INFO_NULL) {
+ /* striping information */
+ ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag)
+ str_unit = atoll(value);
+
+ ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag)
+ str_factor = atoll(value);
+
+ ADIOI_Info_get(fd->info, "romio_lustre_start_iodevice", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag)
+ start_iodev = atoll(value);
+ }
+ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0))
+ set_layout = 1;
+
+ /* if hints were set, we need to delay creation of any lustre objects.
+ * However, if we open the file with O_LOV_DELAY_CREATE and don't call the
+ * follow-up ioctl, subsequent writes will fail */
+ if (myrank == 0 && set_layout)
+ amode = amode | O_LOV_DELAY_CREATE;
+
+ fd->fd_sys = open(fd->filename, amode, perm);
+ if (fd->fd_sys == -1)
+ goto fn_exit;
+
+ /* we can only set these hints on new files */
+ /* It was strange and buggy to open the file in the hint path. Instead,
+ * we'll apply the file tunings at open time */
+ if ((amode & O_CREAT) && set_layout) {
+ /* if user has specified striping info, first aggregator tries to set
+ * it */
+ if (myrank == fd->hints->ranklist[0] || fd->comm == MPI_COMM_SELF) {
+ lum->lmm_magic = LOV_USER_MAGIC;
+ lum->lmm_pattern = 0;
+ /* crude check for overflow of lustre internal datatypes.
+ * Silently cap to large value if user provides a value
+ * larger than lustre supports */
+ if (str_unit > UINT_MAX)
+ lum->lmm_stripe_size = UINT_MAX;
+ else
+ lum->lmm_stripe_size = str_unit;
+
+ if (str_factor > USHRT_MAX)
+ lum->lmm_stripe_count = USHRT_MAX;
+ else
+ lum->lmm_stripe_count = str_factor;
+
+ if (start_iodev > USHRT_MAX)
+ lum->lmm_stripe_offset = USHRT_MAX;
+ else
+ lum->lmm_stripe_offset = start_iodev;
+ err = ioctl(fd->fd_sys, LL_IOC_LOV_SETSTRIPE, lum);
+ if (err == -1 && errno != EEXIST) {
+ fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
+ /* not a fatal error, but user might care to know */
+ }
+ } /* End of striping parameters validation */
+ }
+
+ /* Pascal Deveze reports that, even though we pass a
+ * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this
+ * struct are uninitialzed, the call can give an error. zero it out in case
+ * there are other members that must be initialized and in case
+ * lov_user_md struct changes in future */
+ memset(lum, 0, lumlen);
+ lum->lmm_magic = LOV_USER_MAGIC;
+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) lum);
+ if (!err) {
+
+ fd->hints->striping_unit = lum->lmm_stripe_size;
+ MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_size);
+ ADIOI_Info_set(fd->info, "striping_unit", value);
+
+ fd->hints->striping_factor = lum->lmm_stripe_count;
+ MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_count);
+ ADIOI_Info_set(fd->info, "striping_factor", value);
+
+ fd->hints->start_iodevice = lum->lmm_stripe_offset;
+ MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_offset);
+ ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
+
+ }
+
+ if (fd->access_mode & ADIO_APPEND)
+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+
+ fd->fd_direct = -1;
+ if (fd->direct_write || fd->direct_read) {
+ fd->fd_direct = open(fd->filename, amode_direct, perm);
+ if (fd->fd_direct != -1) {
+ fd->d_mem = fd->d_miniosz = (1 << 12);
+ } else {
+ perror("cannot open file with O_Direct");
+ fd->direct_write = fd->direct_read = 0;
+ }
+ }
+#ifdef HAVE_LUSTRE_LOCKAHEAD
+ if (fd->hints->fs_hints.lustre.lock_ahead_read || fd->hints->fs_hints.lustre.lock_ahead_write) {
+ ADIOI_LUSTRE_clear_locks(fd);
+ ADIOI_LUSTRE_request_only_lock_ioctl(fd);
+ }
+#endif
+
+
+ fn_exit:
+ ADIOI_Free(lum);
+ ADIOI_Free(value);
+ /* --BEGIN ERROR HANDLING-- */
+ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && (fd->direct_write || fd->direct_read))) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ }
+ /* --END ERROR HANDLING-- */
+ else
+ *error_code = MPI_SUCCESS;
+
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_rwcontig.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_rwcontig.c
new file mode 100644
index 0000000000000000000000000000000000000000..f0c04127400471262c4601a2199c9555a9743178
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_rwcontig.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+
+#include
+
+#include
+#include
+
+#define LUSTRE_MEMALIGN (1<<12) /* to use page_shift */
+
+static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, MPI_Count len,
+ ADIO_Offset offset, ssize_t * err);
+static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, MPI_Count len,
+ ADIO_Offset offset, ssize_t * err)
+{
+ ssize_t rem, size, nbytes;
+ if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz)) {
+ *err = pwrite(fd->fd_direct, buf, len, offset);
+ } else if (len < fd->d_miniosz) {
+ *err = pwrite(fd->fd_sys, buf, len, offset);
+ } else {
+ rem = len % fd->d_miniosz;
+ size = len - rem;
+ *err = pwrite(fd->fd_direct, buf, size, offset);
+ if (*err == -1)
+ return;
+ nbytes = *err;
+ *err = pwrite(fd->fd_sys, ((char *) buf) + size, rem, offset + size);
+ if (*err == -1)
+ return;
+ *err += nbytes;
+ }
+}
+
+static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, MPI_Count len,
+ ADIO_Offset offset, ssize_t * err);
+static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, MPI_Count len,
+ ADIO_Offset offset, ssize_t * err)
+{
+ MPI_Count rem, size;
+ ssize_t nbytes;
+ if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz))
+ *err = pread(fd->fd_direct, (void *) buf, len, offset);
+ else if (len < fd->d_miniosz)
+ *err = pread(fd->fd_sys, (void *) buf, len, offset);
+ else {
+ rem = len % fd->d_miniosz;
+ size = len - rem;
+ *err = pread(fd->fd_direct, (void *) buf, size, offset);
+ if (*err == -1)
+ return;
+ nbytes = *err;
+ *err = pread(fd->fd_sys, ((char *) buf) + size, rem, offset + size);
+ if (*err == -1)
+ return;
+ *err += nbytes;
+ }
+}
+
+static ssize_t ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, MPI_Count len,
+ off_t offset, int rw);
+static ssize_t ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, MPI_Count len,
+ off_t offset, int rw)
+{
+ ssize_t err = -1, diff, nbytes = 0;
+ MPI_Count size = len;
+ void *newbuf;
+
+ if (offset % fd->d_miniosz) {
+ diff = fd->d_miniosz - (offset % fd->d_miniosz);
+ diff = MPL_MIN(diff, len);
+ if (rw)
+ nbytes = pwrite(fd->fd_sys, (void *) buf, diff, offset);
+ else
+ nbytes = pread(fd->fd_sys, (void *) buf, diff, offset);
+ if (nbytes == -1)
+ return -1;
+ buf = ((char *) buf) + diff;
+ offset += diff;
+ size = len - diff;
+ }
+
+ if (!size) {
+ return nbytes;
+ }
+
+ if (rw) { /* direct I/O enabled */
+ if (!(((long) buf) % fd->d_mem)) {
+ ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, buf, size, offset, &err);
+ if (err == -1)
+ return -1;
+ nbytes += err;
+ } else {
+ newbuf = (void *) memalign(LUSTRE_MEMALIGN, size);
+ if (newbuf) {
+ memcpy(newbuf, buf, size);
+ ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err);
+ if (err == -1)
+ return -1;
+ nbytes += err;
+ ADIOI_Free(newbuf);
+ } else {
+ err = pwrite(fd->fd_sys, buf, size, offset);
+ if (err == -1)
+ return -1;
+ nbytes += err;
+ }
+ }
+ } else {
+ if (!(((long) buf) % fd->d_mem)) {
+ ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, buf, size, offset, &err);
+ if (err == -1)
+ return -1;
+ nbytes += err;
+ } else {
+ newbuf = (void *) memalign(LUSTRE_MEMALIGN, size);
+ if (newbuf) {
+ ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err);
+ if (err == -1)
+ return -1;
+ if (err > 0)
+ memcpy((void *) buf, newbuf, err);
+ nbytes += err;
+ ADIOI_Free(newbuf);
+ } else {
+ err = pread(fd->fd_sys, (void *) buf, size, offset);
+ if (err == -1)
+ return -1;
+ nbytes += err;
+ }
+ }
+ }
+ return nbytes;
+}
+
+static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ int io_mode, int *error_code);
+static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ int io_mode, int *error_code)
+{
+ ssize_t err = 0;
+ size_t rw_count;
+ ADIO_Offset bytes_xfered = 0;
+ MPI_Count datatype_size, len;
+ static char myname[] = "ADIOI_LUSTRE_IOCONTIG";
+ char *p;
+
+ if (count == 0) {
+ err = 0;
+ goto fn_exit;
+ }
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+
+ if ((!io_mode && !fd->direct_read) || (io_mode && !fd->direct_write)) {
+
+ p = (char *) buf;
+ if (io_mode) {
+ while (bytes_xfered < len) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ rw_count = len - bytes_xfered;
+ err = pwrite(fd->fd_sys, p, rw_count, offset + bytes_xfered);
+ if (err == -1)
+ goto ioerr;
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ if (err == 0)
+ break;
+ bytes_xfered += err;
+ p += err;
+ }
+ } else {
+ while (bytes_xfered < len) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ rw_count = len - bytes_xfered;
+ err = pread(fd->fd_sys, p, rw_count, offset + bytes_xfered);
+ if (err == -1)
+ goto ioerr;
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ if (err == 0)
+ break;
+ bytes_xfered += err;
+ p += err;
+ }
+ }
+ } else {
+ err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
+ if (err == -1)
+ goto ioerr;
+ bytes_xfered = err;
+ }
+
+ fd->fp_sys_posn = offset + bytes_xfered;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += bytes_xfered;
+ }
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status && err != -1)
+ MPIR_Status_set_bytes(status, datatype, bytes_xfered);
+#endif
+ *error_code = MPI_SUCCESS;
+
+ ioerr:
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ fd->fp_sys_posn = -1;
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+}
+
+void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ ADIOI_LUSTRE_IOContig(fd, buf, count, datatype, file_ptr_type, offset, status, 1, error_code);
+}
+
+void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ ADIOI_LUSTRE_IOContig(fd, buf, count, datatype, file_ptr_type, offset, status, 0, error_code);
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_wrcoll.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_wrcoll.c
new file mode 100644
index 0000000000000000000000000000000000000000..c96136282bb43d7b202fe6c72e494da37d6f90ae
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_wrcoll.c
@@ -0,0 +1,1418 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+#include "adio_extern.h"
+
+
+#ifdef HAVE_LUSTRE_LOCKAHEAD
+/* in ad_lustre_lock.c */
+void ADIOI_LUSTRE_lock_ahead_ioctl(ADIO_File fd,
+ int avail_cb_nodes, ADIO_Offset next_offset, int *error_code);
+
+/* Handle lock ahead. If this write is outside our locked region, lock it now */
+#define ADIOI_LUSTRE_WR_LOCK_AHEAD(fd,cb_nodes,offset,error_code) \
+if (fd->hints->fs_hints.lustre.lock_ahead_write) { \
+ if (offset > fd->hints->fs_hints.lustre.lock_ahead_end_extent) { \
+ ADIOI_LUSTRE_lock_ahead_ioctl(fd, cb_nodes, offset, error_code); \
+ } \
+ else if (offset < fd->hints->fs_hints.lustre.lock_ahead_start_extent) { \
+ ADIOI_LUSTRE_lock_ahead_ioctl(fd, cb_nodes, offset, error_code); \
+ } \
+}
+#else
+#define ADIOI_LUSTRE_WR_LOCK_AHEAD(fd,cb_nodes,offset,error_code)
+
+#endif
+
+
+/* prototypes of functions used for collective writes only. */
+static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
+ MPI_Datatype datatype, int nprocs,
+ int myrank,
+ ADIOI_Access * others_req,
+ ADIOI_Access * my_req,
+ ADIO_Offset * offset_list,
+ ADIO_Offset * len_list,
+ int contig_access_count,
+ int *striping_info,
+ ADIO_Offset ** buf_idx, int *error_code);
+static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
+ ADIOI_Flatlist_node * flat_buf,
+ char **send_buf,
+ ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int *send_size,
+ MPI_Request * requests,
+ int *sent_to_proc, int nprocs,
+ int myrank, int contig_access_count,
+ int *striping_info,
+ ADIO_Offset * send_buf_idx,
+ int *curr_to_proc,
+ int *done_to_proc, int iter, MPI_Aint buftype_extent);
+static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
+ char *write_buf,
+ ADIOI_Flatlist_node * flat_buf,
+ ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int *send_size,
+ int *recv_size, ADIO_Offset off,
+ int size, int *count,
+ int *start_pos,
+ int *sent_to_proc, int nprocs,
+ int myrank, int buftype_is_contig,
+ int contig_access_count,
+ int *striping_info,
+ ADIOI_Access * others_req,
+ ADIO_Offset * send_buf_idx,
+ int *curr_to_proc,
+ int *done_to_proc, int *hole,
+ int iter, MPI_Aint buftype_extent,
+ ADIO_Offset * buf_idx,
+ ADIO_Offset ** srt_off, int **srt_len, int *srt_num,
+ int *error_code);
+void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
+ ADIO_Offset * srt_off, int *srt_len, int *start_pos,
+ int nprocs, int nprocs_recv, int total_elements);
+
+static void ADIOI_LUSTRE_IterateOneSided(ADIO_File fd, const void *buf, int *striping_info,
+ ADIO_Offset * offset_list, ADIO_Offset * len_list,
+ int contig_access_count, int currentValidDataIndex,
+ int count, int file_ptr_type, ADIO_Offset offset,
+ ADIO_Offset start_offset, ADIO_Offset end_offset,
+ ADIO_Offset firstFileOffset, ADIO_Offset lastFileOffset,
+ MPI_Datatype datatype, int myrank, int *error_code);
+
+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype,
+ int file_ptr_type, ADIO_Offset offset,
+ ADIO_Status * status, int *error_code)
+{
+ /* Uses a generalized version of the extended two-phase method described
+ * in "An Extended Two-Phase Method for Accessing Sections of
+ * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
+ * Scientific Programming, (5)4:301--317, Winter 1996.
+ * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
+ */
+
+ ADIOI_Access *my_req;
+ /* array of nprocs access structures, one for each other process has
+ * this process's request */
+
+ ADIOI_Access *others_req;
+ /* array of nprocs access structures, one for each other process
+ * whose request is written by this process. */
+
+ int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
+ int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
+ int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
+ ADIO_Offset orig_fp, start_offset, end_offset, off;
+ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
+ ADIO_Offset *len_list = NULL;
+ int striping_info[3];
+ ADIO_Offset **buf_idx = NULL;
+ int old_error, tmp_error;
+ ADIO_Offset *lustre_offsets0, *lustre_offsets, *count_sizes = NULL;
+
+ MPI_Comm_size(fd->comm, &nprocs);
+ MPI_Comm_rank(fd->comm, &myrank);
+
+ orig_fp = fd->fp_ind;
+
+ /* IO patten identification if cb_write isn't disabled */
+ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+ /* For this process's request, calculate the list of offsets and
+ * lengths in the file and determine the start and end offsets.
+ * Note: end_offset points to the last byte-offset to be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
+ */
+ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+ &offset_list, &len_list, &start_offset,
+ &end_offset, &contig_access_count);
+
+ /* each process communicates its start and end offsets to other
+ * processes. The result is an array each of start and end offsets
+ * stored in order of process rank.
+ */
+ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * 2 * sizeof(ADIO_Offset));
+ end_offsets = st_offsets + nprocs;
+ ADIO_Offset my_count_size = 0;
+ /* One-sided aggregation needs the amount of data per rank as well
+ * because the difference in starting and ending offsets for 1 byte is
+ * 0 the same as 0 bytes so it cannot be distiguished.
+ */
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ count_sizes = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
+ MPI_Count buftype_size;
+ MPI_Type_size_x(datatype, &buftype_size);
+ my_count_size = (ADIO_Offset) count *(ADIO_Offset) buftype_size;
+ }
+ if (romio_tunegather) {
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ lustre_offsets0 = (ADIO_Offset *) ADIOI_Malloc(6 * nprocs * sizeof(ADIO_Offset));
+ lustre_offsets = lustre_offsets0 + 3 * nprocs;
+ for (i = 0; i < nprocs; i++) {
+ lustre_offsets0[i * 3] = 0;
+ lustre_offsets0[i * 3 + 1] = 0;
+ lustre_offsets0[i * 3 + 2] = 0;
+ }
+ lustre_offsets0[myrank * 3] = start_offset;
+ lustre_offsets0[myrank * 3 + 1] = end_offset;
+ lustre_offsets0[myrank * 3 + 2] = my_count_size;
+ MPI_Allreduce(lustre_offsets0, lustre_offsets, nprocs * 3, ADIO_OFFSET, MPI_MAX,
+ fd->comm);
+ for (i = 0; i < nprocs; i++) {
+ st_offsets[i] = lustre_offsets[i * 3];
+ end_offsets[i] = lustre_offsets[i * 3 + 1];
+ count_sizes[i] = lustre_offsets[i * 3 + 2];
+ }
+ } else {
+ lustre_offsets0 = (ADIO_Offset *) ADIOI_Malloc(4 * nprocs * sizeof(ADIO_Offset));
+ lustre_offsets = lustre_offsets0 + 2 * nprocs;
+ for (i = 0; i < nprocs; i++) {
+ lustre_offsets0[i * 2] = 0;
+ lustre_offsets0[i * 2 + 1] = 0;
+ }
+ lustre_offsets0[myrank * 2] = start_offset;
+ lustre_offsets0[myrank * 2 + 1] = end_offset;
+
+ MPI_Allreduce(lustre_offsets0, lustre_offsets, nprocs * 2, ADIO_OFFSET, MPI_MAX,
+ fd->comm);
+
+ for (i = 0; i < nprocs; i++) {
+ st_offsets[i] = lustre_offsets[i * 2];
+ end_offsets[i] = lustre_offsets[i * 2 + 1];
+ }
+ }
+ ADIOI_Free(lustre_offsets0);
+ } else {
+ MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm);
+ MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm);
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ MPI_Allgather(&my_count_size, 1, ADIO_OFFSET, count_sizes, 1,
+ ADIO_OFFSET, fd->comm);
+ }
+ }
+ /* are the accesses of different processes interleaved? */
+ for (i = 1; i < nprocs; i++)
+ if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i]))
+ interleave_count++;
+ /* This is a rudimentary check for interleaving, but should suffice
+ * for the moment. */
+
+ /* Two typical access patterns can benefit from collective write.
+ * 1) the processes are interleaved, and
+ * 2) the req size is small.
+ */
+ if (interleave_count > 0) {
+ do_collect = 1;
+ } else {
+ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, len_list, nprocs);
+ }
+ }
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+
+ /* Decide if collective I/O should be done */
+ if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
+ fd->hints->cb_write == ADIOI_HINT_DISABLE) {
+
+ /* use independent accesses */
+ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2))
+ ADIOI_Free(count_sizes);
+ }
+
+ fd->fp_ind = orig_fp;
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+ if (buftype_is_contig && filetype_is_contig) {
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+ off = fd->disp + (ADIO_Offset) (fd->etype_size) * offset;
+ ADIO_WriteContig(fd, buf, count, datatype,
+ ADIO_EXPLICIT_OFFSET, off, status, error_code);
+ } else
+ ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code);
+ } else {
+ ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code);
+ }
+ return;
+ }
+
+ ADIO_Offset lastFileOffset = 0, firstFileOffset = -1;
+ int currentValidDataIndex = 0;
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+ /* Take out the 0-data offsets by shifting the indexes with data to the front
+ * and keeping track of the valid data index for use as the length.
+ */
+ for (i = 0; i < nprocs; i++) {
+ if (count_sizes[i] > 0) {
+ st_offsets[currentValidDataIndex] = st_offsets[i];
+ end_offsets[currentValidDataIndex] = end_offsets[i];
+
+ lastFileOffset = MPL_MAX(lastFileOffset, end_offsets[currentValidDataIndex]);
+ if (firstFileOffset == -1)
+ firstFileOffset = st_offsets[currentValidDataIndex];
+ else
+ firstFileOffset = MPL_MIN(firstFileOffset, st_offsets[currentValidDataIndex]);
+
+ currentValidDataIndex++;
+ }
+ }
+ }
+
+ /* Get Lustre hints information */
+ ADIOI_LUSTRE_Get_striping_info(fd, striping_info, 1);
+ /* If the user has specified to use a one-sided aggregation method then do
+ * that at this point instead of the two-phase I/O.
+ */
+ if ((romio_write_aggmethod == 1) || (romio_write_aggmethod == 2)) {
+
+ ADIOI_LUSTRE_IterateOneSided(fd, buf, striping_info, offset_list, len_list,
+ contig_access_count, currentValidDataIndex, count,
+ file_ptr_type, offset, start_offset, end_offset,
+ firstFileOffset, lastFileOffset, datatype, myrank, error_code);
+
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+ ADIOI_Free(count_sizes);
+ goto fn_exit;
+ } // onesided aggregation
+
+ /* calculate what portions of the access requests of this process are
+ * located in which process
+ */
+ ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+ striping_info, nprocs, &count_my_req_procs,
+ &count_my_req_per_proc, &my_req, &buf_idx);
+
+ /* based on everyone's my_req, calculate what requests of other processes
+ * will be accessed by this process.
+ * count_others_req_procs = number of processes whose requests (including
+ * this process itself) will be accessed by this process
+ * count_others_req_per_proc[i] indicates how many separate contiguous
+ * requests of proc. i will be accessed by this process.
+ */
+
+ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
+ my_req, nprocs, myrank, &count_others_req_procs, &others_req);
+ ADIOI_Free(count_my_req_per_proc);
+
+ /* exchange data and write in sizes of no more than stripe_size. */
+ ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
+ others_req, my_req, offset_list, len_list,
+ contig_access_count, striping_info, buf_idx, error_code);
+
+ /* If this collective write is followed by an independent write,
+ * it's possible to have those subsequent writes on other processes
+ * race ahead and sneak in before the read-modify-write completes.
+ * We carry out a collective communication at the end here so no one
+ * can start independent i/o before collective I/O completes.
+ *
+ * need to do some gymnastics with the error codes so that if something
+ * went wrong, all processes report error, but if a process has a more
+ * specific error code, we can still have that process report the
+ * additional information */
+
+ old_error = *error_code;
+ if (*error_code != MPI_SUCCESS)
+ *error_code = MPI_ERR_IO;
+
+ /* optimization: if only one process performing i/o, we can perform
+ * a less-expensive Bcast */
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
+#endif
+ if (fd->hints->cb_nodes == 1)
+ MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
+ else {
+ tmp_error = *error_code;
+ MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm);
+ }
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
+#endif
+
+ if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
+ *error_code = old_error;
+
+ /* free all memory allocated for collective I/O */
+ /* free others_req */
+ ADIOI_Free(others_req[0].offsets);
+ ADIOI_Free(others_req[0].mem_ptrs);
+ ADIOI_Free(others_req);
+ ADIOI_Free(buf_idx[0]); /* also my_req[*].offsets and my_req[*].lens */
+ ADIOI_Free(buf_idx);
+ ADIOI_Free(my_req);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(st_offsets);
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status) {
+ MPI_Count bufsize, size;
+ /* Don't set status if it isn't needed */
+ MPI_Type_size_x(datatype, &size);
+ bufsize = size * count;
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+ }
+ /* This is a temporary way of filling in status. The right way is to
+ * keep track of how much data was actually written during collective I/O.
+ */
+#endif
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+}
+
+/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
+ * code is created and returned in error_code.
+ */
+static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
+ MPI_Datatype datatype, int nprocs,
+ int myrank, ADIOI_Access * others_req,
+ ADIOI_Access * my_req,
+ ADIO_Offset * offset_list,
+ ADIO_Offset * len_list,
+ int contig_access_count,
+ int *striping_info, ADIO_Offset ** buf_idx, int *error_code)
+{
+ /* Send data to appropriate processes and write in sizes of no more
+ * than lustre stripe_size.
+ * The idea is to reduce the amount of extra memory required for
+ * collective I/O. If all data were written all at once, which is much
+ * easier, it would require temp space more than the size of user_buf,
+ * which is often unacceptable. For example, to write a distributed
+ * array to a file, where each local array is 8Mbytes, requiring
+ * at least another 8Mbytes of temp space is unacceptable.
+ */
+
+ int hole, i, j, m, flag, ntimes = 1, max_ntimes, buftype_is_contig;
+ ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
+ ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
+ ADIO_Offset max_size, step_size = 0;
+ int real_size, req_len, send_len;
+ int *recv_curr_offlen_ptr, *recv_count, *recv_size;
+ int *send_curr_offlen_ptr, *send_size;
+ int *sent_to_proc, *recv_start_pos;
+ int *curr_to_proc, *done_to_proc;
+ ADIO_Offset *send_buf_idx, *this_buf_idx;
+ char *write_buf = NULL;
+ MPI_Status status;
+ ADIOI_Flatlist_node *flat_buf = NULL;
+ MPI_Aint lb, buftype_extent;
+ int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
+ int data_sieving = 0;
+ ADIO_Offset *srt_off = NULL;
+ int *srt_len = NULL;
+ int srt_num = 0;
+ ADIO_Offset block_offset;
+ int block_len;
+
+ *error_code = MPI_SUCCESS; /* changed below if error */
+ /* only I/O errors are currently reported */
+
+ /* calculate the number of writes of stripe size to be done.
+ * That gives the no. of communication phases as well.
+ * Note:
+ * Because we redistribute data in stripe-contiguous pattern for Lustre,
+ * each process has the same no. of communication phases.
+ */
+
+ for (i = 0; i < nprocs; i++) {
+ if (others_req[i].count) {
+ st_loc = others_req[i].offsets[0];
+ end_loc = others_req[i].offsets[0];
+ break;
+ }
+ }
+ for (i = 0; i < nprocs; i++) {
+ for (j = 0; j < others_req[i].count; j++) {
+ st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
+ end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1));
+ }
+ }
+ /* this process does no writing. */
+ if ((st_loc == -1) && (end_loc == -1))
+ ntimes = 0;
+ MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
+ /* avoid min_st_loc be -1 */
+ if (st_loc == -1)
+ st_loc = max_end_loc;
+ MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
+ /* align downward */
+ min_st_loc -= min_st_loc % (ADIO_Offset) stripe_size;
+
+ /* Each time, only avail_cb_nodes number of IO clients perform IO,
+ * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
+ * and ntimes=whole_file_portion/step_size
+ */
+ step_size = (ADIO_Offset) avail_cb_nodes *stripe_size;
+ max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
+ + (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
+/* max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
+ if (ntimes)
+ write_buf = (char *) ADIOI_Malloc(stripe_size);
+
+ /* calculate the start offset for each iteration */
+ off_list = (ADIO_Offset *) ADIOI_Malloc((max_ntimes + 2 * nprocs) * sizeof(ADIO_Offset));
+ send_buf_idx = off_list + max_ntimes;
+ this_buf_idx = send_buf_idx + nprocs;
+
+ for (m = 0; m < max_ntimes; m++)
+ off_list[m] = max_end_loc;
+ for (i = 0; i < nprocs; i++) {
+ for (j = 0; j < others_req[i].count; j++) {
+ req_off = others_req[i].offsets[j];
+ m = (int) ((req_off - min_st_loc) / step_size);
+ off_list[m] = MPL_MIN(off_list[m], req_off);
+ }
+ }
+
+ recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs * 9, sizeof(int));
+ send_curr_offlen_ptr = recv_curr_offlen_ptr + nprocs;
+ /* their use is explained below. calloc initializes to 0. */
+
+ recv_count = send_curr_offlen_ptr + nprocs;
+ /* to store count of how many off-len pairs per proc are satisfied
+ * in an iteration. */
+
+ send_size = recv_count + nprocs;
+ /* total size of data to be sent to each proc. in an iteration.
+ * Of size nprocs so that I can use MPI_Alltoall later. */
+
+ recv_size = send_size + nprocs;
+ /* total size of data to be recd. from each proc. in an iteration. */
+
+ sent_to_proc = recv_size + nprocs;
+ /* amount of data sent to each proc so far. Used in
+ * ADIOI_Fill_send_buffer. initialized to 0 here. */
+
+ curr_to_proc = sent_to_proc + nprocs;
+ done_to_proc = curr_to_proc + nprocs;
+ /* Above three are used in ADIOI_Fill_send_buffer */
+
+ recv_start_pos = done_to_proc + nprocs;
+ /* used to store the starting value of recv_curr_offlen_ptr[i] in
+ * this iteration */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ if (!buftype_is_contig) {
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+ }
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ /* I need to check if there are any outstanding nonblocking writes to
+ * the file, which could potentially interfere with the writes taking
+ * place in this collective write call. Since this is not likely to be
+ * common, let me do the simplest thing possible here: Each process
+ * completes all pending nonblocking operations before completing.
+ */
+ /*ADIOI_Complete_async(error_code);
+ * if (*error_code != MPI_SUCCESS) return;
+ * MPI_Barrier(fd->comm);
+ */
+
+ iter_st_off = min_st_loc;
+
+ /* Although we have recognized the data according to OST index,
+ * a read-modify-write will be done if there is a hole between the data.
+ * For example: if blocksize=60, xfersize=30 and stripe_size=100,
+ * then rank0 will collect data [0, 30] and [60, 90] then write. There
+ * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
+ *
+ * To reduce its impact on the performance, we can disable data sieving
+ * by hint "ds_in_coll".
+ */
+ /* check the hint for data sieving */
+ data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
+
+ for (m = 0; m < max_ntimes; m++) {
+ /* go through all others_req and my_req to check which will be received
+ * and sent in this iteration.
+ */
+
+ /* Note that MPI guarantees that displacements in filetypes are in
+ * monotonically nondecreasing order and that, for writes, the
+ * filetypes cannot specify overlapping regions in the file. This
+ * simplifies implementation a bit compared to reads. */
+
+ /*
+ * off = start offset in the file for the data to be written in
+ * this iteration
+ * iter_st_off = start offset of this iteration
+ * real_size = size of data written (bytes) corresponding to off
+ * max_size = possible maximum size of data written in this iteration
+ * req_off = offset in the file for a particular contiguous request minus
+ * what was satisfied in previous iteration
+ * send_off = offset the request needed by other processes in this iteration
+ * req_len = size corresponding to req_off
+ * send_len = size corresponding to send_off
+ */
+
+ /* first calculate what should be communicated */
+ for (i = 0; i < nprocs; i++)
+ recv_count[i] = recv_size[i] = send_size[i] = 0;
+
+ off = off_list[m];
+ max_size = MPL_MIN(step_size, max_end_loc - iter_st_off + 1);
+ real_size = (int) MPL_MIN((off / stripe_size + 1) * stripe_size - off, end_loc - off + 1);
+
+ for (i = 0; i < nprocs; i++) {
+ if (my_req[i].count) {
+ this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
+ for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
+ send_off = my_req[i].offsets[j];
+ send_len = my_req[i].lens[j];
+ if (send_off < iter_st_off + max_size) {
+ send_size[i] += send_len;
+ } else {
+ break;
+ }
+ }
+ send_curr_offlen_ptr[i] = j;
+ }
+ if (others_req[i].count) {
+ recv_start_pos[i] = recv_curr_offlen_ptr[i];
+ for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+ req_off = others_req[i].offsets[j];
+ req_len = others_req[i].lens[j];
+ if (req_off < iter_st_off + max_size) {
+ recv_count[i]++;
+ ADIOI_Assert((((ADIO_Offset) (uintptr_t) write_buf) + req_off - off) ==
+ (ADIO_Offset) (uintptr_t) (write_buf + req_off - off));
+ MPI_Get_address(write_buf + req_off - off, &(others_req[i].mem_ptrs[j]));
+ recv_size[i] += req_len;
+ } else {
+ break;
+ }
+ }
+ recv_curr_offlen_ptr[i] = j;
+ }
+ }
+ /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
+ hole = data_sieving;
+ ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
+ len_list, send_size, recv_size, off, real_size,
+ recv_count, recv_start_pos,
+ sent_to_proc, nprocs, myrank,
+ buftype_is_contig, contig_access_count,
+ striping_info, others_req, send_buf_idx,
+ curr_to_proc, done_to_proc, &hole, m,
+ buftype_extent, this_buf_idx,
+ &srt_off, &srt_len, &srt_num, error_code);
+
+ if (*error_code != MPI_SUCCESS)
+ goto over;
+
+ flag = 0;
+ for (i = 0; i < nprocs; i++)
+ if (recv_count[i]) {
+ flag = 1;
+ break;
+ }
+ if (flag) {
+ /* check whether to do data sieving */
+ if (data_sieving == ADIOI_HINT_ENABLE) {
+ ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], off, error_code);
+ ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, error_code);
+ } else {
+ /* if there is no hole, write data in one time;
+ * otherwise, write data in several times */
+ if (!hole) {
+ ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], off, error_code);
+ ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, error_code);
+ } else {
+ block_offset = -1;
+ block_len = 0;
+ for (i = 0; i < srt_num; ++i) {
+ if (srt_off[i] < off + real_size && srt_off[i] >= off) {
+ if (block_offset == -1) {
+ block_offset = srt_off[i];
+ block_len = srt_len[i];
+ } else {
+ if (srt_off[i] == block_offset + block_len) {
+ block_len += srt_len[i];
+ } else {
+ ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], block_offset,
+ error_code);
+ ADIO_WriteContig(fd, write_buf + block_offset - off, block_len,
+ MPI_BYTE, ADIO_EXPLICIT_OFFSET, block_offset,
+ &status, error_code);
+ if (*error_code != MPI_SUCCESS)
+ goto over;
+ block_offset = srt_off[i];
+ block_len = srt_len[i];
+ }
+ }
+ }
+ }
+ if (block_offset != -1) {
+ ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], block_offset, error_code);
+ ADIO_WriteContig(fd,
+ write_buf + block_offset - off,
+ block_len,
+ MPI_BYTE, ADIO_EXPLICIT_OFFSET,
+ block_offset, &status, error_code);
+ if (*error_code != MPI_SUCCESS)
+ goto over;
+ }
+ }
+ }
+ if (*error_code != MPI_SUCCESS)
+ goto over;
+ }
+ iter_st_off += max_size;
+ }
+ over:
+ if (srt_off)
+ ADIOI_Free(srt_off);
+ if (srt_len)
+ ADIOI_Free(srt_len);
+ if (ntimes)
+ ADIOI_Free(write_buf);
+ ADIOI_Free(recv_curr_offlen_ptr);
+ ADIOI_Free(off_list);
+}
+
+/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
+ * in the case of error.
+ */
+static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
+ char *write_buf,
+ ADIOI_Flatlist_node * flat_buf,
+ ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int *send_size,
+ int *recv_size, ADIO_Offset off,
+ int size, int *count,
+ int *start_pos,
+ int *sent_to_proc, int nprocs,
+ int myrank, int buftype_is_contig,
+ int contig_access_count,
+ int *striping_info,
+ ADIOI_Access * others_req,
+ ADIO_Offset * send_buf_idx,
+ int *curr_to_proc, int *done_to_proc,
+ int *hole, int iter,
+ MPI_Aint buftype_extent,
+ ADIO_Offset * buf_idx,
+ ADIO_Offset ** srt_off, int **srt_len, int *srt_num,
+ int *error_code)
+{
+ int i, j, k, nprocs_recv, nprocs_send, err;
+ char **send_buf = NULL;
+ MPI_Request *requests, *send_req;
+ MPI_Datatype *recv_types;
+ MPI_Status *statuses, status;
+ int sum_recv;
+ int data_sieving = *hole;
+ static size_t malloc_srt_num = 0;
+ size_t send_total_size;
+ static char myname[] = "ADIOI_W_EXCHANGE_DATA";
+
+ /* create derived datatypes for recv */
+ *srt_num = 0;
+ sum_recv = 0;
+ nprocs_recv = 0;
+ nprocs_send = 0;
+ send_total_size = 0;
+ for (i = 0; i < nprocs; i++) {
+ *srt_num += count[i];
+ sum_recv += recv_size[i];
+ if (recv_size[i])
+ nprocs_recv++;
+ if (send_size[i]) {
+ nprocs_send++;
+ send_total_size += send_size[i];
+ }
+ }
+
+ *hole = (size > sum_recv) ? 1 : 0;
+
+ recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype));
+ /* +1 to avoid a 0-size malloc */
+
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+ ADIOI_Type_create_hindexed_x(count[i],
+ &(others_req[i].lens[start_pos[i]]),
+ &(others_req[i].mem_ptrs[start_pos[i]]),
+ MPI_BYTE, recv_types + j);
+ /* absolute displacements; use MPI_BOTTOM in recv */
+ MPI_Type_commit(recv_types + j);
+ j++;
+ }
+ }
+
+ /* To avoid a read-modify-write,
+ * check if there are holes in the data to be written.
+ * For this, merge the (sorted) offset lists others_req using a heap-merge.
+ */
+
+ if (*srt_num) {
+ if (*srt_off == NULL || *srt_num > malloc_srt_num) {
+ /* must check srt_off against NULL, as the collective write can be
+ * called more than once */
+ if (*srt_off != NULL) {
+ ADIOI_Free(*srt_off);
+ ADIOI_Free(*srt_len);
+ }
+ *srt_off = (ADIO_Offset *) ADIOI_Malloc(*srt_num * sizeof(ADIO_Offset));
+ *srt_len = (int *) ADIOI_Malloc(*srt_num * sizeof(int));
+ malloc_srt_num = *srt_num;
+ }
+
+ ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos,
+ nprocs, nprocs_recv, *srt_num);
+ }
+
+ /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
+ * between aggregation, nominally contiguous regions, and cb_buffer_size
+ * should be handled with a read-modify-write (otherwise we will write out
+ * more data than we receive from everyone else (inclusive), so override
+ * hole detection
+ */
+ if (*hole == 0) {
+ for (i = 0; i < *srt_num - 1; i++) {
+ if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) {
+ *hole = 1;
+ break;
+ }
+ }
+ }
+
+ /* check the hint for data sieving */
+ if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
+ ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err);
+ // --BEGIN ERROR HANDLING--
+ if (err != MPI_SUCCESS) {
+ *error_code = MPIO_Err_create_code(err,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0);
+ ADIOI_Free(recv_types);
+ return;
+ }
+ // --END ERROR HANDLING--
+ }
+
+ if (fd->atomicity) {
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request));
+ send_req = requests;
+ } else {
+ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
+ sizeof(MPI_Request));
+ /* +1 to avoid a 0-size malloc */
+
+ /* post receives */
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+ MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
+ myrank + i + 100 * iter, fd->comm, requests + j);
+ j++;
+ }
+ }
+ send_req = requests + nprocs_recv;
+ }
+
+ /* post sends.
+ * if buftype_is_contig, data can be directly sent from
+ * user buf at location given by buf_idx. else use send_buf.
+ */
+ if (buftype_is_contig) {
+ j = 0;
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i]) {
+ ADIOI_Assert(buf_idx[i] != -1);
+ MPI_Issend(((char *) buf) + buf_idx[i], send_size[i],
+ MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j);
+ j++;
+ }
+ } else if (nprocs_send) {
+ /* buftype is not contig */
+ send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
+ send_buf[0] = (char *) ADIOI_Malloc(send_total_size);
+ for (i = 1; i < nprocs; i++)
+ send_buf[i] = send_buf[i - 1] + send_size[i - 1];
+
+ ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
+ len_list, send_size, send_req,
+ sent_to_proc, nprocs, myrank,
+ contig_access_count, striping_info,
+ send_buf_idx, curr_to_proc, done_to_proc,
+ iter, buftype_extent);
+ /* the send is done in ADIOI_Fill_send_buffer */
+ }
+
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ if (fd->atomicity) {
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ MPI_Status wkl_status;
+ if (recv_size[i]) {
+ MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
+ myrank + i + 100 * iter, fd->comm, &wkl_status);
+ j++;
+ }
+ }
+ }
+
+ for (i = 0; i < nprocs_recv; i++)
+ MPI_Type_free(recv_types + i);
+ ADIOI_Free(recv_types);
+
+#ifdef MPI_STATUSES_IGNORE
+ statuses = MPI_STATUSES_IGNORE;
+#else
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ /* +1 to avoid a 0-size malloc */
+ if (fd->atomicity) {
+ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status));
+ } else {
+ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
+ sizeof(MPI_Status));
+ }
+#endif
+
+#ifdef NEEDS_MPI_TEST
+ i = 0;
+ if (fd->atomicity) {
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ while (!i)
+ MPI_Testall(nprocs_send, send_req, &i, statuses);
+ } else {
+ while (!i)
+ MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
+ }
+#else
+ /* bug fix from Wei-keng Liao and Kenin Coloma */
+ if (fd->atomicity)
+ MPI_Waitall(nprocs_send, send_req, statuses);
+ else
+ MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
+#endif
+
+#ifndef MPI_STATUSES_IGNORE
+ ADIOI_Free(statuses);
+#endif
+ ADIOI_Free(requests);
+ if (!buftype_is_contig && nprocs_send) {
+ ADIOI_Free(send_buf[0]);
+ ADIOI_Free(send_buf);
+ }
+}
+
+#define ADIOI_BUF_INCR \
+{ \
+ while (buf_incr) { \
+ size_in_buf = MPL_MIN(buf_incr, flat_buf_sz); \
+ user_buf_idx += size_in_buf; \
+ flat_buf_sz -= size_in_buf; \
+ if (!flat_buf_sz) { \
+ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
+ else { \
+ flat_buf_idx = 0; \
+ n_buftypes++; \
+ } \
+ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
+ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
+ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
+ } \
+ buf_incr -= size_in_buf; \
+ } \
+}
+
+
+#define ADIOI_BUF_COPY \
+{ \
+ while (size) { \
+ size_in_buf = MPL_MIN(size, flat_buf_sz); \
+ ADIOI_Assert((((ADIO_Offset)(uintptr_t)buf) + user_buf_idx) == (ADIO_Offset)(uintptr_t)((uintptr_t)buf + user_buf_idx)); \
+ ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
+ memcpy(&(send_buf[p][send_buf_idx[p]]), \
+ ((char *) buf) + user_buf_idx, size_in_buf); \
+ send_buf_idx[p] += size_in_buf; \
+ user_buf_idx += size_in_buf; \
+ flat_buf_sz -= size_in_buf; \
+ if (!flat_buf_sz) { \
+ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
+ else { \
+ flat_buf_idx = 0; \
+ n_buftypes++; \
+ } \
+ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
+ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
+ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
+ } \
+ size -= size_in_buf; \
+ buf_incr -= size_in_buf; \
+ } \
+ ADIOI_BUF_INCR \
+}
+
+static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
+ ADIOI_Flatlist_node * flat_buf,
+ char **send_buf,
+ ADIO_Offset * offset_list,
+ ADIO_Offset * len_list, int *send_size,
+ MPI_Request * requests,
+ int *sent_to_proc, int nprocs,
+ int myrank,
+ int contig_access_count,
+ int *striping_info,
+ ADIO_Offset * send_buf_idx,
+ int *curr_to_proc,
+ int *done_to_proc, int iter, MPI_Aint buftype_extent)
+{
+ /* this function is only called if buftype is not contig */
+ int i, p, flat_buf_idx, size;
+ int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
+ ADIO_Offset off, len, rem_len, user_buf_idx;
+
+ /* curr_to_proc[p] = amount of data sent to proc. p that has already
+ * been accounted for so far
+ * done_to_proc[p] = amount of data already sent to proc. p in
+ * previous iterations
+ * user_buf_idx = current location in user buffer
+ * send_buf_idx[p] = current location in send_buf of proc. p
+ */
+
+ for (i = 0; i < nprocs; i++) {
+ send_buf_idx[i] = curr_to_proc[i] = 0;
+ done_to_proc[i] = sent_to_proc[i];
+ }
+ jj = 0;
+
+ user_buf_idx = flat_buf->indices[0];
+ flat_buf_idx = 0;
+ n_buftypes = 0;
+ flat_buf_sz = flat_buf->blocklens[0];
+
+ /* flat_buf_idx = current index into flattened buftype
+ * flat_buf_sz = size of current contiguous component in flattened buf
+ */
+ for (i = 0; i < contig_access_count; i++) {
+ off = offset_list[i];
+ rem_len = (ADIO_Offset) len_list[i];
+
+ /*this request may span to more than one process */
+ while (rem_len != 0) {
+ len = rem_len;
+ /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
+ * longer than the single region that processor "p" is responsible
+ * for.
+ */
+ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
+
+ if (send_buf_idx[p] < send_size[p]) {
+ if (curr_to_proc[p] + len > done_to_proc[p]) {
+ if (done_to_proc[p] > curr_to_proc[p]) {
+ size = (int) MPL_MIN(curr_to_proc[p] + len -
+ done_to_proc[p], send_size[p] - send_buf_idx[p]);
+ buf_incr = done_to_proc[p] - curr_to_proc[p];
+ ADIOI_BUF_INCR
+ ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) ==
+ (unsigned) (curr_to_proc[p] + len - done_to_proc[p]));
+ buf_incr = (int) (curr_to_proc[p] + len - done_to_proc[p]);
+ ADIOI_Assert((done_to_proc[p] + size) ==
+ (unsigned) (done_to_proc[p] + size));
+ curr_to_proc[p] = done_to_proc[p] + size;
+ ADIOI_BUF_COPY} else {
+ size = (int) MPL_MIN(len, send_size[p] - send_buf_idx[p]);
+ buf_incr = (int) len;
+ ADIOI_Assert((curr_to_proc[p] + size) ==
+ (unsigned) ((ADIO_Offset) curr_to_proc[p] + size));
+ curr_to_proc[p] += size;
+ ADIOI_BUF_COPY}
+ if (send_buf_idx[p] == send_size[p]) {
+ MPI_Issend(send_buf[p], send_size[p], MPI_BYTE, p,
+ myrank + p + 100 * iter, fd->comm, requests + jj);
+ jj++;
+ }
+ } else {
+ ADIOI_Assert((curr_to_proc[p] + len) ==
+ (unsigned) ((ADIO_Offset) curr_to_proc[p] + len));
+ curr_to_proc[p] += (int) len;
+ buf_incr = (int) len;
+ ADIOI_BUF_INCR}
+ } else {
+ buf_incr = (int) len;
+ ADIOI_BUF_INCR}
+ off += len;
+ rem_len -= len;
+ }
+ }
+ for (i = 0; i < nprocs; i++)
+ if (send_size[i])
+ sent_to_proc[i] = curr_to_proc[i];
+}
+
+/* This function calls ADIOI_OneSidedWriteAggregation iteratively to
+ * essentially pack stripes of data into the collective buffer and then
+ * flush the collective buffer to the file when fully packed, repeating this
+ * process until all the data is written to the file.
+ */
+static void ADIOI_LUSTRE_IterateOneSided(ADIO_File fd, const void *buf, int *striping_info,
+ ADIO_Offset * offset_list, ADIO_Offset * len_list,
+ int contig_access_count, int currentValidDataIndex,
+ int count, int file_ptr_type, ADIO_Offset offset,
+ ADIO_Offset start_offset, ADIO_Offset end_offset,
+ ADIO_Offset firstFileOffset, ADIO_Offset lastFileOffset,
+ MPI_Datatype datatype, int myrank, int *error_code)
+{
+ int i;
+ int stripesPerAgg = fd->hints->cb_buffer_size / striping_info[0];
+ if (stripesPerAgg == 0) {
+ /* The striping unit is larger than the collective buffer size
+ * therefore we must abort since the buffer has already been
+ * allocated during the open.
+ */
+ FPRINTF(stderr, "Error: The collective buffer size %d is less "
+ "than the striping unit size %d - the ROMIO "
+ "Lustre one-sided write aggregation algorithm "
+ "cannot continue.\n", fd->hints->cb_buffer_size, striping_info[0]);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+
+ /* Based on the co_ratio the number of aggregators we can use is the number of
+ * stripes used in the file times this co_ratio - each stripe is written by
+ * co_ratio aggregators this information is contained in the striping_info.
+ */
+ int numStripedAggs = striping_info[2];
+
+ int orig_cb_nodes = fd->hints->cb_nodes;
+ fd->hints->cb_nodes = numStripedAggs;
+
+ /* Declare ADIOI_OneSidedStripeParms here - these parameters will be locally managed
+ * for this invokation of ADIOI_LUSTRE_IterateOneSided. This will allow for concurrent
+ * one-sided collective writes via multi-threading as well as multiple communicators.
+ */
+ ADIOI_OneSidedStripeParms stripeParms;
+ stripeParms.stripeSize = striping_info[0];
+ stripeParms.stripedLastFileOffset = lastFileOffset;
+ stripeParms.iWasUsedStripingAgg = 0;
+ stripeParms.numStripesUsed = 0;
+ stripeParms.amountOfStripedDataExpected = 0;
+ stripeParms.bufTypeExtent = 0;
+ stripeParms.lastDataTypeExtent = 0;
+ stripeParms.lastFlatBufIndice = 0;
+ stripeParms.lastIndiceOffset = 0;
+
+ /* The general algorithm here is to divide the file up into segements, a segment
+ * being defined as a contiguous region of the file which has up to one occurrence
+ * of each stripe - the data for each stripe being written out by a particular
+ * aggregator. The segmentLen is the maximum size in bytes of each segment
+ * (stripeSize*number of aggs). Iteratively call ADIOI_OneSidedWriteAggregation
+ * for each segment to aggregate the data to the collective buffers, but only do
+ * the actual write (via flushCB stripe parm) once stripesPerAgg stripes
+ * have been packed or the aggregation for all the data is complete, minimizing
+ * synchronization.
+ */
+ stripeParms.segmentLen = ((ADIO_Offset) numStripedAggs) * ((ADIO_Offset) (striping_info[0]));
+
+ /* These arrays define the file offsets for the stripes for a given segment - similar
+ * to the concept of file domains in GPFS, essentially file domeains for the segment.
+ */
+ ADIO_Offset *segment_stripe_start =
+ (ADIO_Offset *) ADIOI_Malloc(numStripedAggs * sizeof(ADIO_Offset));
+ ADIO_Offset *segment_stripe_end =
+ (ADIO_Offset *) ADIOI_Malloc(numStripedAggs * sizeof(ADIO_Offset));
+
+ /* Find the actual range of stripes in the file that have data in the offset
+ * ranges being written -- skip holes at the front and back of the file.
+ */
+ int currentOffsetListIndex = 0;
+ int fileSegmentIter = 0;
+ int startingStripeWithData = 0;
+ int foundStartingStripeWithData = 0;
+ while (!foundStartingStripeWithData) {
+ if (((startingStripeWithData + 1) * (ADIO_Offset) (striping_info[0])) > firstFileOffset)
+ foundStartingStripeWithData = 1;
+ else
+ startingStripeWithData++;
+ }
+
+ ADIO_Offset currentSegementOffset =
+ (ADIO_Offset) startingStripeWithData * (ADIO_Offset) (striping_info[0]);
+
+ int numSegments =
+ (int) ((lastFileOffset + (ADIO_Offset) 1 - currentSegementOffset) / stripeParms.segmentLen);
+ if ((lastFileOffset + (ADIO_Offset) 1 - currentSegementOffset) % stripeParms.segmentLen > 0)
+ numSegments++;
+
+ /* To support read-modify-write use a while-loop to redo the aggregation if necessary
+ * to fill in the holes.
+ */
+ int doAggregation = 1;
+ int holeFound = 0;
+
+ /* Remember romio_onesided_no_rmw setting if we have to re-do
+ * the aggregation if holes are found.
+ */
+ int prev_romio_onesided_no_rmw = romio_onesided_no_rmw;
+
+ while (doAggregation) {
+
+ int totalDataWrittenLastRound = 0;
+
+ /* This variable tracks how many segment stripes we have packed into the agg
+ * buffers so we know when to flush to the file system.
+ */
+ stripeParms.segmentIter = 0;
+
+ /* stripeParms.stripesPerAgg is the number of stripes to aggregate before doing a flush.
+ */
+ stripeParms.stripesPerAgg = stripesPerAgg;
+ if (stripeParms.stripesPerAgg > numSegments)
+ stripeParms.stripesPerAgg = numSegments;
+
+ for (fileSegmentIter = 0; fileSegmentIter < numSegments; fileSegmentIter++) {
+
+ int dataWrittenThisRound = 0;
+
+ /* Define the segment range in terms of file offsets.
+ */
+ ADIO_Offset segmentFirstFileOffset = currentSegementOffset;
+ if ((currentSegementOffset + stripeParms.segmentLen - (ADIO_Offset) 1) > lastFileOffset)
+ currentSegementOffset = lastFileOffset;
+ else
+ currentSegementOffset += (stripeParms.segmentLen - (ADIO_Offset) 1);
+ ADIO_Offset segmentLastFileOffset = currentSegementOffset;
+ currentSegementOffset++;
+
+ ADIO_Offset segment_stripe_offset = segmentFirstFileOffset;
+ for (i = 0; i < numStripedAggs; i++) {
+ if (firstFileOffset > segment_stripe_offset)
+ segment_stripe_start[i] = firstFileOffset;
+ else
+ segment_stripe_start[i] = segment_stripe_offset;
+ if ((segment_stripe_offset + (ADIO_Offset) (striping_info[0])) > lastFileOffset)
+ segment_stripe_end[i] = lastFileOffset;
+ else
+ segment_stripe_end[i] =
+ segment_stripe_offset + (ADIO_Offset) (striping_info[0]) - (ADIO_Offset) 1;
+ segment_stripe_offset += (ADIO_Offset) (striping_info[0]);
+ }
+
+ /* In the interest of performance for non-contiguous data with large offset lists
+ * essentially modify the given offset and length list appropriately for this segment
+ * and then pass pointers to the sections of the lists being used for this segment
+ * to ADIOI_OneSidedWriteAggregation. Remember how we have modified the list for this
+ * segment, and then restore it appropriately after processing for this segment has
+ * concluded, so it is ready for the next segment.
+ */
+ int segmentContigAccessCount = 0;
+ int startingOffsetListIndex = -1;
+ int endingOffsetListIndex = -1;
+ ADIO_Offset startingOffsetAdvancement = 0;
+ ADIO_Offset startingLenTrim = 0;
+ ADIO_Offset endingLenTrim = 0;
+
+ while (((offset_list[currentOffsetListIndex] +
+ ((ADIO_Offset) (len_list[currentOffsetListIndex])) - (ADIO_Offset) 1) <
+ segmentFirstFileOffset) && (currentOffsetListIndex < (contig_access_count - 1)))
+ currentOffsetListIndex++;
+ startingOffsetListIndex = currentOffsetListIndex;
+ endingOffsetListIndex = currentOffsetListIndex;
+ int offsetInSegment = 0;
+ ADIO_Offset offsetStart = offset_list[currentOffsetListIndex];
+ ADIO_Offset offsetEnd =
+ (offset_list[currentOffsetListIndex] +
+ ((ADIO_Offset) (len_list[currentOffsetListIndex])) - (ADIO_Offset) 1);
+
+ if (len_list[currentOffsetListIndex] == 0)
+ offsetInSegment = 0;
+ else if ((offsetStart >= segmentFirstFileOffset) &&
+ (offsetStart <= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ } else if ((offsetEnd >= segmentFirstFileOffset) &&
+ (offsetEnd <= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ } else if ((offsetStart <= segmentFirstFileOffset) &&
+ (offsetEnd >= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+
+ if (!offsetInSegment) {
+ segmentContigAccessCount = 0;
+ } else {
+ /* We are in the segment, advance currentOffsetListIndex until we are out of segment.
+ */
+ segmentContigAccessCount = 1;
+
+ while ((offset_list[currentOffsetListIndex] <= segmentLastFileOffset) &&
+ (currentOffsetListIndex < contig_access_count)) {
+ dataWrittenThisRound += (int) len_list[currentOffsetListIndex];
+ currentOffsetListIndex++;
+ }
+
+ if (currentOffsetListIndex > startingOffsetListIndex) {
+ /* If we did advance, if we are at the end need to check if we are still in segment.
+ */
+ if (currentOffsetListIndex == contig_access_count) {
+ currentOffsetListIndex--;
+ } else if (offset_list[currentOffsetListIndex] > segmentLastFileOffset) {
+ /* We advanced into the last one and it still in the segment.
+ */
+ currentOffsetListIndex--;
+ } else {
+ dataWrittenThisRound += (int) len_list[currentOffsetListIndex];
+ }
+ segmentContigAccessCount += (currentOffsetListIndex - startingOffsetListIndex);
+ endingOffsetListIndex = currentOffsetListIndex;
+ }
+ }
+
+ if (segmentContigAccessCount > 0) {
+ /* Trim edges here so all data in the offset list range fits exactly in the segment.
+ */
+ if (offset_list[startingOffsetListIndex] < segmentFirstFileOffset) {
+ startingOffsetAdvancement =
+ segmentFirstFileOffset - offset_list[startingOffsetListIndex];
+ offset_list[startingOffsetListIndex] += startingOffsetAdvancement;
+ dataWrittenThisRound -= (int) startingOffsetAdvancement;
+ startingLenTrim = startingOffsetAdvancement;
+ len_list[startingOffsetListIndex] -= startingLenTrim;
+ }
+
+ if ((offset_list[endingOffsetListIndex] +
+ ((ADIO_Offset) (len_list[endingOffsetListIndex])) - (ADIO_Offset) 1) >
+ segmentLastFileOffset) {
+ endingLenTrim =
+ offset_list[endingOffsetListIndex] +
+ ((ADIO_Offset) (len_list[endingOffsetListIndex])) - (ADIO_Offset) 1 -
+ segmentLastFileOffset;
+ len_list[endingOffsetListIndex] -= endingLenTrim;
+ dataWrittenThisRound -= (int) endingLenTrim;
+ }
+ }
+
+ int holeFoundThisRound = 0;
+
+ /* Once we have packed the collective buffers do the actual write.
+ */
+ if ((stripeParms.segmentIter == (stripeParms.stripesPerAgg - 1)) ||
+ (fileSegmentIter == (numSegments - 1))) {
+ stripeParms.flushCB = 1;
+ } else
+ stripeParms.flushCB = 0;
+
+ stripeParms.firstStripedWriteCall = 0;
+ stripeParms.lastStripedWriteCall = 0;
+ if (fileSegmentIter == 0) {
+ stripeParms.firstStripedWriteCall = 1;
+ } else if (fileSegmentIter == (numSegments - 1))
+ stripeParms.lastStripedWriteCall = 1;
+
+ /* The difference in calls to ADIOI_OneSidedWriteAggregation is based on the whether the buftype is
+ * contiguous. The algorithm tracks the position in the source buffer when called
+ * multiple times -- in the case of contiguous data this is simple and can be externalized with
+ * a buffer offset, in the case of non-contiguous data this is complex and the state must be tracked
+ * internally, therefore no external buffer offset. Care was taken to minimize
+ * ADIOI_OneSidedWriteAggregation changes at the expense of some added complexity to the caller.
+ */
+ int bufTypeIsContig;
+ ADIOI_Datatype_iscontig(datatype, &bufTypeIsContig);
+ if (bufTypeIsContig) {
+ ADIOI_OneSidedWriteAggregation(fd,
+ (ADIO_Offset *) &
+ (offset_list[startingOffsetListIndex]),
+ (ADIO_Offset *) &
+ (len_list[startingOffsetListIndex]),
+ segmentContigAccessCount,
+ buf + totalDataWrittenLastRound, datatype,
+ error_code, segmentFirstFileOffset,
+ segmentLastFileOffset, currentValidDataIndex,
+ segment_stripe_start, segment_stripe_end,
+ &holeFoundThisRound, &stripeParms);
+ } else {
+ ADIOI_OneSidedWriteAggregation(fd,
+ (ADIO_Offset *) &
+ (offset_list[startingOffsetListIndex]),
+ (ADIO_Offset *) &
+ (len_list[startingOffsetListIndex]),
+ segmentContigAccessCount, buf, datatype, error_code,
+ segmentFirstFileOffset, segmentLastFileOffset,
+ currentValidDataIndex, segment_stripe_start,
+ segment_stripe_end, &holeFoundThisRound,
+ &stripeParms);
+ }
+
+ if (stripeParms.flushCB) {
+ stripeParms.segmentIter = 0;
+ if (stripesPerAgg > (numSegments - fileSegmentIter - 1))
+ stripeParms.stripesPerAgg = numSegments - fileSegmentIter - 1;
+ else
+ stripeParms.stripesPerAgg = stripesPerAgg;
+ } else
+ stripeParms.segmentIter++;
+
+ if (holeFoundThisRound)
+ holeFound = 1;
+
+ /* If we know we won't be doing a pre-read in a subsequent call to
+ * ADIOI_OneSidedWriteAggregation which will have a barrier to keep
+ * feeder ranks from doing rma to the collective buffer before the
+ * write completes that we told it do with the stripeParms.flushCB
+ * flag then we need to do a barrier here.
+ */
+ if (!romio_onesided_always_rmw && stripeParms.flushCB) {
+ if (fileSegmentIter < (numSegments - 1)) {
+ MPI_Barrier(fd->comm);
+ }
+ }
+
+ /* Restore the offset_list and len_list to values that are ready for the
+ * next iteration.
+ */
+ if (segmentContigAccessCount > 0) {
+ offset_list[endingOffsetListIndex] += len_list[endingOffsetListIndex];
+ len_list[endingOffsetListIndex] = endingLenTrim;
+ }
+ totalDataWrittenLastRound += dataWrittenThisRound;
+ } // fileSegmentIter for-loop
+
+ /* Check for holes in the data unless romio_onesided_no_rmw is set.
+ * If a hole is found redo the entire aggregation and write.
+ */
+ if (!romio_onesided_no_rmw) {
+ int anyHolesFound = 0;
+ MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm);
+
+ if (anyHolesFound) {
+ ADIOI_Free(offset_list);
+ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
+ &offset_list, &len_list, &start_offset,
+ &end_offset, &contig_access_count);
+
+ currentSegementOffset =
+ (ADIO_Offset) startingStripeWithData *(ADIO_Offset) (striping_info[0]);
+ romio_onesided_always_rmw = 1;
+ romio_onesided_no_rmw = 1;
+
+ /* Holes are found in the data and the user has not set
+ * romio_onesided_no_rmw --- set romio_onesided_always_rmw to 1
+ * and redo the entire aggregation and write and if the user has
+ * romio_onesided_inform_rmw set then inform him of this condition
+ * and behavior.
+ */
+ if (romio_onesided_inform_rmw && (myrank == 0)) {
+ FPRINTF(stderr, "Information: Holes found during one-sided "
+ "write aggregation algorithm --- re-running one-sided "
+ "write aggregation with ROMIO_ONESIDED_ALWAYS_RMW set to 1.\n");
+ }
+ } else
+ doAggregation = 0;
+ } else
+ doAggregation = 0;
+ } // while doAggregation
+ romio_onesided_no_rmw = prev_romio_onesided_no_rmw;
+
+ ADIOI_Free(segment_stripe_start);
+ ADIOI_Free(segment_stripe_end);
+
+ fd->hints->cb_nodes = orig_cb_nodes;
+
+}
diff --git a/3rd-party/romio341/adio/ad_lustre/ad_lustre_wrstr.c b/3rd-party/romio341/adio/ad_lustre/ad_lustre_wrstr.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9e03d58a0aeaf6313db5d39962737ec2d8c2049
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_lustre/ad_lustre_wrstr.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_lustre.h"
+#include "adio_extern.h"
+
+#define ADIOI_BUFFERED_WRITE \
+ { \
+ if (req_off >= writebuf_off + writebuf_len) { \
+ if (writebuf_len) { \
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
+ ADIO_EXPLICIT_OFFSET, writebuf_off, \
+ &status1, error_code); \
+ if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (*error_code != MPI_SUCCESS) { \
+ *error_code = MPIO_Err_create_code(*error_code, \
+ MPIR_ERR_RECOVERABLE, \
+ myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
+ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ } \
+ writebuf_off = req_off; \
+ /* stripe_size alignment */ \
+ writebuf_len = (unsigned) MPL_MIN(end_offset - writebuf_off + 1, \
+ (writebuf_off / stripe_size + 1) * \
+ stripe_size - writebuf_off); \
+ if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \
+ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
+ ADIO_EXPLICIT_OFFSET, \
+ writebuf_off, &status1, error_code); \
+ if (*error_code != MPI_SUCCESS) { \
+ *error_code = MPIO_Err_create_code(*error_code, \
+ MPIR_ERR_RECOVERABLE, \
+ myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowsrc", 0); \
+ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ } \
+ write_sz = (unsigned) (MPL_MIN(req_len, \
+ writebuf_off + writebuf_len - req_off)); \
+ ADIOI_Assert((ADIO_Offset)write_sz == \
+ MPL_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
+ memcpy(writebuf + req_off - writebuf_off, (char *)buf +userbuf_off, write_sz); \
+ while (write_sz != req_len) { \
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
+ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
+ if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (*error_code != MPI_SUCCESS) { \
+ *error_code = MPIO_Err_create_code(*error_code, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
+ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ req_len -= write_sz; \
+ userbuf_off += write_sz; \
+ writebuf_off += writebuf_len; \
+ /* stripe_size alignment */ \
+ writebuf_len = (unsigned) MPL_MIN(end_offset - writebuf_off + 1, \
+ (writebuf_off / stripe_size + 1) * \
+ stripe_size - writebuf_off); \
+ if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \
+ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
+ ADIO_EXPLICIT_OFFSET, \
+ writebuf_off, &status1, error_code); \
+ if (*error_code != MPI_SUCCESS) { \
+ *error_code = MPIO_Err_create_code(*error_code, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowsrc", 0); \
+ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ write_sz = MPL_MIN(req_len, writebuf_len); \
+ memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
+ } \
+ }
+
+
+/* this macro is used when filetype is contig and buftype is not contig.
+ it does not do a read-modify-write and does not lock*/
+#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
+ { \
+ if (req_off >= writebuf_off + writebuf_len) { \
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
+ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, \
+ error_code); \
+ if (*error_code != MPI_SUCCESS) { \
+ *error_code = MPIO_Err_create_code(*error_code, \
+ MPIR_ERR_RECOVERABLE, \
+ myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
+ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ writebuf_off = req_off; \
+ /* stripe_size alignment */ \
+ writebuf_len = (unsigned) MPL_MIN(end_offset - writebuf_off + 1, \
+ (writebuf_off / stripe_size + 1) * \
+ stripe_size - writebuf_off); \
+ } \
+ write_sz = (unsigned) MPL_MIN(req_len, writebuf_off + writebuf_len - req_off); \
+ ADIOI_Assert((ADIO_Offset)write_sz == MPL_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
+ memcpy(writebuf + req_off - writebuf_off, \
+ (char *)buf + userbuf_off, write_sz); \
+ while (write_sz != req_len) { \
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
+ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
+ if (*error_code != MPI_SUCCESS) { \
+ *error_code = MPIO_Err_create_code(*error_code, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
+ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ req_len -= write_sz; \
+ userbuf_off += write_sz; \
+ writebuf_off += writebuf_len; \
+ /* stripe_size alignment */ \
+ writebuf_len = (unsigned) MPL_MIN(end_offset - writebuf_off + 1, \
+ (writebuf_off / stripe_size + 1) * \
+ stripe_size - writebuf_off); \
+ write_sz = MPL_MIN(req_len, writebuf_len); \
+ memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
+ } \
+ }
+
+void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ /* offset is in units of etype relative to the filetype. */
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ ADIO_Offset i_offset, sum, size_in_filetype;
+ int i, j, k, st_index = 0;
+ int n_etypes_in_filetype;
+ ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
+ ADIO_Offset abs_off_in_filetype = 0;
+ MPI_Count filetype_size, etype_size, buftype_size;
+ MPI_Aint lb, filetype_extent, buftype_extent;
+ int buf_count, buftype_is_contig, filetype_is_contig;
+ ADIO_Offset userbuf_off;
+ ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
+ char *writebuf;
+ unsigned bufsize, writebuf_len, write_sz;
+ ADIO_Status status1;
+ ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size = 0, bwr_size, req_len;
+ int stripe_size;
+ static char myname[] = "ADIOI_LUSTRE_WriteStrided";
+
+ if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
+ /* if user has disabled data sieving on writes, use naive
+ * approach instead.
+ */
+ ADIOI_GEN_WriteStrided_naive(fd,
+ buf,
+ count, datatype, file_ptr_type, offset, status, error_code);
+ return;
+ }
+
+ *error_code = MPI_SUCCESS; /* changed below if error */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (!filetype_size) {
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ ADIOI_Assert((buftype_size * count) ==
+ ((ADIO_Offset) (unsigned) buftype_size * (ADIO_Offset) count));
+ bufsize = buftype_size * count;
+
+ /* get striping info */
+ stripe_size = fd->hints->striping_unit;
+
+ /* Different buftype to different filetype */
+ if (!buftype_is_contig && filetype_is_contig) {
+ /* noncontiguous in memory, contiguous in file. */
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
+ fd->disp + (ADIO_Offset) etype_size *offset;
+
+ start_off = off;
+ end_offset = start_off + bufsize - 1;
+ /* write stripe size buffer each time */
+ writebuf = (char *) ADIOI_Malloc(MPL_MIN(bufsize, stripe_size));
+ writebuf_off = 0;
+ writebuf_len = 0;
+
+ /* if atomicity is true or data sieving is not disable, lock the region
+ * to be accessed */
+ if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE)
+ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
+
+ for (j = 0; j < count; j++) {
+ for (i = 0; i < flat_buf->count; i++) {
+ userbuf_off = (ADIO_Offset) j *(ADIO_Offset) buftype_extent + flat_buf->indices[i];
+ req_off = off;
+ req_len = flat_buf->blocklens[i];
+ ADIOI_BUFFERED_WRITE_WITHOUT_READ;
+ off += flat_buf->blocklens[i];
+ }
+ }
+
+ /* write the buffer out finally */
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code);
+
+ if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
+ if (*error_code != MPI_SUCCESS) {
+ ADIOI_Free(writebuf);
+ return;
+ }
+ ADIOI_Free(writebuf);
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+ } else {
+ /* noncontiguous in file */
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+ disp = fd->disp;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* Wei-keng reworked type processing to be a bit more efficient */
+ offset = fd->fp_ind - disp;
+ n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
+ offset -= (ADIO_Offset) n_filetypes *filetype_extent;
+ /* now offset is local to this extent */
+
+ /* find the block where offset is located, skip blocklens[i]==0 */
+ for (i = 0; i < flat_file->count; i++) {
+ ADIO_Offset dist;
+ if (flat_file->blocklens[i] == 0)
+ continue;
+ dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
+ /* fwr_size is from offset to the end of block i */
+ if (dist == 0) {
+ i++;
+ offset = flat_file->indices[i];
+ fwr_size = flat_file->blocklens[i];
+ break;
+ }
+ if (dist > 0) {
+ fwr_size = dist;
+ break;
+ }
+ }
+ st_index = i; /* starting index in flat_file->indices[] */
+ offset += disp + (ADIO_Offset) n_filetypes *filetype_extent;
+ } else {
+ n_etypes_in_filetype = filetype_size / etype_size;
+ n_filetypes = offset / n_etypes_in_filetype;
+ etype_in_filetype = offset % n_etypes_in_filetype;
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i = 0; i < flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ fwr_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+
+ /* abs. offset in bytes in the file */
+ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype;
+ }
+
+ start_off = offset;
+
+ /* Wei-keng Liao:write request is within single flat_file
+ * contig block*/
+ /* this could happen, for example, with subarray types that are
+ * actually fairly contiguous */
+ if (buftype_is_contig && bufsize <= fwr_size) {
+ req_off = start_off;
+ req_len = bufsize;
+ end_offset = start_off + bufsize - 1;
+ writebuf = (char *) ADIOI_Malloc(MPL_MIN(bufsize, stripe_size));
+ memset(writebuf, -1, MPL_MIN(bufsize, stripe_size));
+ writebuf_off = 0;
+ writebuf_len = 0;
+ userbuf_off = 0;
+ ADIOI_BUFFERED_WRITE_WITHOUT_READ;
+ /* write the buffer out finally */
+ if (fd->hints->ds_write != ADIOI_HINT_DISABLE)
+ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code);
+ if (fd->hints->ds_write != ADIOI_HINT_DISABLE)
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* update MPI-IO file pointer to point to the first byte
+ * that can be accessed in the fileview. */
+ fd->fp_ind = offset + bufsize;
+ if (bufsize == fwr_size) {
+ do {
+ st_index++;
+ if (st_index == flat_file->count) {
+ st_index = 0;
+ n_filetypes++;
+ }
+ } while (flat_file->blocklens[st_index] == 0);
+ fd->fp_ind = disp + flat_file->indices[st_index]
+ + (ADIO_Offset) n_filetypes *filetype_extent;
+ }
+ }
+ fd->fp_sys_posn = -1; /* set it to null. */
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+#endif
+ ADIOI_Free(writebuf);
+ return;
+ }
+
+ /* Calculate end_offset, the last byte-offset that will be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
+
+ st_fwr_size = fwr_size;
+ st_n_filetypes = n_filetypes;
+ i_offset = 0;
+ j = st_index;
+ off = offset;
+ fwr_size = MPL_MIN(st_fwr_size, bufsize);
+ while (i_offset < bufsize) {
+ i_offset += fwr_size;
+ end_offset = off + fwr_size - 1;
+
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+
+ off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent;
+ fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
+ }
+
+ /* if atomicity is true or data sieving is not disable, lock the region
+ * to be accessed */
+ if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE)
+ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ writebuf_off = 0;
+ writebuf_len = 0;
+ writebuf = (char *) ADIOI_Malloc(stripe_size);
+ memset(writebuf, -1, stripe_size);
+
+ if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+ common case. */
+
+ i_offset = 0;
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+ fwr_size = MPL_MIN(st_fwr_size, bufsize);
+ while (i_offset < bufsize) {
+ if (fwr_size) {
+ /* TYPE_UB and TYPE_LB can result in
+ * fwr_size = 0. save system call in such cases */
+ /* lseek(fd->fd_sys, off, SEEK_SET);
+ * err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size); */
+
+ req_off = off;
+ req_len = fwr_size;
+ userbuf_off = i_offset;
+ ADIOI_BUFFERED_WRITE;
+ }
+ i_offset += fwr_size;
+
+ if (off + fwr_size < disp + flat_file->indices[j] +
+ flat_file->blocklens[j] + n_filetypes * (ADIO_Offset) filetype_extent)
+ off += fwr_size;
+ /* did not reach end of contiguous block in filetype.
+ * no more I/O needed. off is incremented by fwr_size. */
+ else {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+ off = disp + flat_file->indices[j] +
+ n_filetypes * (ADIO_Offset) filetype_extent;
+ fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
+ }
+ }
+ } else {
+/* noncontiguous in memory as well as in file */
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ k = num = buf_count = 0;
+ i_offset = flat_buf->indices[0];
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+ fwr_size = st_fwr_size;
+ bwr_size = flat_buf->blocklens[0];
+
+ while (num < bufsize) {
+ size = MPL_MIN(fwr_size, bwr_size);
+ if (size) {
+ /* lseek(fd->fd_sys, off, SEEK_SET);
+ * err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
+
+ req_off = off;
+ req_len = size;
+ userbuf_off = i_offset;
+ ADIOI_BUFFERED_WRITE;
+ }
+
+ new_fwr_size = fwr_size;
+ new_bwr_size = bwr_size;
+
+ if (size == fwr_size) {
+/* reached end of contiguous block in file */
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+
+ off = disp + flat_file->indices[j] +
+ n_filetypes * (ADIO_Offset) filetype_extent;
+
+ new_fwr_size = flat_file->blocklens[j];
+ if (size != bwr_size) {
+ i_offset += size;
+ new_bwr_size -= size;
+ }
+ }
+
+ if (size == bwr_size) {
+/* reached end of contiguous block in memory */
+
+ k = (k + 1) % flat_buf->count;
+ buf_count++;
+ i_offset = (ADIO_Offset) buftype_extent *
+ (ADIO_Offset) (buf_count / flat_buf->count) + flat_buf->indices[k];
+ new_bwr_size = flat_buf->blocklens[k];
+ if (size != fwr_size) {
+ off += size;
+ new_fwr_size -= size;
+ }
+ }
+ num += size;
+ fwr_size = new_fwr_size;
+ bwr_size = new_bwr_size;
+ }
+ }
+
+ /* write the buffer out finally */
+ if (writebuf_len) {
+ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code);
+ if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE)
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+ if (*error_code != MPI_SUCCESS)
+ return;
+ }
+ if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ ADIOI_Free(writebuf);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+ }
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/Makefile.mk b/3rd-party/romio341/adio/ad_nfs/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..73d0fb279df4a493ceb231384402e8e5bf9a2c19
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/Makefile.mk
@@ -0,0 +1,25 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_NFS
+
+noinst_HEADERS += adio/ad_nfs/ad_nfs.h
+
+romio_other_sources += \
+ adio/ad_nfs/ad_nfs_read.c \
+ adio/ad_nfs/ad_nfs_open.c \
+ adio/ad_nfs/ad_nfs_write.c \
+ adio/ad_nfs/ad_nfs_done.c \
+ adio/ad_nfs/ad_nfs_fcntl.c \
+ adio/ad_nfs/ad_nfs_iread.c \
+ adio/ad_nfs/ad_nfs_iwrite.c \
+ adio/ad_nfs/ad_nfs_wait.c \
+ adio/ad_nfs/ad_nfs_setsh.c \
+ adio/ad_nfs/ad_nfs_getsh.c \
+ adio/ad_nfs/ad_nfs.c \
+ adio/ad_nfs/ad_nfs_resize.c \
+ adio/ad_nfs/ad_nfs_features.c
+
+endif BUILD_AD_NFS
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..d9806d497ddc566e41de7c393565f5b7cf3c85ba
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+/* adioi.h has the ADIOI_Fns_struct define */
+#include "adioi.h"
+
+struct ADIOI_Fns_struct ADIO_NFS_operations = {
+ ADIOI_NFS_Open, /* Open */
+ ADIOI_FAILSAFE_OpenColl, /* OpenColl */
+ ADIOI_NFS_ReadContig, /* ReadContig */
+ ADIOI_NFS_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_NFS_Fcntl, /* Fcntl */
+ ADIOI_GEN_SetInfo, /* SetInfo */
+ ADIOI_NFS_ReadStrided, /* ReadStrided */
+ ADIOI_NFS_WriteStrided, /* WriteStrided */
+ ADIOI_GEN_Close, /* Close */
+ /* Even with lockd running and NFS mounted 'noac', we have been unable to
+ * gaurantee correct behavior over NFS with asyncronous I/O operations */
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+ ADIOI_NFS_ReadDone, /* ReadDone */
+ ADIOI_NFS_WriteDone, /* WriteDone */
+ ADIOI_NFS_ReadComplete, /* ReadComplete */
+ ADIOI_NFS_WriteComplete, /* WriteComplete */
+ ADIOI_GEN_IreadStrided, /* IreadStrided */
+ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
+ ADIOI_GEN_Flush, /* Flush */
+ ADIOI_NFS_Resize, /* Resize */
+ ADIOI_GEN_Delete, /* Delete */
+ ADIOI_NFS_Feature, /* Features */
+ "NFS:", /* fsname: just a string */
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs.h b/3rd-party/romio341/adio/ad_nfs/ad_nfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f6d74613ef6d579059a058c9cdc9b37d8b0227
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_NFS_H_INCLUDED
+#define AD_NFS_H_INCLUDED
+
+#include "adio.h"
+
+#include
+#include
+#include
+
+#ifdef HAVE_SIGNAL_H
+#include
+#endif
+#ifdef HAVE_SYS_TYPES_H
+#include
+#endif
+#ifdef HAVE_AIO_LITE_H
+#include
+#else
+#ifdef HAVE_AIO_H
+#include
+#endif
+#ifdef HAVE_SYS_AIO_H
+#include
+#endif
+#endif
+
+/* Workaround for incomplete set of definitions if __REDIRECT is not
+ defined and large file support is used in aio.h */
+#if !defined(__REDIRECT) && defined(__USE_FILE_OFFSET64)
+#define aiocb aiocb64
+#endif
+
+int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
+ int wr, MPI_Request * request);
+
+#ifdef SX4
+#define lseek llseek
+#endif
+
+void ADIOI_NFS_Open(ADIO_File fd, int *error_code);
+void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_NFS_IwriteContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int
+ *error_code);
+void ADIOI_NFS_IreadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int
+ *error_code);
+int ADIOI_NFS_ReadDone(ADIO_Request * request, ADIO_Status * status, int
+ *error_code);
+int ADIOI_NFS_WriteDone(ADIO_Request * request, ADIO_Status * status, int
+ *error_code);
+void ADIOI_NFS_ReadComplete(ADIO_Request * request, ADIO_Status * status, int
+ *error_code);
+void ADIOI_NFS_WriteComplete(ADIO_Request * request, ADIO_Status * status, int *error_code);
+void ADIOI_NFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int
+ *error_code);
+void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_NFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset * shared_fp,
+ int *error_code);
+void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
+void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
+int ADIOI_NFS_Feature(ADIO_File fd, int feature_flag);
+
+#endif /* AD_NFS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_done.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_done.c
new file mode 100644
index 0000000000000000000000000000000000000000..0fdee009a403527861bac33d4c3c744bcdb142e1
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_done.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+int ADIOI_NFS_ReadDone(ADIO_Request * request, ADIO_Status * status, int *error_code)
+{
+ *error_code = MPI_SUCCESS;
+ return 1;
+}
+
+int ADIOI_NFS_WriteDone(ADIO_Request * request, ADIO_Status * status, int *error_code)
+{
+ return ADIOI_NFS_ReadDone(request, status, error_code);
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_fcntl.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_fcntl.c
new file mode 100644
index 0000000000000000000000000000000000000000..2b82d2018048706b5dca71b31a44460f76f950d8
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_fcntl.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+#include "adio_extern.h"
+
+void ADIOI_NFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code)
+{
+ static char myname[] = "ADIOI_NFS_FCNTL";
+
+ switch (flag) {
+ case ADIO_FCNTL_GET_FSIZE:
+ ADIOI_READ_LOCK(fd, 0, SEEK_SET, 1);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ ADIOI_UNLOCK(fd, 0, SEEK_SET, 1);
+ if (fd->fp_sys_posn != -1) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ }
+ if (fcntl_struct->fsize == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+ break;
+
+ case ADIO_FCNTL_SET_DISKSPACE:
+ ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
+ break;
+
+ case ADIO_FCNTL_SET_ATOMICITY:
+ fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
+ *error_code = MPI_SUCCESS;
+ break;
+
+ default:
+ /* --BEGIN ERROR HANDLING-- */
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG, "**flag", "**flag %d", flag);
+ return;
+ /* --END ERROR HANDLING-- */
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_features.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_features.c
new file mode 100644
index 0000000000000000000000000000000000000000..41edbc3d411d25571cf6b59fa25c756f90084956
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_features.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "ad_nfs.h"
+
+int ADIOI_NFS_Feature(ADIO_File fd, int flag)
+{
+ switch (flag) {
+ case ADIO_SHARED_FP:
+ case ADIO_LOCKS:
+ case ADIO_SEQUENTIAL:
+ case ADIO_DATA_SIEVING_WRITES:
+ return 1;
+ case ADIO_SCALABLE_OPEN:
+ case ADIO_UNLINK_AFTER_CLOSE:
+ case ADIO_SCALABLE_RESIZE:
+ default:
+ return 0;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_getsh.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_getsh.c
new file mode 100644
index 0000000000000000000000000000000000000000..484add75c04bab7df17d495dc411882bf0225ab8
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_getsh.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+/* returns the current location of the shared_fp in terms of the
+ no. of etypes relative to the current view, and also increments the
+ shared_fp by the number of etypes to be accessed (incr) in the read
+ or write following this function. */
+
+void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset * shared_fp,
+ int *error_code)
+{
+ ADIO_Offset new_fp;
+ ssize_t err;
+ MPI_Comm dupcommself;
+ static char myname[] = "ADIOI_NFS_GET_SHARED_FP";
+
+ if (fd->shared_fp_fd == ADIO_FILE_NULL) {
+ MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
+ fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
+ fd->shared_fp_fname,
+ fd->file_system,
+ fd->fns,
+ ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
+ 0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
+ ADIO_PERM_NULL, error_code);
+ if (*error_code != MPI_SUCCESS)
+ return;
+ *shared_fp = 0;
+ ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ /* if the file is empty, the above read may return error
+ * (reading beyond end of file). In that case, shared_fp = 0,
+ * set above, is the correct value. */
+ } else {
+ ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ if (err == 0) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ }
+ if (err == -1) {
+ ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ return;
+ }
+ }
+
+ if (incr == 0) {
+ goto done;
+ }
+
+ new_fp = *shared_fp + incr;
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ if (err == 0) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ }
+ done:
+ ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_hints.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_hints.c
new file mode 100644
index 0000000000000000000000000000000000000000..3cacb905dfe86ac7e377016d52c723b71e0b5659
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_hints.c
@@ -0,0 +1,11 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+void ADIOI_NFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+{
+ ADIOI_GEN_SetInfo(fd, users_info, error_code);
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_iread.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_iread.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc1689dbcd7ebdde29d2a354b4c17e9ab9b370d0
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_iread.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+#ifdef ROMIO_HAVE_WORKING_AIO
+/* nearly identical to ADIOI_GEN_IreadContig, except we lock around I/O */
+void ADIOI_NFS_IreadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int *error_code)
+{
+ MPI_Count len, typesize;
+ int aio_errno = 0;
+ static char myname[] = "ADIOI_NFS_IREADCONTIG";
+
+ MPI_Type_size_x(datatype, &typesize);
+ len = count * typesize;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ offset = fd->fp_ind;
+ aio_errno = ADIOI_NFS_aio(fd, buf, len, offset, 0, request);
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += len;
+
+ fd->fp_sys_posn = -1;
+
+ if (aio_errno != 0) {
+ /* --BEGIN ERROR HANDLING-- */
+ MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
+ return;
+ /* --END ERROR HANDLING-- */
+ } else
+ *error_code = MPI_SUCCESS;
+}
+#endif
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_iwrite.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_iwrite.c
new file mode 100644
index 0000000000000000000000000000000000000000..9424ca5591a4fa15ac71d62ee4d9725f73d9885d
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_iwrite.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+#include "../../mpi-io/mpioimpl.h"
+#ifdef MPIO_BUILD_PROFILING
+#include "../../mpi-io/mpioprof.h"
+#endif
+#include "mpiu_greq.h"
+
+#include
+
+#ifdef ROMIO_HAVE_WORKING_AIO
+static MPIX_Grequest_class ADIOI_GEN_greq_class = 0;
+/* this routine is nearly identical to ADIOI_GEN_IwriteContig, except we lock
+ * around I/O */
+void ADIOI_NFS_IwriteContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int *error_code)
+{
+ MPI_Count len, typesize;
+ int aio_errno = 0;
+ static char myname[] = "ADIOI_NFS_IWRITECONTIG";
+
+ MPI_Type_size_x(datatype, &typesize);
+ len = count * typesize;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ offset = fd->fp_ind;
+ aio_errno = ADIOI_NFS_aio(fd, buf, len, offset, 1, request);
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += len;
+
+ fd->fp_sys_posn = -1;
+
+ if (aio_errno != 0) {
+ /* --BEGIN ERROR HANDLING-- */
+ MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
+ return;
+ /* --END ERROR HANDLING-- */
+ } else
+ *error_code = MPI_SUCCESS;
+ return;
+}
+#endif
+
+/* This function is for implementation convenience. It is not user-visible.
+ * It takes care of the differences in the interface for nonblocking I/O
+ * on various Unix machines! If wr==1 write, wr==0 read.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+#ifdef ROMIO_HAVE_WORKING_AIO
+int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
+ int wr, MPI_Request * request)
+{
+ int err = -1, fd_sys;
+ int error_code, this_errno;
+
+ struct aiocb *aiocbp;
+ ADIOI_AIO_Request *aio_req;
+ MPI_Status status;
+
+ fd_sys = fd->fd_sys;
+
+ aio_req = (ADIOI_AIO_Request *) ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
+ aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1);
+ aiocbp->aio_offset = offset;
+ aiocbp->aio_buf = buf;
+ aiocbp->aio_nbytes = len;
+
+#ifdef HAVE_STRUCT_AIOCB_AIO_WHENCE
+ aiocbp->aio_whence = SEEK_SET;
+#endif
+#ifdef HAVE_STRUCT_AIOCB_AIO_FILDES
+ aiocbp->aio_fildes = fd_sys;
+#endif
+#ifdef HAVE_STRUCT_AIOCB_AIO_SIGEVENT
+#ifdef AIO_SIGNOTIFY_NONE
+ aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
+#endif
+ aiocbp->aio_sigevent.sigev_signo = 0;
+#endif
+#ifdef HAVE_STRUCT_AIOCB_AIO_REQPRIO
+#ifdef AIO_PRIO_DFL
+ aiocbp->aio_reqprio = AIO_PRIO_DFL; /* not needed in DEC Unix 4.0 */
+#else
+ aiocbp->aio_reqprio = 0;
+#endif
+#endif
+
+ if (wr)
+ ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
+ else
+ ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
+
+#ifndef ROMIO_HAVE_AIO_CALLS_NEED_FILEDES
+ if (wr)
+ err = aio_write(aiocbp);
+ else
+ err = aio_read(aiocbp);
+#else
+ /* Broken IBM interface */
+ if (wr)
+ err = aio_write(fd_sys, aiocbp);
+ else
+ err = aio_read(fd_sys, aiocbp);
+#endif
+
+ this_errno = errno;
+ ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
+
+ if (err == -1) {
+ if (this_errno == EAGAIN) {
+ /* exceeded the max. no. of outstanding requests.
+ * complete all previous async. requests and try again. */
+ ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
+ offset, &status, &error_code);
+ MPIO_Completed_request_create(&fd, len, &error_code, request);
+ return 0;
+ } else {
+ return -this_errno;
+ }
+ }
+ aio_req->aiocbp = aiocbp;
+ if (ADIOI_GEN_greq_class == 0) {
+ MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn,
+ ADIOI_GEN_aio_free_fn, MPIU_Greq_cancel_fn,
+ ADIOI_GEN_aio_poll_fn, ADIOI_GEN_aio_wait_fn,
+ &ADIOI_GEN_greq_class);
+ }
+ MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, aio_req, request);
+ memcpy(&(aio_req->req), request, sizeof(MPI_Request));
+ return 0;
+}
+#endif
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_open.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab898e134af1452d3a87c245cc21c890f89d9ab0
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_open.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+void ADIOI_NFS_Open(ADIO_File fd, int *error_code)
+{
+ int perm, amode;
+ mode_t old_mask;
+ static char myname[] = "ADIOI_NFS_OPEN";
+
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = old_mask ^ 0666;
+ } else
+ perm = fd->perm;
+
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE)
+ amode = amode | O_CREAT;
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
+#endif
+ fd->fd_sys = open(fd->filename, amode, perm);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
+#endif
+ fd->fd_direct = -1;
+
+ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ }
+
+ if (fd->fd_sys == -1) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_read.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_read.c
new file mode 100644
index 0000000000000000000000000000000000000000..1b5daa6f8acb6901b9bd0b1ace1fc2165f383c50
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_read.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+#include "adio_extern.h"
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ ssize_t err = -1;
+ MPI_Count datatype_size, len;
+ ADIO_Offset bytes_xfered = 0;
+ size_t rd_count;
+ static char myname[] = "ADIOI_NFS_READCONTIG";
+ char *p;
+
+ if (count == 0) {
+ err = 0;
+ goto fn_exit;
+ }
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+
+ p = buf;
+ while (bytes_xfered < len) {
+ rd_count = len - bytes_xfered;
+ /* FreeBSD and Darwin workaround: bigger than INT_MAX is an error */
+ if (rd_count > INT_MAX)
+ rd_count = INT_MAX;
+ if (fd->atomicity)
+ ADIOI_WRITE_LOCK(fd, offset + bytes_xfered, SEEK_SET, rd_count);
+ else
+ ADIOI_READ_LOCK(fd, offset + bytes_xfered, SEEK_SET, rd_count);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err = pread(fd->fd_sys, p, rd_count, offset + bytes_xfered);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ }
+ /* --END ERROR HANDLING-- */
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ ADIOI_UNLOCK(fd, offset + bytes_xfered, SEEK_SET, rd_count);
+ if (err == 0) {
+ /* end of file */
+ break;
+ }
+ bytes_xfered += err;
+ p += err;
+ }
+
+ fd->fp_sys_posn = offset + bytes_xfered;
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += bytes_xfered;
+ }
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status && err != -1)
+ MPIR_Status_set_bytes(status, datatype, bytes_xfered);
+#endif
+
+ *error_code = MPI_SUCCESS;
+}
+
+
+
+#ifdef ADIOI_MPE_LOGGING
+#define ADIOI_BUFFERED_READ \
+ { \
+ if (req_off >= readbuf_off + readbuf_len) { \
+ readbuf_off = req_off; \
+ readbuf_len = (int) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1)); \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, readbuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len); \
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); \
+ err = read(fd->fd_sys, readbuf, readbuf_len); \
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len); \
+ if (err == -1) err_flag = 1; \
+ } \
+ while (req_len > readbuf_off + readbuf_len - req_off) { \
+ partial_read = (int) (readbuf_off + readbuf_len - req_off); \
+ tmp_buf = (char *) ADIOI_Malloc(partial_read); \
+ memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
+ ADIOI_Free(readbuf); \
+ readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
+ memcpy(readbuf, tmp_buf, partial_read); \
+ ADIOI_Free(tmp_buf); \
+ readbuf_off += readbuf_len-partial_read; \
+ readbuf_len = (int) (partial_read + MPL_MIN(max_bufsize, \
+ end_offset-readbuf_off+1)); \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read); \
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); \
+ err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read); \
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read); \
+ if (err == -1) err_flag = 1; \
+ } \
+ memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
+ }
+#else
+#define ADIOI_BUFFERED_READ \
+ { \
+ if (req_off >= readbuf_off + readbuf_len) { \
+ readbuf_off = req_off; \
+ readbuf_len = (int) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1)); \
+ lseek(fd->fd_sys, readbuf_off, SEEK_SET); \
+ if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len); \
+ err = read(fd->fd_sys, readbuf, readbuf_len); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len); \
+ if (err == -1) err_flag = 1; \
+ } \
+ while (req_len > readbuf_off + readbuf_len - req_off) { \
+ partial_read = (int) (readbuf_off + readbuf_len - req_off); \
+ tmp_buf = (char *) ADIOI_Malloc(partial_read); \
+ memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
+ ADIOI_Free(readbuf); \
+ readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
+ memcpy(readbuf, tmp_buf, partial_read); \
+ ADIOI_Free(tmp_buf); \
+ readbuf_off += readbuf_len-partial_read; \
+ readbuf_len = (int) (partial_read + MPL_MIN(max_bufsize, \
+ end_offset-readbuf_off+1)); \
+ lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET); \
+ if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read); \
+ err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read); \
+ if (err == -1) err_flag = 1; \
+ } \
+ memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
+ }
+#endif
+
+
+void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code)
+{
+/* offset is in units of etype relative to the filetype. */
+
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ ADIO_Offset i_offset, new_brd_size, brd_size, size;
+ int i, j, k, err, err_flag = 0, st_index = 0;
+ MPI_Count num, bufsize;
+ int n_etypes_in_filetype;
+ ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
+ ADIO_Offset abs_off_in_filetype = 0, new_frd_size, frd_size = 0, st_frd_size;
+ MPI_Count filetype_size, etype_size, buftype_size, partial_read;
+ MPI_Aint lb, filetype_extent, buftype_extent;
+ int buf_count, buftype_is_contig, filetype_is_contig;
+ ADIO_Offset userbuf_off, req_len, sum;
+ ADIO_Offset off, req_off, disp, end_offset = 0, readbuf_off, start_off;
+ char *readbuf, *tmp_buf, *value;
+ int info_flag;
+ unsigned max_bufsize, readbuf_len;
+
+ static char myname[] = "ADIOI_NFS_READSTRIDED";
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (!filetype_size) {
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ ADIOI_Assert((buftype_size * count) ==
+ ((ADIO_Offset) (MPI_Count) buftype_size * (ADIO_Offset) count));
+ bufsize = buftype_size * count;
+
+/* get max_bufsize from the info object. */
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
+ max_bufsize = atoi(value);
+ ADIOI_Free(value);
+
+ if (!buftype_is_contig && filetype_is_contig) {
+
+/* noncontiguous in memory, contiguous in file. */
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset;
+
+ start_off = off;
+ end_offset = off + bufsize - 1;
+ readbuf_off = off;
+ readbuf = (char *) ADIOI_Malloc(max_bufsize);
+ readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset - readbuf_off + 1));
+
+/* if atomicity is true, lock (exclusive) the region to be accessed */
+ if (fd->atomicity)
+ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->fd_sys, readbuf_off, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err = read(fd->fd_sys, readbuf, readbuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
+ if (err == -1)
+ err_flag = 1;
+
+ for (j = 0; j < count; j++)
+ for (i = 0; i < flat_buf->count; i++) {
+ userbuf_off = (ADIO_Offset) j *buftype_extent + flat_buf->indices[i];
+ req_off = off;
+ req_len = flat_buf->blocklens[i];
+ ADIOI_BUFFERED_READ off += flat_buf->blocklens[i];
+ }
+
+ if (fd->atomicity)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+
+ ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
+
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+ }
+
+ else { /* noncontiguous in file */
+
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+ disp = fd->disp;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* Wei-keng reworked type processing to be a bit more efficient */
+ offset = fd->fp_ind - disp;
+ n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
+ offset -= (ADIO_Offset) n_filetypes *filetype_extent;
+ /* now offset is local to this extent */
+
+ /* find the block where offset is located, skip blocklens[i]==0 */
+ for (i = 0; i < flat_file->count; i++) {
+ ADIO_Offset dist;
+ if (flat_file->blocklens[i] == 0)
+ continue;
+ dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
+ /* frd_size is from offset to the end of block i */
+ if (dist == 0) {
+ i++;
+ offset = flat_file->indices[i];
+ frd_size = flat_file->blocklens[i];
+ break;
+ }
+ if (dist > 0) {
+ frd_size = dist;
+ break;
+ }
+ }
+ st_index = i; /* starting index in flat_file->indices[] */
+ offset += disp + (ADIO_Offset) n_filetypes *filetype_extent;
+ } else {
+ n_etypes_in_filetype = filetype_size / etype_size;
+ n_filetypes = offset / n_etypes_in_filetype;
+ etype_in_filetype = offset % n_etypes_in_filetype;
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i = 0; i < flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ frd_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+
+ /* abs. offset in bytes in the file */
+ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype;
+ }
+
+ start_off = offset;
+
+ /* Wei-keng Liao: read request is within a single flat_file contig
+ * block e.g. with subarray types that actually describe the whole
+ * array */
+ if (buftype_is_contig && bufsize <= frd_size) {
+ /* a count of bytes can overflow. operate on original type instead */
+ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
+ offset, status, error_code);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* update MPI-IO file pointer to point to the first byte that
+ * can be accessed in the fileview. */
+ fd->fp_ind = offset + bufsize;
+ if (bufsize == frd_size) {
+ do {
+ st_index++;
+ if (st_index == flat_file->count) {
+ st_index = 0;
+ n_filetypes++;
+ }
+ } while (flat_file->blocklens[st_index] == 0);
+ fd->fp_ind = disp + flat_file->indices[st_index]
+ + n_filetypes * filetype_extent;
+ }
+ }
+ fd->fp_sys_posn = -1; /* set it to null. */
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+#endif
+ return;
+ }
+
+ /* Calculate end_offset, the last byte-offset that will be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */
+
+ st_frd_size = frd_size;
+ st_n_filetypes = n_filetypes;
+ i_offset = 0;
+ j = st_index;
+ off = offset;
+ frd_size = MPL_MIN(st_frd_size, bufsize);
+ while (i_offset < bufsize) {
+ i_offset += frd_size;
+ end_offset = off + frd_size - 1;
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+
+ off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent;
+ frd_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
+ }
+
+/* if atomicity is true, lock (exclusive) the region to be accessed */
+ if (fd->atomicity)
+ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ /* initial read into readbuf */
+ readbuf_off = offset;
+ readbuf = (char *) ADIOI_Malloc(max_bufsize);
+ readbuf_len = (int) (MPL_MIN(max_bufsize, end_offset - readbuf_off + 1));
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->fd_sys, offset, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err = read(fd->fd_sys, readbuf, readbuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
+
+ if (err == -1)
+ err_flag = 1;
+
+ if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+ common case. */
+
+ i_offset = 0;
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+ frd_size = MPL_MIN(st_frd_size, bufsize);
+ while (i_offset < bufsize) {
+ if (frd_size) {
+ /* TYPE_UB and TYPE_LB can result in
+ * frd_size = 0. save system call in such cases */
+ /* lseek(fd->fd_sys, off, SEEK_SET);
+ * err = read(fd->fd_sys, ((char *) buf) + i, frd_size); */
+
+ req_off = off;
+ req_len = frd_size;
+ userbuf_off = i_offset;
+ ADIOI_BUFFERED_READ}
+ i_offset += frd_size;
+
+ if (off + frd_size < disp + flat_file->indices[j] +
+ flat_file->blocklens[j] + n_filetypes * (ADIO_Offset) filetype_extent)
+ off += frd_size;
+ /* did not reach end of contiguous block in filetype.
+ * no more I/O needed. off is incremented by frd_size. */
+ else {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+ off = disp + flat_file->indices[j] +
+ n_filetypes * (ADIO_Offset) filetype_extent;
+ frd_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
+ }
+ }
+ } else {
+/* noncontiguous in memory as well as in file */
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ k = num = buf_count = 0;
+ i_offset = flat_buf->indices[0];
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+ frd_size = st_frd_size;
+ brd_size = flat_buf->blocklens[0];
+
+ while (num < bufsize) {
+ size = MPL_MIN(frd_size, brd_size);
+ if (size) {
+ /* lseek(fd->fd_sys, off, SEEK_SET);
+ * err = read(fd->fd_sys, ((char *) buf) + i, size); */
+
+ req_off = off;
+ req_len = size;
+ userbuf_off = i_offset;
+ ADIOI_BUFFERED_READ}
+
+ new_frd_size = frd_size;
+ new_brd_size = brd_size;
+
+ if (size == frd_size) {
+/* reached end of contiguous block in file */
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+ off = disp + flat_file->indices[j] +
+ n_filetypes * (ADIO_Offset) filetype_extent;
+
+ new_frd_size = flat_file->blocklens[j];
+ if (size != brd_size) {
+ i_offset += size;
+ new_brd_size -= size;
+ }
+ }
+
+ if (size == brd_size) {
+/* reached end of contiguous block in memory */
+
+ k = (k + 1) % flat_buf->count;
+ buf_count++;
+ i_offset = buftype_extent * (buf_count / flat_buf->count) +
+ flat_buf->indices[k];
+ new_brd_size = flat_buf->blocklens[k];
+ if (size != frd_size) {
+ off += size;
+ new_frd_size -= size;
+ }
+ }
+ num += size;
+ frd_size = new_frd_size;
+ brd_size = new_brd_size;
+ }
+ }
+
+ if (fd->atomicity)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+
+ ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
+
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+ }
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually read and placed in buf
+ by ADIOI_BUFFERED_READ. */
+#endif
+
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_resize.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_resize.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b4e1553629ab97e2e55c9f1fd0d2fa2551dcc5b
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_resize.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+/* NFS resize
+ *
+ * Note: we resize on all processors to guarantee that all processors
+ * will have updated cache values. This used to be the generic
+ * implementation used by the majority of the ADIO implementations.
+ */
+void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
+{
+ int err;
+ static char myname[] = "ADIOI_NFS_RESIZE";
+
+ err = ftruncate(fd->fd_sys, size);
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_setsh.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_setsh.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8cafa96916d11e44444a538812cd4cda182b7b4
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_setsh.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+/* set the shared file pointer to "offset" etypes relative to the current
+ view */
+
+/*
+This looks very similar to ADIOI_GEN_Set_shared_fp, except this
+function avoids locking the file twice. The generic version does
+
+Write lock
+ADIO_WriteContig
+Unlock
+
+For NFS, ADIOI_NFS_WriteContig does a lock before writing to disable
+caching. To avoid the lock being called twice, this version for NFS does
+
+Write lock
+Lseek
+Write
+Unlock
+
+*/
+
+void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
+{
+ ssize_t err;
+ MPI_Comm dupcommself;
+ static char myname[] = "ADIOI_NFS_SET_SHARED_FP";
+
+ if (fd->shared_fp_fd == ADIO_FILE_NULL) {
+ MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
+ fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
+ fd->shared_fp_fname,
+ fd->file_system, fd->fns,
+ ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
+ 0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
+ ADIO_PERM_NULL, error_code);
+ }
+
+ if (*error_code != MPI_SUCCESS)
+ return;
+
+ ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
+
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_wait.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_wait.c
new file mode 100644
index 0000000000000000000000000000000000000000..66f1aa545774942ef5c5fa95859d0562abd2db11
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_wait.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+
+void ADIOI_NFS_ReadComplete(ADIO_Request * request, ADIO_Status * status, int *error_code)
+{
+ return;
+}
+
+
+void ADIOI_NFS_WriteComplete(ADIO_Request * request, ADIO_Status * status, int *error_code)
+{
+ ADIOI_NFS_ReadComplete(request, status, error_code);
+}
diff --git a/3rd-party/romio341/adio/ad_nfs/ad_nfs_write.c b/3rd-party/romio341/adio/ad_nfs/ad_nfs_write.c
new file mode 100644
index 0000000000000000000000000000000000000000..992d52ffe412442f17c0e248bf31c44454ce4e58
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_nfs/ad_nfs_write.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_nfs.h"
+#include "adio_extern.h"
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ ssize_t err = -1;
+ MPI_Count datatype_size, len;
+ ADIO_Offset bytes_xfered = 0;
+ size_t wr_count;
+ static char myname[] = "ADIOI_NFS_WRITECONTIG";
+ char *p;
+
+ if (count == 0) {
+ err = 0;
+ goto fn_exit;
+ }
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * (ADIO_Offset) count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+
+ p = (char *) buf;
+ while (bytes_xfered < len) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ wr_count = len - bytes_xfered;
+ /* work around FreeBSD and OS X defects */
+ if (wr_count > INT_MAX)
+ wr_count = INT_MAX;
+
+ ADIOI_WRITE_LOCK(fd, offset + bytes_xfered, SEEK_SET, wr_count);
+ err = pwrite(fd->fd_sys, p, wr_count, offset + bytes_xfered);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ fd->fp_sys_posn = -1;
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ ADIOI_UNLOCK(fd, offset + bytes_xfered, SEEK_SET, wr_count);
+ bytes_xfered += err;
+ p += err;
+ }
+ fd->fp_sys_posn = offset + bytes_xfered;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += bytes_xfered;
+ }
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status && err != -1)
+ MPIR_Status_set_bytes(status, datatype, bytes_xfered);
+#endif
+
+ *error_code = MPI_SUCCESS;
+}
+
+
+
+
+#ifdef ADIOI_MPE_LOGGING
+#define ADIOI_BUFFERED_WRITE \
+ { \
+ if (req_off >= writebuf_off + writebuf_len) { \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ writebuf_off = req_off; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); \
+ err = read(fd->fd_sys, writebuf, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); \
+ if (err == -1) { \
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**ioRMWrdwr", 0); \
+ goto fn_exit; \
+ } \
+ } \
+ write_sz = (int) (MPL_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
+ memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz); \
+ while (write_sz != req_len) { \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ req_len -= write_sz; \
+ userbuf_off += write_sz; \
+ writebuf_off += writebuf_len; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); \
+ err = read(fd->fd_sys, writebuf, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); \
+ if (err == -1) { \
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**ioRMWrdwr", 0); \
+ goto fn_exit; \
+ } \
+ write_sz = MPL_MIN(req_len, writebuf_len); \
+ memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
+ } \
+ }
+#else
+#define ADIOI_BUFFERED_WRITE \
+ { \
+ if (req_off >= writebuf_off + writebuf_len) { \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ writebuf_off = req_off; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ err = read(fd->fd_sys, writebuf, writebuf_len); \
+ if (err == -1) { \
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**ioRMWrdwr", 0); \
+ goto fn_exit; \
+ } \
+ } \
+ write_sz = (int) (MPL_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
+ memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz); \
+ while (write_sz != req_len) { \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ req_len -= write_sz; \
+ userbuf_off += write_sz; \
+ writebuf_off += writebuf_len; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ err = read(fd->fd_sys, writebuf, writebuf_len); \
+ if (err == -1) { \
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**ioRMWrdwr", 0); \
+ goto fn_exit; \
+ } \
+ write_sz = MPL_MIN(req_len, writebuf_len); \
+ memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
+ } \
+ }
+#endif
+
+/* this macro is used when filetype is contig and buftype is not contig.
+ it does not do a read-modify-write and does not lock*/
+#ifdef ADIOI_MPE_LOGGING
+#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
+ { \
+ if (req_off >= writebuf_off + writebuf_len) { \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ writebuf_off = req_off; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ } \
+ write_sz = (int) (MPL_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
+ memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz); \
+ while (write_sz != req_len) { \
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ req_len -= write_sz; \
+ userbuf_off += write_sz; \
+ writebuf_off += writebuf_len; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ write_sz = MPL_MIN(req_len, writebuf_len); \
+ memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
+ } \
+ }
+#else
+#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
+ { \
+ if (req_off >= writebuf_off + writebuf_len) { \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ writebuf_off = req_off; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ } \
+ write_sz = (int) (MPL_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
+ memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz); \
+ while (write_sz != req_len) { \
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
+ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ err = write(fd->fd_sys, writebuf, writebuf_len); \
+ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
+ if (err == -1) err_flag = 1; \
+ req_len -= write_sz; \
+ userbuf_off += write_sz; \
+ writebuf_off += writebuf_len; \
+ writebuf_len = (int) (MPL_MIN(max_bufsize,end_offset-writebuf_off+1)); \
+ write_sz = MPL_MIN(req_len, writebuf_len); \
+ memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
+ } \
+ }
+#endif
+
+
+void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code)
+{
+/* offset is in units of etype relative to the filetype. */
+
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ int i, j, k, err = -1, bwr_size, st_index = 0;
+ ADIO_Offset i_offset, sum, size_in_filetype;
+ ADIO_Offset num, size, n_etypes_in_filetype;
+ MPI_Count bufsize;
+ ADIO_Offset n_filetypes, etype_in_filetype;
+ ADIO_Offset abs_off_in_filetype = 0;
+ int req_len;
+ MPI_Count filetype_size, etype_size, buftype_size;
+ MPI_Aint lb, filetype_extent, buftype_extent;
+ int buf_count, buftype_is_contig, filetype_is_contig;
+ ADIO_Offset userbuf_off;
+ ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
+ char *writebuf = NULL, *value;
+ int st_n_filetypes, writebuf_len, write_sz;
+ ADIO_Offset fwr_size = 0, new_fwr_size, st_fwr_size;
+ int new_bwr_size, err_flag = 0, info_flag, max_bufsize;
+ static char myname[] = "ADIOI_NFS_WRITESTRIDED";
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (!filetype_size) {
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ bufsize = buftype_size * count;
+
+/* get max_bufsize from the info object. */
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
+ max_bufsize = atoi(value);
+ ADIOI_Free(value);
+
+ if (!buftype_is_contig && filetype_is_contig) {
+
+/* noncontiguous in memory, contiguous in file. */
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset;
+
+ start_off = off;
+ end_offset = off + bufsize - 1;
+ writebuf_off = off;
+ writebuf = (char *) ADIOI_Malloc(max_bufsize);
+ writebuf_len = (int) (MPL_MIN(max_bufsize, end_offset - writebuf_off + 1));
+
+/* if atomicity is true, lock the region to be accessed */
+ if (fd->atomicity)
+ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ for (j = 0; j < count; j++)
+ for (i = 0; i < flat_buf->count; i++) {
+ userbuf_off = j * buftype_extent + flat_buf->indices[i];
+ req_off = off;
+ req_len = flat_buf->blocklens[i];
+ ADIOI_BUFFERED_WRITE_WITHOUT_READ;
+ off += flat_buf->blocklens[i];
+ }
+
+ /* write the buffer out finally */
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ err = write(fd->fd_sys, writebuf, writebuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+ if (err == -1)
+ err_flag = 1;
+
+ if (fd->atomicity)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+ }
+
+ else { /* noncontiguous in file */
+
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+ disp = fd->disp;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* Wei-keng reworked type processing to be a bit more efficient */
+ offset = fd->fp_ind - disp;
+ n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
+ offset -= (ADIO_Offset) n_filetypes *filetype_extent;
+ /* now offset is local to this extent */
+
+ /* find the block where offset is located, skip blocklens[i]==0 */
+ for (i = 0; i < flat_file->count; i++) {
+ ADIO_Offset dist;
+ if (flat_file->blocklens[i] == 0)
+ continue;
+ dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
+ /* fwr_size is from offset to the end of block i */
+ if (dist == 0) {
+ i++;
+ offset = flat_file->indices[i];
+ fwr_size = flat_file->blocklens[i];
+ break;
+ }
+ if (dist > 0) {
+ fwr_size = dist;
+ break;
+ }
+ }
+ st_index = i; /* starting index in flat_file->indices[] */
+ offset += disp + (ADIO_Offset) n_filetypes *filetype_extent;
+ } else {
+ n_etypes_in_filetype = filetype_size / etype_size;
+ n_filetypes = offset / n_etypes_in_filetype;
+ etype_in_filetype = offset % n_etypes_in_filetype;
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i = 0; i < flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ fwr_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+
+ /* abs. offset in bytes in the file */
+ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype;
+ }
+
+ start_off = offset;
+ /* Wei-keng Liao:write request is within single flat_file contig block */
+ /* this could happen, for example, with subarray types that are
+ * actually fairly contiguous */
+ if (buftype_is_contig && bufsize <= fwr_size) {
+ /* though MPI api has an integer 'count' parameter, derived
+ * datatypes might describe more bytes than can fit into an integer.
+ * if we've made it this far, we can pass a count of original
+ * datatypes, instead of a count of bytes (which might overflow)
+ * Other WriteContig calls in this path are operating on data
+ * sieving buffer */
+ ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
+ offset, status, error_code);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* update MPI-IO file pointer to point to the first byte
+ * that can be accessed in the fileview. */
+ fd->fp_ind = offset + bufsize;
+ if (bufsize == fwr_size) {
+ do {
+ st_index++;
+ if (st_index == flat_file->count) {
+ st_index = 0;
+ n_filetypes++;
+ }
+ } while (flat_file->blocklens[st_index] == 0);
+ fd->fp_ind = disp + flat_file->indices[st_index]
+ + (ADIO_Offset) n_filetypes *filetype_extent;
+ }
+ }
+ fd->fp_sys_posn = -1; /* set it to null. */
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+#endif
+ goto fn_exit;
+ }
+
+ /* Calculate end_offset, the last byte-offset that will be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
+
+ st_fwr_size = fwr_size;
+ st_n_filetypes = n_filetypes;
+ i_offset = 0;
+ j = st_index;
+ off = offset;
+ fwr_size = MPL_MIN(st_fwr_size, bufsize);
+ while (i_offset < bufsize) {
+ i_offset += fwr_size;
+ end_offset = off + fwr_size - 1;
+
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+
+ off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent;
+ fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
+ }
+
+/* if atomicity is true, lock the region to be accessed */
+ if (fd->atomicity)
+ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ /* initial read for the read-modify-write */
+ writebuf_off = offset;
+ writebuf = (char *) ADIOI_Malloc(max_bufsize);
+ memset(writebuf, -1, max_bufsize);
+ writebuf_len = (int) (MPL_MIN(max_bufsize, end_offset - writebuf_off + 1));
+ if (!(fd->atomicity))
+ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err = read(fd->fd_sys, writebuf, writebuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO,
+ "ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.",
+ 0);
+ goto fn_exit;
+ }
+
+ if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+ common case. */
+
+ i_offset = 0;
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+ fwr_size = MPL_MIN(st_fwr_size, bufsize);
+ while (i_offset < bufsize) {
+ if (fwr_size) {
+ /* TYPE_UB and TYPE_LB can result in
+ * fwr_size = 0. save system call in such cases */
+ /* lseek(fd->fd_sys, off, SEEK_SET);
+ * err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); */
+
+ req_off = off;
+ req_len = fwr_size;
+ userbuf_off = i_offset;
+ ADIOI_BUFFERED_WRITE;
+ }
+ i_offset += fwr_size;
+
+ if (off + fwr_size < disp + flat_file->indices[j] +
+ flat_file->blocklens[j] + n_filetypes * (ADIO_Offset) filetype_extent)
+ off += fwr_size;
+ /* did not reach end of contiguous block in filetype.
+ * no more I/O needed. off is incremented by fwr_size. */
+ else {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+ off = disp + flat_file->indices[j] +
+ n_filetypes * (ADIO_Offset) filetype_extent;
+ fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
+ }
+ }
+ } else {
+/* noncontiguous in memory as well as in file */
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ k = num = buf_count = 0;
+ i_offset = flat_buf->indices[0];
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+ fwr_size = st_fwr_size;
+ bwr_size = flat_buf->blocklens[0];
+
+ while (num < bufsize) {
+ size = MPL_MIN(fwr_size, bwr_size);
+ if (size) {
+ /* lseek(fd->fd_sys, off, SEEK_SET);
+ * err = write(fd->fd_sys, ((char *) buf) + i, size); */
+
+ req_off = off;
+ req_len = size;
+ userbuf_off = i_offset;
+ ADIOI_BUFFERED_WRITE;
+ }
+
+ new_fwr_size = fwr_size;
+ new_bwr_size = bwr_size;
+
+ if (size == fwr_size) {
+/* reached end of contiguous block in file */
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ while (flat_file->blocklens[j] == 0) {
+ j = (j + 1) % flat_file->count;
+ n_filetypes += (j == 0) ? 1 : 0;
+ }
+
+ off = disp + flat_file->indices[j] +
+ n_filetypes * (ADIO_Offset) filetype_extent;
+
+ new_fwr_size = flat_file->blocklens[j];
+ if (size != bwr_size) {
+ i_offset += size;
+ new_bwr_size -= size;
+ }
+ }
+
+ if (size == bwr_size) {
+/* reached end of contiguous block in memory */
+
+ k = (k + 1) % flat_buf->count;
+ buf_count++;
+ i_offset =
+ (ADIO_Offset) buftype_extent *(ADIO_Offset) (buf_count / flat_buf->count) +
+ flat_buf->indices[k];
+ new_bwr_size = flat_buf->blocklens[k];
+ if (size != fwr_size) {
+ off += size;
+ new_fwr_size -= size;
+ }
+ }
+ num += size;
+ fwr_size = new_fwr_size;
+ bwr_size = new_bwr_size;
+ }
+ }
+
+ /* write the buffer out finally */
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ lseek(fd->fd_sys, writebuf_off, SEEK_SET);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ if (!(fd->atomicity))
+ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ err = write(fd->fd_sys, writebuf, writebuf_len);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+
+ if (!(fd->atomicity))
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+ else
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+ if (err == -1)
+ err_flag = 1;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+ }
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+
+ fn_exit:
+ if (writebuf != NULL)
+ ADIOI_Free(writebuf);
+
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_panfs/Makefile.mk b/3rd-party/romio341/adio/ad_panfs/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..59573df63074f7bd20cf4b18dd4a087b70b9f3da
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/Makefile.mk
@@ -0,0 +1,25 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_PANFS_OPEN6
+# override open with one that uses newer features
+panfs_open = adio/ad_panfs/ad_panfs_open6.c
+else
+panfs_open = adio/ad_panfs/ad_panfs_open.c
+endif
+
+if BUILD_AD_PANFS
+
+noinst_HEADERS += adio/ad_panfs/ad_panfs.h
+
+romio_other_sources += \
+ adio/ad_panfs/ad_panfs.c \
+ $(panfs_open) \
+ adio/ad_panfs/ad_panfs_hints.c \
+ adio/ad_panfs/ad_panfs_read.c \
+ adio/ad_panfs/ad_panfs_resize.c \
+ adio/ad_panfs/ad_panfs_write.c
+
+endif BUILD_AD_PANFS
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..1a59e0503f9acef082d80e3bf84b483a3a10cc32
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+
+/* adioi.h has the ADIOI_Fns_struct define */
+#include "adioi.h"
+
+#ifndef ROMIOCONF_H_INCLUDED
+#include "romioconf.h"
+#define ROMIOCONF_H_INCLUDED
+#endif
+
+
+struct ADIOI_Fns_struct ADIO_PANFS_operations = {
+#ifdef HAVE_PAN_FS_CLIENT_RAIDN_ENCODING_T
+ ADIOI_PANFS_Open6, /* Open, using newer Panasas features */
+#else
+ ADIOI_PANFS_Open, /* open, but using Panasas5 and earlier features */
+#endif
+ ADIOI_GEN_OpenColl,
+ ADIOI_PANFS_ReadContig, /* ReadContig */
+ ADIOI_PANFS_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_GEN_Fcntl, /* Fcntl */
+ ADIOI_PANFS_SetInfo, /* SetInfo */
+ ADIOI_GEN_ReadStrided, /* ReadStrided */
+ ADIOI_GEN_WriteStrided, /* WriteStrided */
+ ADIOI_GEN_Close, /* Close */
+#ifdef ROMIO_HAVE_WORKING_AIO
+ ADIOI_GEN_IreadContig, /* IreadContig */
+ ADIOI_GEN_IwriteContig, /* IwriteContig */
+#else
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+#endif
+ ADIOI_GEN_IODone, /* ReadDone */
+ ADIOI_GEN_IODone, /* WriteDone */
+ ADIOI_GEN_IOComplete, /* ReadComplete */
+ ADIOI_GEN_IOComplete, /* WriteComplete */
+ ADIOI_GEN_IreadStrided, /* IreadStrided */
+ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
+ ADIOI_GEN_Flush, /* Flush */
+ ADIOI_PANFS_Resize, /* Resize */
+ ADIOI_GEN_Delete, /* Delete */
+ ADIOI_GEN_Feature,
+ "PANFS: Panasas PanFS",
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs.h b/3rd-party/romio341/adio/ad_panfs/ad_panfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6af35e78d2e2dd34c72f19cdce06e0c17c8c48e
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_PANFS_H_INCLUDED
+#define AD_PANFS_H_INCLUDED
+
+#include "adio.h"
+#include
+#include
+#include
+
+#ifndef NO_AIO
+#ifdef AIO_SUN
+#include
+#else
+#ifdef HAVE_AIO_LITE_H
+#include
+#else
+#ifdef HAVE_AIO_H
+#include
+#endif
+#ifdef HAVE_SYS_AIO_H
+#include
+#endif
+#endif
+#endif
+#endif
+
+void ADIOI_PANFS_Open(ADIO_File fd, int *error_code);
+/* Panasas 6 introduced some new features */
+void ADIOI_PANFS_Open6(ADIO_File fd, int *error_code);
+void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+void ADIOI_PANFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_PANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
+void ADIOI_PANFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+
+/* TODO: move this to common code and have all routines retry. */
+/* TODO: also check for EWOULDBLOCK */
+#if defined(NEEDS_USLEEP_DECL)
+int usleep(useconds_t usec);
+#endif
+
+/* Delay 1 ms */
+#define AD_PANFS_RETRY_DELAY 1000
+
+#define AD_PANFS_RETRY(_op_,_rc_) \
+{ \
+ _rc_ = (_op_); \
+ while (_rc_ == -1 && errno == EAGAIN) \
+ { \
+ if (usleep(AD_PANFS_RETRY_DELAY) == -1) \
+ { \
+ break; \
+ } \
+ _rc_ = (_op_); \
+ } \
+}
+
+#endif /* AD_PANFS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs_hints.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs_hints.c
new file mode 100644
index 0000000000000000000000000000000000000000..daa5ab7560ad9ee6f79621a419d81fdc4ef12329
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs_hints.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+#include
+#include "hint_fns.h"
+
+void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+{
+#if defined(MPICH) || !defined(PRINT_ERR_MSG)
+ static char myname[] = "ADIOI_PANFS_SETINFO";
+#endif
+ int gen_error_code;
+
+ *error_code = MPI_SUCCESS;
+
+ if (fd->info == MPI_INFO_NULL) {
+ /* This must be part of the open call. can set striping parameters
+ * if necessary.
+ */
+ MPI_Info_create(&(fd->info));
+
+ /* anticipate concurrent writes in an MPI-IO application */
+ ADIOI_Info_set(fd->info, "panfs_concurrent_write", "1");
+
+ /* has user specified striping parameters
+ * and do they have the same value on all processes? */
+ if (users_info != MPI_INFO_NULL) {
+
+ ADIOI_Info_check_and_install_int(fd, users_info, "panfs_concurrent_write",
+ NULL, myname, error_code);
+
+ ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_type",
+ NULL, myname, error_code);
+
+ ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_stripe_unit",
+ NULL, myname, error_code);
+
+ /* strange: there was a check "layout_type ==
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE, but
+ * nothing ever touched layout_type */
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "panfs_layout_parity_stripe_width", NULL, myname,
+ error_code);
+
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "panfs_layout_parity_stripe_depth", NULL, myname,
+ error_code);
+
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "panfs_layout_total_num_comps", NULL, myname,
+ error_code);
+ /* this hint used to check for
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE or
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAID10, but again, layout_type never
+ * gets updated */
+ ADIOI_Info_check_and_install_int(fd, users_info,
+ "panfs_layout_visit_policy", NULL, myname, error_code);
+ }
+ }
+
+ ADIOI_GEN_SetInfo(fd, users_info, &gen_error_code);
+ /* If this function is successful, use the error code returned from ADIOI_GEN_SetInfo
+ * otherwise use the error_code generated by this function
+ */
+ if (*error_code == MPI_SUCCESS) {
+ *error_code = gen_error_code;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs_open.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..a0d3890ed493109d3779f9bd1bdbb7c6ac12c9d9
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs_open.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+#include
+#include
+#define TEMP_BUFFER_SIZE 64
+
+void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
+{
+ char *value;
+ int perm, old_mask, amode, flag;
+ static char myname[] = "ADIOI_PANFS_OPEN";
+
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = ~old_mask & 0666;
+ } else
+ perm = fd->perm;
+
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE) {
+ pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT;
+ unsigned long int layout_stripe_unit = 0;
+ unsigned long int layout_parity_stripe_width = 0;
+ unsigned long int layout_parity_stripe_depth = 0;
+ unsigned long int layout_total_num_comps = 0;
+ pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN;
+ int myrank;
+
+ MPI_Comm_rank(fd->comm, &myrank);
+
+ *error_code = MPI_SUCCESS;
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_type = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_stripe_unit = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_total_num_comps = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
+ value, &flag);
+ if (flag) {
+ layout_parity_stripe_width = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
+ value, &flag);
+ if (flag) {
+ layout_parity_stripe_depth = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_visit_policy = strtoul(value, NULL, 10);
+ }
+ ADIOI_Free(value);
+
+ amode = amode | O_CREAT;
+ /* Check for valid set of hints */
+ if ((layout_type < PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT) ||
+ (layout_type > PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
+ FPRINTF(stderr, "%s: panfs_layout_type is not a valid value: %u.\n", myname,
+ layout_type);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if ((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) &&
+ ((layout_stripe_unit == 0) || (layout_total_num_comps == 0))) {
+ if (layout_stripe_unit == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID0 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_total_num_comps == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID0 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) {
+ if ((layout_stripe_unit == 0) ||
+ (layout_parity_stripe_width == 0) ||
+ (layout_parity_stripe_depth == 0) || (layout_total_num_comps == 0)) {
+ if (layout_stripe_unit == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_total_num_comps == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_parity_stripe_width == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_parity_stripe_width hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_parity_stripe_depth == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_parity_stripe_depth hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) ||
+ (layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET))
+ {
+ FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname,
+ layout_visit_policy);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) {
+ if ((layout_stripe_unit == 0) || (layout_total_num_comps == 0)) {
+ if (layout_stripe_unit == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_total_num_comps == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) ||
+ (layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET))
+ {
+ FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname,
+ layout_visit_policy);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+ /* Create the file via ioctl() or open(). ADIOI_PANFS_Open's caller
+ * already optimizes performance by only calling this function with
+ * ADIO_CREATE on rank 0. Therefore, we don't need to worry about
+ * implementing that optimization here. */
+ if ((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) ||
+ (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)
+ || (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
+ pan_fs_client_layout_create_args_t file_create_args;
+ int fd_dir;
+ char *slash;
+ struct stat stat_buf;
+ int err;
+ char *path;
+
+ /* Check that the file does not exist before
+ * trying to create it. The ioctl itself should
+ * be able to handle this condition. Currently,
+ * the ioctl will return successfully if the file
+ * has been previously created. Filed bug 33862
+ * to track the problem.
+ */
+ err = stat(fd->filename, &stat_buf);
+ if ((err == -1) && (errno != ENOENT)) {
+ FPRINTF(stderr, "%s: Unexpected I/O Error calling stat() on PanFS file: %s.\n",
+ myname, strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ } else if (err == 0) {
+ FPRINTF(stderr,
+ "%s: Cannot create PanFS file with ioctl when file already exists.\n",
+ myname);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ } else {
+ /* (err == -1) && (errno == ENOENT) */
+ /* File does not exist */
+ path = ADIOI_Strdup(fd->filename);
+ slash = strrchr(path, '/');
+ if (!slash)
+ ADIOI_Strncpy(path, ".", 2);
+ else {
+ if (slash == path)
+ *(path + 1) = '\0';
+ else
+ *slash = '\0';
+ }
+
+ /* create PanFS object */
+ memset(&file_create_args, 0, sizeof(pan_fs_client_layout_create_args_t));
+ /* open directory */
+ fd_dir = open(path, O_RDONLY);
+ if (fd_dir < 0) {
+ FPRINTF(stderr,
+ "%s: I/O Error opening parent directory to create PanFS file using ioctl: %s.\n",
+ myname, strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ } else {
+ char *file_name_ptr = fd->filename;
+ slash = strrchr(fd->filename, '/');
+ if (slash) {
+ file_name_ptr = slash + 1;
+ }
+ /* create file in the directory */
+ file_create_args.mode = perm;
+ file_create_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
+ file_create_args.flags = PAN_FS_CLIENT_LAYOUT_CREATE_F__NONE;
+ ADIOI_Strncpy(file_create_args.filename, file_name_ptr,
+ strlen(fd->filename) + 1);
+ file_create_args.layout.agg_type = layout_type;
+ file_create_args.layout.layout_is_valid = 1;
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) {
+ file_create_args.layout.u.raid1_5_parity_stripe.total_num_comps =
+ layout_total_num_comps;
+ file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_width =
+ layout_parity_stripe_width;
+ file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth =
+ layout_parity_stripe_depth;
+ file_create_args.layout.u.raid1_5_parity_stripe.stripe_unit =
+ layout_stripe_unit;
+ file_create_args.layout.u.raid1_5_parity_stripe.layout_visit_policy =
+ layout_visit_policy;
+ } else if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) {
+ file_create_args.layout.u.raid0.total_num_comps = layout_total_num_comps;
+ file_create_args.layout.u.raid0.stripe_unit = layout_stripe_unit;
+ } else if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) {
+ file_create_args.layout.u.raid10.total_num_comps = layout_total_num_comps;
+ file_create_args.layout.u.raid10.stripe_unit = layout_stripe_unit;
+ file_create_args.layout.u.raid10.layout_visit_policy = layout_visit_policy;
+ }
+ err = ioctl(fd_dir, PAN_FS_CLIENT_LAYOUT_CREATE_FILE, &file_create_args);
+ if (err < 0) {
+ FPRINTF(stderr,
+ "%s: I/O Error doing ioctl on parent directory to create PanFS file using ioctl: %s.\n",
+ myname, strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ err = close(fd_dir);
+ }
+ ADIOI_Free(path);
+ }
+ } else {
+ int create_fd = open(fd->filename, amode, perm);
+ if (create_fd != -1) {
+ close(create_fd);
+ } else {
+ FPRINTF(stderr, "%s: I/O Error creating PanFS file using open: %s.\n", myname,
+ strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+ }
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ unsigned long int concurrent_write = strtoul(value, NULL, 10);
+ if (concurrent_write == 1) {
+ amode = amode | O_CONCURRENT_WRITE;
+ }
+ }
+ ADIOI_Free(value);
+
+ fd->fd_sys = open(fd->filename, amode, perm);
+ fd->fd_direct = -1;
+
+ if (fd->fd_sys != -1) {
+ int rc;
+ char temp_buffer[TEMP_BUFFER_SIZE];
+ pan_fs_client_layout_query_args_t file_query_args;
+ memset(&file_query_args, 0, sizeof(pan_fs_client_layout_query_args_t));
+ file_query_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
+ rc = ioctl(fd->fd_sys, PAN_FS_CLIENT_LAYOUT_QUERY_FILE, &file_query_args);
+ if (rc < 0) {
+ /* Error - set layout type to unknown */
+ ADIOI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
+ } else {
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u", file_query_args.layout.agg_type);
+ ADIOI_Info_set(fd->info, "panfs_layout_type", temp_buffer);
+ if (file_query_args.layout.layout_is_valid == 1) {
+ switch (file_query_args.layout.agg_type) {
+ case PAN_FS_CLIENT_LAYOUT_TYPE__RAID0:
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid0.stripe_unit);
+ ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid0.total_num_comps);
+ ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
+ break;
+ case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE:
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit);
+ ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.
+ raid1_5_parity_stripe.parity_stripe_width);
+ ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.
+ raid1_5_parity_stripe.parity_stripe_depth);
+ ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.
+ raid1_5_parity_stripe.total_num_comps);
+ ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.
+ raid1_5_parity_stripe.layout_visit_policy);
+ ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
+ break;
+ case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10:
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid10.stripe_unit);
+ ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid10.total_num_comps);
+ ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid10.layout_visit_policy);
+ ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
+ break;
+ case PAN_FS_CLIENT_LAYOUT_TYPE__INVALID:
+ case PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT:
+ MPI_Info_set(fd->info, "panfs_layout_type",
+ "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+
+ if (fd->fd_sys == -1) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs_open6.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs_open6.c
new file mode 100644
index 0000000000000000000000000000000000000000..762bf84189558561efcee32443a086d80ffc7e6e
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs_open6.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+#include
+#include
+#define TEMP_BUFFER_SIZE 64
+
+void ADIOI_PANFS_Open6(ADIO_File fd, int *error_code)
+{
+ char *value;
+ int perm, old_mask, amode, flag;
+ static char myname[] = "ADIOI_PANFS_OPEN6";
+
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = ~old_mask & 0666;
+ } else
+ perm = fd->perm;
+
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE) {
+ pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT;
+ unsigned long int layout_stripe_unit = 0;
+ unsigned long int layout_parity_stripe_width = 0;
+ unsigned long int layout_parity_stripe_depth = 0;
+ unsigned long int layout_total_num_comps = 0;
+ unsigned long int layout_max_faults = 2;
+ pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN;
+ pan_fs_client_raidn_encoding_t layout_encoding = PAN_FS_CLIENT_LAYOUT_RAIDN_ENCODING_RS;
+ int myrank;
+
+ MPI_Comm_rank(fd->comm, &myrank);
+
+ *error_code = MPI_SUCCESS;
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_type = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_stripe_unit = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_total_num_comps = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
+ value, &flag);
+ if (flag) {
+ layout_parity_stripe_width = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
+ value, &flag);
+ if (flag) {
+ layout_parity_stripe_depth = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_max_faults", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_max_faults = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_visit_policy = strtoul(value, NULL, 10);
+ }
+ ADIOI_Info_get(fd->info, "panfs_layout_encoding", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ layout_encoding = strtoul(value, NULL, 10);
+ }
+ ADIOI_Free(value);
+
+ amode = amode | O_CREAT;
+ /* Check for valid set of hints
+ *
+ * Note that RAID0 has been dropped. In the event PAN_FS_CLIENT_LAYOUT_TYPE__RAID0
+ * enumeraion no longer exists, the following check will still be correct.
+ *
+ * The enumerations looks as follows:
+ *
+ * enum pan_fs_client_layout_agg_type_e {
+ * PAN_FS_CLIENT_LAYOUT_TYPE__INVALID = 0, - *INVALID
+ * PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT = 1, - VALID
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAID0 = 2, - *INVALID
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE = 3, - VALID
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAID10 = 4, - VALID
+ * PAN_FS_CLIENT_LAYOUT_TYPE__RAIDN_PARITY_STRIPE = 5 - VALID
+ * };
+ */
+ if (((layout_type < PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) &&
+ (layout_type != PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT)) ||
+ (layout_type > PAN_FS_CLIENT_LAYOUT_TYPE__RAIDN_PARITY_STRIPE)) {
+ FPRINTF(stderr, "%s: panfs_layout_type is not a valid value: %u.\n", myname,
+ layout_type);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAIDN_PARITY_STRIPE) {
+ if ((layout_stripe_unit == 0) ||
+ (layout_parity_stripe_width == 0) ||
+ (layout_parity_stripe_depth == 0) || (layout_total_num_comps == 0)) {
+ if (layout_stripe_unit == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAIDN parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_total_num_comps == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAIDN parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_parity_stripe_width == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_parity_stripe_width hint which is necessary to specify a valid RAIDN parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_parity_stripe_depth == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_parity_stripe_depth hint which is necessary to specify a valid RAIDN parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ /* as of 6.0.x release, we only support max_faults == 2 */
+ if (layout_max_faults != 2) {
+ FPRINTF(stderr,
+ "%s: panfs_layout_max_faults is not a valid value. Setting default of 2\n",
+ myname);
+ layout_max_faults = 2;
+ }
+ /* as of 6.0.x release, we only support RS enconding */
+ if (layout_encoding != PAN_FS_CLIENT_LAYOUT_RAIDN_ENCODING_RS) {
+ FPRINTF(stderr,
+ "%s: panfs_layout_encoding is not a valid value: %u. Setting to default of %u\n",
+ myname, layout_encoding, PAN_FS_CLIENT_LAYOUT_RAIDN_ENCODING_RS);
+ layout_encoding = PAN_FS_CLIENT_LAYOUT_RAIDN_ENCODING_RS;
+ }
+ }
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) {
+ if ((layout_stripe_unit == 0) ||
+ (layout_parity_stripe_width == 0) ||
+ (layout_parity_stripe_depth == 0) || (layout_total_num_comps == 0)) {
+ if (layout_stripe_unit == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_total_num_comps == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_parity_stripe_width == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_parity_stripe_width hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_parity_stripe_depth == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_parity_stripe_depth hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) ||
+ (layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET))
+ {
+ FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname,
+ layout_visit_policy);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) {
+ if ((layout_stripe_unit == 0) || (layout_total_num_comps == 0)) {
+ if (layout_stripe_unit == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ if (layout_total_num_comps == 0) {
+ FPRINTF(stderr,
+ "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n",
+ myname);
+ }
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) ||
+ (layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET))
+ {
+ FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname,
+ layout_visit_policy);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+ /* Create the file via ioctl() or open(). ADIOI_PANFS_Open's caller
+ * already optimizes performance by only calling this function with
+ * ADIO_CREATE on rank 0. Therefore, we don't need to worry about
+ * implementing that optimization here. */
+ if ((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) ||
+ (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) ||
+ (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAIDN_PARITY_STRIPE)) {
+ pan_fs_client_layout_create_args_t file_create_args;
+ int fd_dir;
+ char *slash;
+ struct stat stat_buf;
+ int err;
+ char *path;
+
+ /* Check that the file does not exist before
+ * trying to create it. The ioctl itself should
+ * be able to handle this condition. Currently,
+ * the ioctl will return successfully if the file
+ * has been previously created. Filed bug 33862
+ * to track the problem.
+ */
+ err = stat(fd->filename, &stat_buf);
+ if ((err == -1) && (errno != ENOENT)) {
+ FPRINTF(stderr, "%s: Unexpected I/O Error calling stat() on PanFS file: %s.\n",
+ myname, strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ } else if (err == 0) {
+ /* ensure that we have the same semantics here and in the call to creat(). In the latter, we do not
+ * use O_EXCL so a create on an existing file should not fail.
+ */
+ FPRINTF(stderr,
+ "%s: Cannot create PanFS file with ioctl when file already exists, using open() syscall.\n",
+ myname);
+ goto use_open_syscall;
+ } else {
+ /* (err == -1) && (errno == ENOENT) */
+ /* File does not exist */
+ path = ADIOI_Strdup(fd->filename);
+ slash = strrchr(path, '/');
+ if (!slash)
+ ADIOI_Strncpy(path, ".", 2);
+ else {
+ if (slash == path)
+ *(path + 1) = '\0';
+ else
+ *slash = '\0';
+ }
+
+ /* create PanFS object */
+ memset(&file_create_args, 0, sizeof(pan_fs_client_layout_create_args_t));
+ /* open directory */
+ fd_dir = open(path, O_RDONLY);
+ if (fd_dir < 0) {
+ FPRINTF(stderr,
+ "%s: I/O Error opening parent directory to create PanFS file using ioctl: %s.\n",
+ myname, strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ } else {
+ char *file_name_ptr = fd->filename;
+ slash = strrchr(fd->filename, '/');
+ if (slash) {
+ file_name_ptr = slash + 1;
+ }
+ /* create file in the directory */
+ file_create_args.mode = perm;
+ file_create_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
+ file_create_args.flags = PAN_FS_CLIENT_LAYOUT_CREATE_F__NONE;
+ ADIOI_Strncpy(file_create_args.filename, file_name_ptr,
+ strlen(fd->filename) + 1);
+ file_create_args.layout.agg_type = layout_type;
+ file_create_args.layout.layout_is_valid = 1;
+ if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAIDN_PARITY_STRIPE) {
+ file_create_args.layout.u.raidn_parity_stripe.total_num_comps =
+ layout_total_num_comps;
+ file_create_args.layout.u.raidn_parity_stripe.parity_stripe_width =
+ layout_parity_stripe_width;
+ file_create_args.layout.u.raidn_parity_stripe.parity_stripe_depth =
+ layout_parity_stripe_depth;
+ file_create_args.layout.u.raidn_parity_stripe.stripe_unit =
+ layout_stripe_unit;
+ file_create_args.layout.u.raidn_parity_stripe.max_faults =
+ layout_max_faults;
+ file_create_args.layout.u.raidn_parity_stripe.encoding = layout_encoding;
+ } else if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) {
+ file_create_args.layout.u.raid1_5_parity_stripe.total_num_comps =
+ layout_total_num_comps;
+ file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_width =
+ layout_parity_stripe_width;
+ file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth =
+ layout_parity_stripe_depth;
+ file_create_args.layout.u.raid1_5_parity_stripe.stripe_unit =
+ layout_stripe_unit;
+ file_create_args.layout.u.raid1_5_parity_stripe.layout_visit_policy =
+ layout_visit_policy;
+ } else if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) {
+ file_create_args.layout.u.raid10.total_num_comps = layout_total_num_comps;
+ file_create_args.layout.u.raid10.stripe_unit = layout_stripe_unit;
+ file_create_args.layout.u.raid10.layout_visit_policy = layout_visit_policy;
+ }
+ err = ioctl(fd_dir, PAN_FS_CLIENT_LAYOUT_CREATE_FILE, &file_create_args);
+ if (err < 0) {
+ FPRINTF(stderr,
+ "%s: I/O Error doing ioctl on parent directory to create PanFS file using ioctl: %s.\n",
+ myname, strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ err = close(fd_dir);
+ }
+ ADIOI_Free(path);
+ }
+ } else {
+ use_open_syscall:;
+ int create_fd = open(fd->filename, amode, perm);
+ if (create_fd != -1) {
+ close(create_fd);
+ } else {
+ FPRINTF(stderr, "%s: I/O Error creating PanFS file using open: %s.\n", myname,
+ strerror(errno));
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+ }
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ ADIOI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ unsigned long int concurrent_write = strtoul(value, NULL, 10);
+ if (concurrent_write == 1) {
+ amode = amode | O_CONCURRENT_WRITE;
+ }
+ }
+ ADIOI_Free(value);
+
+ fd->fd_sys = open(fd->filename, amode, perm);
+ fd->fd_direct = -1;
+
+ if (fd->fd_sys != -1) {
+ int rc;
+ char temp_buffer[TEMP_BUFFER_SIZE];
+ pan_fs_client_layout_query_args_t file_query_args;
+ memset(&file_query_args, 0, sizeof(pan_fs_client_layout_query_args_t));
+ file_query_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
+ rc = ioctl(fd->fd_sys, PAN_FS_CLIENT_LAYOUT_QUERY_FILE, &file_query_args);
+ if (rc < 0) {
+ /* Error - set layout type to unknown */
+ ADIOI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
+ } else {
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u", file_query_args.layout.agg_type);
+ ADIOI_Info_set(fd->info, "panfs_layout_type", temp_buffer);
+ if (file_query_args.layout.layout_is_valid == 1) {
+ switch (file_query_args.layout.agg_type) {
+ case PAN_FS_CLIENT_LAYOUT_TYPE__RAIDN_PARITY_STRIPE:
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raidn_parity_stripe.stripe_unit);
+ ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raidn_parity_stripe.
+ parity_stripe_width);
+ ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raidn_parity_stripe.
+ parity_stripe_depth);
+ ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raidn_parity_stripe.total_num_comps);
+ ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raidn_parity_stripe.max_faults);
+ ADIOI_Info_set(fd->info, "panfs_layout_max_faults", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raidn_parity_stripe.encoding);
+ ADIOI_Info_set(fd->info, "panfs_layout_encoding", temp_buffer);
+ break;
+ case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE:
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit);
+ ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid1_5_parity_stripe.
+ parity_stripe_width);
+ ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid1_5_parity_stripe.
+ parity_stripe_depth);
+ ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.
+ raid1_5_parity_stripe.total_num_comps);
+ ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid1_5_parity_stripe.
+ layout_visit_policy);
+ ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
+ break;
+ case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10:
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid10.stripe_unit);
+ ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid10.total_num_comps);
+ ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
+ MPL_snprintf(temp_buffer, TEMP_BUFFER_SIZE, "%u",
+ file_query_args.layout.u.raid10.layout_visit_policy);
+ ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
+ break;
+ case PAN_FS_CLIENT_LAYOUT_TYPE__INVALID:
+ case PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT:
+ MPI_Info_set(fd->info, "panfs_layout_type",
+ "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+
+ if (fd->fd_sys == -1) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs_read.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs_read.c
new file mode 100644
index 0000000000000000000000000000000000000000..5fcc10a638a90cc001fc3d799999635cb1988fb9
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs_read.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+void ADIOI_PANFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ MPI_Count err = -1, datatype_size, len;
+ static char myname[] = "ADIOI_PANFS_READCONTIG";
+
+ if (count == 0) {
+ err = 0;
+ goto fn_exit;
+ }
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+
+ if (fd->fp_sys_posn != offset) {
+ err = lseek(fd->fd_sys, offset, SEEK_SET);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ fd->fp_sys_posn = -1;
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+ }
+
+ AD_PANFS_RETRY(read(fd->fd_sys, buf, len), err)
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ fd->fp_sys_posn = -1;
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ fd->fp_sys_posn = offset + err;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += err;
+ }
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status && err != -1)
+ MPIR_Status_set_bytes(status, datatype, err);
+#endif
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs_resize.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs_resize.c
new file mode 100644
index 0000000000000000000000000000000000000000..3fb67d6ae577a40c5cdbd598f691773025b85cdd
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs_resize.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+void ADIOI_PANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
+{
+ int err;
+ int myrank;
+ struct stat stat_buf;
+ static char myname[] = "ADIOI_PANFS_RESIZE";
+
+ MPI_Comm_rank(fd->comm, &myrank);
+ if (!myrank) {
+ AD_PANFS_RETRY(ftruncate(fd->fd_sys, size), err);
+ MPI_Barrier(fd->comm);
+ } else {
+ MPI_Barrier(fd->comm);
+ AD_PANFS_RETRY(fstat(fd->fd_sys, &stat_buf), err);
+ if (((ADIO_Offset) stat_buf.st_size) != size) {
+ /* This should never happen otherwise there is a coherency problem. */
+ FPRINTF(stderr, "%s: Rank %d: Resize failed: requested=%llu actual=%llu.\n", myname,
+ myrank, size, (unsigned long long) stat_buf.st_size);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ }
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_panfs/ad_panfs_write.c b/3rd-party/romio341/adio/ad_panfs/ad_panfs_write.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc6feb87304a7996e1a5a7d40f0fc130a206175d
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_panfs/ad_panfs_write.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_panfs.h"
+
+#ifdef HAVE_UNISTD_H
+#include
+#endif
+
+void ADIOI_PANFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ MPI_Count err = -1, datatype_size, len;
+ static char myname[] = "ADIOI_PANFS_WRITECONTIG";
+
+ if (count == 0) {
+ err = 0;
+ goto fn_exit;
+ }
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+
+ if (fd->fp_sys_posn != offset) {
+ err = lseek(fd->fd_sys, offset, SEEK_SET);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ fd->fp_sys_posn = -1;
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+ }
+
+ AD_PANFS_RETRY(write(fd->fd_sys, buf, len), err)
+ /* --BEGIN ERROR HANDLING-- */
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ fd->fp_sys_posn = -1;
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ fd->fp_sys_posn = offset + err;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += err;
+ }
+
+ fn_exit:
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status && err != -1)
+ MPIR_Status_set_bytes(status, datatype, err);
+#endif
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/Makefile.mk b/3rd-party/romio341/adio/ad_pvfs2/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..c5bf0e1fed8f1d651e14cdebc59c70e747bd88c6
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/Makefile.mk
@@ -0,0 +1,32 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_PVFS2
+
+noinst_HEADERS += \
+ adio/ad_pvfs2/ad_pvfs2.h \
+ adio/ad_pvfs2/ad_pvfs2_io.h \
+ adio/ad_pvfs2/ad_pvfs2_common.h
+
+romio_other_sources += \
+ adio/ad_pvfs2/ad_pvfs2_close.c \
+ adio/ad_pvfs2/ad_pvfs2_read.c \
+ adio/ad_pvfs2/ad_pvfs2_open.c \
+ adio/ad_pvfs2/ad_pvfs2_write.c \
+ adio/ad_pvfs2/ad_pvfs2_fcntl.c \
+ adio/ad_pvfs2/ad_pvfs2_flush.c \
+ adio/ad_pvfs2/ad_pvfs2_resize.c \
+ adio/ad_pvfs2/ad_pvfs2_hints.c \
+ adio/ad_pvfs2/ad_pvfs2_delete.c \
+ adio/ad_pvfs2/ad_pvfs2.c \
+ adio/ad_pvfs2/ad_pvfs2_common.c \
+ adio/ad_pvfs2/ad_pvfs2_aio.c \
+ adio/ad_pvfs2/ad_pvfs2_read_list_classic.c \
+ adio/ad_pvfs2/ad_pvfs2_io_list.c \
+ adio/ad_pvfs2/ad_pvfs2_io_dtype.c \
+ adio/ad_pvfs2/ad_pvfs2_write_list_classic.c \
+ adio/ad_pvfs2/ad_pvfs2_features.c
+
+endif BUILD_AD_PVFS2
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa591644749899e78e8e1ef0774f80eb5aa7cac1
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+
+#include "adio.h"
+
+struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
+ ADIOI_PVFS2_Open, /* Open */
+ ADIOI_SCALEABLE_OpenColl, /* OpenColl */
+ ADIOI_PVFS2_ReadContig, /* ReadContig */
+ ADIOI_PVFS2_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_PVFS2_Fcntl, /* Fcntl */
+ ADIOI_PVFS2_SetInfo, /* SetInfo */
+ ADIOI_PVFS2_ReadStrided, /* ReadStrided */
+ ADIOI_PVFS2_WriteStrided, /* WriteStrided */
+ ADIOI_PVFS2_Close, /* Close */
+#ifdef HAVE_MPI_GREQUEST_EXTENSIONS
+ ADIOI_PVFS2_IReadContig, /* IreadContig */
+ ADIOI_PVFS2_IWriteContig, /* IwriteContig */
+#else
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+#endif
+ ADIOI_FAKE_IODone, /* ReadDone */
+ ADIOI_FAKE_IODone, /* WriteDone */
+ ADIOI_FAKE_IOComplete, /* ReadComplete */
+ ADIOI_FAKE_IOComplete, /* WriteComplete */
+ ADIOI_FAKE_IreadStrided, /* IreadStrided */
+ ADIOI_FAKE_IwriteStrided, /* IwriteStrided */
+ ADIOI_PVFS2_Flush, /* Flush */
+ ADIOI_PVFS2_Resize, /* Resize */
+ ADIOI_PVFS2_Delete, /* Delete */
+ ADIOI_PVFS2_Feature,
+ "PVFS2: the PVFS v2 or OrangeFS file systems",
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2.h b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c14802fa14d7ce0c056128cddcb4dc9b5867e03
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_PVFS2_H_INCLUDED
+#define AD_PVFS2_H_INCLUDED
+
+#include "adio.h"
+#ifdef HAVE_PVFS2_H
+#include "pvfs2.h"
+#endif
+
+#ifdef PVFS2_VERSION_MAJOR
+#include "pvfs2-compat.h"
+#endif
+
+void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code);
+void ADIOI_PVFS2_Close(ADIO_File fd, int *error_code);
+void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_PVFS2_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_PVFS2_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int
+ *error_code);
+void ADIOI_PVFS2_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_PVFS2_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_PVFS2_Flush(ADIO_File fd, int *error_code);
+void ADIOI_PVFS2_Delete(const char *filename, int *error_code);
+void ADIOI_PVFS2_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
+void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+int ADIOI_PVFS2_Feature(ADIO_File fd, int flag);
+
+void ADIOI_PVFS2_IReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code);
+void ADIOI_PVFS2_IWriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code);
+void ADIOI_PVFS2_AIO_contig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int flag, int *error_code);
+void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code);
+
+int ADIOI_PVFS2_WriteStridedListIO(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+int ADIOI_PVFS2_WriteStridedDtypeIO(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+
+#endif /* AD_PVFS2_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_aio.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_aio.c
new file mode 100644
index 0000000000000000000000000000000000000000..058db4a8b113932087ac62cb790c18b355d99ec2
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_aio.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_pvfs2.h"
+#include
+
+#include "ad_pvfs2_common.h"
+#include "mpiu_greq.h"
+#include "../../mpi-io/mpioimpl.h"
+
+#define READ 0
+#define WRITE 1
+
+static int ADIOI_PVFS2_greq_class = 0;
+int ADIOI_PVFS2_aio_free_fn(void *extra_state);
+int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status * status);
+int ADIOI_PVFS2_aio_wait_fn(int count, void **array_of_states, double timeout, MPI_Status * status);
+
+void ADIOI_PVFS2_IReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code)
+{
+ ADIOI_PVFS2_AIO_contig(fd, buf, count, datatype, file_ptr_type,
+ offset, request, READ, error_code);
+}
+
+void ADIOI_PVFS2_IWriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code)
+{
+ ADIOI_PVFS2_AIO_contig(fd, (void *) buf, count, datatype, file_ptr_type,
+ offset, request, WRITE, error_code);
+}
+
+void ADIOI_PVFS2_AIO_contig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int flag, int *error_code)
+{
+
+ int ret;
+ MPI_Count datatype_size, len;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ ADIOI_AIO_Request *aio_req;
+ static char myname[] = "ADIOI_PVFS2_AIO_contig";
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ aio_req = (ADIOI_AIO_Request *) ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ ret = PVFS_Request_contiguous(len, PVFS_BYTE, &(aio_req->mem_req));
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in pvfs_request_contig (memory)", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ret = PVFS_Request_contiguous(len, PVFS_BYTE, &(aio_req->file_req));
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in pvfs_request_contig (file)", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* copy individual file pointer into offset variable, continue */
+ offset = fd->fp_ind;
+ }
+ if (flag == READ) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_iread_a, 0, NULL);
+#endif
+ ret = PVFS_isys_read(pvfs_fs->object_ref, aio_req->file_req, offset,
+ buf, aio_req->mem_req, &(pvfs_fs->credentials),
+ &(aio_req->resp_io), &(aio_req->op_id), NULL);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_iread_b, 0, NULL);
+#endif
+ } else if (flag == WRITE) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_iwrite_a, 0, NULL);
+#endif
+ ret = PVFS_isys_write(pvfs_fs->object_ref, aio_req->file_req, offset,
+ buf, aio_req->mem_req, &(pvfs_fs->credentials),
+ &(aio_req->resp_io), &(aio_req->op_id), NULL);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_iwrite_b, 0, NULL);
+#endif
+ }
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret < 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_isys_io", 0);
+ goto fn_exit;
+ }
+ /* --END ERROR HANDLING-- */
+
+ /* posted. defered completion */
+ if (ret == 0) {
+ if (ADIOI_PVFS2_greq_class == 0) {
+ MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn,
+ ADIOI_PVFS2_aio_free_fn, MPIU_Greq_cancel_fn,
+ ADIOI_PVFS2_aio_poll_fn, ADIOI_PVFS2_aio_wait_fn,
+ &ADIOI_PVFS2_greq_class);
+ }
+ MPIX_Grequest_class_allocate(ADIOI_PVFS2_greq_class, aio_req, request);
+ memcpy(&(aio_req->req), request, sizeof(*request));
+ }
+
+ /* immediate completion */
+ if (ret == 1) {
+ MPIO_Completed_request_create(&fd, len, error_code, request);
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += len;
+ }
+ fd->fp_sys_posn = offset + len;
+
+ *error_code = MPI_SUCCESS;
+ fn_exit:
+ return;
+}
+
+int ADIOI_PVFS2_aio_free_fn(void *extra_state)
+{
+ ADIOI_AIO_Request *aio_req;
+ aio_req = (ADIOI_AIO_Request *) extra_state;
+
+ PVFS_Request_free(&(aio_req->mem_req));
+ PVFS_Request_free(&(aio_req->file_req));
+ ADIOI_Free(aio_req);
+
+ return MPI_SUCCESS;
+}
+
+int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status * status)
+{
+ ADIOI_AIO_Request *aio_req;
+ int ret, error;
+
+ aio_req = (ADIOI_AIO_Request *) extra_state;
+
+ /* BUG: cannot PVFS_sys_testsome: does not work for a specific request */
+ ret = PVFS_sys_wait(aio_req->op_id, "ADIOI_PVFS2_aio_poll_fn", &error);
+ if (ret == 0) {
+ aio_req->nbytes = aio_req->resp_io.total_completed;
+ MPI_Grequest_complete(aio_req->req);
+ return MPI_SUCCESS;
+ } else
+ return MPI_UNDEFINED; /* TODO: what's this error? */
+}
+
+/* wait for multiple requests to complete */
+int ADIOI_PVFS2_aio_wait_fn(int count, void **array_of_states, double timeout, MPI_Status * status)
+{
+
+ ADIOI_AIO_Request **aio_reqlist;
+ PVFS_sys_op_id *op_id_array;
+ int i, j, greq_count, completed_count = 0;
+ int *error_array;
+
+ aio_reqlist = (ADIOI_AIO_Request **) array_of_states;
+
+ op_id_array = (PVFS_sys_op_id *) ADIOI_Calloc(count, sizeof(PVFS_sys_op_id));
+ error_array = (int *) ADIOI_Calloc(count, sizeof(int));
+ greq_count = count;
+
+
+ /* PVFS-2.6: testsome actually tests all requests and fills in op_id_array
+ * with the ones that have completed. count is an in/out parameter.
+ * returns with the number of completed operations. what a mess! */
+ while (completed_count < greq_count) {
+ count = greq_count;
+ PVFS_sys_testsome(op_id_array, &count, NULL, error_array, INT_MAX);
+ completed_count += count;
+ for (i = 0; i < count; i++) {
+ for (j = 0; j < greq_count; j++) {
+ if (op_id_array[i] == aio_reqlist[j]->op_id) {
+ aio_reqlist[j]->nbytes = aio_reqlist[j]->resp_io.total_completed;
+ MPI_Grequest_complete(aio_reqlist[j]->req);
+ }
+ }
+ }
+ }
+ return MPI_SUCCESS; /* TODO: no idea how to deal with errors */
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_close.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_close.c
new file mode 100644
index 0000000000000000000000000000000000000000..a040927685bf3f96354a12545a8328cbc73d7f29
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_close.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+
+void ADIOI_PVFS2_Close(ADIO_File fd, int *error_code)
+{
+ ADIOI_Free(fd->fs_ptr);
+ fd->fs_ptr = NULL;
+ /* PVFS2 doesn't have a 'close', but MPI-IO semantics dictate that we
+ * ensure all data has been flushed.
+ */
+
+ /* At some point or another it was decided that ROMIO would not
+ * explicitly flush (other than any local cache) on close, because
+ * there is no way to *avoid* that overhead if you implement it here
+ * and don't actually want it.
+ */
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_common.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_common.c
new file mode 100644
index 0000000000000000000000000000000000000000..371285460c66d84d6edcb3fd658965a0c9b3227e
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_common.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_common.h"
+#include
+#include
+#include
+#include
+
+/* maybe give romio access to the globalconfig struct */
+/* keyval hack to both tell us if we've already initialized pvfs2 and also
+ * close it down when mpi exits */
+int ADIOI_PVFS2_Initialized = MPI_KEYVAL_INVALID;
+
+void ADIOI_PVFS2_End(int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_PVFS2_END";
+
+ ret = PVFS_sys_finalize();
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_finalize", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ *error_code = MPI_SUCCESS;
+}
+
+int ADIOI_PVFS2_End_call(MPI_Comm comm, int keyval, void *attribute_val, void *extra_state)
+{
+ int error_code;
+ ADIOI_PVFS2_End(&error_code);
+ MPI_Keyval_free(&keyval);
+ return error_code;
+}
+
+void ADIOI_PVFS2_Init(int *error_code)
+{
+ int ret;
+ static char myname[] = "ADIOI_PVFS2_INIT";
+ char *ncache_timeout;
+
+ /* do nothing if we've already fired up the pvfs2 interface */
+ if (ADIOI_PVFS2_Initialized != MPI_KEYVAL_INVALID) {
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ /* for consistency, we should disable the pvfs2 ncache. If the
+ * environtment variable is already set, assume a user knows it
+ * won't be a problem */
+ ncache_timeout = getenv("PVFS2_NCACHE_TIMEOUT");
+ if (ncache_timeout == NULL)
+ setenv("PVFS2_NCACHE_TIMEOUT", "0", 1);
+
+ ret = PVFS_util_init_defaults();
+ if (ret < 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_util_init_defaults", 0);
+ PVFS_perror("PVFS_util_init_defaults", ret);
+
+ return;
+ }
+
+ MPI_Keyval_create(MPI_NULL_COPY_FN, ADIOI_PVFS2_End_call, &ADIOI_PVFS2_Initialized, (void *) 0);
+ /* just like romio does, we make a dummy attribute so we
+ * get cleaned up */
+ MPI_Attr_put(MPI_COMM_SELF, ADIOI_PVFS2_Initialized, (void *) 0);
+}
+
+void ADIOI_PVFS2_makeattribs(PVFS_sys_attr * attribs)
+{
+ memset(attribs, 0, sizeof(PVFS_sys_attr));
+
+ attribs->owner = geteuid();
+ attribs->group = getegid();
+ attribs->perms = 0644;
+ attribs->mask = PVFS_ATTR_SYS_ALL_SETABLE;
+ attribs->atime = time(NULL);
+ attribs->mtime = attribs->atime;
+ attribs->ctime = attribs->atime;
+}
+
+
+void ADIOI_PVFS2_makecredentials(PVFS_credentials * credentials)
+{
+ memset(credentials, 0, sizeof(PVFS_credentials));
+
+ PVFS_util_gen_credentials(credentials);
+}
+
+int ADIOI_PVFS2_error_convert(int pvfs_error)
+{
+ switch (pvfs_error) {
+ case PVFS_EPERM:
+ case PVFS_EACCES:
+ return MPI_ERR_ACCESS;
+ case PVFS_ENOENT:
+ case PVFS_ENXIO:
+ case PVFS_ENODEV:
+ return MPI_ERR_NO_SUCH_FILE;
+ case PVFS_EIO:
+ return MPI_ERR_IO;
+ case PVFS_EEXIST:
+ return MPI_ERR_FILE_EXISTS;
+ case PVFS_ENOTDIR: /* ??? */
+ case PVFS_EISDIR: /* ??? */
+ case PVFS_ENAMETOOLONG:
+ return MPI_ERR_BAD_FILE;
+ case PVFS_EINVAL:
+ return MPI_ERR_FILE;
+ case PVFS_EFBIG: /* ??? */
+ case PVFS_ENOSPC:
+ return MPI_ERR_NO_SPACE;
+ case PVFS_EROFS:
+ return MPI_ERR_READ_ONLY;
+ case PVFS_ENOSYS:
+ return MPI_ERR_UNSUPPORTED_OPERATION;
+ /* PVFS does not support quotas */
+ case EDQUOT:
+ return MPI_ERR_QUOTA;
+ case PVFS_ENOMEM:
+ return MPI_ERR_INTERN;
+ default:
+ return MPI_UNDEFINED;
+ }
+
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_common.h b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..93305ac3e0e1ea826958c1ee50050581945f7a1d
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_common.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_PVFS2_COMMON_H_INCLUDED
+#define AD_PVFS2_COMMON_H_INCLUDED
+#include "ad_pvfs2.h"
+
+/* useful values:
+ * 0: no debugging
+ * CLIENT_DEBUG: debug client state machines
+ */
+#define ADIOI_PVFS2_DEBUG_MASK 0
+
+
+struct ADIOI_PVFS2_fs_s {
+ PVFS_object_ref object_ref;
+ PVFS_credentials credentials;
+} ADIOI_PVFS2_fs_s;
+
+typedef struct ADIOI_PVFS2_fs_s ADIOI_PVFS2_fs;
+
+
+void ADIOI_PVFS2_Init(int *error_code);
+void ADIOI_PVFS2_makeattribs(PVFS_sys_attr * attribs);
+void ADIOI_PVFS2_makecredentials(PVFS_credentials * credentials);
+void ADIOI_PVFS2_End(int *error_code);
+int ADIOI_PVFS2_End_call(MPI_Comm comm, int keyval, void *attribute_val, void *extra_state);
+int ADIOI_PVFS2_error_convert(int pvfs_error);
+
+#endif /* AD_PVFS2_COMMON_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_delete.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_delete.c
new file mode 100644
index 0000000000000000000000000000000000000000..fff32069a7c036cbbc2e4e8abacda3c975a880de
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_delete.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "adio.h"
+
+#include "ad_pvfs2_common.h"
+
+void ADIOI_PVFS2_Delete(const char *filename, int *error_code)
+{
+ PVFS_credentials credentials;
+ PVFS_sysresp_getparent resp_getparent;
+ int ret;
+ PVFS_fs_id cur_fs;
+ static char myname[] = "ADIOI_PVFS2_DELETE";
+ char pvfs_path[PVFS_NAME_MAX] = { 0 };
+
+ ADIOI_PVFS2_Init(error_code);
+ /* --BEGIN ERROR HANDLING-- */
+ if (*error_code != MPI_SUCCESS) {
+ /* ADIOI_PVFS2_INIT handles creating error codes itself */
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ /* in most cases we'll store the credentials in the fs struct, but we don't
+ * have one of those in Delete */
+ ADIOI_PVFS2_makecredentials(&credentials);
+
+ /* given the filename, figure out which pvfs filesystem it is on */
+ ret = PVFS_util_resolve(filename, &cur_fs, pvfs_path, PVFS_NAME_MAX);
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_util_resolve", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ret = PVFS_sys_getparent(cur_fs, pvfs_path, &credentials, &resp_getparent);
+
+ ret = PVFS_sys_remove(resp_getparent.basename, resp_getparent.parent_ref, &credentials);
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_remove", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ *error_code = MPI_SUCCESS;
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_fcntl.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_fcntl.c
new file mode 100644
index 0000000000000000000000000000000000000000..af467bb958d96152390e7a15de2fd72c51ca7280
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_fcntl.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "adio_extern.h"
+#include "ad_pvfs2_common.h"
+
+void ADIOI_PVFS2_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code)
+{
+ int ret;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ PVFS_sysresp_getattr resp_getattr;
+ static char myname[] = "ADIOI_PVFS2_FCNTL";
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ switch (flag) {
+ case ADIO_FCNTL_GET_FSIZE:
+ ret = PVFS_sys_getattr(pvfs_fs->object_ref, PVFS_ATTR_SYS_SIZE,
+ &(pvfs_fs->credentials), &resp_getattr);
+ if (ret != 0) {
+ /* --BEGIN ERROR HANDLING-- */
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_getattr", 0);
+ /* --END ERROR HANDLING-- */
+ } else {
+ *error_code = MPI_SUCCESS;
+ }
+ fcntl_struct->fsize = resp_getattr.attr.size;
+ return;
+
+ case ADIO_FCNTL_SET_DISKSPACE:
+ ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
+ break;
+
+ /* --BEGIN ERROR HANDLING-- */
+ case ADIO_FCNTL_SET_ATOMICITY:
+ default:
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG, "**flag", "**flag %d", flag);
+ /* --END ERROR HANDLING-- */
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_features.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_features.c
new file mode 100644
index 0000000000000000000000000000000000000000..43980a3d84e08e7a21b299f11f11c80639cf0031
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_features.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "ad_pvfs2.h"
+
+int ADIOI_PVFS2_Feature(ADIO_File fd, int flag)
+{
+ switch (flag) {
+ case ADIO_SCALABLE_OPEN:
+ case ADIO_SCALABLE_RESIZE:
+ return 1;
+ case ADIO_SHARED_FP:
+ case ADIO_LOCKS:
+ case ADIO_SEQUENTIAL:
+ case ADIO_DATA_SIEVING_WRITES:
+ default:
+ return 0;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_flush.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_flush.c
new file mode 100644
index 0000000000000000000000000000000000000000..82f5f259b70a7120360bc232d853556766a56362
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_flush.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_common.h"
+
+/* we want to be a bit clever here: at scale, if every client sends a
+ * flush request, it will stress the PVFS2 servers with redundant
+ * PVFS_sys_flush requests. Instead, one process should wait for
+ * everyone to catch up, do the sync, then broadcast the result. We can
+ * get away with this thanks to PVFS2's stateless design
+ */
+
+void ADIOI_PVFS2_Flush(ADIO_File fd, int *error_code)
+{
+ int ret, rank, dummy = 0, dummy_in = 0;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ static char myname[] = "ADIOI_PVFS2_FLUSH";
+
+ *error_code = MPI_SUCCESS;
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ MPI_Comm_rank(fd->comm, &rank);
+
+
+ /* unlike ADIOI_PVFS2_Resize, MPI_File_sync() does not perform any
+ * syncronization */
+ MPI_Reduce(&dummy_in, &dummy, 1, MPI_INT, MPI_SUM, fd->hints->ranklist[0], fd->comm);
+
+ /* io_worker computed in ADIO_Open */
+ if (rank == fd->hints->ranklist[0]) {
+ ret = PVFS_sys_flush(pvfs_fs->object_ref, &(pvfs_fs->credentials));
+ }
+ MPI_Bcast(&ret, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_flush", 0);
+ }
+ /* --END ERROR HANDLING-- */
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_hints.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_hints.c
new file mode 100644
index 0000000000000000000000000000000000000000..a637f7636ddd655353672b6151b5299c0ec06e78
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_hints.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include
+#include "ad_pvfs2.h"
+
+#include "hint_fns.h"
+
+void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+{
+ char *value;
+ int flag, tmp_value;
+ static char myname[] = "ADIOI_PVFS_SETINFO";
+
+ if ((fd->info) == MPI_INFO_NULL) {
+ /* part of the open call */
+ MPI_Info_create(&(fd->info));
+ ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", "0");
+ fd->hints->fs_hints.pvfs2.debugmask = 0;
+
+ ADIOI_Info_set(fd->info, "striping_factor", "0");
+ fd->hints->striping_factor = 0;
+
+ ADIOI_Info_set(fd->info, "striping_unit", "0");
+ fd->hints->striping_unit = 0;
+
+ /* disable the aggressive strided optimizations by default */
+ ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", "disable");
+ ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", "disable");
+ fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE;
+ fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE;
+
+ ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", "disable");
+ ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", "disable");
+ fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE;
+ fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE;
+
+ ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", "disable");
+ ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", "disable");
+ fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE;
+ fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE;
+
+
+ /* any user-provided hints? */
+ if (users_info != MPI_INFO_NULL) {
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ /* pvfs2 debugging */
+ ADIOI_Info_get(users_info, "romio_pvfs2_debugmask", MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ tmp_value = fd->hints->fs_hints.pvfs2.debugmask =
+ PVFS_debug_eventlog_to_mask(value);
+
+ MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
+ /* --BEGIN ERROR HANDLING-- */
+ if (tmp_value != fd->hints->fs_hints.pvfs2.debugmask) {
+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_pvfs2_debugmask", error_code);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", value);
+ }
+
+ /* the striping factor */
+ ADIOI_Info_check_and_install_int(fd, users_info, "striping_factor",
+ &(fd->hints->striping_factor), myname, error_code);
+
+
+ /* the striping unit */
+ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
+ &(fd->hints->striping_unit), myname, error_code);
+
+ /* distribution name */
+ ADIOI_Info_get(users_info, "romio_pvfs2_distribution_name",
+ MPI_MAX_INFO_VAL, value, &flag);
+ if (flag) {
+ }
+
+ /* POSIX read */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_pvfs2_posix_read",
+ &(fd->hints->fs_hints.pvfs2.posix_read), myname,
+ error_code);
+
+ /* POSIX write */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_pvfs2_posix_write",
+ &(fd->hints->fs_hints.pvfs2.posix_write), myname,
+ error_code);
+
+ /* Datatype read */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_pvfs2_dtype_read",
+ &(fd->hints->fs_hints.pvfs2.dtype_read), myname,
+ error_code);
+
+ /* Datatype write */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_pvfs2_dtype_write",
+ &(fd->hints->fs_hints.pvfs2.dtype_write), myname,
+ error_code);
+
+ /* Listio read */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_pvfs2_listio_read",
+ &(fd->hints->fs_hints.pvfs2.listio_read), myname,
+ error_code);
+
+ /* Datatype write */
+ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_pvfs2_listio_write",
+ &(fd->hints->fs_hints.pvfs2.listio_write), myname,
+ error_code);
+ ADIOI_Free(value);
+ }
+ }
+ /* set the values for collective I/O and data sieving parameters */
+ ADIOI_GEN_SetInfo(fd, users_info, error_code);
+
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io.h b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..881912db48c01a6388cba5fe1b284b46176691c1
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_PVFS2_IO_H_INCLUDED
+#define AD_PVFS2_IO_H_INCLUDED
+
+/* Contig I/O helper prototypes */
+
+#define READ 0
+#define WRITE 1
+
+/* #define DEBUG_CONTIG */
+/* #define DEBUG_LIST */
+/* #define DEBUG_DTYPE */
+
+/* Contig I/O helper prototypes */
+int ADIOI_PVFS2_Contig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code, int rw_type);
+
+/* List I/O helper prototypes */
+int ADIOI_PVFS2_StridedListIO(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ int *error_code, int rw_type);
+
+int gen_listio_arr(ADIOI_Flatlist_node * flat_buf,
+ int *flat_buf_index_p,
+ int64_t * cur_flat_buf_reg_off_p,
+ int flat_buf_size,
+ int flat_buf_extent,
+ ADIOI_Flatlist_node * flat_file,
+ int *flat_file_index_p,
+ int64_t * cur_flat_file_reg_off_p,
+ int flat_file_size,
+ int flat_file_extent,
+ int max_ol_count,
+ ADIO_Offset disp,
+ int bytes_into_filetype,
+ int64_t * bytes_completed,
+ int64_t total_io_size,
+ int64_t buf_off_arr[],
+ int32_t buf_len_arr[],
+ int32_t * buf_ol_count_p,
+ int64_t file_off_arr[], int32_t file_len_arr[], int32_t * file_ol_count_p);
+
+void print_buf_file_ol_pairs(int64_t buf_off_arr[],
+ int32_t buf_len_arr[],
+ int32_t buf_ol_count,
+ int64_t file_off_arr[],
+ int32_t file_len_arr[], int32_t file_ol_count, void *buf, int rw_type);
+
+/* Datatype I/O helper prototypes */
+int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ int *error_code, int rw_type);
+
+int convert_named(MPI_Datatype * mpi_dtype, PVFS_Request * pvfs_dtype, int combiner);
+
+void print_dtype_info(int combiner,
+ int num_int,
+ int num_addr,
+ int num_dtype, int *arr_int, MPI_Aint * arr_addr, MPI_Datatype * arr_dtype);
+
+int convert_mpi_pvfs2_dtype(MPI_Datatype * mpi_dtype, PVFS_Request * pvfs_dtype);
+
+#endif /* AD_PVFS2_IO_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io_dtype.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io_dtype.c
new file mode 100644
index 0000000000000000000000000000000000000000..491af6d3858237f34147eee8ee40f3deec4ab7a3
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io_dtype.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include
+#include "adio_extern.h"
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_io.h"
+#include "ad_pvfs2_common.h"
+
+int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code, int rw_type)
+{
+ int ret = -1, filetype_is_contig = -1;
+ MPI_Count filetype_size = -1;
+ int num_filetypes = 0, cur_flat_file_reg_off = 0;
+ PVFS_Request tmp_mem_req, mem_req, tmp_file_req, file_req;
+ PVFS_sysresp_io resp_io;
+ ADIO_Offset off = -1, bytes_into_filetype = 0;
+ MPI_Aint lb, filetype_extent = -1;
+ int i = -1;
+ MPI_Count etype_size;
+ PVFS_size pvfs_disp = -1;
+ ADIOI_Flatlist_node *flat_file_p;
+
+ /* Use for offseting the PVFS2 filetype */
+ int pvfs_blk = 1;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ static char myname[] = "ADIOI_PVFS2_STRIDED_DTYPE";
+
+ memset(&tmp_mem_req, 0, sizeof(PVFS_Request));
+ memset(&mem_req, 0, sizeof(PVFS_Request));
+ memset(&tmp_file_req, 0, sizeof(PVFS_Request));
+ memset(&file_req, 0, sizeof(PVFS_Request));
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ /* changed below if error */
+ *error_code = MPI_SUCCESS;
+
+ /* datatype is the memory type
+ * fd->filetype is the file type */
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (filetype_size == 0) {
+ *error_code = MPI_SUCCESS;
+ return -1;
+ }
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(fd->etype, &etype_size);
+ if (filetype_size == 0) {
+ *error_code = MPI_SUCCESS;
+ return -1;
+ }
+
+ /* offset is in units of etype relative to the filetype. We
+ * convert this to off in terms of actual data bytes (the offset
+ * minus the number of bytes that are not used). We are allowed
+ * to do this since PVFS2 handles offsets with respect to a
+ * file_req in bytes, otherwise we would have to convert into a
+ * pure byte offset as is done in other methods. Explicit offset
+ * case is handled by using fd->disp and byte-converted off. */
+
+ pvfs_disp = fd->disp;
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ if (filetype_is_contig) {
+ off = fd->fp_ind - fd->disp;
+ } else {
+ int flag = 0;
+ flat_file_p = ADIOI_Flatten_and_find(fd->filetype);
+ num_filetypes = -1;
+ while (!flag) {
+ num_filetypes++;
+ for (i = 0; i < flat_file_p->count; i++) {
+ /* Start on a non zero-length region */
+ if (flat_file_p->blocklens[i]) {
+ if (fd->disp + flat_file_p->indices[i] +
+ (num_filetypes * filetype_extent) +
+ flat_file_p->blocklens[i] > fd->fp_ind &&
+ fd->disp + flat_file_p->indices[i] <= fd->fp_ind) {
+ cur_flat_file_reg_off = fd->fp_ind -
+ (fd->disp + flat_file_p->indices[i] +
+ (num_filetypes * filetype_extent));
+ flag = 1;
+ break;
+ } else
+ bytes_into_filetype += flat_file_p->blocklens[i];
+ }
+ }
+ }
+ /* Impossible that we don't find it in this datatype */
+ assert(i != flat_file_p->count);
+ off = bytes_into_filetype + cur_flat_file_reg_off;
+ }
+ } else { /* ADIO_EXPLICIT */
+
+ off = etype_size * offset;
+ }
+
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: (fd->fp_ind=%Ld,fd->disp=%Ld,"
+ " offset=%Ld),(pvfs_disp=%Ld,off=%Ld)\n", fd->fp_ind, fd->disp, offset, pvfs_disp, off);
+#endif
+
+
+ /* Convert the MPI memory and file datatypes into
+ * PVFS2 datatypes */
+ ret = convert_mpi_pvfs2_dtype(&datatype, &tmp_mem_req);
+ if (ret < 0) {
+ goto error_state;
+ }
+ ret = convert_mpi_pvfs2_dtype(&(fd->filetype), &tmp_file_req);
+ if (ret < 0) {
+ goto error_state;
+ }
+
+ ret = PVFS_Request_contiguous(count, tmp_mem_req, &mem_req);
+ if (ret != 0) /* TODO: convert this to MPIO error handling */
+ fprintf(stderr, "ADIOI_PVFS2_stridedDtypeIO: error in final" " CONTIG memory type\n");
+ PVFS_Request_free(&tmp_mem_req);
+
+ /* pvfs_disp is used to offset the filetype */
+ ret = PVFS_Request_hindexed(1, &pvfs_blk, &pvfs_disp, tmp_file_req, &file_req);
+ if (ret != 0)
+ fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: error in final" " HINDEXED file type\n");
+ PVFS_Request_free(&tmp_file_req);
+
+ if (rw_type == READ)
+ ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, off, buf,
+ mem_req, &(pvfs_fs->credentials), &resp_io);
+ else
+ ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, off, buf,
+ mem_req, &(pvfs_fs->credentials), &resp_io);
+
+ if (ret != 0) {
+ fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: Warning - PVFS_sys_"
+ "read/write returned %d and completed %Ld bytes.\n",
+ ret, (long long) resp_io.total_completed);
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_io \n", 0);
+ goto error_state;
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind = off += resp_io.total_completed;
+ }
+
+ error_state:
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: "
+ "resp_io.total_completed=%Ld,ret=%d\n", resp_io.total_completed, ret);
+#endif
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, resp_io.total_completed);
+ /* This is a temporary way of filling in status. The right way is to
+ * keep track of how much data was actually acccessed by
+ * ADIOI_BUFFERED operations */
+#endif
+ return ret;
+}
+
+/* convert_mpi_pvfs2_dtype - Convert a MPI datatype into
+ * a PVFS2 datatype so that we can natively use the PVFS2
+ * datatypes in the PVFS2 I/O calls instead of converting
+ * all datatypes to the hindexed method
+ * return 1 - a leaf node
+ * return 0 - normal return
+ * return -1 - problems */
+
+int convert_mpi_pvfs2_dtype(MPI_Datatype * mpi_dtype, PVFS_Request * pvfs_dtype)
+{
+ int num_int = -1, num_addr = -1, num_dtype = -1, combiner = -1, i = -1, ret = -1, leaf = -1;
+ int *arr_int = NULL;
+ MPI_Aint *arr_addr = NULL;
+ MPI_Datatype *arr_dtype = NULL;
+ PVFS_Request *old_pvfs_dtype = NULL;
+ PVFS_Request *old_pvfs_dtype_arr = NULL;
+ int arr_count = -1;
+ PVFS_size *pvfs_arr_disp = NULL;
+ int *pvfs_arr_len = NULL;
+
+ MPI_Type_get_envelope(*mpi_dtype, &num_int, &num_addr, &num_dtype, &combiner);
+
+ /* Depending on type of datatype do the following
+ * operations */
+
+ if (combiner == MPI_COMBINER_NAMED) {
+ convert_named(mpi_dtype, pvfs_dtype, combiner);
+ return 1;
+ }
+
+ /* Allocate space for the arrays necessary for
+ * MPI_Type_get_contents */
+
+ if ((arr_int = ADIOI_Malloc(sizeof(int) * num_int)) == NULL) {
+ fprintf(stderr, "Failed to allocate array_int\n");
+ return -1;
+ }
+ if ((arr_addr = ADIOI_Malloc(sizeof(int) * num_addr)) == NULL) {
+ ADIOI_Free(arr_int);
+ fprintf(stderr, "Failed to allocate array_addr\n");
+ return -1;
+ }
+ if ((arr_dtype = ADIOI_Malloc(sizeof(MPI_Datatype) * num_dtype)) == NULL) {
+ ADIOI_Free(arr_int);
+ ADIOI_Free(arr_addr);
+ fprintf(stderr, "Failed to allocate array_dtypes\n");
+ return -1;
+ }
+
+ MPI_Type_get_contents(*mpi_dtype, num_int, num_addr, num_dtype, arr_int, arr_addr, arr_dtype);
+
+ /* If it's not a predefined datatype, it is either a
+ * derived datatype or a structured datatype */
+
+ if (combiner != MPI_COMBINER_STRUCT) {
+ if ((old_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL)
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate PVFS_Request\n");
+ switch (combiner) {
+ case MPI_COMBINER_CONTIGUOUS:
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
+ ret = PVFS_Request_contiguous(arr_int[0], *old_pvfs_dtype, pvfs_dtype);
+ break;
+ case MPI_COMBINER_VECTOR:
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
+ ret = PVFS_Request_vector(arr_int[0], arr_int[1],
+ arr_int[2], *old_pvfs_dtype, pvfs_dtype);
+ break;
+ case MPI_COMBINER_HVECTOR:
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
+ ret = PVFS_Request_hvector(arr_int[0], arr_int[1],
+ arr_addr[0], *old_pvfs_dtype, pvfs_dtype);
+ break;
+ /* Both INDEXED and HINDEXED types require PVFS_size
+ * address arrays. Therefore, we need to copy and
+ * convert the data from MPI_get_contents() into
+ * a PVFS_size buffer */
+ case MPI_COMBINER_INDEXED:
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
+ if ((pvfs_arr_disp = ADIOI_Malloc(arr_int[0] * sizeof(PVFS_size))) == 0) {
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: "
+ "Failed to allocate pvfs_arr_disp\n");
+ }
+ for (i = 0; i < arr_int[0]; i++) {
+ pvfs_arr_disp[i] = (PVFS_size) arr_int[arr_int[0] + 1 + i];
+ }
+ ret = PVFS_Request_indexed(arr_int[0], &arr_int[1],
+ pvfs_arr_disp, *old_pvfs_dtype, pvfs_dtype);
+ ADIOI_Free(pvfs_arr_disp);
+ break;
+ case MPI_COMBINER_HINDEXED:
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
+ if ((pvfs_arr_disp = ADIOI_Malloc(arr_int[0] * sizeof(PVFS_size))) == 0) {
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: "
+ "Failed to allocate pvfs_arr_disp\n");
+ }
+ for (i = 0; i < arr_int[0]; i++) {
+ pvfs_arr_disp[i] = (PVFS_size) arr_addr[i];
+ }
+ ret = PVFS_Request_hindexed(arr_int[0], &arr_int[1],
+ (int64_t *) & arr_addr[0], *old_pvfs_dtype, pvfs_dtype);
+ ADIOI_Free(pvfs_arr_disp);
+ break;
+ case MPI_COMBINER_DUP:
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
+ ret = PVFS_Request_contiguous(1, *old_pvfs_dtype, pvfs_dtype);
+
+ break;
+ case MPI_COMBINER_INDEXED_BLOCK:
+ /* No native PVFS2 support for this operation currently */
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "INDEXED_BLOCK is unsupported\n");
+ break;
+ case MPI_COMBINER_HINDEXED_BLOCK:
+ /* No native PVFS2 support for this operation currently */
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "HINDEXED_BLOCK is unsupported\n");
+ break;
+ case MPI_COMBINER_HINDEXED_INTEGER:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "HINDEXED_INTEGER is unsupported\n");
+ break;
+ case MPI_COMBINER_STRUCT_INTEGER:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "STRUCT_INTEGER is unsupported\n");
+ break;
+ case MPI_COMBINER_SUBARRAY:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "SUBARRAY is unsupported\n");
+ break;
+ case MPI_COMBINER_DARRAY:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "DARRAY is unsupported\n");
+ break;
+ case MPI_COMBINER_F90_REAL:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "F90_REAL is unsupported\n");
+ break;
+ case MPI_COMBINER_F90_COMPLEX:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "F90_COMPLEX is unsupported\n");
+ break;
+ case MPI_COMBINER_F90_INTEGER:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "F90_INTEGER is unsupported\n");
+ break;
+ case MPI_COMBINER_RESIZED:
+ ADIOI_Free(old_pvfs_dtype);
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "RESIZED is unsupported\n");
+ break;
+ default:
+ break;
+ }
+
+ if (ret != 0)
+ fprintf(stderr, "Error in PVFS_Request_* " "for a derived datatype\n");
+
+#ifdef DEBUG_DTYPE
+ print_dtype_info(combiner, num_int, num_addr, num_dtype, arr_int, arr_addr, arr_dtype);
+#endif
+
+ if (leaf != 1 && combiner != MPI_COMBINER_DUP)
+ MPI_Type_free(&arr_dtype[0]);
+
+ ADIOI_Free(arr_int);
+ ADIOI_Free(arr_addr);
+ ADIOI_Free(arr_dtype);
+
+ PVFS_Request_free(old_pvfs_dtype);
+ ADIOI_Free(old_pvfs_dtype);
+
+ return ret;
+ } else { /* MPI_COMBINER_STRUCT */
+
+ MPI_Aint mpi_lb = -1, mpi_extent = -1;
+ PVFS_offset pvfs_lb = -1;
+ PVFS_size pvfs_extent = -1;
+ int has_lb_ub = 0;
+
+ /* When converting into a PVFS_Request_struct, we no longer
+ * can use MPI_LB and MPI_UB. Therfore, we have to do the
+ * following.
+ * We simply ignore all the MPI_LB and MPI_UB types and
+ * get the lb and extent and pass it on through a
+ * PVFS resized_req */
+
+ arr_count = 0;
+ for (i = 0; i < arr_int[0]; i++) {
+ if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) {
+ arr_count++;
+ }
+ }
+
+ if (arr_int[0] != arr_count) {
+ MPI_Type_get_extent(*mpi_dtype, &mpi_lb, &mpi_extent);
+ pvfs_lb = mpi_lb;
+ pvfs_extent = mpi_extent;
+ if ((pvfs_arr_len = ADIOI_Malloc(arr_count * sizeof(int)))
+ == NULL) {
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate pvfs_arr_len\n");
+ }
+ has_lb_ub = 1;
+ }
+
+ if ((old_pvfs_dtype_arr = ADIOI_Malloc(arr_count * sizeof(PVFS_Request))) == NULL)
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate PVFS_Requests\n");
+
+ if ((pvfs_arr_disp = ADIOI_Malloc(arr_count * sizeof(PVFS_size)))
+ == NULL) {
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate pvfs_arr_disp\n");
+ }
+
+ arr_count = 0;
+ for (i = 0; i < arr_int[0]; i++) {
+ if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) {
+ leaf = convert_mpi_pvfs2_dtype(&arr_dtype[i], &old_pvfs_dtype_arr[arr_count]);
+ if (leaf != 1)
+ MPI_Type_free(&arr_dtype[i]);
+ pvfs_arr_disp[arr_count] = (PVFS_size) arr_addr[i];
+ if (has_lb_ub) {
+ pvfs_arr_len[arr_count] = arr_int[i + 1];
+ }
+ arr_count++;
+ }
+ }
+
+ /* If a MPI_UB or MPI_LB did exist, we have to
+ * resize the datatype */
+ if (has_lb_ub) {
+ PVFS_Request *tmp_pvfs_dtype = NULL;
+ if ((tmp_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL)
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate PVFS_Request\n");
+
+ ret = PVFS_Request_struct(arr_count, pvfs_arr_len,
+ pvfs_arr_disp, old_pvfs_dtype_arr, tmp_pvfs_dtype);
+ if (ret != 0)
+ fprintf(stderr, "Error in PVFS_Request_struct\n");
+
+ arr_count = 0;
+ for (i = 0; i < arr_int[0]; i++) {
+ if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) {
+ PVFS_Request_free(&old_pvfs_dtype_arr[arr_count]);
+ arr_count++;
+ }
+ }
+
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "STRUCT(WITHOUT %d LB or UB)(%d,[", arr_int[0] - arr_count, arr_count);
+ for (i = 0; i < arr_count; i++)
+ fprintf(stderr, "(%d,%Ld) ", pvfs_arr_len[i], pvfs_arr_disp[i]);
+ fprintf(stderr, "]\n");
+ fprintf(stderr, "RESIZED(LB = %Ld, EXTENT = %Ld)\n", pvfs_lb, pvfs_extent);
+#endif
+ ret = PVFS_Request_resized(*tmp_pvfs_dtype, pvfs_lb, pvfs_extent, pvfs_dtype);
+ if (ret != 0)
+ fprintf(stderr, "Error in PVFS_Request_resize\n");
+
+ PVFS_Request_free(tmp_pvfs_dtype);
+ ADIOI_Free(tmp_pvfs_dtype);
+ } else { /* No MPI_LB or MPI_UB datatypes */
+
+ ret = PVFS_Request_struct(arr_int[0], &arr_int[1],
+ pvfs_arr_disp, old_pvfs_dtype_arr, pvfs_dtype);
+ if (ret != 0)
+ fprintf(stderr, "Error in PVFS_Request_struct\n");
+
+ for (i = 0; i < arr_int[0]; i++) {
+ if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB)
+ PVFS_Request_free(&old_pvfs_dtype_arr[i]);
+ }
+
+#ifdef DEBUG_DTYPE
+ print_dtype_info(combiner, num_int, num_addr, num_dtype, arr_int, arr_addr, arr_dtype);
+#endif
+ }
+
+ ADIOI_Free(arr_int);
+ ADIOI_Free(arr_addr);
+ ADIOI_Free(arr_dtype);
+
+ ADIOI_Free(old_pvfs_dtype_arr);
+ ADIOI_Free(pvfs_arr_disp);
+ ADIOI_Free(pvfs_arr_len);
+
+ return ret;
+ }
+
+ /* Shouldn't have gotten here */
+ fprintf(stderr, "convert_mpi_pvfs2_dtype: SERIOUS ERROR\n");
+ return -1;
+}
+
+int convert_named(MPI_Datatype * mpi_dtype, PVFS_Request * pvfs_dtype, int combiner)
+{
+ int ret = -1;
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "NAMED");
+#endif
+
+ if (MPI_CHAR == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_CHAR, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_CHAR\n");
+#endif
+ } else if (MPI_BYTE == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_BYTE, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_BYTE\n");
+#endif
+ } else if (MPI_SHORT == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_SHORT, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_SHORT\n");
+#endif
+ } else if (MPI_INT == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_INT, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_INT\n");
+#endif
+ } else if (MPI_LONG == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_LONG, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_LONG\n");
+#endif
+ } else if (MPI_FLOAT == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_FLOAT, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_FLOAT\n");
+#endif
+ } else if (MPI_DOUBLE == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_DOUBLE, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_DOUBLE\n");
+#endif
+ } else if (MPI_UNSIGNED_CHAR == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED_CHAR, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_UNSIGNED_CHAR\n");
+#endif
+ } else if (MPI_UNSIGNED_SHORT == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_UNSIGNED_SHORT\n");
+#endif
+ } else if (MPI_UNSIGNED == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_SHORT\n");
+#endif
+ } else if (MPI_UNSIGNED_LONG == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED_LONG, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_UNSIGNED_LONG\n");
+#endif
+ } else if (MPI_LONG_DOUBLE == *mpi_dtype) {
+ ret = PVFS_Request_contiguous(1, PVFS_LONG_DOUBLE, pvfs_dtype);
+#ifdef DEBUG_DTYPE
+ fprintf(stderr, "-MPI_LONG_DOUBLE\n");
+#endif
+ } else {
+ fprintf(stderr, "convert_named: predefined type not found");
+ return -1;
+ }
+ if (ret != 0)
+ fprintf(stderr, "convert_named: Datatype creation failed\n");
+ return ret;
+}
+
+void print_dtype_info(int combiner,
+ int num_int,
+ int num_addr,
+ int num_dtype, int *arr_int, MPI_Aint * arr_addr, MPI_Datatype * arr_dtype)
+{
+ int i = -1;
+ switch (combiner) {
+ case MPI_COMBINER_CONTIGUOUS:
+ fprintf(stderr, "CONTIG(%d)\n", arr_int[0]);
+ break;
+ case MPI_COMBINER_VECTOR:
+ fprintf(stderr, "VECTOR(%d,%d,%d)\n", arr_int[0], arr_int[1], arr_int[2]);
+ break;
+ case MPI_COMBINER_HVECTOR:
+ fprintf(stderr, "HVECTOR(%d,%d,%ld)\n", arr_int[0], arr_int[1], arr_addr[0]);
+ break;
+ case MPI_COMBINER_INDEXED:
+ fprintf(stderr, "INDEXED(%d,[", arr_int[0]);
+ for (i = 0; i < arr_int[0]; i++)
+ fprintf(stderr, "(%d,%d) ", arr_int[1 + i], arr_int[arr_int[0] + 1 + i]);
+ fprintf(stderr, "]\n");
+ break;
+ case MPI_COMBINER_HINDEXED:
+ fprintf(stderr, "HINDEXED(%d,[", arr_int[0]);
+ for (i = 0; i < arr_int[0]; i++)
+ fprintf(stderr, "(%d,%lld) ", arr_int[1 + i], (long long) arr_addr[i]);
+ fprintf(stderr, "]\n");
+ break;
+ case MPI_COMBINER_STRUCT:
+ fprintf(stderr, "STRUCT(%d,[", arr_int[0]);
+ for (i = 0; i < arr_int[0]; i++)
+ fprintf(stderr, "(%d,%lld) ", arr_int[1 + i], (long long) arr_addr[i]);
+ fprintf(stderr, "]\n");
+ break;
+ case MPI_COMBINER_DUP:
+ fprintf(stderr, "DUP\n");
+ break;
+ default:
+ fprintf(stderr, "no available information on this datatype");
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io_list.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io_list.c
new file mode 100644
index 0000000000000000000000000000000000000000..8030f2a917406622376f7012cb523a96b89e8dca
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_io_list.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include
+#include "adio_extern.h"
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_io.h"
+#include "ad_pvfs2_common.h"
+
+#define COALESCE_REGIONS /* TODO: would we ever want to *not* coalesce? */
+#define MAX_OL_COUNT 64
+int ADIOI_PVFS2_StridedListIO(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status,
+ int *error_code, int rw_type)
+{
+ /* list I/O parameters */
+ int i = -1, ret = -1;
+ int tmp_filetype_size = -1;
+ int64_t cur_io_size = 0, io_size = 0;
+ int etype_size = -1;
+ int num_etypes_in_filetype = -1, num_filetypes = -1;
+ int etypes_in_filetype = -1, size_in_filetype = -1;
+ int bytes_into_filetype = 0;
+ MPI_Offset total_bytes_accessed = 0;
+
+ /* parameters for offset-length pairs arrays */
+ int64_t buf_off_arr[MAX_OL_COUNT];
+ int32_t buf_len_arr[MAX_OL_COUNT];
+ int64_t file_off_arr[MAX_OL_COUNT];
+ int32_t file_len_arr[MAX_OL_COUNT];
+ int32_t buf_ol_count = 0;
+ int32_t file_ol_count = 0;
+
+ /* parameters for flattened memory and file datatypes */
+ int flat_buf_index = 0;
+ int flat_file_index = 0;
+ int64_t cur_flat_buf_reg_off = 0;
+ int64_t cur_flat_file_reg_off = 0;
+ ADIOI_Flatlist_node *flat_buf_p, *flat_file_p;
+ MPI_Count buftype_size = -1, filetype_size = -1;
+ MPI_Aint lb, filetype_extent = -1, buftype_extent = -1;;
+ int buftype_is_contig = -1, filetype_is_contig = -1;
+
+ /* PVFS2 specific parameters */
+ PVFS_Request mem_req, file_req;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ PVFS_sysresp_io resp_io;
+ static char myname[] = "ADIOI_PVFS2_STRIDED_LISTIO";
+
+ if (fd->atomicity) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG,
+ "Atomic noncontiguous writes"
+ " are not supported by PVFS2", 0);
+ return -1;
+ }
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (filetype_size == 0) {
+ *error_code = MPI_SUCCESS;
+ return -1;
+ }
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ io_size = buftype_size * count;
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ /* Flatten the memory datatype
+ * (file datatype has already been flattened in ADIO open
+ * unless it is contibuous, then we need to flatten it manually)
+ * and set the correct buffers for flat_buf and flat_file */
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+ if (buftype_is_contig == 0) {
+ flat_buf_p = ADIOI_Flatten_and_find(datatype);
+ } else {
+ /* flatten and add to the list */
+ flat_buf_p = (ADIOI_Flatlist_node *) ADIOI_Malloc(sizeof(ADIOI_Flatlist_node));
+ flat_buf_p->blocklens = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
+ flat_buf_p->indices = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
+ /* For the buffer, we can optimize the buftype, this is not
+ * possible with the filetype since it is tiled */
+ buftype_size = buftype_size * count;
+ buftype_extent = buftype_size * count;
+ flat_buf_p->blocklens[0] = buftype_size;
+ flat_buf_p->indices[0] = 0;
+ flat_buf_p->count = 1;
+ }
+ if (filetype_is_contig == 0) {
+ /* TODO: why does avery say this should already have been
+ * flattened in Open, but also says contig types don't get
+ * flattened */
+ flat_file_p = ADIOI_Flatten_and_find(fd->filetype);
+ } else {
+ /* flatten and add to the list */
+ flat_file_p = (ADIOI_Flatlist_node *) ADIOI_Malloc(sizeof(ADIOI_Flatlist_node));
+ flat_file_p->blocklens = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
+ flat_file_p->indices = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
+ flat_file_p->blocklens[0] = filetype_size;
+ flat_file_p->indices[0] = 0;
+ flat_file_p->count = 1;
+ }
+
+ /* Find out where we are in the flattened filetype (the block index,
+ * how far into the block, and how many bytes_into_filetype)
+ * If the file_ptr_type == ADIO_INDIVIDUAL we will use disp, fp_ind
+ * to figure this out (offset should always be zero)
+ * If file_ptr_type == ADIO_EXPLICIT, we will use disp and offset
+ * to figure this out. */
+
+ etype_size = fd->etype_size;
+ num_etypes_in_filetype = filetype_size / etype_size;
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ int flag = 0;
+ /* Should have already been flattened in ADIO_Open */
+ num_filetypes = -1;
+ while (!flag) {
+ num_filetypes++;
+ for (i = 0; i < flat_file_p->count; i++) {
+ /* Start on a non zero-length region */
+ if (flat_file_p->blocklens[i]) {
+ if (fd->disp + flat_file_p->indices[i] +
+ (num_filetypes * filetype_extent) +
+ flat_file_p->blocklens[i] > fd->fp_ind &&
+ fd->disp + flat_file_p->indices[i] <= fd->fp_ind) {
+ flat_file_index = i;
+ cur_flat_file_reg_off = fd->fp_ind -
+ (fd->disp + flat_file_p->indices[i] +
+ (num_filetypes * filetype_extent));
+ flag = 1;
+ break;
+ } else
+ bytes_into_filetype += flat_file_p->blocklens[i];
+ }
+ }
+ }
+ /* Impossible that we don't find it in this datatype */
+ assert(i != flat_file_p->count);
+ } else {
+ num_filetypes = (int) (offset / num_etypes_in_filetype);
+ etypes_in_filetype = (int) (offset % num_etypes_in_filetype);
+ size_in_filetype = etypes_in_filetype * etype_size;
+
+ tmp_filetype_size = 0;
+ for (i = 0; i < flat_file_p->count; i++) {
+ tmp_filetype_size += flat_file_p->blocklens[i];
+ if (tmp_filetype_size > size_in_filetype) {
+ flat_file_index = i;
+ cur_flat_file_reg_off = flat_file_p->blocklens[i] -
+ (tmp_filetype_size - size_in_filetype);
+ bytes_into_filetype = offset * filetype_size - flat_file_p->blocklens[i];
+ break;
+ }
+ }
+ }
+#ifdef DEBUG_LIST
+ fprintf(stderr, "ADIOI_PVFS2_StridedListIO: (fd->fp_ind=%Ld,fd->disp=%Ld,"
+ " offset=%Ld)\n(flat_file_index=%d,cur_flat_file_reg_off=%Ld,"
+ "bytes_into_filetype=%d)\n",
+ fd->fp_ind, fd->disp, offset, flat_file_index,
+ cur_flat_file_reg_off, bytes_into_filetype);
+#endif
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "flat_buf:\n");
+ for (i = 0; i < flat_buf_p->count; i++)
+ fprintf(stderr, "(offset, length) = (%Ld, %d)\n",
+ flat_buf_p->indices[i], flat_buf_p->blocklens[i]);
+ fprintf(stderr, "flat_file:\n");
+ for (i = 0; i < flat_file_p->count; i++)
+ fprintf(stderr, "(offset, length) = (%Ld, %d)\n",
+ flat_file_p->indices[i], flat_file_p->blocklens[i]);
+#endif
+
+ /* total data written */
+ cur_io_size = 0;
+ while (cur_io_size != io_size) {
+ /* Initialize the temporarily unrolling lists and
+ * and associated variables */
+ buf_ol_count = 0;
+ file_ol_count = 0;
+ for (i = 0; i < MAX_OL_COUNT; i++) {
+ buf_off_arr[i] = 0;
+ buf_len_arr[i] = 0;
+ file_off_arr[i] = 0;
+ file_len_arr[i] = 0;
+ }
+
+ /* Generate the offset-length pairs for a
+ * list I/O operation */
+ gen_listio_arr(flat_buf_p,
+ &flat_buf_index,
+ &cur_flat_buf_reg_off,
+ buftype_size,
+ buftype_extent,
+ flat_file_p,
+ &flat_file_index,
+ &cur_flat_file_reg_off,
+ filetype_size,
+ filetype_extent,
+ MAX_OL_COUNT,
+ fd->disp,
+ bytes_into_filetype,
+ &cur_io_size,
+ io_size,
+ buf_off_arr,
+ buf_len_arr, &buf_ol_count, file_off_arr, file_len_arr, &file_ol_count);
+
+ assert(buf_ol_count <= MAX_OL_COUNT);
+ assert(file_ol_count <= MAX_OL_COUNT);
+#ifdef DEBUG_LIST2
+ print_buf_file_ol_pairs(buf_off_arr,
+ buf_len_arr,
+ buf_ol_count,
+ file_off_arr, file_len_arr, file_ol_count, buf, rw_type);
+#endif
+#ifdef DEBUG_LIST2
+ do {
+ int y, z;
+ fprintf(stderr, "ad_pvfs2_io_list.c::\n");
+ for (y = 0; y < buf_ol_count; y++) {
+ for (z = 0; z < buf_len_arr[y]; z++) {
+ fprintf(stderr, "buf[%d][%d]=%c\n", y, z, ((char *) buf + buf_off_arr[y])[z]);
+ }
+ }
+ } while (0);
+#endif
+
+ /* Run list I/O operation */
+ ret = PVFS_Request_hindexed(buf_ol_count, buf_len_arr, buf_off_arr, PVFS_BYTE, &mem_req);
+
+ ret = PVFS_Request_hindexed(file_ol_count, file_len_arr,
+ file_off_arr, PVFS_BYTE, &file_req);
+ if (rw_type == READ) {
+ ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
+ buf, mem_req, &(pvfs_fs->credentials), &resp_io);
+ } else {
+ ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
+ buf, mem_req, &(pvfs_fs->credentials), &resp_io);
+ }
+ if (ret != 0) {
+ fprintf(stderr, "ADIOI_PVFS2_StridedListIO: Warning - PVFS_sys_"
+ "read/write returned %d and completed %lld bytes.\n",
+ ret, (long long) resp_io.total_completed);
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_io \n", 0);
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ goto error_state;
+ }
+ total_bytes_accessed += resp_io.total_completed;
+
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ }
+
+#ifdef DEBUG_LIST
+ fprintf(stderr, "ADIOI_PVFS2_StridedListIO: "
+ "total_bytes_accessed=%Ld,ret=%d\n", total_bytes_accessed, ret);
+#endif
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += total_bytes_accessed;
+ *error_code = MPI_SUCCESS;
+
+ error_state:
+#ifdef HAVE_STATUS_SET_BYTES
+ /* TODO: why the cast? */
+ MPIR_Status_set_bytes(status, datatype, total_bytes_accessed);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+ if (buftype_is_contig != 0) {
+ ADIOI_Free(flat_buf_p->blocklens);
+ ADIOI_Free(flat_buf_p->indices);
+ ADIOI_Free(flat_buf_p);
+ }
+
+ if (filetype_is_contig != 0) {
+ ADIOI_Free(flat_file_p->blocklens);
+ ADIOI_Free(flat_file_p->indices);
+ ADIOI_Free(flat_file_p);
+ }
+
+ return 0;
+}
+
+/* To do: Fix the code to coalesce the offset-length pairs for memory
+ * and file. */
+
+/* gen_listio_arr - fills in offset-length pairs for memory and file
+ * for list I/O */
+int gen_listio_arr(ADIOI_Flatlist_node * flat_buf_p,
+ int *flat_buf_index_p,
+ int64_t * cur_flat_buf_reg_off_p,
+ int flat_buf_size,
+ int flat_buf_extent,
+ ADIOI_Flatlist_node * flat_file_p,
+ int *flat_file_index_p,
+ int64_t * cur_flat_file_reg_off_p,
+ int flat_file_size,
+ int flat_file_extent,
+ int max_ol_count,
+ ADIO_Offset disp,
+ int bytes_into_filetype,
+ int64_t * bytes_completed,
+ int64_t total_io_size,
+ int64_t buf_off_arr[],
+ int32_t buf_len_arr[],
+ int32_t * buf_ol_count_p,
+ int64_t file_off_arr[], int32_t file_len_arr[], int32_t * file_ol_count_p)
+{
+ int region_size = -1;
+
+ /* parameters for flattened memory and file datatypes */
+ int64_t cur_flat_buf_reg_left = 0;
+ int64_t cur_flat_file_reg_left = 0;
+
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "gen_list_arr:\n");
+#endif
+
+ if ((*buf_ol_count_p) != 0 || (*file_ol_count_p) != 0) {
+ fprintf(stderr, "buf_ol_count != 0 || file_ol_count != 0\n");
+ return -1;
+ }
+
+ /* Start on a non-zero memory and file region
+ * Note this does not affect the bytes_completed
+ * since no data is in these regions. Initialize the
+ * first memory and file offsets. */
+ while (flat_buf_p->blocklens[(*flat_buf_index_p)] == 0) {
+ (*flat_buf_index_p) = ((*flat_buf_index_p) + 1) % flat_buf_p->count;
+ }
+ buf_off_arr[*buf_ol_count_p] =
+ (*bytes_completed / flat_buf_size) *
+ flat_buf_extent + flat_buf_p->indices[*flat_buf_index_p] + *cur_flat_buf_reg_off_p;
+ buf_len_arr[*buf_ol_count_p] = 0;
+
+ while (flat_file_p->blocklens[(*flat_file_index_p)] == 0) {
+ (*flat_file_index_p) = ((*flat_file_index_p) + 1) % flat_file_p->count;
+ }
+ file_off_arr[*file_ol_count_p] = disp +
+ (((bytes_into_filetype + *bytes_completed) / flat_file_size) *
+ flat_file_extent) + flat_file_p->indices[*flat_file_index_p] + *cur_flat_file_reg_off_p;
+ file_len_arr[*file_ol_count_p] = 0;
+
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "initial buf_off_arr[%d] = %Ld\n", *buf_ol_count_p,
+ buf_off_arr[*buf_ol_count_p]);
+ fprintf(stderr, "initial file_off_arr[%d] = %Ld\n", *file_ol_count_p,
+ file_off_arr[*file_ol_count_p]);
+#endif
+
+ while (*bytes_completed != total_io_size
+ && (*buf_ol_count_p) < max_ol_count && (*file_ol_count_p) < max_ol_count) {
+ /* How much data is left in the current piece in
+ * the flattened datatypes */
+ cur_flat_buf_reg_left = flat_buf_p->blocklens[*flat_buf_index_p]
+ - *cur_flat_buf_reg_off_p;
+ cur_flat_file_reg_left = flat_file_p->blocklens[*flat_file_index_p]
+ - *cur_flat_file_reg_off_p;
+
+#ifdef DEBUG_LIST2
+ fprintf(stderr,
+ "flat_buf_index=%d flat_buf->blocklens[%d]=%d\n"
+ "cur_flat_buf_reg_left=%Ld "
+ "*cur_flat_buf_reg_off_p=%Ld\n"
+ "flat_file_index=%d flat_file->blocklens[%d]=%d\n"
+ "cur_flat_file_reg_left=%Ld "
+ "*cur_flat_file_reg_off_p=%Ld\n"
+ "bytes_completed=%Ld\n"
+ "buf_ol_count=%d file_ol_count=%d\n"
+ "buf_len_arr[%d]=%d file_len_arr[%d]=%d\n\n",
+ *flat_buf_index_p, *flat_buf_index_p,
+ flat_buf_p->blocklens[*flat_buf_index_p],
+ cur_flat_buf_reg_left,
+ *cur_flat_buf_reg_off_p,
+ *flat_file_index_p, *flat_file_index_p,
+ flat_file_p->blocklens[*flat_file_index_p],
+ cur_flat_file_reg_left,
+ *cur_flat_file_reg_off_p,
+ *bytes_completed,
+ *buf_ol_count_p, *file_ol_count_p,
+ *buf_ol_count_p,
+ buf_len_arr[*buf_ol_count_p], *file_ol_count_p, file_len_arr[*file_ol_count_p]);
+#endif
+
+ /* What is the size of the next contiguous region agreed
+ * upon by both memory and file regions that does not
+ * surpass the file size */
+ if (cur_flat_buf_reg_left > cur_flat_file_reg_left)
+ region_size = cur_flat_file_reg_left;
+ else
+ region_size = cur_flat_buf_reg_left;
+
+ if (region_size > total_io_size - *bytes_completed)
+ region_size = total_io_size - *bytes_completed;
+
+ /* Add this piece to both the mem and file arrays
+ * coalescing offset-length pairs if possible and advance
+ * the pointers through the flatten mem and file datatypes
+ * as well Note: no more than a single piece can be done
+ * since we take the smallest one possible */
+
+ if (cur_flat_buf_reg_left == region_size) {
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "reached end of memory block...\n");
+#endif
+ (*flat_buf_index_p) = ((*flat_buf_index_p) + 1) % flat_buf_p->count;
+ while (flat_buf_p->blocklens[(*flat_buf_index_p)] == 0) {
+ (*flat_buf_index_p) = ((*flat_buf_index_p) + 1) % flat_buf_p->count;
+ }
+ *cur_flat_buf_reg_off_p = 0;
+
+#ifdef COALESCE_REGIONS
+ if (*buf_ol_count_p != 0) {
+ if (buf_off_arr[(*buf_ol_count_p) - 1] +
+ buf_len_arr[(*buf_ol_count_p) - 1] == buf_off_arr[*buf_ol_count_p]) {
+ buf_len_arr[(*buf_ol_count_p) - 1] += region_size;
+ } else {
+ buf_len_arr[*buf_ol_count_p] += region_size;
+ (*buf_ol_count_p)++;
+ }
+ } else {
+#endif
+ buf_len_arr[*buf_ol_count_p] += region_size;
+ (*buf_ol_count_p)++;
+#ifdef COALESCE_REGIONS
+ }
+#endif
+
+ /* Don't prepare for the next piece if we have reached
+ * the limit or else it will segment fault. */
+ if ((*buf_ol_count_p) != max_ol_count) {
+ buf_off_arr[*buf_ol_count_p] =
+ ((*bytes_completed + region_size) / flat_buf_size) *
+ flat_buf_extent +
+ flat_buf_p->indices[*flat_buf_index_p] + (*cur_flat_buf_reg_off_p);
+ buf_len_arr[*buf_ol_count_p] = 0;
+ }
+ } else if (cur_flat_buf_reg_left > region_size) {
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "advanced %d in memory block...\n", region_size);
+#endif
+ (*cur_flat_buf_reg_off_p) += region_size;
+ buf_len_arr[*buf_ol_count_p] += region_size;
+ } else {
+ fprintf(stderr, "gen_listio_arr: Error\n");
+ }
+
+ /* To calculate the absolute file offset we need to
+ * add the disp, how many filetypes we have gone through,
+ * the relative block offset in the filetype and how far
+ * into the block we have gone. */
+ if (cur_flat_file_reg_left == region_size) {
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "reached end of file block...\n");
+#endif
+ (*flat_file_index_p) = ((*flat_file_index_p) + 1) % flat_file_p->count;
+ while (flat_file_p->blocklens[(*flat_file_index_p)] == 0) {
+ (*flat_file_index_p) = ((*flat_file_index_p) + 1) % flat_file_p->count;
+ }
+ (*cur_flat_file_reg_off_p) = 0;
+
+#ifdef COALESCE_REGIONS
+ if (*file_ol_count_p != 0) {
+ if (file_off_arr[(*file_ol_count_p) - 1] +
+ file_len_arr[(*file_ol_count_p) - 1] == file_off_arr[*file_ol_count_p]) {
+ file_len_arr[(*file_ol_count_p) - 1] += region_size;
+ } else {
+ file_len_arr[*file_ol_count_p] += region_size;
+ (*file_ol_count_p)++;
+ }
+ } else {
+#endif
+ file_len_arr[*file_ol_count_p] += region_size;
+ (*file_ol_count_p)++;
+#ifdef COALESCE_REGIONS
+ }
+#endif
+
+ /* Don't prepare for the next piece if we have reached
+ * the limit or else it will segment fault. */
+ if ((*file_ol_count_p) != max_ol_count) {
+ file_off_arr[*file_ol_count_p] = disp +
+ (((bytes_into_filetype + *bytes_completed + region_size)
+ / flat_file_size) *
+ flat_file_extent) +
+ flat_file_p->indices[*flat_file_index_p] + (*cur_flat_file_reg_off_p);
+ file_len_arr[*file_ol_count_p] = 0;
+ }
+ } else if (cur_flat_file_reg_left > region_size) {
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "advanced %d in file block...\n", region_size);
+#endif
+ (*cur_flat_file_reg_off_p) += region_size;
+ file_len_arr[*file_ol_count_p] += region_size;
+ } else {
+ fprintf(stderr, "gen_listio_arr: Error\n");
+ }
+#ifdef DEBUG_LIST2
+ fprintf(stderr, "------------------------------\n\n");
+#endif
+ *bytes_completed += region_size;
+ }
+ /* Increment the count if we stopped in the middle of a
+ * memory or file region */
+ if (*cur_flat_buf_reg_off_p != 0)
+ (*buf_ol_count_p)++;
+ if (*cur_flat_file_reg_off_p != 0)
+ (*file_ol_count_p)++;
+
+ return 0;
+}
+
+void print_buf_file_ol_pairs(int64_t buf_off_arr[],
+ int32_t buf_len_arr[],
+ int32_t buf_ol_count,
+ int64_t file_off_arr[],
+ int32_t file_len_arr[], int32_t file_ol_count, void *buf, int rw_type)
+{
+ int i = -1;
+
+ fprintf(stderr, "buf_ol_pairs(offset,length) count = %d\n", buf_ol_count);
+ for (i = 0; i < buf_ol_count; i++) {
+ fprintf(stderr, "(%lld, %d) ", (long long) buf_off_arr[i], buf_len_arr[i]);
+ }
+ fprintf(stderr, "\n");
+
+ fprintf(stderr, "file_ol_pairs(offset,length) count = %d\n", file_ol_count);
+ for (i = 0; i < file_ol_count; i++) {
+ fprintf(stderr, "(%lld, %d) ", (long long) file_off_arr[i], file_len_arr[i]);
+ }
+ fprintf(stderr, "\n\n");
+
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_open.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea80f6b56586a007772e979a8200c42e82576f11
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_open.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_common.h"
+
+/* open_status is helpful for bcasting values around */
+struct open_status_s {
+ int error;
+ PVFS_object_ref object_ref;
+};
+typedef struct open_status_s open_status;
+
+ /* steps for getting a handle: (it gets a little convoluted, but at least
+ * it's deterministic)
+ * . lookup the file.
+ * . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
+ * . if lookup fails, the file might not exist.
+ * in that case, create the file if we were passed MPI_MODE_CREATE
+ * . if the create fails, that means someone else created the file between
+ * our call to lookup and our call to create (like if N processors all
+ * open the same file with MPI_COMM_SELF). Then we can just look up the
+ * file (which now exists).
+ *
+ * the good news is that only one processor does this and broadcasts the
+ * handle to everyone else in the communicator
+ */
+static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
+ int nr_datafiles, PVFS_size strip_size,
+ ADIOI_PVFS2_fs * pvfs2_fs, open_status * o_status)
+{
+ int ret;
+ PVFS_sysresp_lookup resp_lookup;
+ PVFS_sysresp_getparent resp_getparent;
+ PVFS_sysresp_create resp_create;
+ PVFS_sys_attr attribs;
+ PVFS_sys_dist *dist;
+
+ ADIOI_PVFS2_makeattribs(&attribs);
+ if (nr_datafiles > 0) {
+ attribs.dfile_count = nr_datafiles;
+ attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
+ }
+
+ dist = NULL;
+
+ memset(&resp_lookup, 0, sizeof(resp_lookup));
+ memset(&resp_getparent, 0, sizeof(resp_getparent));
+ memset(&resp_create, 0, sizeof(resp_create));
+
+
+ ret = PVFS_sys_lookup(fs_id, pvfs_name,
+ &(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
+ if (ret == (-PVFS_ENOENT)) {
+ if (access_mode & ADIO_CREATE) {
+ ret = PVFS_sys_getparent(fs_id, pvfs_name, &(pvfs2_fs->credentials), &resp_getparent);
+ if (ret < 0) {
+ FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
+ o_status->error = ret;
+ return;
+ }
+
+ /* Set the distribution strip size if specified */
+ if (0 < strip_size) {
+ /* Note that the distribution is hardcoded here */
+ dist = PVFS_sys_dist_lookup("simple_stripe");
+ ret = PVFS_sys_dist_setparam(dist, "strip_size", &strip_size);
+ if (ret < 0) {
+ FPRINTF(stderr, "pvfs_sys_dist_setparam returns with %d\n", ret);
+ o_status->error = ret;
+ }
+ }
+
+ /* Perform file creation */
+#ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
+ ret = PVFS_sys_create(resp_getparent.basename,
+ resp_getparent.parent_ref, attribs,
+ &(pvfs2_fs->credentials), dist, &resp_create);
+#else
+ ret = PVFS_sys_create(resp_getparent.basename,
+ resp_getparent.parent_ref, attribs,
+ &(pvfs2_fs->credentials), dist, NULL, &resp_create);
+#endif
+
+ /* if many creates are happening in this directory, the earlier
+ * sys_lookup may have returned ENOENT, but the sys_create could
+ * return EEXISTS. That means the file has been created anyway, so
+ * less work for us and we can just open it up and return the
+ * handle */
+ if (ret == (-PVFS_EEXIST)) {
+ ret = PVFS_sys_lookup(fs_id, pvfs_name,
+ &(pvfs2_fs->credentials), &resp_lookup,
+ PVFS2_LOOKUP_LINK_FOLLOW);
+ if (ret < 0) {
+ o_status->error = ret;
+ return;
+ }
+ o_status->error = ret;
+ o_status->object_ref = resp_lookup.ref;
+ return;
+ }
+ o_status->object_ref = resp_create.ref;
+ } else {
+ FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
+ o_status->error = ret;
+ return;
+ }
+ } else if (access_mode & ADIO_EXCL) {
+ /* lookup should not succeed if opened with EXCL */
+ o_status->error = -PVFS_EEXIST;
+ return;
+ } else {
+ o_status->object_ref = resp_lookup.ref;
+ }
+ o_status->error = ret;
+ return;
+
+}
+
+
+/* ADIOI_PVFS2_Open:
+ * one process opens (or creates) the file, then broadcasts the result to the
+ * remaining processors.
+ *
+ * ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
+ * that, MPI_MODE_EXCL) was set. Because PVFS2 handles file lookup and
+ * creation more scalably than other file systems, ADIO_Open now skips any
+ * special handling when CREATE is set. */
+void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
+{
+ int rank, ret;
+ PVFS_fs_id cur_fs;
+ static char myname[] = "ADIOI_PVFS2_OPEN";
+ char pvfs_path[PVFS_NAME_MAX] = { 0 };
+
+ ADIOI_PVFS2_fs *pvfs2_fs;
+
+ /* since one process is doing the open, that means one process is also
+ * doing the error checking. define a struct for both the object reference
+ * and the error code to broadcast to all the processors */
+
+ open_status o_status = { 0, {0, 0} };
+ MPI_Datatype open_status_type;
+ MPI_Datatype types[2] = { MPI_INT, MPI_BYTE };
+ int lens[2] = { 1, sizeof(PVFS_object_ref) };
+ MPI_Aint offsets[2];
+
+ pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (pvfs2_fs == NULL) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_UNKNOWN, "Error allocating memory", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ MPI_Comm_rank(fd->comm, &rank);
+
+ ADIOI_PVFS2_Init(error_code);
+ if (*error_code != MPI_SUCCESS) {
+ /* ADIOI_PVFS2_INIT handles creating error codes on its own */
+ return;
+ }
+
+ /* currently everyone gets their own credentials */
+ ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));
+
+ /* one process resolves name and will later bcast to others */
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
+#endif
+ if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
+ /* given the filename, figure out which pvfs filesystem it is on */
+ ret = PVFS_util_resolve(fd->filename, &cur_fs, pvfs_path, PVFS_NAME_MAX);
+ if (ret < 0) {
+ PVFS_perror("PVFS_util_resolve", ret);
+ /* TODO: pick a good error for this */
+ o_status.error = -1;
+ } else {
+ fake_an_open(cur_fs, pvfs_path,
+ fd->access_mode, fd->hints->striping_factor,
+ fd->hints->striping_unit, pvfs2_fs, &o_status);
+ }
+
+ /* store credentials and object reference in fd */
+ pvfs2_fs->object_ref = o_status.object_ref;
+ fd->fs_ptr = pvfs2_fs;
+ }
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
+#endif
+
+ /* broadcast status and (possibly valid) object reference */
+ MPI_Get_address(&o_status.error, &offsets[0]);
+ MPI_Get_address(&o_status.object_ref, &offsets[1]);
+
+ MPI_Type_create_struct(2, lens, offsets, types, &open_status_type);
+ MPI_Type_commit(&open_status_type);
+
+ /* Assertion: if we hit this Bcast, then all processes collectively
+ * called this open.
+ *
+ * That's because deferred open never happens with PVFS2.
+ */
+ MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm);
+ MPI_Type_free(&open_status_type);
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (o_status.error != 0) {
+ ADIOI_Free(pvfs2_fs);
+ fd->fs_ptr = NULL;
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(o_status.error),
+ "Unknown error", 0);
+ /* TODO: FIX STRING */
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ pvfs2_fs->object_ref = o_status.object_ref;
+ fd->fs_ptr = pvfs2_fs;
+
+ *error_code = MPI_SUCCESS;
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_read.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_read.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6a01fcc0a61812714f94d892393753424dd1f2c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_read.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_io.h"
+#include "ad_pvfs2_common.h"
+
+void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ int ret;
+ MPI_Count datatype_size, len;
+ PVFS_Request file_req, mem_req;
+ PVFS_sysresp_io resp_io;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ static char myname[] = "ADIOI_PVFS2_READCONTIG";
+
+ if (count == 0) {
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ ret = PVFS_Request_contiguous(len, PVFS_BYTE, &mem_req);
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in pvfs_request_contig (memory)", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ret = PVFS_Request_contiguous(len, PVFS_BYTE, &file_req);
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in pvfs_request_contig (file)", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ /* copy individual file pointer into offset variable, continue */
+ offset = fd->fp_ind;
+ }
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, offset, buf,
+ mem_req, &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_read", 0);
+ goto fn_exit;
+ }
+ /* --END ERROR HANDLING-- */
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += (int) resp_io.total_completed;
+ /* TODO: WHY THE INT CAST? */
+ }
+ fd->fp_sys_posn = offset + (int) resp_io.total_completed;
+
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, resp_io.total_completed);
+#endif
+
+ *error_code = MPI_SUCCESS;
+ fn_exit:
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ return;
+}
+
+static int ADIOI_PVFS2_ReadStridedListIO(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ return ADIOI_PVFS2_StridedListIO(fd, buf, count,
+ datatype, file_ptr_type, offset, status, error_code, READ);
+}
+
+static int ADIOI_PVFS2_ReadStridedDtypeIO(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ return ADIOI_PVFS2_StridedDtypeIO(fd, buf, count,
+ datatype, file_ptr_type, offset, status, error_code, READ);
+}
+
+void ADIOI_PVFS2_ReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code)
+{
+ /* four ways (to date) that we can carry out strided i/o accesses:
+ * - naive posix
+ * - 'true' Datatype (from avery)
+ * - new List I/O (from avery)
+ * - classic List I/O (the one that's always been in ROMIO)
+ * I imagine we'll keep Datatype as an optional optimization, and afer a
+ * release or two promote it to the default
+ */
+ int ret = -1;
+
+ if (fd->hints->fs_hints.pvfs2.posix_read == ADIOI_HINT_ENABLE) {
+ ADIOI_GEN_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code);
+ return;
+ }
+ if (fd->hints->fs_hints.pvfs2.dtype_read == ADIOI_HINT_ENABLE) {
+ ret = ADIOI_PVFS2_ReadStridedDtypeIO(fd, buf, count,
+ datatype, file_ptr_type, offset, status, error_code);
+
+ /* Fall back to list I/O if datatype I/O didn't work */
+ if (ret != 0) {
+ fprintf(stderr, "Falling back to list I/O since datatype I/O failed\n");
+ ret = ADIOI_PVFS2_ReadStridedListIO(fd, buf, count,
+ datatype, file_ptr_type,
+ offset, status, error_code);
+ }
+ return;
+ }
+ if (fd->hints->fs_hints.pvfs2.listio_read == ADIOI_HINT_ENABLE) {
+ ret = ADIOI_PVFS2_ReadStridedListIO(fd, buf, count, datatype,
+ file_ptr_type, offset, status, error_code);
+ return;
+ }
+ /* Use classic list I/O if no hints given base case */
+
+ ADIOI_PVFS2_OldReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code);
+ return;
+}
+
+
+/*
+ */
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_read_list_classic.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_read_list_classic.c
new file mode 100644
index 0000000000000000000000000000000000000000..86212b19f08aa0d65171370cf5cb45a092b744dc
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_read_list_classic.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_pvfs2.h"
+
+#include "ad_pvfs2_common.h"
+
+void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int
+ *error_code)
+{
+ /* offset is in units of etype relative to the filetype. */
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ int i, j, k, brd_size, frd_size = 0, st_index = 0;
+ int sum, n_etypes_in_filetype, size_in_filetype;
+ MPI_Count bufsize;
+ int n_filetypes, etype_in_filetype;
+ ADIO_Offset abs_off_in_filetype = 0;
+ MPI_Count filetype_size, etype_size, buftype_size;
+ MPI_Aint lb, filetype_extent, buftype_extent;
+ int buf_count, buftype_is_contig, filetype_is_contig;
+ ADIO_Offset off, disp, start_off, initial_off;
+ int flag, st_frd_size, st_n_filetypes;
+
+ int mem_list_count, file_list_count;
+ PVFS_size *mem_offsets;
+ int64_t *file_offsets;
+ int *mem_lengths;
+ int32_t *file_lengths;
+ int total_blks_to_read;
+
+ int max_mem_list, max_file_list;
+
+ int b_blks_read;
+ int f_data_read;
+ int size_read = 0, n_read_lists, extra_blks;
+
+ int end_brd_size, end_frd_size;
+ int start_k, start_j, new_file_read, new_buffer_read;
+ int start_mem_offset;
+ PVFS_Request mem_req, file_req;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ PVFS_sysresp_io resp_io;
+ int err_flag = 0;
+ MPI_Offset total_bytes_read = 0;
+ static char myname[] = "ADIOI_PVFS2_ReadStrided";
+
+#define MAX_ARRAY_SIZE 64
+
+ *error_code = MPI_SUCCESS; /* changed below if error */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ /* the HDF5 tests showed a bug in this list processing code (see many many
+ * lines down below). We added a workaround, but common HDF5 file types
+ * are actually contiguous and do not need the expensive workarond */
+ if (!filetype_is_contig) {
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+ if (flat_file->count == 1 && !buftype_is_contig)
+ filetype_is_contig = 1;
+ }
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (!filetype_size) {
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ bufsize = buftype_size * count;
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ if (!buftype_is_contig && filetype_is_contig) {
+
+/* noncontiguous in memory, contiguous in file. */
+ int64_t file_offset;
+ int32_t file_length;
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset;
+
+ file_list_count = 1;
+ file_offset = off;
+ file_length = 0;
+ total_blks_to_read = count * flat_buf->count;
+ b_blks_read = 0;
+
+ /* allocate arrays according to max usage */
+ if (total_blks_to_read > MAX_ARRAY_SIZE)
+ mem_list_count = MAX_ARRAY_SIZE;
+ else
+ mem_list_count = total_blks_to_read;
+ mem_offsets = (PVFS_size *) ADIOI_Malloc(mem_list_count * sizeof(PVFS_size));
+ mem_lengths = (int *) ADIOI_Malloc(mem_list_count * sizeof(int));
+
+ /* TODO: CHECK RESULTS OF MEMORY ALLOCATION */
+
+ j = 0;
+ /* step through each block in memory, filling memory arrays */
+ while (b_blks_read < total_blks_to_read) {
+ for (i = 0; i < flat_buf->count; i++) {
+ mem_offsets[b_blks_read % MAX_ARRAY_SIZE] =
+ /* TODO: fix this compiler warning */
+ ((PVFS_size) buf + j * buftype_extent + flat_buf->indices[i]);
+ mem_lengths[b_blks_read % MAX_ARRAY_SIZE] = flat_buf->blocklens[i];
+ file_length += flat_buf->blocklens[i];
+ b_blks_read++;
+ if (!(b_blks_read % MAX_ARRAY_SIZE) || (b_blks_read == total_blks_to_read)) {
+
+ /* in the case of the last read list call,
+ * adjust mem_list_count */
+ if (b_blks_read == total_blks_to_read) {
+ mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE;
+ /* in case last read list call fills max arrays */
+ if (!mem_list_count)
+ mem_list_count = MAX_ARRAY_SIZE;
+ }
+ err_flag = PVFS_Request_hindexed(mem_list_count,
+ mem_lengths, mem_offsets, PVFS_BYTE, &mem_req);
+ if (err_flag < 0)
+ break;
+ err_flag = PVFS_Request_contiguous(file_length, PVFS_BYTE, &file_req);
+ if (err_flag < 0)
+ break;
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req,
+ file_offset, PVFS_BOTTOM, mem_req,
+ &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_read", 0);
+ goto error_state;
+ }
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ total_bytes_read += resp_io.total_completed;
+ /* --END ERROR HANDLING-- */
+
+ /* in the case of error or the last read list call,
+ * leave here */
+ if (err_flag || b_blks_read == total_blks_to_read)
+ break;
+
+ file_offset += file_length;
+ file_length = 0;
+ }
+ } /* for (i=0; icount; i++) */
+ j++;
+ } /* while (b_blks_read < total_blks_to_read) */
+ ADIOI_Free(mem_offsets);
+ ADIOI_Free(mem_lengths);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += total_bytes_read;
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+ /* This isa temporary way of filling in status. The right way is to
+ * keep tracke of how much data was actually read adn placed in buf
+ * by ADIOI_BUFFERED_READ. */
+#endif
+
+ return;
+ }
+
+
+
+
+ /* if (!buftype_is_contig && filetype_is_contig) */
+ /* know file is noncontiguous from above */
+ /* noncontiguous in file */
+ /* filetype already flattened in ADIO_Open */
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+
+ disp = fd->disp;
+ initial_off = offset;
+
+
+ /* for each case - ADIO_Individual pointer or explicit, find the file
+ * offset in bytes (offset), n_filetypes (how many filetypes into
+ * file to start), frd_size (remaining amount of data in present
+ * file block), and st_index (start point in terms of blocks in
+ * starting filetype) */
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind; /* in bytes */
+ n_filetypes = -1;
+ flag = 0;
+ while (!flag) {
+ n_filetypes++;
+ for (i = 0; i < flat_file->count; i++) {
+ if (disp + flat_file->indices[i] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent +
+ flat_file->blocklens[i] >= offset) {
+ st_index = i;
+ frd_size = (int) (disp + flat_file->indices[i] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent
+ + flat_file->blocklens[i] - offset);
+ flag = 1;
+ break;
+ }
+ }
+ } /* while (!flag) */
+ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */
+ else {
+ n_etypes_in_filetype = filetype_size / etype_size;
+ n_filetypes = (int) (offset / n_etypes_in_filetype);
+ etype_in_filetype = (int) (offset % n_etypes_in_filetype);
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i = 0; i < flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ frd_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+
+ /* abs. offset in bytes in the file */
+ offset = disp + ((ADIO_Offset) n_filetypes) * filetype_extent + abs_off_in_filetype;
+ } /* else [file_ptr_type != ADIO_INDIVIDUAL] */
+
+ start_off = offset;
+ st_frd_size = frd_size;
+ st_n_filetypes = n_filetypes;
+
+ if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+ common case. */
+
+ int mem_length = 0;
+ intptr_t mem_offset;
+
+ i = 0;
+ j = st_index;
+ n_filetypes = st_n_filetypes;
+
+ mem_list_count = 1;
+
+ /* determine how many blocks in file to read */
+ f_data_read = MPL_MIN(st_frd_size, bufsize);
+ total_blks_to_read = 1;
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ while (f_data_read < bufsize) {
+ f_data_read += flat_file->blocklens[j];
+ total_blks_to_read++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else
+ j = 0;
+ }
+
+ j = st_index;
+ n_filetypes = st_n_filetypes;
+ n_read_lists = total_blks_to_read / MAX_ARRAY_SIZE;
+ extra_blks = total_blks_to_read % MAX_ARRAY_SIZE;
+
+ mem_offset = (intptr_t) buf;
+ mem_lengths = 0;
+
+ /* if at least one full readlist, allocate file arrays
+ * at max array size and don't free until very end */
+ if (n_read_lists) {
+ file_offsets = (int64_t *) ADIOI_Malloc(MAX_ARRAY_SIZE * sizeof(int64_t));
+ file_lengths = (int32_t *) ADIOI_Malloc(MAX_ARRAY_SIZE * sizeof(int32_t));
+ }
+ /* if there's no full readlist allocate file arrays according
+ * to needed size (extra_blks) */
+ else {
+ file_offsets = (int64_t *) ADIOI_Malloc(extra_blks * sizeof(int64_t));
+ file_lengths = (int32_t *) ADIOI_Malloc(extra_blks * sizeof(int32_t));
+ }
+
+ /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
+ for (i = 0; i < n_read_lists; i++) {
+ file_list_count = MAX_ARRAY_SIZE;
+ if (!i) {
+ file_offsets[0] = offset;
+ file_lengths[0] = st_frd_size;
+ mem_length = st_frd_size;
+ }
+ for (k = 0; k < MAX_ARRAY_SIZE; k++) {
+ if (i || k) {
+ file_offsets[k] = disp +
+ ((ADIO_Offset) n_filetypes) * filetype_extent + flat_file->indices[j];
+ file_lengths[k] = flat_file->blocklens[j];
+ mem_lengths += file_lengths[k];
+ }
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (k=0; kobject_ref, file_req, 0,
+ (void *) mem_offset, mem_req,
+ &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_read", 0);
+ goto error_state;
+ }
+ /* --END ERROR HANDING-- */
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+
+ total_bytes_read += resp_io.total_completed;
+
+ mem_offset += mem_length;
+ mem_lengths = 0;
+ } /* for (i=0; iindices[j];
+ if (k == (extra_blks - 1)) {
+ file_lengths[k] = bufsize - (int32_t) mem_lengths
+ - mem_offset + (int32_t) buf;
+ } else
+ file_lengths[k] = flat_file->blocklens[j];
+ } /* if (i || k) */
+ mem_lengths += file_lengths[k];
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (k=0; kobject_ref, file_req, 0,
+ (void *) mem_offset, mem_req, &(pvfs_fs->credentials),
+ &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_read", 0);
+ goto error_state;
+ }
+ /* --END ERROR HANDLING-- */
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ total_bytes_read += resp_io.total_completed;
+ }
+ } else {
+/* noncontiguous in memory as well as in file */
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ size_read = 0;
+ n_filetypes = st_n_filetypes;
+ frd_size = st_frd_size;
+ brd_size = flat_buf->blocklens[0];
+ buf_count = 0;
+ start_mem_offset = 0;
+ start_k = k = 0;
+ start_j = st_index;
+ max_mem_list = 0;
+ max_file_list = 0;
+
+ /* run through and file max_file_list and max_mem_list so that you
+ * can allocate the file and memory arrays less than MAX_ARRAY_SIZE
+ * if possible */
+
+ while (size_read < bufsize) {
+ k = start_k;
+ new_buffer_read = 0;
+ mem_list_count = 0;
+ while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize - size_read)) {
+ /* find mem_list_count and file_list_count such that both are
+ * less than MAX_ARRAY_SIZE, the sum of their lengths are
+ * equal, and the sum of all the data read and data to be
+ * read in the next immediate read list is less than
+ * bufsize */
+ if (mem_list_count) {
+ if ((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) {
+ end_brd_size = new_buffer_read +
+ flat_buf->blocklens[k] - (bufsize - size_read);
+ new_buffer_read = bufsize - size_read;
+ } else {
+ new_buffer_read += flat_buf->blocklens[k];
+ end_brd_size = flat_buf->blocklens[k];
+ }
+ } else {
+ if (brd_size > (bufsize - size_read)) {
+ new_buffer_read = bufsize - size_read;
+ brd_size = new_buffer_read;
+ } else
+ new_buffer_read = brd_size;
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_read < bufsize-size_read)) */
+ j = start_j;
+ new_file_read = 0;
+ file_list_count = 0;
+ while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) {
+ if (file_list_count) {
+ if ((new_file_read + flat_file->blocklens[j]) > new_buffer_read) {
+ end_frd_size = new_buffer_read - new_file_read;
+ new_file_read = new_buffer_read;
+ j--;
+ } else {
+ new_file_read += flat_file->blocklens[j];
+ end_frd_size = flat_file->blocklens[j];
+ }
+ } else {
+ if (frd_size > new_buffer_read) {
+ new_file_read = new_buffer_read;
+ frd_size = new_file_read;
+ } else
+ new_file_read = frd_size;
+ }
+ file_list_count++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else
+ j = 0;
+
+ k = start_k;
+ if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) {
+ new_buffer_read = 0;
+ mem_list_count = 0;
+ while (new_buffer_read < new_file_read) {
+ if (mem_list_count) {
+ if ((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) {
+ end_brd_size = new_file_read - new_buffer_read;
+ new_buffer_read = new_file_read;
+ k--;
+ } else {
+ new_buffer_read += flat_buf->blocklens[k];
+ end_brd_size = flat_buf->blocklens[k];
+ }
+ } else {
+ new_buffer_read = brd_size;
+ if (brd_size > (bufsize - size_read)) {
+ new_buffer_read = bufsize - size_read;
+ brd_size = new_buffer_read;
+ }
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while (new_buffer_read < new_file_read) */
+ } /* if ((new_file_read < new_buffer_read) && (file_list_count
+ * == MAX_ARRAY_SIZE)) */
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_read < bufsize-size_read)) */
+
+ /* fakes filling the readlist arrays of lengths found above */
+ k = start_k;
+ j = start_j;
+ for (i = 0; i < mem_list_count; i++) {
+ if (i) {
+ if (i == (mem_list_count - 1)) {
+ if (flat_buf->blocklens[k] == end_brd_size)
+ brd_size = flat_buf->blocklens[(k + 1) % flat_buf->count];
+ else {
+ brd_size = flat_buf->blocklens[k] - end_brd_size;
+ k--;
+ buf_count--;
+ }
+ }
+ }
+ buf_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* for (i=0; iblocklens[j] == end_frd_size)
+ frd_size = flat_file->blocklens[(j + 1) % flat_file->count];
+ else {
+ frd_size = flat_file->blocklens[j] - end_frd_size;
+ j--;
+ }
+ }
+ }
+ if (j < flat_file->count - 1)
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (i=0; iblocklens[0])) ||
+ ((mem_list_count == 1) &&
+ (new_buffer_read < flat_buf->blocklens[0])) ||
+ ((file_list_count == MAX_ARRAY_SIZE) &&
+ (new_file_read < flat_buf->blocklens[0])) ||
+ ((mem_list_count == MAX_ARRAY_SIZE) && (new_buffer_read < flat_file->blocklens[0]))) {
+
+ ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype,
+ file_ptr_type, initial_off, status, error_code);
+ return;
+ }
+
+ mem_offsets = (PVFS_size *) ADIOI_Malloc(max_mem_list * sizeof(PVFS_size));
+ mem_lengths = (int *) ADIOI_Malloc(max_mem_list * sizeof(int));
+ file_offsets = (int64_t *) ADIOI_Malloc(max_file_list * sizeof(int64_t));
+ file_lengths = (int32_t *) ADIOI_Malloc(max_file_list * sizeof(int32_t));
+
+ size_read = 0;
+ n_filetypes = st_n_filetypes;
+ frd_size = st_frd_size;
+ brd_size = flat_buf->blocklens[0];
+ buf_count = 0;
+ start_mem_offset = 0;
+ start_k = k = 0;
+ start_j = st_index;
+
+ /* this section calculates mem_list_count and file_list_count
+ * and also finds the possibly odd sized last array elements
+ * in new_frd_size and new_brd_size */
+
+ while (size_read < bufsize) {
+ k = start_k;
+ new_buffer_read = 0;
+ mem_list_count = 0;
+ while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize - size_read)) {
+ /* find mem_list_count and file_list_count such that both are
+ * less than MAX_ARRAY_SIZE, the sum of their lengths are
+ * equal, and the sum of all the data read and data to be
+ * read in the next immediate read list is less than
+ * bufsize */
+ if (mem_list_count) {
+ if ((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) {
+ end_brd_size = new_buffer_read +
+ flat_buf->blocklens[k] - (bufsize - size_read);
+ new_buffer_read = bufsize - size_read;
+ } else {
+ new_buffer_read += flat_buf->blocklens[k];
+ end_brd_size = flat_buf->blocklens[k];
+ }
+ } else {
+ if (brd_size > (bufsize - size_read)) {
+ new_buffer_read = bufsize - size_read;
+ brd_size = new_buffer_read;
+ } else
+ new_buffer_read = brd_size;
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_read < bufsize-size_read)) */
+ j = start_j;
+ new_file_read = 0;
+ file_list_count = 0;
+ while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) {
+ if (file_list_count) {
+ if ((new_file_read + flat_file->blocklens[j]) > new_buffer_read) {
+ end_frd_size = new_buffer_read - new_file_read;
+ new_file_read = new_buffer_read;
+ j--;
+ } else {
+ new_file_read += flat_file->blocklens[j];
+ end_frd_size = flat_file->blocklens[j];
+ }
+ } else {
+ if (frd_size > new_buffer_read) {
+ new_file_read = new_buffer_read;
+ frd_size = new_file_read;
+ } else
+ new_file_read = frd_size;
+ }
+ file_list_count++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else
+ j = 0;
+
+ k = start_k;
+ if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) {
+ new_buffer_read = 0;
+ mem_list_count = 0;
+ while (new_buffer_read < new_file_read) {
+ if (mem_list_count) {
+ if ((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) {
+ end_brd_size = new_file_read - new_buffer_read;
+ new_buffer_read = new_file_read;
+ k--;
+ } else {
+ new_buffer_read += flat_buf->blocklens[k];
+ end_brd_size = flat_buf->blocklens[k];
+ }
+ } else {
+ new_buffer_read = brd_size;
+ if (brd_size > (bufsize - size_read)) {
+ new_buffer_read = bufsize - size_read;
+ brd_size = new_buffer_read;
+ }
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while (new_buffer_read < new_file_read) */
+ } /* if ((new_file_read < new_buffer_read) && (file_list_count
+ * == MAX_ARRAY_SIZE)) */
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_read < bufsize-size_read)) */
+
+ /* fills the allocated readlist arrays */
+ k = start_k;
+ j = start_j;
+ for (i = 0; i < mem_list_count; i++) {
+ mem_offsets[i] = ((PVFS_size) buf + buftype_extent *
+ (buf_count / flat_buf->count) + (int) flat_buf->indices[k]);
+ if (!i) {
+ mem_lengths[0] = brd_size;
+ mem_offsets[0] += flat_buf->blocklens[k] - brd_size;
+ } else {
+ if (i == (mem_list_count - 1)) {
+ mem_lengths[i] = end_brd_size;
+ if (flat_buf->blocklens[k] == end_brd_size)
+ brd_size = flat_buf->blocklens[(k + 1) % flat_buf->count];
+ else {
+ brd_size = flat_buf->blocklens[k] - end_brd_size;
+ k--;
+ buf_count--;
+ }
+ } else {
+ mem_lengths[i] = flat_buf->blocklens[k];
+ }
+ }
+ buf_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* for (i=0; iindices[j] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent;
+ if (!i) {
+ file_lengths[0] = frd_size;
+ file_offsets[0] += flat_file->blocklens[j] - frd_size;
+ } else {
+ if (i == (file_list_count - 1)) {
+ file_lengths[i] = end_frd_size;
+ if (flat_file->blocklens[j] == end_frd_size)
+ frd_size = flat_file->blocklens[(j + 1) % flat_file->count];
+ else {
+ frd_size = flat_file->blocklens[j] - end_frd_size;
+ j--;
+ }
+ } else
+ file_lengths[i] = flat_file->blocklens[j];
+ }
+ if (j < flat_file->count - 1)
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (i=0; iobject_ref, file_req, 0,
+ PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_read", 0);
+ }
+ /* --END ERROR HANDLING-- */
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ total_bytes_read += resp_io.total_completed;
+ size_read += new_buffer_read;
+ start_k = k;
+ start_j = j;
+ } /* while (size_read < bufsize) */
+ ADIOI_Free(mem_offsets);
+ ADIOI_Free(mem_lengths);
+ }
+ /* Other ADIO routines will convert absolute bytes into counts of datatypes */
+ /* when incrementing fp_ind, need to also take into account the file type:
+ * consider an N-element 1-d subarray with a lb and ub: (|---xxxxx-----|
+ * if we wrote N elements, offset needs to point at beginning of type, not
+ * at empty region at offset N+1)
+ *
+ * As we discussed on mpich-discuss in may/june 2009, the code below might
+ * look wierd, but by putting fp_ind at the last byte written, the next
+ * time we run through the strided code we'll update the fp_ind to the
+ * right location. */
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind = file_offsets[file_list_count - 1] + file_lengths[file_list_count - 1];
+ }
+
+ ADIOI_Free(file_offsets);
+ ADIOI_Free(file_lengths);
+
+ if (err_flag == 0)
+ *error_code = MPI_SUCCESS;
+
+ error_state:
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+ /* This is a temporary way of filling in status. The right way is to
+ * keep track of how much data was actually read and placed in buf
+ * by ADIOI_BUFFERED_READ. */
+#endif
+
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_resize.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_resize.c
new file mode 100644
index 0000000000000000000000000000000000000000..5dfe5101ce9dcd864f1fd20f3dfb8dbc2220df95
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_resize.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "ad_pvfs2_common.h"
+
+/* as with ADIOI_PVFS2_Flush, implement the resize operation in a scalable
+ * manner. one process does the work, then broadcasts the result to everyone
+ * else. fortunately, this operation is defined to be collective */
+void ADIOI_PVFS2_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
+{
+ int ret, rank;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ static char myname[] = "ADIOI_PVFS2_RESIZE";
+
+ *error_code = MPI_SUCCESS;
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ MPI_Comm_rank(fd->comm, &rank);
+
+ /* We desginate one node in the communicator to be an 'io_worker' in
+ * ADIO_Open. This node can perform operations on files and then
+ * inform the other nodes of the result */
+
+ /* MPI-IO semantics treat conflicting MPI_File_set_size requests the
+ * same as conflicting write requests. Thus, a resize from one
+ * process does not have to be visible to the other processes until a
+ * syncronization point is reached */
+
+ if (rank == fd->hints->ranklist[0]) {
+ ret = PVFS_sys_truncate(pvfs_fs->object_ref, size, &(pvfs_fs->credentials));
+ MPI_Bcast(&ret, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
+ } else {
+ MPI_Bcast(&ret, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
+ }
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_truncate", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_write.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_write.c
new file mode 100644
index 0000000000000000000000000000000000000000..7cac61d34789061d360fa16e03168a4b03c3904a
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_write.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_pvfs2.h"
+#include "adio_extern.h"
+#include "ad_pvfs2_io.h"
+#include "ad_pvfs2_common.h"
+
+void ADIOI_PVFS2_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ int ret;
+ MPI_Count datatype_size, len;
+ PVFS_Request file_req, mem_req;
+ PVFS_sysresp_io resp_io;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ static char myname[] = "ADIOI_PVFS2_WRITECONTIG";
+
+ if (count == 0) {
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ MPI_Type_size_x(datatype, &datatype_size);
+ len = datatype_size * count;
+
+ ret = PVFS_Request_contiguous(len, PVFS_BYTE, &mem_req);
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_Request_contiguous (memory)", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ret = PVFS_Request_contiguous(len, PVFS_BYTE, &file_req);
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_Request_contiguous (file)", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, offset, (void *) buf,
+ mem_req, &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_write", 0);
+ goto fn_exit;
+ }
+ /* --END ERROR HANDLING-- */
+
+ fd->fp_sys_posn = offset + (int) resp_io.total_completed;
+ } else {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, fd->fp_ind, (void *) buf,
+ mem_req, &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (ret != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(ret),
+ "Error in PVFS_sys_write", 0);
+ goto fn_exit;
+ }
+ /* --END ERROR HANDLING-- */
+ fd->fp_ind += (int) resp_io.total_completed;
+ fd->fp_sys_posn = fd->fp_ind;
+ }
+#ifdef HAVE_STATUS_SET_BYTES
+ if (status)
+ MPIR_Status_set_bytes(status, datatype, resp_io.total_completed);
+#endif
+ *error_code = MPI_SUCCESS;
+ fn_exit:
+ PVFS_Request_free(&file_req);
+ PVFS_Request_free(&mem_req);
+ return;
+}
+
+int ADIOI_PVFS2_WriteStridedListIO(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ return ADIOI_PVFS2_StridedListIO(fd, (void *) buf, count,
+ datatype, file_ptr_type, offset, status, error_code, WRITE);
+}
+
+int ADIOI_PVFS2_WriteStridedDtypeIO(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ return ADIOI_PVFS2_StridedDtypeIO(fd, (void *) buf, count,
+ datatype, file_ptr_type, offset, status, error_code, WRITE);
+}
+
+
+void ADIOI_PVFS2_WriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ /* four ways (to date) that we can carry out strided i/o accesses:
+ * - naive posix
+ * - 'true' Datatype (from avery)
+ * - new List I/O (from avery)
+ * - classic List I/O (the one that's always been in ROMIO)
+ * I imagine we'll keep Datatype as an optional optimization, and afer a
+ * release or two promote it to the default
+ */
+
+ /* a lot of near-duplication from ADIOI_PVFS2_ReadStrided: for
+ * debugging/testing it's helpful to be able to turn on and off these
+ * optimizations separately for the read and write cases */
+ int ret = -1;
+ if (fd->hints->fs_hints.pvfs2.posix_write == ADIOI_HINT_ENABLE) {
+ ADIOI_GEN_WriteStrided_naive(fd, buf, count,
+ datatype, file_ptr_type, offset, status, error_code);
+ return;
+ }
+ if (fd->hints->fs_hints.pvfs2.dtype_write == ADIOI_HINT_ENABLE) {
+ ret = ADIOI_PVFS2_WriteStridedDtypeIO(fd, buf, count,
+ datatype, file_ptr_type, offset, status, error_code);
+
+ /* Fall back to list I/O if datatype I/O didn't work */
+ if (ret != 0) {
+ fprintf(stderr, "Falling back to list I/O since datatype I/O failed\n");
+ ret = ADIOI_PVFS2_WriteStridedListIO(fd, buf, count,
+ datatype, file_ptr_type,
+ offset, status, error_code);
+ }
+ return;
+ }
+ /* Use list I/O in the base case */
+ if (fd->hints->fs_hints.pvfs2.listio_write == ADIOI_HINT_ENABLE) {
+ ret = ADIOI_PVFS2_WriteStridedListIO(fd, buf, count, datatype,
+ file_ptr_type, offset, status, error_code);
+ return;
+ }
+
+ /* Use classic list I/O if no hints given base case */
+ ADIOI_PVFS2_OldWriteStrided(fd, buf, count, datatype,
+ file_ptr_type, offset, status, error_code);
+ return;
+}
diff --git a/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_write_list_classic.c b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_write_list_classic.c
new file mode 100644
index 0000000000000000000000000000000000000000..8fe5c1de6c3f5b514dcc2dab60c8c0c0d583d428
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_pvfs2/ad_pvfs2_write_list_classic.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+#include "ad_pvfs2.h"
+
+#include "ad_pvfs2_common.h"
+
+void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ /* as with all the other WriteStrided functions, offset is in units of
+ * etype relative to the filetype */
+
+ /* Since PVFS2 does not support file locking, can't do buffered writes
+ * as on Unix */
+
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
+ int sum, n_etypes_in_filetype, size_in_filetype;
+ MPI_Count bufsize;
+ int n_filetypes, etype_in_filetype;
+ ADIO_Offset abs_off_in_filetype = 0;
+ MPI_Count filetype_size, etype_size, buftype_size;
+ MPI_Aint lb, filetype_extent, buftype_extent;
+ int buf_count, buftype_is_contig, filetype_is_contig;
+ ADIO_Offset off, disp, start_off, initial_off;
+ int flag, st_fwr_size, st_n_filetypes;
+ int err_flag = 0;
+
+ int mem_list_count, file_list_count;
+ PVFS_size *mem_offsets;
+ int64_t *file_offsets;
+ int *mem_lengths;
+ int32_t *file_lengths;
+ int total_blks_to_write;
+
+ int max_mem_list, max_file_list;
+
+ int b_blks_wrote;
+ int f_data_wrote;
+ int size_wrote = 0, n_write_lists, extra_blks;
+
+ int end_bwr_size, end_fwr_size;
+ int start_k, start_j, new_file_write, new_buffer_write;
+ int start_mem_offset;
+ PVFS_Request mem_req, file_req;
+ ADIOI_PVFS2_fs *pvfs_fs;
+ PVFS_sysresp_io resp_io;
+ MPI_Offset total_bytes_written = 0;
+ static char myname[] = "ADIOI_PVFS2_WRITESTRIDED";
+
+ /* note: don't increase this: several parts of PVFS2 now
+ * assume this limit*/
+#define MAX_ARRAY_SIZE 64
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (fd->atomicity) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG,
+ "Atomic noncontiguous writes are not supported by PVFS2",
+ 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ /* the HDF5 tests showed a bug in this list processing code (see many many
+ * lines down below). We added a workaround, but common HDF5 file types
+ * are actually contiguous and do not need the expensive workarond */
+ if (!filetype_is_contig) {
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+ if (flat_file->count == 1 && !buftype_is_contig)
+ filetype_is_contig = 1;
+ }
+
+ MPI_Type_size_x(fd->filetype, &filetype_size);
+ if (!filetype_size) {
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, 0);
+#endif
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
+ MPI_Type_size_x(datatype, &buftype_size);
+ MPI_Type_get_extent(datatype, &lb, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ bufsize = buftype_size * count;
+
+ pvfs_fs = (ADIOI_PVFS2_fs *) fd->fs_ptr;
+
+ if (!buftype_is_contig && filetype_is_contig) {
+
+/* noncontiguous in memory, contiguous in file. */
+ int64_t file_offset;
+ int32_t file_length;
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+ off = fd->disp + etype_size * offset;
+ } else
+ off = fd->fp_ind;
+
+ file_list_count = 1;
+ file_offset = off;
+ file_length = 0;
+ total_blks_to_write = count * flat_buf->count;
+ b_blks_wrote = 0;
+
+ /* allocate arrays according to max usage */
+ if (total_blks_to_write > MAX_ARRAY_SIZE)
+ mem_list_count = MAX_ARRAY_SIZE;
+ else
+ mem_list_count = total_blks_to_write;
+ mem_offsets = (PVFS_size *) ADIOI_Malloc(mem_list_count * sizeof(PVFS_size));
+ mem_lengths = (int *) ADIOI_Malloc(mem_list_count * sizeof(int));
+
+ j = 0;
+ /* step through each block in memory, filling memory arrays */
+ while (b_blks_wrote < total_blks_to_write) {
+ for (i = 0; i < flat_buf->count; i++) {
+ mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] =
+ /* TODO: fix this warning by casting to an integer that's
+ * the same size as a char * and /then/ casting to
+ * PVFS_size */
+ ((PVFS_size) buf + j * buftype_extent + flat_buf->indices[i]);
+ mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i];
+ file_length += flat_buf->blocklens[i];
+ b_blks_wrote++;
+ if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) {
+
+ /* in the case of the last write list call,
+ * adjust mem_list_count */
+ if (b_blks_wrote == total_blks_to_write) {
+ mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE;
+ /* in case last write list call fills max arrays */
+ if (!mem_list_count)
+ mem_list_count = MAX_ARRAY_SIZE;
+ }
+ err_flag = PVFS_Request_hindexed(mem_list_count,
+ mem_lengths, mem_offsets, PVFS_BYTE, &mem_req);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_Request_hindexed (memory)",
+ 0);
+ break;
+ }
+ /* --END ERROR HANDLING-- */
+
+ err_flag = PVFS_Request_contiguous(file_length, PVFS_BYTE, &file_req);
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_Request_contiguous (file)",
+ 0);
+ break;
+ }
+ /* --END ERROR HANDLING-- */
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req,
+ file_offset, PVFS_BOTTOM,
+ mem_req, &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ total_bytes_written += resp_io.total_completed;
+
+ /* in the case of error or the last write list call,
+ * leave here */
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_write", 0);
+ break;
+ }
+ /* --END ERROR HANDLING-- */
+ if (b_blks_wrote == total_blks_to_write)
+ break;
+
+ file_offset += file_length;
+ file_length = 0;
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ }
+ } /* for (i=0; icount; i++) */
+ j++;
+ } /* while (b_blks_wrote < total_blks_to_write) */
+ ADIOI_Free(mem_offsets);
+ ADIOI_Free(mem_lengths);
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += total_bytes_written;
+
+ if (!err_flag)
+ *error_code = MPI_SUCCESS;
+
+ fd->fp_sys_posn = -1; /* clear this. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+
+ return;
+ }
+
+
+
+
+ /* if (!buftype_is_contig && filetype_is_contig) */
+ /* already know that file is noncontiguous from above */
+ /* noncontiguous in file */
+ /* filetype already flattened in ADIO_Open */
+ flat_file = ADIOI_Flatten_and_find(fd->filetype);
+
+ disp = fd->disp;
+ initial_off = offset;
+
+ /* for each case - ADIO_Individual pointer or explicit, find offset
+ * (file offset in bytes), n_filetypes (how many filetypes into file
+ * to start), fwr_size (remaining amount of data in present file
+ * block), and st_index (start point in terms of blocks in starting
+ * filetype) */
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind; /* in bytes */
+ n_filetypes = -1;
+ flag = 0;
+ while (!flag) {
+ n_filetypes++;
+ for (i = 0; i < flat_file->count; i++) {
+ if (disp + flat_file->indices[i] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent +
+ flat_file->blocklens[i] >= offset) {
+ st_index = i;
+ fwr_size = disp + flat_file->indices[i] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent
+ + flat_file->blocklens[i] - offset;
+ flag = 1;
+ break;
+ }
+ }
+ } /* while (!flag) */
+ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */
+ else {
+ n_etypes_in_filetype = filetype_size / etype_size;
+ n_filetypes = (int) (offset / n_etypes_in_filetype);
+ etype_in_filetype = (int) (offset % n_etypes_in_filetype);
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i = 0; i < flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ fwr_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+
+ /* abs. offset in bytes in the file */
+ offset = disp + ((ADIO_Offset) n_filetypes) * filetype_extent + abs_off_in_filetype;
+ } /* else [file_ptr_type != ADIO_INDIVIDUAL] */
+
+ start_off = offset;
+ st_fwr_size = fwr_size;
+ st_n_filetypes = n_filetypes;
+
+ if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+ common case. */
+
+ int mem_length;
+ intptr_t mem_offset;
+
+ i = 0;
+ j = st_index;
+ off = offset;
+ n_filetypes = st_n_filetypes;
+
+ mem_list_count = 1;
+
+ /* determine how many blocks in file to write */
+ f_data_wrote = MPL_MIN(st_fwr_size, bufsize);
+ total_blks_to_write = 1;
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ while (f_data_wrote < bufsize) {
+ f_data_wrote += flat_file->blocklens[j];
+ total_blks_to_write++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else
+ j = 0;
+ }
+
+ j = st_index;
+ n_filetypes = st_n_filetypes;
+ n_write_lists = total_blks_to_write / MAX_ARRAY_SIZE;
+ extra_blks = total_blks_to_write % MAX_ARRAY_SIZE;
+
+ mem_offset = (intptr_t) buf;
+ mem_length = 0;
+
+ /* if at least one full writelist, allocate file arrays
+ * at max array size and don't free until very end */
+ if (n_write_lists) {
+ file_offsets = (int64_t *) ADIOI_Malloc(MAX_ARRAY_SIZE * sizeof(int64_t));
+ file_lengths = (int32_t *) ADIOI_Malloc(MAX_ARRAY_SIZE * sizeof(int32_t));
+ }
+ /* if there's no full writelist allocate file arrays according
+ * to needed size (extra_blks) */
+ else {
+ file_offsets = (int64_t *) ADIOI_Malloc(extra_blks * sizeof(int64_t));
+ file_lengths = (int32_t *) ADIOI_Malloc(extra_blks * sizeof(int32_t));
+ }
+
+ /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
+ for (i = 0; i < n_write_lists; i++) {
+ file_list_count = MAX_ARRAY_SIZE;
+ if (!i) {
+ file_offsets[0] = offset;
+ file_lengths[0] = st_fwr_size;
+ mem_length = st_fwr_size;
+ }
+ for (k = 0; k < MAX_ARRAY_SIZE; k++) {
+ if (i || k) {
+ file_offsets[k] = disp +
+ ((ADIO_Offset) n_filetypes) * filetype_extent + flat_file->indices[j];
+ file_lengths[k] = flat_file->blocklens[j];
+ mem_length += file_lengths[k];
+ }
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (k=0; kobject_ref, file_req, 0,
+ (void *) mem_offset, mem_req,
+ &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_write", 0);
+ goto error_state;
+ }
+ /* --END ERROR HANDLING-- */
+ total_bytes_written += resp_io.total_completed;
+
+ mem_offset += mem_length;
+ mem_lengths = 0;
+ PVFS_Request_free(&file_req);
+ PVFS_Request_free(&mem_req);
+
+ } /* for (i=0; iindices[j];
+ if (k == (extra_blks - 1)) {
+ file_lengths[k] = bufsize - (int32_t) mem_length
+ - mem_offset + (int32_t) buf;
+ } else
+ file_lengths[k] = flat_file->blocklens[j];
+ } /* if (i || k) */
+ mem_length += file_lengths[k];
+ if (j < (flat_file->count - 1))
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (k=0; kobject_ref, file_req, 0,
+ (void *) mem_offset, mem_req,
+ &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_write", 0);
+ goto error_state;
+ }
+ /* --END ERROR HANDLING-- */
+ total_bytes_written += resp_io.total_completed;
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ }
+ } else {
+ /* noncontiguous in memory as well as in file */
+
+ flat_buf = ADIOI_Flatten_and_find(datatype);
+
+ size_wrote = 0;
+ n_filetypes = st_n_filetypes;
+ fwr_size = st_fwr_size;
+ bwr_size = flat_buf->blocklens[0];
+ buf_count = 0;
+ start_mem_offset = 0;
+ start_k = k = 0;
+ start_j = st_index;
+ max_mem_list = 0;
+ max_file_list = 0;
+
+ /* run through and file max_file_list and max_mem_list so that you
+ * can allocate the file and memory arrays less than MAX_ARRAY_SIZE
+ * if possible */
+
+ while (size_wrote < bufsize) {
+ k = start_k;
+ new_buffer_write = 0;
+ mem_list_count = 0;
+ while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize - size_wrote)) {
+ /* find mem_list_count and file_list_count such that both are
+ * less than MAX_ARRAY_SIZE, the sum of their lengths are
+ * equal, and the sum of all the data written and data to be
+ * written in the next immediate write list is less than
+ * bufsize */
+ if (mem_list_count) {
+ if ((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) {
+ end_bwr_size = new_buffer_write +
+ flat_buf->blocklens[k] - (bufsize - size_wrote);
+ new_buffer_write = bufsize - size_wrote;
+ } else {
+ new_buffer_write += flat_buf->blocklens[k];
+ end_bwr_size = flat_buf->blocklens[k];
+ }
+ } else {
+ if (bwr_size > (bufsize - size_wrote)) {
+ new_buffer_write = bufsize - size_wrote;
+ bwr_size = new_buffer_write;
+ } else
+ new_buffer_write = bwr_size;
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_write < bufsize-size_wrote)) */
+ j = start_j;
+ new_file_write = 0;
+ file_list_count = 0;
+ while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) {
+ if (file_list_count) {
+ if ((new_file_write + flat_file->blocklens[j]) > new_buffer_write) {
+ end_fwr_size = new_buffer_write - new_file_write;
+ new_file_write = new_buffer_write;
+ j--;
+ } else {
+ new_file_write += flat_file->blocklens[j];
+ end_fwr_size = flat_file->blocklens[j];
+ }
+ } else {
+ if (fwr_size > new_buffer_write) {
+ new_file_write = new_buffer_write;
+ fwr_size = new_file_write;
+ } else
+ new_file_write = fwr_size;
+ }
+ file_list_count++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else
+ j = 0;
+
+ k = start_k;
+ if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) {
+ new_buffer_write = 0;
+ mem_list_count = 0;
+ while (new_buffer_write < new_file_write) {
+ if (mem_list_count) {
+ if ((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) {
+ end_bwr_size = new_file_write - new_buffer_write;
+ new_buffer_write = new_file_write;
+ k--;
+ } else {
+ new_buffer_write += flat_buf->blocklens[k];
+ end_bwr_size = flat_buf->blocklens[k];
+ }
+ } else {
+ new_buffer_write = bwr_size;
+ if (bwr_size > (bufsize - size_wrote)) {
+ new_buffer_write = bufsize - size_wrote;
+ bwr_size = new_buffer_write;
+ }
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while (new_buffer_write < new_file_write) */
+ } /* if ((new_file_write < new_buffer_write) &&
+ * (file_list_count == MAX_ARRAY_SIZE)) */
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_write < bufsize-size_wrote)) */
+
+ /* fakes filling the writelist arrays of lengths found above */
+ k = start_k;
+ j = start_j;
+ for (i = 0; i < mem_list_count; i++) {
+ if (i) {
+ if (i == (mem_list_count - 1)) {
+ if (flat_buf->blocklens[k] == end_bwr_size)
+ bwr_size = flat_buf->blocklens[(k + 1) % flat_buf->count];
+ else {
+ bwr_size = flat_buf->blocklens[k] - end_bwr_size;
+ k--;
+ buf_count--;
+ }
+ }
+ }
+ buf_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* for (i=0; iblocklens[j] == end_fwr_size)
+ fwr_size = flat_file->blocklens[(j + 1) % flat_file->count];
+ else {
+ fwr_size = flat_file->blocklens[j] - end_fwr_size;
+ j--;
+ }
+ }
+ }
+ if (j < flat_file->count - 1)
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (i=0; iblocklens[0])) ||
+ ((mem_list_count == 1) &&
+ (new_buffer_write < flat_buf->blocklens[0])) ||
+ ((file_list_count == MAX_ARRAY_SIZE) &&
+ (new_file_write < flat_buf->blocklens[0])) ||
+ ((mem_list_count == MAX_ARRAY_SIZE) && (new_buffer_write < flat_file->blocklens[0]))) {
+ ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype,
+ file_ptr_type, initial_off, status, error_code);
+ return;
+ }
+
+
+ mem_offsets = (PVFS_size *) ADIOI_Malloc(max_mem_list * sizeof(PVFS_size));
+ mem_lengths = (int *) ADIOI_Malloc(max_mem_list * sizeof(int));
+ file_offsets = (int64_t *) ADIOI_Malloc(max_file_list * sizeof(int64_t));
+ file_lengths = (int32_t *) ADIOI_Malloc(max_file_list * sizeof(int32_t));
+
+ size_wrote = 0;
+ n_filetypes = st_n_filetypes;
+ fwr_size = st_fwr_size;
+ bwr_size = flat_buf->blocklens[0];
+ buf_count = 0;
+ start_mem_offset = 0;
+ start_k = k = 0;
+ start_j = st_index;
+
+ /* this section calculates mem_list_count and file_list_count
+ * and also finds the possibly odd sized last array elements
+ * in new_fwr_size and new_bwr_size */
+
+ while (size_wrote < bufsize) {
+ k = start_k;
+ new_buffer_write = 0;
+ mem_list_count = 0;
+ while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize - size_wrote)) {
+ /* find mem_list_count and file_list_count such that both are
+ * less than MAX_ARRAY_SIZE, the sum of their lengths are
+ * equal, and the sum of all the data written and data to be
+ * written in the next immediate write list is less than
+ * bufsize */
+ if (mem_list_count) {
+ if ((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) {
+ end_bwr_size = new_buffer_write +
+ flat_buf->blocklens[k] - (bufsize - size_wrote);
+ new_buffer_write = bufsize - size_wrote;
+ } else {
+ new_buffer_write += flat_buf->blocklens[k];
+ end_bwr_size = flat_buf->blocklens[k];
+ }
+ } else {
+ if (bwr_size > (bufsize - size_wrote)) {
+ new_buffer_write = bufsize - size_wrote;
+ bwr_size = new_buffer_write;
+ } else
+ new_buffer_write = bwr_size;
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_write < bufsize-size_wrote)) */
+ j = start_j;
+ new_file_write = 0;
+ file_list_count = 0;
+ while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) {
+ if (file_list_count) {
+ if ((new_file_write + flat_file->blocklens[j]) > new_buffer_write) {
+ end_fwr_size = new_buffer_write - new_file_write;
+ new_file_write = new_buffer_write;
+ j--;
+ } else {
+ new_file_write += flat_file->blocklens[j];
+ end_fwr_size = flat_file->blocklens[j];
+ }
+ } else {
+ if (fwr_size > new_buffer_write) {
+ new_file_write = new_buffer_write;
+ fwr_size = new_file_write;
+ } else
+ new_file_write = fwr_size;
+ }
+ file_list_count++;
+ if (j < (flat_file->count - 1))
+ j++;
+ else
+ j = 0;
+
+ k = start_k;
+ if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) {
+ new_buffer_write = 0;
+ mem_list_count = 0;
+ while (new_buffer_write < new_file_write) {
+ if (mem_list_count) {
+ if ((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) {
+ end_bwr_size = new_file_write - new_buffer_write;
+ new_buffer_write = new_file_write;
+ k--;
+ } else {
+ new_buffer_write += flat_buf->blocklens[k];
+ end_bwr_size = flat_buf->blocklens[k];
+ }
+ } else {
+ new_buffer_write = bwr_size;
+ if (bwr_size > (bufsize - size_wrote)) {
+ new_buffer_write = bufsize - size_wrote;
+ bwr_size = new_buffer_write;
+ }
+ }
+ mem_list_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* while (new_buffer_write < new_file_write) */
+ } /* if ((new_file_write < new_buffer_write) &&
+ * (file_list_count == MAX_ARRAY_SIZE)) */
+ } /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
+ * (new_buffer_write < bufsize-size_wrote)) */
+
+ /* fills the allocated writelist arrays */
+ k = start_k;
+ j = start_j;
+ for (i = 0; i < mem_list_count; i++) {
+ /* TODO: fix this warning by casting to an integer that's the
+ * same size as a char * and /then/ casting to PVFS_size */
+ mem_offsets[i] = ((PVFS_size) buf + buftype_extent *
+ (buf_count / flat_buf->count) + (int) flat_buf->indices[k]);
+
+ if (!i) {
+ mem_lengths[0] = bwr_size;
+ mem_offsets[0] += flat_buf->blocklens[k] - bwr_size;
+ } else {
+ if (i == (mem_list_count - 1)) {
+ mem_lengths[i] = end_bwr_size;
+ if (flat_buf->blocklens[k] == end_bwr_size)
+ bwr_size = flat_buf->blocklens[(k + 1) % flat_buf->count];
+ else {
+ bwr_size = flat_buf->blocklens[k] - end_bwr_size;
+ k--;
+ buf_count--;
+ }
+ } else {
+ mem_lengths[i] = flat_buf->blocklens[k];
+ }
+ }
+ buf_count++;
+ k = (k + 1) % flat_buf->count;
+ } /* for (i=0; iindices[j] +
+ ((ADIO_Offset) n_filetypes) * filetype_extent;
+ if (!i) {
+ file_lengths[0] = fwr_size;
+ file_offsets[0] += flat_file->blocklens[j] - fwr_size;
+ } else {
+ if (i == (file_list_count - 1)) {
+ file_lengths[i] = end_fwr_size;
+ if (flat_file->blocklens[j] == end_fwr_size)
+ fwr_size = flat_file->blocklens[(j + 1) % flat_file->count];
+ else {
+ fwr_size = flat_file->blocklens[j] - end_fwr_size;
+ j--;
+ }
+ } else
+ file_lengths[i] = flat_file->blocklens[j];
+ }
+ if (j < flat_file->count - 1)
+ j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ } /* for (i=0; iobject_ref, file_req, 0,
+ PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ /* --BEGIN ERROR HANDLING-- */
+ if (err_flag != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ ADIOI_PVFS2_error_convert(err_flag),
+ "Error in PVFS_sys_write", 0);
+ goto error_state;
+ }
+ /* --END ERROR HANDLING-- */
+
+ size_wrote += new_buffer_write;
+ total_bytes_written += resp_io.total_completed;
+ start_k = k;
+ start_j = j;
+ PVFS_Request_free(&mem_req);
+ PVFS_Request_free(&file_req);
+ } /* while (size_wrote < bufsize) */
+ ADIOI_Free(mem_offsets);
+ ADIOI_Free(mem_lengths);
+ }
+ /* when incrementing fp_ind, need to also take into account the file type:
+ * consider an N-element 1-d subarray with a lb and ub: (|---xxxxx-----|
+ * if we wrote N elements, offset needs to point at beginning of type, not
+ * at empty region at offset N+1).
+ *
+ * As we discussed on mpich-discuss in may/june 2009, the code below might
+ * look wierd, but by putting fp_ind at the last byte written, the next
+ * time we run through the strided code we'll update the fp_ind to the
+ * right location. */
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind = file_offsets[file_list_count - 1] + file_lengths[file_list_count - 1];
+ }
+ ADIOI_Free(file_offsets);
+ ADIOI_Free(file_lengths);
+
+ *error_code = MPI_SUCCESS;
+
+ error_state:
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/Makefile.mk b/3rd-party/romio341/adio/ad_quobytefs/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..2916fd0bf6a87ccab258b09070ee9c550f245a0c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/Makefile.mk
@@ -0,0 +1,25 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_QUOBYTEFS
+
+noinst_HEADERS += adio/ad_quobytefs/ad_quobytefs.h \
+ adio/ad_quobytefs/ad_quobytefs_internal.h
+
+romio_other_sources += \
+ adio/ad_quobytefs/ad_quobytefs.c \
+ adio/ad_quobytefs/ad_quobytefs_open.c \
+ adio/ad_quobytefs/ad_quobytefs_close.c \
+ adio/ad_quobytefs/ad_quobytefs_write.c \
+ adio/ad_quobytefs/ad_quobytefs_flush.c \
+ adio/ad_quobytefs/ad_quobytefs_fcntl.c \
+ adio/ad_quobytefs/ad_quobytefs_read.c \
+ adio/ad_quobytefs/ad_quobytefs_resize.c \
+ adio/ad_quobytefs/ad_quobytefs_delete.c \
+ adio/ad_quobytefs/ad_quobytefs_aio.c \
+ adio/ad_quobytefs/ad_quobytefs_setlock.c \
+ adio/ad_quobytefs/ad_quobytefs_internal.c
+
+endif BUILD_AD_QUOBYTEFS
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cc53bd79a939a39c8d26dae9330a92669c77f
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "ad_quobytefs.h"
+
+#include "adioi.h"
+#include
+
+struct ADIOI_Fns_struct ADIO_QUOBYTEFS_operations = {
+ ADIOI_QUOBYTEFS_Open, /* Open */
+ ADIOI_GEN_OpenColl, /* OpenColl */
+ ADIOI_QUOBYTEFS_ReadContig, /* ReadContig */
+ ADIOI_QUOBYTEFS_WriteContig, /* WriteContig */
+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+ ADIOI_QUOBYTEFS_Fcntl, /* Fcntl */
+ ADIOI_GEN_SetInfo, /* SetInfo */
+ ADIOI_GEN_ReadStrided, /* ReadStrided */
+ ADIOI_GEN_WriteStrided, /* WriteStrided */
+ ADIOI_QUOBYTEFS_Close, /* Close */
+#ifdef ROMIO_HAVE_WORKING_AIO
+ ADIOI_QUOBYTEFS_IreadContig, /* IreadContig */
+ ADIOI_QUOBYTEFS_IwriteContig, /* IwriteContig */
+#else
+ ADIOI_FAKE_IreadContig, /* IreadContig */
+ ADIOI_FAKE_IwriteContig, /* IwriteContig */
+#endif
+ ADIOI_GEN_IODone, /* ReadDone */
+ ADIOI_GEN_IODone, /* WriteDone */
+ ADIOI_GEN_IOComplete, /* ReadComplete */
+ ADIOI_GEN_IOComplete, /* WriteComplete */
+ ADIOI_GEN_IreadStrided, /* IreadStrided */
+ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
+ ADIOI_QUOBYTEFS_Flush, /* Flush */
+ ADIOI_QUOBYTEFS_Resize, /* Resize */
+ ADIOI_QUOBYTEFS_Delete, /* Delete */
+ ADIOI_GEN_Feature, /* Features */
+ "QUOBYTEFS:ROMIO driver for quobyte file system",
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+ ADIOI_QUOBYTEFS_SetLock /* SetLock */
+};
+
+static char *extract_registry(const char *filename, int *error_code)
+{
+ /* input: //registry.address/[volume/]path
+ * output: registry.address */
+ static char myname[] = "extract_registry";
+ const char *prefix = "//";
+ int prefix_size = strlen(prefix);
+ if (!strncmp(filename, prefix, prefix_size)) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname,
+ __LINE__, MPI_ERR_NAME, "Invalid uri", 0);
+ }
+ char *extract_filename = (char *) filename + prefix_size;
+ char *tmp = strchr(extract_filename, '/');
+ char *registry = NULL;
+
+ if (tmp != NULL && tmp > extract_filename) {
+ size_t length = tmp - extract_filename;
+ registry = strndup(extract_filename, length);
+ *error_code = MPI_SUCCESS;
+ return registry;
+ } else {
+ *error_code = ADIOI_Err_create_code(myname, filename, EINVAL);
+ }
+ return NULL;
+}
+
+void ADIOI_QUOBYTEFS_CreateAdapter(const char *filename, int *error_code)
+{
+ static char myname[] = "ADIOI_QUOBYTEFS_CreateAdapter";
+
+ char *registry = extract_registry(filename, error_code);
+ if (registry == NULL || *error_code != MPI_SUCCESS) {
+ return;
+ }
+ const char process_name[] = "adio_ffffffff";
+ char name_buffer[strlen(process_name)];
+ int rank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ snprintf(name_buffer, strlen(process_name), "adio_%x", rank);
+ quobyte_set_process_name(name_buffer);
+ int create_status = quobyte_create_adapter(registry);
+ MPL_external_free(registry);
+ if (create_status != EISCONN && create_status != 0) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname,
+ __LINE__, MPI_ERR_IO,
+ "Could not create quobyte adapter", 0);
+ return;
+ }
+ global_quobyte_io_context = -1;
+}
+
+void ADIOI_QUOBYTEFS_DestroyAdapter()
+{
+ /* TODO(alexey): place holder adapter destruction,
+ * for when it works as expected */
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs.h b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..3efff45379a1057c8d2b62491dad1622e133f933
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ *
+ * The main purpose of implementing the ADIO interface for Quobyte is to
+ * avoid calling the kernel for I/O operations. Using default MPI, a user
+ * would mount a quobyte volume in userspace and then use regular unix file
+ * paths as input for MPI-IO.
+ *
+ * This results in the following calls:
+ * MPI-IO <-> UFS (kernel) <-> quobyte client (userspace)
+ *
+ * Using Quobyte ADIO interface we will call the quobyte client directly
+ * through our API library:
+ * MPI-IO <-> quobyte client (userspace)
+ *
+ * This enables Quobyte users to take full advantage of the Quobyte
+ * distributed filesystem.
+ *
+ * This implementation of the ADIO interface is based on the available UFS
+ * ADIO implementation, committing minimal changes necessary for the
+ * compatibility with the Quobyte filesystem.
+ */
+
+#ifndef AD_QUOBYTEFS_H_INCLUDED
+#define AD_QUOBYTEFS_H_INCLUDED
+
+#include "adio.h"
+#include
+#include
+#include
+
+#ifdef HAVE_SIGNAL_H
+#include
+#endif
+
+extern int global_quobyte_io_context;
+
+void ADIOI_QUOBYTEFS_CreateAdapter(const char *registry, int *error_code);
+void ADIOI_QUOBYTEFS_DestroyAdapter(void) __attribute__ ((destructor));
+
+void ADIOI_QUOBYTEFS_Open(ADIO_File fd, int *error_code);
+void ADIOI_QUOBYTEFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_QUOBYTEFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code);
+void ADIOI_QUOBYTEFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int
+ *error_code);
+void ADIOI_QUOBYTEFS_Close(ADIO_File fd, int *error_code);
+void ADIOI_QUOBYTEFS_Flush(ADIO_File fd, int *error_code);
+void ADIOI_QUOBYTEFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
+void ADIOI_QUOBYTEFS_Delete(const char *path, int *error_code);
+int ADIOI_QUOBYTEFS_SetLock(ADIO_File fd, int cmd, int type, ADIO_Offset offset, int whence,
+ ADIO_Offset len);
+int ADIOI_QUOBYTEFS_aio(ADIO_File fd, void *buf, int count, MPI_Datatype type,
+ ADIO_Offset offset, int wr, MPI_Request * request);
+int ADIOI_QUOBYTEFS_aio_free_fn(void *extra_state);
+int ADIOI_QUOBYTEFS_aio_poll_fn(void *extra_state, MPI_Status * status);
+int ADIOI_QUOBYTEFS_aio_wait_fn(int count, void **array_of_states, double timeout,
+ MPI_Status * status);
+void ADIOI_QUOBYTEFS_IreadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype,
+ int file_ptr_type, ADIO_Offset offset, MPI_Request * request,
+ int *error_code);
+void ADIOI_QUOBYTEFS_IwriteContig(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype,
+ int file_ptr_type, ADIO_Offset offset, ADIO_Request * request,
+ int *error_code);
+#endif /* AD_QUOBYTEFS_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_aio.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_aio.c
new file mode 100644
index 0000000000000000000000000000000000000000..ba31bc7220ba495ec3636df2e91592b4079f01bf
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_aio.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#define READ 0
+#define WRITE 1
+#define QUOBYTE_CONCURRENT_REQS 8
+
+#include "ad_quobytefs.h"
+#include "mpiu_greq.h"
+
+static int ADIOI_QUOBYTEFS_greq_class = 0;
+int global_quobyte_io_context;
+
+int ADIOI_QUOBYTEFS_aio_free_fn(void *extra_state);
+int ADIOI_QUOBYTEFS_aio_poll_fn(void *extra_state, MPI_Status * status);
+int ADIOI_QUOBYTEFS_aio_wait_fn(int count, void **array_of_states, double timeout,
+ MPI_Status * status);
+
+static void quobyte_io_event_finished(void *event, int ret)
+{
+ struct quobyte_io_event *aio_event = (struct quobyte_io_event *) event;
+ aio_event->result = ret;
+ if (ret >= 0) {
+ aio_event->errorcode = 0;
+ } else {
+ aio_event->errorcode = EIO;
+ }
+}
+
+int ADIOI_QUOBYTEFS_aio(ADIO_File fd, void *buf, int count, MPI_Datatype type,
+ ADIO_Offset offset, int wr, MPI_Request * request)
+{
+
+ int err = -1;
+ static char myname[] = "ADIOI_QUOBYTEFS_aio";
+ struct quobyte_iocb *aiocbp = NULL;
+ struct quobyte_io_event *quobyte_aio = NULL;
+ ADIOI_AIO_Request *aio_req = NULL;
+ MPI_Count len, typesize;
+
+ MPI_Type_size_x(type, &typesize);
+ len = count * typesize;
+ if (global_quobyte_io_context == -1) {
+ global_quobyte_io_context = quobyte_aio_setup(QUOBYTE_CONCURRENT_REQS);
+ }
+ aio_req = (ADIOI_AIO_Request *) ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
+ aiocbp = (struct quobyte_iocb *) ADIOI_Calloc(sizeof(struct quobyte_iocb), 1);
+ quobyte_aio = (struct quobyte_io_event *) ADIOI_Calloc(sizeof(struct quobyte_io_event), 1);
+ if (wr == WRITE) {
+ aiocbp->op_code = QB_WRITE;
+ } else {
+ aiocbp->op_code = QB_READ;
+ }
+ aiocbp->io_context = global_quobyte_io_context;
+ aiocbp->file_handle = fd->file_handle;
+ aiocbp->buffer = buf;
+ aiocbp->offset = offset;
+ aiocbp->length = len;
+ quobyte_aio->iocb = aiocbp;
+ quobyte_aio->errorcode = EINPROGRESS;
+ quobyte_aio->result = -1;
+ aio_req->qaiocbp = quobyte_aio;
+
+ err =
+ quobyte_aio_submit_with_callback(global_quobyte_io_context, aiocbp,
+ quobyte_io_event_finished, quobyte_aio);
+ if (err == -1) {
+ ADIOI_Free(aio_req);
+ ADIOI_Free(aiocbp);
+ return MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname,
+ __LINE__, MPI_ERR_IO,
+ "Quobyte failed to submit aio context", 0);
+ }
+
+ if (ADIOI_QUOBYTEFS_greq_class == 0) {
+ MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn,
+ ADIOI_QUOBYTEFS_aio_free_fn, MPIU_Greq_cancel_fn,
+ ADIOI_QUOBYTEFS_aio_poll_fn, ADIOI_QUOBYTEFS_aio_wait_fn,
+ &ADIOI_QUOBYTEFS_greq_class);
+ }
+ MPIX_Grequest_class_allocate(ADIOI_QUOBYTEFS_greq_class, aio_req, request);
+ memcpy(&(aio_req->req), request, sizeof(MPI_Request));
+ return 0;
+}
+
+void ADIOI_QUOBYTEFS_IreadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, MPI_Request * request, int *error_code)
+{
+ MPI_Count len, typesize;
+ int aio_errno = 0;
+ static char myname[] = "ADIOI_QUOBYTEFS_IREADCONTIG";
+
+ MPI_Type_size_x(datatype, &typesize);
+ len = count * typesize;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ offset = fd->fp_ind;
+ aio_errno = ADIOI_QUOBYTEFS_aio(fd, buf, count, datatype, offset, READ, request);
+
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (aio_errno != 0) {
+ MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += len;
+
+ fd->fp_sys_posn = -1;
+ *error_code = MPI_SUCCESS;
+}
+
+void ADIOI_QUOBYTEFS_IwriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Request * request, int *error_code)
+{
+ MPI_Count len, typesize;
+ int aio_errno = 0;
+ static char myname[] = "ADIOI_QUOBYTEFS_IWRITECONTIG";
+
+ MPI_Type_size_x(datatype, &typesize);
+ len = count * typesize;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ offset = fd->fp_ind;
+ /* Cast away the const'ness of 'buf' as ADIOI_GEN_aio is used for
+ * both read and write calls */
+ aio_errno = ADIOI_QUOBYTEFS_aio(fd, (char *) buf, count, datatype, offset, WRITE, request);
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (aio_errno != 0) {
+ MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind += len;
+
+ fd->fp_sys_posn = -1;
+
+ *error_code = MPI_SUCCESS;
+}
+
+
+int ADIOI_QUOBYTEFS_aio_free_fn(void *extra_state)
+{
+ ADIOI_AIO_Request *aio_req;
+ aio_req = (ADIOI_AIO_Request *) extra_state;
+
+ if (aio_req != NULL && aio_req->qaiocbp != NULL && aio_req->qaiocbp->iocb != NULL) {
+
+ ADIOI_Free(aio_req->qaiocbp->iocb);
+ ADIOI_Free(aio_req->qaiocbp);
+ ADIOI_Free(aio_req);
+ } else {
+ return MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
+ "ADIOI_QUOBYTEFS_aio_free_fn",
+ __LINE__, MPI_ERR_IO, "Quobyte aio destruction failed", 0);
+ }
+
+ return MPI_SUCCESS;
+}
+
+int ADIOI_QUOBYTEFS_aio_poll_fn(void *extra_state, MPI_Status * status)
+{
+ ADIOI_AIO_Request *aio_req;
+ int errcode = MPI_SUCCESS;
+
+ aio_req = (ADIOI_AIO_Request *) extra_state;
+
+ if (aio_req != NULL && aio_req->qaiocbp != NULL) {
+ if (aio_req->qaiocbp->errorcode == 0) {
+ aio_req->nbytes = aio_req->qaiocbp->result;
+ errcode = MPI_Grequest_complete(aio_req->req);
+ if (errcode != MPI_SUCCESS) {
+ errcode = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ "ADIOI_QUOYBTEFS_aio_poll_fn", __LINE__,
+ MPI_ERR_IO, "**mpi_grequest_complete", 0);
+ }
+ } else {
+ if (aio_req->qaiocbp->errorcode == EIO) {
+ errcode = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ "ADIOI_QUOYBTEFS_aio_poll_fn", __LINE__,
+ MPI_ERR_IO, "Quobyte aio failed", 0);
+ }
+ }
+ }
+ return errcode;
+}
+
+/* wait for multiple requests to complete */
+int ADIOI_QUOBYTEFS_aio_wait_fn(int count, void **array_of_states, double timeout,
+ MPI_Status * status)
+{
+ ADIOI_AIO_Request **aio_reqlist;
+ struct quobyte_io_event **events =
+ (struct quobyte_io_event **) ADIOI_Calloc(sizeof(struct quobyte_io_event *), count);
+ int i = 0;
+ int errcode = MPI_SUCCESS;
+ int num_in_progress = 0;
+ aio_reqlist = (ADIOI_AIO_Request **) array_of_states;
+
+ while (i < count && aio_reqlist[i] != NULL) {
+ struct quobyte_io_event *current_event = aio_reqlist[i]->qaiocbp;
+ if (current_event->errorcode == EINPROGRESS) {
+ events[i] = current_event;
+ num_in_progress++;
+ } else {
+ errcode = MPI_Grequest_complete(aio_reqlist[i]->req);
+ }
+ i++;
+ }
+
+ i = 0;
+
+ double start_time = MPI_Wtime();
+ int no_timeout = timeout > 0 ? 0 : 1; /* when timeout is <= 0 the loop will run until all events are done */
+ while (num_in_progress > 0 && (no_timeout || MPI_Wtime() - start_time < timeout)) {
+ if (events[i] != NULL && events[i]->errorcode != EINPROGRESS) {
+ errcode = MPI_Grequest_complete(aio_reqlist[i]->req);
+ events[i] = NULL;
+ num_in_progress--;
+ }
+ if (i >= count) {
+ i = 0;
+ } else {
+ i++;
+ }
+ }
+ ADIOI_Free(events);
+
+ if (errcode != MPI_SUCCESS) {
+ errcode = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ "ADIOI_QUOBYTEFS_aio_wait_fn",
+ __LINE__, MPI_ERR_IO, "**mpi_grequest_complete", 0);
+ }
+ return errcode;
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_close.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_close.c
new file mode 100644
index 0000000000000000000000000000000000000000..1e93a7dd8abc60b32e0bcb9826812dc6e0d1a83c
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_close.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+
+void ADIOI_QUOBYTEFS_Close(ADIO_File fd, int *error_code)
+{
+ static char myname[] = "ADIOI_QUOBYTEFS_CLOSE";
+ struct quobyte_fh *file_handle = fd->file_handle;
+ if (file_handle != NULL) {
+ if (quobyte_close(file_handle)) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO, "**io",
+ "Quobyte failed to close the file: %s",
+ strerror(errno));
+ return;
+ }
+ *error_code = MPI_SUCCESS;
+ } else {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname,
+ __LINE__, MPI_ERR_IO, "Quobyte file header is null", 0);
+ return;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_delete.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_delete.c
new file mode 100644
index 0000000000000000000000000000000000000000..dfd2396400d5c23ca5e3b47176526ad2f0ecfb60
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_delete.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+#include "ad_quobytefs_internal.h"
+
+void ADIOI_QUOBYTEFS_Delete(const char *path, int *error_code)
+{
+ ADIOI_QUOBYTEFS_CreateAdapter(path, error_code);
+ static char myname[] = "ADIOI_QUOBYTEFS_DELETE";
+
+ if (quobyte_unlink(ADIOI_QUOBYTEFSI_GetVolumeAndPath(path))) {
+ *error_code = ADIOI_Err_create_code(myname, path, errno);
+ return;
+ }
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_fcntl.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_fcntl.c
new file mode 100644
index 0000000000000000000000000000000000000000..325842a6bb01b7f23b1cc3fdc84e49479d298d5d
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_fcntl.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+#include "ad_quobytefs_internal.h"
+
+void ADIOI_QUOBYTEFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code)
+{
+ static char myname[] = "ADIOI_QUOBYTEFS_FCNTL";
+ struct stat file_stat;
+ const char *filepath = ADIOI_QUOBYTEFSI_GetVolumeAndPath(fd->filename);
+ if (!(filepath > fd->filename)) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, EINVAL);
+ return;
+ }
+
+ switch (flag) {
+ case ADIO_FCNTL_GET_FSIZE:
+ if (quobyte_getattr(filepath, &file_stat)) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ return;
+ }
+ fcntl_struct->fsize = file_stat.st_size;
+ if (fcntl_struct->fsize == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_IO, "**io", "**io %s", strerror(errno));
+ } else {
+ *error_code = MPI_SUCCESS;
+ }
+ break;
+
+ case ADIO_FCNTL_SET_DISKSPACE:
+ ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
+ break;
+
+ case ADIO_FCNTL_SET_ATOMICITY:
+ fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
+ *error_code = MPI_SUCCESS;
+ break;
+
+ default:
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_ARG, "**flag", "**flag %d", flag);
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_flush.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_flush.c
new file mode 100644
index 0000000000000000000000000000000000000000..feb2499b38399742dd154d87a0275fc5ff3d2777
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_flush.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+
+void ADIOI_QUOBYTEFS_Flush(ADIO_File fd, int *error_code)
+{
+ int err;
+ static char myname[] = "ADIOI_QUOBYTEFS_FLUSH";
+
+ /* the deferred-open optimization may mean that a file has not been opened
+ * on this processor */
+ if (fd->is_open > 0) {
+ /* alexey: the original uses fsync although it is named flush */
+ err = quobyte_fsync(fd->file_handle);
+ if (err == -1) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__, MPI_ERR_IO,
+ "**io", "**io %s", strerror(errno));
+ return;
+ }
+ }
+ *error_code = MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_internal.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_internal.c
new file mode 100644
index 0000000000000000000000000000000000000000..655b4830fbd1bd8b4bd2d91a1a711d54f5e858ee
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_internal.c
@@ -0,0 +1,12 @@
+#include "ad_quobytefs_internal.h"
+
+const char *ADIOI_QUOBYTEFSI_GetVolumeAndPath(const char *filename)
+{
+ const char *path;
+ if (strlen(filename) > 1 && !strncmp(filename, "//", 2)) {
+ path = strchr(filename + 2, '/');
+ } else {
+ path = filename;
+ }
+ return path;
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_internal.h b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..580deeb4736ada726832bf87fb0ff556c17b5a17
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_internal.h
@@ -0,0 +1,8 @@
+#ifndef AD_QUOBYTEFS_INTERNAL_H_INCLUDED
+#define AD_QUOBYTEFS_INTERNAL_H_INCLUDED
+
+#include
+
+const char *ADIOI_QUOBYTEFSI_GetVolumeAndPath(const char *filename);
+
+#endif /* AD_QUOBYTEFS_INTERNAL_H_INCLUDED */
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_open.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..c7dcbcb699175e62c31bc0ab9c18d1f30e6f4e27
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_open.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+#include "ad_quobytefs_internal.h"
+
+void ADIOI_QUOBYTEFS_Open(ADIO_File fd, int *error_code)
+{
+ ADIOI_QUOBYTEFS_CreateAdapter(fd->filename, error_code);
+ int perm, old_mask, amode;
+ static char myname[] = "ADIOI_QUOBYTEFS_OPEN";
+ /* shortest possible path "//A/B" */
+ if (strlen(fd->filename) < 5) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, EINVAL);
+ return;
+ }
+ const char *filepath = ADIOI_QUOBYTEFSI_GetVolumeAndPath(fd->filename);
+ if (strlen(filepath) == 0) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, EINVAL);
+ return;
+ }
+ if (fd->perm == ADIO_PERM_NULL) {
+ old_mask = umask(022);
+ umask(old_mask);
+ perm = old_mask ^ 0666;
+ } else
+ perm = fd->perm;
+
+ amode = 0;
+ if (fd->access_mode & ADIO_CREATE)
+ amode = amode | O_CREAT;
+ if (fd->access_mode & ADIO_RDONLY)
+ amode = amode | O_RDONLY;
+ if (fd->access_mode & ADIO_WRONLY)
+ amode = amode | O_WRONLY;
+ if (fd->access_mode & ADIO_RDWR)
+ amode = amode | O_RDWR;
+ if (fd->access_mode & ADIO_EXCL)
+ amode = amode | O_EXCL;
+
+ /* romio tests expect write then read without flush */
+ amode = amode | O_DIRECT;
+
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
+#endif
+ fd->file_handle = quobyte_open(filepath, amode, perm);
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
+#endif
+ fd->fd_direct = -1;
+
+ if ((fd->file_handle != NULL) && (fd->access_mode & ADIO_APPEND)) {
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
+#endif
+ struct stat file_stat;
+ if (quobyte_fstat(fd->file_handle, &file_stat)) {
+ *error_code = ADIOI_Err_create_code(myname, filepath, errno);
+ return;
+ }
+ fd->fp_ind = fd->fp_sys_posn = file_stat.st_size;
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
+#endif
+ }
+ if (fd->file_handle == NULL) {
+ *error_code = ADIOI_Err_create_code(myname, filepath, errno);
+ } else {
+ *error_code = MPI_SUCCESS;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_read.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_read.c
new file mode 100644
index 0000000000000000000000000000000000000000..ae03a35ce16d248fe5da433cedbbe952d6a86d44
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_read.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+
+void ADIOI_QUOBYTEFS_ReadContig(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ MPI_Count datatype_size;
+ ADIO_Offset bytes_transfered;
+
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5034, 0, NULL);
+#endif
+ MPI_Type_size_x(datatype, &datatype_size);
+ bytes_transfered = datatype_size * (ADIO_Offset) count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
+#endif
+ int bytes_read = quobyte_read(fd->file_handle, buf, offset, bytes_transfered);
+ fd->fp_sys_posn = offset + bytes_read;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += bytes_read;
+ }
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bytes_transfered);
+#endif
+ *error_code = MPI_SUCCESS;
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5035, 0, NULL);
+#endif
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_resize.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_resize.c
new file mode 100644
index 0000000000000000000000000000000000000000..7e10b74a7cc9734ed903117a9a1f5a375531c7af
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_resize.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#include "ad_quobytefs.h"
+
+void ADIOI_QUOBYTEFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
+{
+ static char myname[] = "ADIOI_QUOBYTEFS_RESIZE";
+ struct quobyte_fh *file_handle = fd->file_handle;
+ int err, rank;
+
+ if (file_handle != NULL) {
+
+ MPI_Comm_rank(fd->comm, &rank);
+ /* first aggregator performs ftruncate() */
+ if (rank == fd->hints->ranklist[0]) {
+ ADIOI_Assert(size == (off_t) size);
+ err = quobyte_ftruncate(fd->file_handle, (off_t) size);
+ }
+
+ /* bcast return value */
+ MPI_Bcast(&err, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
+ if (err == -1) {
+ *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
+ return;
+ }
+ *error_code = MPI_SUCCESS;
+ } else {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname,
+ __LINE__, MPI_ERR_IO, "Quobyte file header is null", 0);
+ return;
+ }
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_setlock.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_setlock.c
new file mode 100644
index 0000000000000000000000000000000000000000..526c435f521be84a304ed91aee85de3e3d51abe3
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_setlock.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+#include "lock_internal.h"
+
+#include "ad_quobytefs.h"
+
+int ADIOI_QUOBYTEFS_SetLock(ADIO_File fd, int cmd, int type, ADIO_Offset offset, int whence,
+ ADIO_Offset len)
+{
+ struct flock lock;
+ lock.l_type = type;
+ lock.l_whence = whence;
+ lock.l_start = offset;
+ lock.l_len = len;
+
+ int err, save_errno, err_count; /* save previous errno in case we recover from retryable errors */
+ errno = 0;
+ err_count = 0;
+ save_errno = errno;
+
+ do {
+ err = quobyte_lock(fd->file_handle, cmd, &lock);
+ } while (err && ((errno == EINTR) || ((errno == EINPROGRESS) && (++err_count < 10000))));
+
+ if (!err) /* report fcntl failure errno's (EBADF), otherwise */
+ errno = save_errno; /* restore previous errno in case we recovered from retryable errors */
+
+ if (err && (errno != EBADF)) {
+ FPRINTF(stderr,
+ "File locking failed in ADIOI_QUOBYTEFS_SetLock(fd %X,cmd %s/%X,type %s/%X,whence "
+ "%X) with return value %X and errno %X.\n",
+ fd->fd_sys, ADIOI_GEN_flock_cmd_to_string(cmd), cmd,
+ ADIOI_GEN_flock_type_to_string(type), type, whence, err, errno);
+ perror("ADIOI_QUOBYTEFS_SetLock:");
+ FPRINTF(stderr, "ADIOI_QUOBYTEFS_SetLock:offset %llu, length %llu\n",
+ (unsigned long long) offset, (unsigned long long) len);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ return MPI_ERR_IO;
+ }
+ return MPI_SUCCESS;
+}
diff --git a/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_write.c b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_write.c
new file mode 100644
index 0000000000000000000000000000000000000000..91ae755eb0da94ebf8ea3ed852420cea4a7682f8
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_quobytefs/ad_quobytefs_write.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+
+#include "adio.h"
+
+#ifdef AGGREGATION_PROFILE
+#include "mpe.h"
+#endif
+
+#include "ad_quobytefs.h"
+
+void ADIOI_QUOBYTEFS_WriteContig(ADIO_File fd, const void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status * status, int *error_code)
+{
+ MPI_Count datatype_size;
+ ADIO_Offset bytes_transfered;
+ char *buffer_pointer = (char *) buf;
+ static char myname[] = "ADIOI_QUOBYTEFS_WRITECONTIG";
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5036, 0, NULL);
+#endif
+ MPI_Type_size_x(datatype, &datatype_size);
+ bytes_transfered = datatype_size * (ADIO_Offset) count;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind;
+ }
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
+#endif
+ if (quobyte_write(fd->file_handle, buffer_pointer, offset, bytes_transfered,
+ false /* sync write */) != bytes_transfered) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname,
+ __LINE__, MPI_ERR_IO, "Quobyte failed to write", 0);
+ return;
+ }
+#ifdef ADIOI_MPE_LOGGING
+ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
+#endif
+ fd->fp_sys_posn = offset + bytes_transfered;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ fd->fp_ind += bytes_transfered;
+ }
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bytes_transfered);
+#endif
+ *error_code = MPI_SUCCESS;
+#ifdef AGGREGATION_PROFILE
+ MPE_Log_event(5037, 0, NULL);
+#endif
+}
diff --git a/3rd-party/romio341/adio/ad_testfs/Makefile.mk b/3rd-party/romio341/adio/ad_testfs/Makefile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..342fa56c256c1e61dba82ca73515eac5c8f98d71
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_testfs/Makefile.mk
@@ -0,0 +1,29 @@
+##
+## Copyright (C) by Argonne National Laboratory
+## See COPYRIGHT in top-level directory
+##
+
+if BUILD_AD_TESTFS
+
+noinst_HEADERS += adio/ad_testfs/ad_testfs.h
+
+romio_other_sources += \
+ adio/ad_testfs/ad_testfs_close.c \
+ adio/ad_testfs/ad_testfs_read.c \
+ adio/ad_testfs/ad_testfs_rdcoll.c \
+ adio/ad_testfs/ad_testfs_wrcoll.c \
+ adio/ad_testfs/ad_testfs_open.c \
+ adio/ad_testfs/ad_testfs_write.c \
+ adio/ad_testfs/ad_testfs_done.c \
+ adio/ad_testfs/ad_testfs_fcntl.c \
+ adio/ad_testfs/ad_testfs_iread.c \
+ adio/ad_testfs/ad_testfs_iwrite.c \
+ adio/ad_testfs/ad_testfs_wait.c \
+ adio/ad_testfs/ad_testfs_flush.c \
+ adio/ad_testfs/ad_testfs_seek.c \
+ adio/ad_testfs/ad_testfs_resize.c \
+ adio/ad_testfs/ad_testfs_hints.c \
+ adio/ad_testfs/ad_testfs_delete.c \
+ adio/ad_testfs/ad_testfs.c
+
+endif BUILD_AD_TESTFS
diff --git a/3rd-party/romio341/adio/ad_testfs/ad_testfs.c b/3rd-party/romio341/adio/ad_testfs/ad_testfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..5e6a5a1df143124b8a080e421ceebb583e65e7de
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_testfs/ad_testfs.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#include "ad_testfs.h"
+
+/* adioi.h has the ADIOI_Fns_struct define */
+#include "adioi.h"
+
+struct ADIOI_Fns_struct ADIO_TESTFS_operations = {
+ ADIOI_TESTFS_Open, /* Open */
+ ADIOI_GEN_OpenColl, /* OpenColl */
+ ADIOI_TESTFS_ReadContig, /* ReadContig */
+ ADIOI_TESTFS_WriteContig, /* WriteContig */
+ ADIOI_TESTFS_ReadStridedColl, /* ReadStridedColl */
+ ADIOI_TESTFS_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_TESTFS_SeekIndividual, /* SeekIndividual */
+ ADIOI_TESTFS_Fcntl, /* Fcntl */
+ ADIOI_TESTFS_SetInfo, /* SetInfo */
+ ADIOI_TESTFS_ReadStrided, /* ReadStrided */
+ ADIOI_TESTFS_WriteStrided, /* WriteStrided */
+ ADIOI_TESTFS_Close, /* Close */
+ ADIOI_TESTFS_IreadContig, /* IreadContig */
+ ADIOI_TESTFS_IwriteContig, /* IwriteContig */
+ ADIOI_TESTFS_ReadDone, /* ReadDone */
+ ADIOI_TESTFS_WriteDone, /* WriteDone */
+ ADIOI_TESTFS_ReadComplete, /* ReadComplete */
+ ADIOI_TESTFS_WriteComplete, /* WriteComplete */
+ ADIOI_TESTFS_IreadStrided, /* IreadStrided */
+ ADIOI_TESTFS_IwriteStrided, /* IwriteStrided */
+ ADIOI_TESTFS_Flush, /* Flush */
+ ADIOI_TESTFS_Resize, /* Resize */
+ ADIOI_TESTFS_Delete, /* Delete */
+ ADIOI_GEN_Feature, /* Features */
+ "TESTFS: the logging-only file system",
+ ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
+ ADIOI_GEN_IwriteStridedColl, /* IwriteStridedColl */
+#if defined(F_SETLKW64)
+ ADIOI_GEN_SetLock /* SetLock */
+#else
+ ADIOI_GEN_SetLock64 /* SetLock */
+#endif
+};
diff --git a/3rd-party/romio341/adio/ad_testfs/ad_testfs.h b/3rd-party/romio341/adio/ad_testfs/ad_testfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..0777da52959fe608d2c5a8e652b2bf12f25b4b4e
--- /dev/null
+++ b/3rd-party/romio341/adio/ad_testfs/ad_testfs.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ * See COPYRIGHT in top-level directory
+ */
+
+#ifndef AD_TESTFS_H_INCLUDED
+#define AD_TESTFS_H_INCLUDED
+
+#include "adio.h"
+#include
+#include