diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/humanevalx/humanevalx_gen_0_shot.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/humanevalx/humanevalx_gen_0_shot.py
new file mode 100644
index 0000000000000000000000000000000000000000..16382fd6a48aacba9f6fe3b6b5588484b03f3239
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/humanevalx/humanevalx_gen_0_shot.py
@@ -0,0 +1,42 @@
+from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.datasets import HumanevalXDataset, HumanevalXEvaluator
+
+humanevalx_reader_cfg = dict(
+    input_columns=['prompt'], output_column='declaration', train_split='test')
+
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, batch_size=16))
+
+humanevalx_eval_cfg_dict = {
+    lang: dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+
+            language=lang,
+            ip_address=
+            'localhost',  # replace to your code_eval_server ip_address, port
+            port=5001),
+        pred_role='BOT')
+    for lang in ['python', 'cpp', 'go', 'java', 'js']  # do not support rust now
+}
+
+# Please download the needed `xx.jsonl.gz` from
+# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
+# and move them into `data/humanevalx/` folder
+humanevalx_datasets = [
+    dict(
+        type=HumanevalXDataset,
+        abbr=f'humanevalx-{lang}',
+        language=lang,
+        path='ais_bench/datasets/humanevalx',
+        reader_cfg=humanevalx_reader_cfg,
+        infer_cfg=humanevalx_infer_cfg,
+        eval_cfg=humanevalx_eval_cfg_dict[lang])
+    for lang in ['python', 'cpp', 'go', 'java', 'js']
+]
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py
index cb9e7b4c35c9a8402f8eefea48f2237d412f818a..9934cb8db588d4f165db9678ae1bc50a7aa51337 100644
--- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py
@@ -20,4 +20,6 @@ from .mbpp import * # noqa: F401，F403
 from .agieval import * # noqa: F401, F403
 from .hellaswag import * # noqa: F401, F403
 from .triviaqa import * # noqa: F401, F403
-from .cmmlu import * # noqa: F401, F403
\ No newline at end of file
+from .cmmlu import * # noqa: F401, F403
+from .humanevalx import humanevalx, humaneval_x_eval, humaneval_x_utils
+from .humanevalx.humanevalx import *
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/README.md b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1affb959844843497c2f8871acdc6c68b6455035
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/README.md
@@ -0,0 +1,73 @@
+# HumanEval_X数据集配置依赖环境 README
+
+
+## `HumanEval_X`环境安装
+在测试HumanEval_X时，需要配置语言环境，包括GO、JS、JAVA这三种语言,安装步骤如下：
+
+
+1.首先需要设置代理，根据服务器网段设置服务器代理
+
+2.下载语言包：
+    
+    NPU环境上的语言包为：
+    go    https://go.dev/dl/go1.18.4.linux-arm64.tar.gz
+    node  https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-arm64.tar.gz
+    java  https://download.oracle.com/java/18/archive/jdk-18.0.2.1_linux-aarch64_bin.tar.gz
+
+    GPU环境上的语言包为：
+    go    https://go.dev/dl/go1.18.4.linux-amd64.tar.gz
+    node  https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-x64.tar.gz
+    java  https://download.oracle.com/java/18/archive/jdk-18.0.2.1_linux-x64_bin.tar.gz
+
+3.执行解压安装：
+    
+    NPU安装步骤：
+
+        # 安装npm:可能代理网络不好，导致npm下载失败，可以更换代理或者换个时间段在进行安装
+        apt-get update
+        apt-get install -y npm
+
+        # 安装GO语言：
+        tar -zxf go1.18.4.linux-arm64.tar.gz -C /usr/local
+        export PATH=/bin:/usr/local/go/bin:$PATH
+
+        # 安装node:
+        mkdir -p /usr/local/lib/nodejs
+        tar -zxf node-v16.14.0-linux-arm64.tar.gz -C /usr/local/lib/nodejs
+        mv /usr/local/lib/nodejs/node-v16.14.0-linux-arm64 /usr/local/lib/nodejs/node
+
+        # 安装js:
+        npm config set strict-ssl false
+        npm install -g js-md5@0.7.3
+        export PATH=/usr/local/lib/nodejs/node/bin:$PATH
+        export NODE_PATH=/usr/local/lib/node_modules
+
+        # 安装JAVA:
+        mkdir /usr/java
+        tar -zxf jdk-18.0.2.1_linux-aarch64_bin.tar.gz -C /usr/java
+        JAVA_HOME=/usr/java/jdk-18.0.2.1
+        update-alternatives --install /usr/bin/java java $JAVA_HOME/bin/java 20000
+        update-alternatives --install /usr/bin/javac javac $JAVA_HOME/bin/javac 20000
+
+    GPU：与NPU类似，只需要更换上述脚本中对应的语言安装包名即可
+
+4.检查语言是否安装成功
+
+    go version  #检查go语言
+    js --version  #检查js
+    java --version  #检查java
+    npm -v  #检查npm
+
+若语言出现对应的版本说明下载安装成功，即可进行精度测试
+若没有出现对应版本则需要重新下载或者单独安装相关的语言
+
+
+## `HumanEval_X`环境配置补充说明
+
+1.如出现某个语言精度结果为0.0则应检查对应的语言版本是否下载完成，并查看是否导入环境变量
+
+2.go语言和JAVA的执行时间比较久，如果发现这两个语言有波动，则可以检查model_test.py中的timeout设置是否太小。
+
+3.每次进入环境都需要进行导入GO语言环境，检查语言版本
+
+
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/cpp/evaluation/test.cpp b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/cpp/evaluation/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76bd7d57832c29c152763a974f079171b50ab728
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/cpp/evaluation/test.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
+separate those group into separate strings and return the vector of those.
+Separate groups are balanced (each open brace is properly closed) and not nested within each other
+Ignore any spaces in the input string.
+>>> separate_paren_groups("( ) (( )) (( )( ))")
+{"()", "(())", "(()())"}
+*/
+#include <cstdio>
+#include <vector>
+#include <string>
+
+namespace {
+
+    std::vector<std::string> separate_paren_groups(std::string paren_string)
+    {
+        std::vector<std::string> all_parens;
+        std::string current_paren;
+        int level = 0;
+        char chr;
+        int i;
+
+        for (i = 0; i < paren_string.length(); i++) {
+            chr = paren_string[i];
+            if (chr == '(') {
+                level += 1;
+                current_paren += chr;
+            }
+            if (chr == ')') {
+                level -= 1;
+                current_paren += chr;
+                if (level == 0) {
+                    all_parens.push_back(current_paren);
+                    current_paren = "";
+                }
+            }
+        }
+        return all_parens;
+    }
+
+#undef NDEBUG
+#include <cassert>
+
+    static bool g_issame(std::vector<std::string> a, std::vector<std::string> b)
+    {
+        if (a.size() != b.size()) return false;
+        for (int i = 0; i < a.size(); i++) {
+            if (a[i] != b[i]) return false;
+        }
+        return true;
+    }
+} // namespace
+
+int main()
+{
+    assert(g_issame(separate_paren_groups("(()()) ((())) () ((())()())"), {"(()())", "((()))", "()", "((())()())"}));
+    assert(g_issame(separate_paren_groups("() (()) ((())) (((())))"), {"()", "(())", "((()))", "(((())))"}));
+    assert(g_issame(separate_paren_groups("(()(())((())))"), {"(()(())((())))"}));
+    assert(g_issame(separate_paren_groups("( ) (( )) (( )( ))"), {"()", "(())", "(()())"}));
+}
\ No newline at end of file
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/file_utils.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9255a24cf9395e2b75dd52e33e09c6748476f0d4
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/file_utils.py
@@ -0,0 +1,181 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+from functools import reduce
+import os
+import stat
+
+MAX_PATH_LENGTH = 4096
+MAX_FILE_SIZE = 100 * 1024 * 1024
+MAX_FILENUM_PER_DIR = 1024
+MAX_LINENUM_PER_FILE = 10 * 1024 * 1024
+
+FLAG_OS_MAP = {
+    'r': os.O_RDONLY, 'r+': os.O_RDWR,
+    'w': os.O_CREAT | os.O_TRUNC | os.O_WRONLY,
+    'w+': os.O_CREAT | os.O_TRUNC | os.O_RDWR,
+    'a': os.O_CREAT | os.O_APPEND | os.O_WRONLY,
+    'a+': os.O_CREAT | os.O_APPEND | os.O_RDWR,
+    'x': os.O_CREAT | os.O_EXCL,
+    "b": getattr(os, "O_BINARY", 0)
+}
+
+
+def safe_open(file_path: str, mode='r', encoding=None, permission_mode=0o600, is_exist_ok=True, **kwargs):
+    """
+    :param file_path: 文件路径
+    :param mode: 文件打开模式
+    :param encoding: 文件编码方式
+    :param permission_mode: 文件权限模式
+    :param is_exist_ok: 是否允许文件存在
+    :param max_path_length: 文件路径最大长度
+    :param max_file_size: 文件最大大小，单位: 字节, 默认值10MB
+    :param check_link: 是否校验软链接
+    :param kwargs:
+    :return:
+    """
+    max_path_length = kwargs.get('max_path_length', MAX_PATH_LENGTH)
+    max_file_size = kwargs.get('max_file_size', MAX_FILE_SIZE)
+    check_link = kwargs.get('check_link', False)
+
+    file_path = standardize_path(file_path, max_path_length, check_link)
+    check_file_safety(file_path, mode, is_exist_ok, max_file_size)
+
+    flags = []
+    for item in list(mode):
+        if item == "+" and flags:
+            flags[-1] = f"{flags[-1]}+"
+            continue
+        flags.append(item)
+    flags = [FLAG_OS_MAP.get(mode, os.O_RDONLY) for mode in flags]
+    total_flag = reduce(lambda a, b: a | b, flags)
+
+    return os.fdopen(os.open(file_path, total_flag, permission_mode),
+                     mode, encoding=encoding)
+
+
+def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True):
+    """
+    check path
+    param: path
+    return: data real path after check
+    """
+    check_path_is_none(path)
+    check_path_length_lt(path, max_path_length)
+    if check_link:
+        check_path_is_link(path)
+    path = os.path.realpath(path)
+    return path
+
+
+def is_path_exists(path: str):
+    return os.path.exists(path)
+
+
+def check_path_is_none(path: str):
+    if path is None:
+        raise TypeError("The file path should not be None.")
+
+
+def check_path_is_link(path: str):
+    if os.path.islink(os.path.normpath(path)):
+        raise ValueError("The path should not be a symbolic link file. "
+                         f"Please check the input path:{path}.")
+
+
+def check_path_length_lt(path: str, max_path_length=MAX_PATH_LENGTH):
+    path_length = path.__len__()
+    if path_length > max_path_length:
+        raise ValueError(f"The length of path should not be greater than {max_path_length}, but got {path_length}. "
+                         f"Please check the input path within the valid length range:{path[:max_path_length]}.")
+
+
+def check_file_size_lt(path: str, max_file_size=MAX_FILE_SIZE):
+    file_size = os.path.getsize(path)
+    if file_size > max_file_size:
+        raise ValueError(f"The size of file should not be greater than {max_file_size}, but got {file_size}. "
+                         f"Please check the input path:{path}.")
+
+
+def check_owner(path: str):
+    """
+    check the path owner
+    param: the input path
+    """
+    path_stat = os.stat(path)
+    path_owner, path_gid = path_stat.st_uid, path_stat.st_gid
+    cur_uid = os.geteuid()
+    cur_gid = os.getgid()
+    if not (cur_uid == 0 or cur_uid == path_owner or path_gid == cur_gid):
+        raise PermissionError(f"The current user does not have permission to access the path:{path}. "
+                              "Because he is not root or the path owner, "
+                              "and not in the same user group with the path owner. "
+                              "Please check and make sure to satisfy at least one of the conditions above.")
+
+
+def check_other_write_permission(file_path: str):
+    """
+    check if the specified file is writable by others who are neither the owner nor in the group
+    param: the path to the file to be checked
+    """
+    # Get the status of the file
+    file_stat = os.stat(file_path)
+    # Get the mode (permission) of the file
+    mode = file_stat.st_mode
+    # check the write permission for others
+    if mode & stat.S_IWOTH:
+        raise PermissionError("The file should not be writable by others who are neither the owner nor in the group. "
+                              f"Please check the input path:{file_path}, and change mode to {mode & ~stat.S_IWOTH}.")
+
+
+def check_path_permission(file_path: str, is_internal_file=False):
+    check_inputfiles_permission = os.getenv("MINDIE_CHECK_INPUTFILES_PERMISSION", "1") != "0"
+    check_permission_flag = is_internal_file or check_inputfiles_permission
+    if check_permission_flag:
+        check_owner(file_path)
+        check_other_write_permission(file_path)
+
+
+def check_file_safety(file_path: str, mode='r', is_exist_ok=True,
+                      max_file_size=MAX_FILE_SIZE, is_check_file_size=True):
+    if is_path_exists(file_path):
+        if not is_exist_ok:
+            raise FileExistsError("The file is expected not to exist, but it already does. "
+                                  f"Please check the input path:{file_path}.")
+        if is_check_file_size:
+            check_file_size_lt(file_path, max_file_size)
+        file_dir = file_path
+    else:
+        if mode == 'r' or mode == 'r+':
+            raise FileNotFoundError("The file is expected to exist, but it does not. "
+                                    f"Please check the input path:{file_path}.")
+        file_dir = os.path.dirname(file_path)
+
+    check_path_permission(file_dir)
+
+
+def safe_listdir(file_path: str, max_file_num=MAX_FILENUM_PER_DIR):
+    filenames = os.listdir(file_path)
+    file_num = len(filenames)
+    if file_num > max_file_num:
+        raise ValueError(f"The file num in dir is {file_num}, which exceeds the limit {max_file_num}. "
+                         f"Please check the input path:{file_path}.")
+    return filenames
+
+
+def safe_chmod(file_path: str, permission_mode):
+    standard_path = standardize_path(file_path)
+    check_path_permission(standard_path)
+    os.chmod(file_path, permission_mode)
+
+
+def has_owner_write_permission(file_path: str):
+    st = os.stat(file_path)
+    return st.st_mode & stat.S_IWUSR
+
+
+def safe_readlines(file_obj, max_line_num=MAX_LINENUM_PER_FILE):
+    lines = file_obj.readlines()
+    line_num = len(lines)
+    if line_num > max_line_num:
+        raise ValueError(f"The file line num is {line_num}, which exceeds the limit {max_line_num}. "
+                         f"Please check the input file:{file_obj.name}.")
+    return lines
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.mod b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.mod
new file mode 100644
index 0000000000000000000000000000000000000000..5fcb8d8692375bf8b6380956ac18922f4189aab8
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.mod
@@ -0,0 +1,28 @@
+// Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+
+//        http://www.apache.org/licenses/LICENSE-2.0
+
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+
+module humanEval
+
+go 1.18
+
+require (
+	github.com/go-openapi/inflect v0.19.0
+	github.com/stretchr/testify v1.8.0
+)
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.sum b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.sum
new file mode 100644
index 0000000000000000000000000000000000000000..d88557079e79dba9786aee16537cd1c35c2fbc1e
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.sum
@@ -0,0 +1,17 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/go-openapi/inflect v0.19.0 h1:9jCH9scKIbHeV9m12SmPilScz6krDxKRasNNSNPXu/4=
+github.com/go-openapi/inflect v0.19.0/go.mod h1:lHpZVlpIQqLyKwJ4N+YSc9hchQy/i12fJykb83CRBH4=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/vendor.tar.gz b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/vendor.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..6d6c649e558abd01c23488bb906c2d4f3e805e10
Binary files /dev/null and b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/vendor.tar.gz differ
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_eval.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf9bdb472ee42e3c02a3e8e98e27506b7a39577
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_eval.py
@@ -0,0 +1,257 @@
+#  Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distsafe_openributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import os
+import json
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Counter, Tuple, Iterable, Dict
+
+from dataclasses import dataclass
+import regex
+import numpy as np
+from tqdm.auto import tqdm
+
+from .file_utils import safe_open
+from .humaneval_x_utils import read_dataset, IMPORT_HELPER, estimate_pass_at_k, check_correctness
+
+LANGUAGE_NAME = {
+    "cpp": "CPP",
+    "go": "Go",
+    "java": "Java",
+    "js": "JavaScript",
+    "python": "Python",
+}
+
+COMPLETION_ID_KEY = "completion_id"
+TEST_CODE_KEY = "test_code"
+TASK_ID_KEY = "task_id"
+
+
+@dataclass
+class EvalConfig:
+    input_file: str = None
+    tmp_dir: str = "./"
+    n_workers: int = 32
+    timeout: float = 500.0
+    problem_file: str = "./benchmark/ais_bench//datasets/humanevalx/humanevalx_python.jsonl.gz"
+    out_dir: str = None
+    k: Tuple[int, int, int] = (1, 10, 100)
+    test_groundtruth: bool = False
+    example_test: bool = False
+    go_dir: str = None
+
+
+def process_humaneval_test(sample, problems, example_test=False):
+    task_id = sample["task_id"]
+    language = task_id.split("/")[0].lower()
+    example_test_key = "example_test"
+
+    prompt = sample["prompt"]
+    if example_test and example_test_key in problems[task_id] and problems[task_id][example_test_key] != "":
+        test = problems[task_id][example_test_key]
+    else:
+        test = problems[task_id]["test"]
+
+    code = sample["generation"]
+
+    # Pre-process for different languages
+    if language == "python":
+        code_ = []
+        for line in code.split("\n"):
+            if (len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t'):
+                break
+            code_.append(line)
+        code = "\n".join(code_)
+        test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
+        test_string = test_setup + prompt + code + "\n" + test + "\n"
+    elif language == "cpp":
+        test_set_up = ""
+        for s in IMPORT_HELPER["cpp"]:
+            if s not in prompt:
+                test_set_up += s + "\n"
+        test_string = test_set_up + "\n" + prompt + code + "\n" + test
+    elif language == "java":
+        test_string = prompt + code + "\n" + test
+    elif language == "js" or language == "javascript":
+        test_string = prompt + code + "\n" + test
+    elif language == "go":
+        import_string = problems[task_id]["import"]
+        prompt = prompt.replace(import_string, "")
+        if example_test and example_test_key in problems[task_id]:
+            test = problems[task_id][example_test_key]
+        else:
+            test = problems[task_id]["test"]
+        test_setup = problems[task_id]["test_setup"]
+        other_pkgs = []
+        for pkg in IMPORT_HELPER["go"]:
+            if pkg not in test_setup:
+                p = pkg.split("/")[-1]
+                if p + "." in code:
+                    other_pkgs.append(f"\"{pkg}\"")
+        if other_pkgs:
+            import_other_pkgs = "import (\n" + "    ".join([p + "\n" for p in other_pkgs]) + ")"
+            test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
+        else:
+            test_string = test_setup + "\n" + prompt + code + "\n" + test
+    elif language == "rust":
+        main = "\nfn main(){ \n } \n"
+        declaration = problems[task_id]["declaration"]
+        test_string = main + declaration + prompt + code + test
+
+    return test_string
+
+
+def stream_jsonl_all(filename: str) -> Iterable[Dict]:
+    results = []
+    fp = safe_open(filename, "r")
+    for line in fp:
+        if any(not x.isspace() for x in line):
+            results.append(json.loads(line))
+    fp.close()
+
+    return results
+
+
+def evaluate_functional_correctness(config: EvalConfig):
+    completion_id_key = COMPLETION_ID_KEY
+    test_code_key = TEST_CODE_KEY
+    task_id_key = TASK_ID_KEY
+
+    if config.example_test:
+        pass
+
+    problems = read_dataset(config.problem_file,
+                            dataset_type="humaneval")
+    sample_jsonl = stream_jsonl_all(config.input_file)
+
+    if config.example_test:
+        suffix = "_example_test.jsonl"
+    else:
+        suffix = "_results.jsonl"
+    if config.out_dir is not None:
+        if not os.path.exists(config.out_dir):
+            os.makedirs(config.out_dir)
+        out_file = os.path.join(config.out_dir, config.input_file.split('/')[-1].replace(".jsonl", suffix))
+    else:
+        out_file = os.path.join(config.input_file.replace(".jsonl", suffix))
+
+    if "/codegeex/benchmark/humaneval-x/" in config.input_file:
+        config.test_groundtruth = True
+
+    if "-to-" in config.input_file:
+        translation_mode = True
+    else:
+        translation_mode = False
+
+    with ThreadPoolExecutor(max_workers=config.n_workers) as executor:
+        futures = []
+        completion_id = Counter()
+        n_samples = 0
+        results = defaultdict(list)
+
+        if config.test_groundtruth:
+            for sample in tqdm(problems.values()):
+                task_id = sample[task_id_key]
+                lang = task_id.split("/")[0].lower()
+                if lang == "javascript":
+                    lang = "js"
+                tmp_dir_ = os.path.join(config.tmp_dir, lang, "evaluation")
+                sample["generation"] = sample["canonical_solution"]
+                sample[test_code_key] = process_humaneval_test(sample, problems, config.example_test)
+                if sample[test_code_key] is None:
+                    print(f"Skipping task {task_id} due to missing test code.")  # 跳过的任务
+                    continue
+                config_dict = {
+                    "language_type": lang,
+                    "timeout": config.timeout,
+                    "tmp_dir": tmp_dir_,
+                    "completion_id": completion_id[task_id],
+                    "go_dir": config.go_dir
+                }
+                args = (task_id, sample, config_dict)
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+        else:
+            for sample in tqdm(sample_jsonl):
+                task_id = sample[task_id_key]
+
+                lang = task_id.split("/")[0].lower()
+                if translation_mode:
+                    task_id = sample[task_id_key].split("/")[-1]
+                    lang = regex.findall("-to-.*-", config.input_file)[0].split("-to-")[-1].rstrip("-")
+                    for language in LANGUAGE_NAME:
+                        if language in lang:
+                            lang = language
+                            break
+                    task_id = f"{LANGUAGE_NAME[lang]}/{task_id}"
+                if lang == "javascript":
+                    lang = "js"
+                tmp_dir_ = os.path.join(config.tmp_dir, lang, "evaluation")
+                sample[task_id_key] = task_id
+                sample[test_code_key] = process_humaneval_test(sample, problems, config.example_test)
+                if sample[test_code_key] is None:
+                    continue
+                if completion_id_key in sample:
+                    completion_id_ = sample[completion_id_key]
+                else:
+                    completion_id_ = completion_id[task_id]
+                config_dict = {
+                    "language_type": lang,
+                    "timeout": config.timeout,
+                    "tmp_dir": tmp_dir_,
+                    "completion_id": completion_id_,
+                    "go_dir": config.go_dir
+                }
+                args = (task_id, sample, config_dict)
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+
+        if len(completion_id) == len(problems):
+            evaluate_pass_at_k = True
+        else:
+            evaluate_pass_at_k = False
+
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            result = future.result()
+            results[result[task_id_key]].append((result[completion_id_key], result))
+
+    # Calculate accuracy
+    total, correct, details = [], [], []
+    for result in results.values():
+        for r in result:
+            passed = r[1].get('passed', False)
+            total.append(1)
+            correct.append(1 if passed else 0)
+            details.append({'task_id': r[0], 'passed': passed, 'result': r[1]})
+
+    accuracy = 100 * sum(correct) / sum(total) if total else 0
+
+    result = {'accuracy': accuracy, 'details': details}
+
+    fp = safe_open(out_file, 'w')
+    for res in results.values():
+        for r in res:
+            fp.write(json.dumps(r[1], indent=4) + "\n")
+    fp.close()
+
+    with safe_open(out_file, "ab") as fp:
+        fp.write((json.dumps(result) + "\n").encode('utf-8'))
+
+    return result  # Only return the required result
+
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_utils.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..592b66cb5ef673383c5ebd1876d672c7a4b31b3e
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_utils.py
@@ -0,0 +1,816 @@
+#  Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import contextlib
+import faulthandler
+import json
+import io
+import itertools
+import multiprocessing
+import os
+import sys
+import shutil
+import platform
+import random
+import signal
+import subprocess
+import tempfile
+import builtins
+import resource
+import gzip
+import json
+from typing import Iterable, Dict
+import importlib.util
+from typing import Iterable, Dict, Union, List, Optional
+import torch
+import numpy as np
+from torch.utils.data import Dataset
+from transformers.generation.stopping_criteria import StoppingCriteria
+
+from .file_utils import safe_open
+
+# from atb_llm.utils.log.logging import logger
+
+LANGUAGE_TAG = {
+    "c": "// language: C",
+    "c++": "// language: C++",
+    "cpp": "// language: C++",
+    "c#": "// language: C#",
+    "csharp": "// language: C#",
+    "css": "/* language: CSS */",
+    "cuda": "// language: Cuda",
+    "dart": "// language: Dart",
+    "lua": "// language: Lua",
+    "objectivec": "// language: Objective-C",
+    "objective-c": "// language: Objective-C",
+    "objective-c++": "// language: Objective-C++",
+    "python": "# language: Python",
+    "perl": "# language: Perl",
+    "prolog": "% language: Prolog",
+    "swift": "// language: swift",
+    "lisp": "; language: Lisp",
+    "java": "// language: Java",
+    "scala": "// language: Scala",
+    "tex": "% language: TeX",
+    "vue": "<!--language: Vue-->",
+    "markdown": "<!--language: Markdown-->",
+    "html": "<!--language: HTML-->",
+    "php": "// language: PHP",
+    "js": "// language: JavaScript",
+    "javascript": "// language: JavaScript",
+    "typescript": "// language: TypeScript",
+    "go": "// language: Go",
+    "shell": "# language: Shell",
+    "rust": "// language: Rust",
+    "sql": "-- language: SQL",
+    "kotlin": "// language: Kotlin",
+    "vb": "' language: Visual Basic",
+    "ruby": "# language: Ruby",
+    "pascal": "// language: Pascal",
+    "r": "# language: R",
+    "fortran": "!language: Fortran",
+    "lean": "-- language: Lean",
+    "matlab": "% language: Matlab",
+    "delphi": "{language: Delphi}",
+    "scheme": "; language: Scheme",
+    "basic": "' language: Basic",
+    "assembly": "; language: Assembly",
+    "groovy": "// language: Groovy",
+    "abap": "* language: Abap",
+    "gdscript": "# language: GDScript",
+    "haskell": "-- language: Haskell",
+    "julia": "# language: Julia",
+    "elixir": "# language: Elixir",
+    "excel": "' language: Excel",
+    "clojure": "; language: Clojure",
+    "actionscript": "// language: ActionScript",
+    "solidity": "// language: Solidity",
+    "powershell": "# language: PowerShell",
+    "erlang": "% language: Erlang",
+    "cobol": "// language: Cobol",
+}
+
+IMPORT_HELPER = {
+    "python": [
+        "import math",
+        "import re",
+        "import sys",
+        "import copy",
+        "import datetime",
+        "import itertools",
+        "import collections",
+        "import heapq",
+        "import statistics",
+        "import functools",
+        "import hashlib",
+        "import numpy",
+        "import numpy as np",
+        "import string",
+        "from typing import *",
+        "from collections import *",
+    ],
+    "go": [
+        "math",
+        "strings",
+        "fmt",
+        "strconv",
+        "time",
+        "bytes",
+        "regexp",
+        "sort",
+        "math/rand",
+        "crypto/md5",
+    ],
+    "cpp": [
+        "#include<stdlib.h>",
+        "#include<algorithm>",
+        "#include<math.h>",
+        "#include<stdio.h>",
+        "#include<vector>",
+        "#include<string>",
+        "#include<climits>",
+        "#include<cstring>",
+        "#include<iostream>",
+    ],
+}
+
+
+class StoppingCriteriaWithHumanEvalX(StoppingCriteria):
+    def __init__(self, lang: str = None, original_input_len: int = None, tokenizer=None):
+        self.lang = lang
+        self.original_input_len = original_input_len
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        output_ids = input_ids[0]
+        if output_ids[-1].detach().cpu().numpy() in [self.tokenizer.eos_token_id]:
+            return True
+        text = self.tokenizer.decode(output_ids[self.original_input_len:], skip_special_tokens=False)
+        return is_code_generation_finished(
+            text,
+            language_type=self.lang,
+            dataset="humaneval",
+        )
+
+
+class HumanEvalXDataset(Dataset):
+    def __init__(self, task_dict):
+        self.task_dict = task_dict
+        self.keys = list(task_dict.keys())
+
+    def __len__(self):
+        return len(self.keys)
+
+    def __getitem__(self, index):
+        return self.task_dict[self.keys[index]]
+
+
+def estimate_pass_at_k(
+        num_samples: Union[int, List[int], np.ndarray],
+        num_correct: Union[List[int], np.ndarray],
+        k: int
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        if len(num_samples) != len(num_correct):
+            raise RuntimeError("The lengths of num_samples and num_correct do not match.")
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+
+
+def stream_jsonl(filename: str) -> Iterable[Dict]:
+    """
+    Parses each jsonl line and yields it as a dictionary, assuming the file is gzipped if needed.
+    """
+
+    if filename.endswith('.gz'):
+        with gzip.open(filename, 'rt', encoding='utf-8') as fp:
+            for line in fp:
+                if any(not x.isspace() for x in line):
+                    yield json.loads(line)
+    else:
+        with open(filename, 'r', encoding='utf-8') as fp:
+            for line in fp:
+                if any(not x.isspace() for x in line):
+                    yield json.loads(line)
+
+
+def read_dataset(
+        data_file: str = None,
+        dataset_type: str = "humaneval",
+        num_shot=None,
+) -> Dict:
+    if num_shot is not None:
+        pass
+    if "humaneval" in dataset_type.lower():
+        if data_file is None:
+            current_path = os.path.dirname(os.path.abspath(__file__))
+            data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
+        dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
+    else:
+        raise f"Dataset: {dataset_type} not supported."
+
+    return dataset
+
+
+def process_extra_prompt(prompt: str, language_type: str = None) -> str:
+    """
+    Processes the extra prompt.
+    """
+    language = language_type.lower()
+    if language in LANGUAGE_TAG:
+        extra_prompt = LANGUAGE_TAG[language] + "\n"
+    else:
+        extra_prompt = ""
+
+    return extra_prompt + prompt
+
+
+def is_code_generation_finished(
+        code: str,
+        language_type: str = None,
+        dataset: str = None,
+):
+    """
+    Checks whether the generated code is finished.
+    """
+    if language_type is None or dataset is None:
+        return False
+
+    if "humaneval" in dataset.lower():
+        if language_type.lower() == "python":
+            for line in code.split("\n"):
+                if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
+                    return True
+            end_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
+            for w in end_words:
+                if w in code:
+                    return True
+        elif language_type.lower() == "java":
+            if code.count("{") + 1 == code.count("}"):
+                return True
+        elif language_type.lower() == "go":
+            if code.count("{") + 1 == code.count("}"):
+                return True
+        elif language_type.lower() == "js":
+            if code.count("{") + 1 == code.count("}"):
+                return True
+        elif language_type.lower() == "cpp":
+            if code.count("{") + 1 == code.count("}"):
+                return True
+
+    return False
+
+
+def cleanup_code(
+        code: str,
+        language_type: str = None,
+        dataset: str = None,
+):
+    """
+    Cleans up the generated code.
+    """
+    if language_type is None or dataset is None:
+        return code
+
+    if "humaneval" in dataset.lower():
+        if language_type.lower() == "python":
+            end_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint", "\nassert"]
+            for w in end_words:
+                if w in code:
+                    code = code[:code.rfind(w)]
+        elif language_type.lower() == "java":
+            main_pos = code.find("public static void main")
+            if main_pos != -1:
+                code = code[:main_pos] + '}'
+            if '}' in code:
+                code = code[:code.rfind('}')] + '}'
+            if code.count('{') + 1 == code.count('}'):
+                code += "\n}"
+        elif language_type.lower() == "go":
+            end_words = ["\n//", "\nfunc main("]
+            for w in end_words:
+                if w in code:
+                    code = code[:code.rfind(w)]
+            if '}' in code:
+                code = code[:code.rfind('}')] + '}'
+        elif language_type.lower() == "cpp":
+            if '}' in code:
+                code = code[:code.rfind('}')] + '}'
+        elif language_type.lower() == "js":
+            if '}' in code:
+                code = code[:code.rfind('}')] + '}'
+
+    return code
+
+
+class TimeoutException(Exception):
+    pass
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """ StringIO that throws an exception when it's read from """
+
+    def read(self, *args, **kwargs):
+        raise IOError
+
+    def readline(self, *args, **kwargs):
+        raise IOError
+
+    def readlines(self, *args, **kwargs):
+        raise IOError
+
+    def readable(self, *args, **kwargs):
+        """ Returns True if the IO object can be read. """
+        return False
+
+
+class RedirectStdin:
+    def __init__(self, new_target):
+        self._new_target = new_target
+        self._old_target = None
+
+    def __enter__(self):
+        self._old_target = sys.stdin
+        sys.stdin = self._new_target
+        return self._new_target
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        sys.stdin = self._old_target
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+
+@contextlib.contextmanager
+def create_tempdir():
+    original_unlink = os.unlink
+    original_remove = os.remove
+
+    def get_safe_unlink():
+        return getattr(shutil, '_orig_unlink', original_unlink)
+
+    def get_safe_remove():
+        return getattr(shutil, '_orig_remove', original_remove)
+
+    try:
+        os.unlink = get_safe_unlink()
+        os.remove = get_safe_remove()
+        with tempfile.TemporaryDirectory() as dirname:
+            yield dirname
+    finally:
+        os.unlink = original_unlink
+        os.remove = original_remove
+
+
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with RedirectStdin(stream):
+                yield
+
+
+def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    original_unlink = os.unlink
+    original_remove = os.remove
+    try:
+        if maximum_memory_bytes is not None:
+            resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+            resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+            if not platform.uname().system == 'Darwin':
+                resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+    except Exception as e:
+        pass
+
+        faulthandler.disable()
+
+        builtins.exit = None
+        builtins.quit = None
+
+        os.environ['OMP_NUM_THREADS'] = '1'
+
+        os.kill = None
+        os.system = None
+        os.putenv = None
+        os.remove = None
+        os.removedirs = None
+        os.rmdir = None
+        os.fchdir = None
+        os.setuid = None
+        os.fork = None
+        os.forkpty = None
+        os.killpg = None
+        os.rename = None
+        os.renames = None
+        os.truncate = None
+        os.replace = None
+        os.unlink = None
+        os.fchmod = None
+        os.fchown = None
+        os.chmod = None
+        os.chown = None
+        os.chroot = None
+        os.fchdir = None
+        os.lchflags = None
+        os.lchmod = None
+        os.lchown = None
+        os.getcwd = None
+        os.chdir = None
+
+        shutil.move = None
+        shutil.chown = None
+
+        subprocess.Popen = None  # type: ignore
+
+        __builtins__['help'] = None
+
+        sys.modules['ipdb'] = None
+        sys.modules['joblib'] = None
+        sys.modules['resource'] = None
+        sys.modules['psutil'] = None
+
+    finally:
+        os.unlink = original_unlink
+        os.remove = original_remove
+
+
+def check_correctness(
+        task_id: str,
+        sample: dict,
+        config: dict,
+) -> Dict:
+    """
+    Evaluates the functional correctness of a completion by running the test
+    suite provided in the problem.
+    """
+
+    language_type = config.get('language_type', 'python')
+    timeout = config.get('timeout', 300.0)
+    tmp_dir = config.get('tmp_dir', None)
+    completion_id = config.get('completion_id', None)
+    go_dir = config.get('go_dir', None)
+
+    finish = "finish"
+    file = "file"
+    output = "output"
+    test_code = "test_code"
+    passed = "passed"
+    timed_out = "timed out"
+    tmp = "tmp"
+    test_cpp = "test.cpp"
+
+    def unsafe_execute(tmp_dir):
+        random_id = random.uniform(1, 1000)
+        if "python" in language_type.lower():
+            with create_tempdir() as tmp_dir:
+
+                # These system calls are needed when cleaning up tempdir.
+                rmtree = shutil.rmtree
+                rmdir = os.rmdir
+                change_dir = os.chdir
+
+                # Disable functionalities that can make destructive changes to the test.
+                reliability_guard()
+                code_file = os.path.join(tmp_dir, f"test_code_{random_id}.py")
+
+                with open(code_file, 'w') as f:
+                    f.write(sample[test_code])
+
+                exec_globals = {}
+                try:
+                    with swallow_io():
+                        with time_limit(timeout):
+                            # WARNING
+                            # This program exists to execute untrusted model-generated code. Although
+                            # it is highly unlikely that model-generated code will do something overtly
+                            # malicious in response to this test suite, model-generated code may act
+                            # destructively due to a lack of model capability or alignment.
+                            # Users are strongly encouraged to sandbox this evaluation suite so that it
+                            # does not perform destructive actions on their host or network.
+                            spec = importlib.util.spec_from_file_location("test_code_module", code_file)
+                            module = importlib.util.module_from_spec(spec)
+                            spec.loader.exec_module(module)
+                        result.append(passed)
+                except TimeoutException:
+                    result.append(timed_out)
+                except AssertionError:
+                    result.append("failed: AssertionError")
+                except BaseException as e:
+                    result.append(f"failed: {e}")
+
+                # Needed for cleaning up.
+                shutil.rmtree = rmtree
+                os.rmdir = rmdir
+                os.chdir = change_dir
+
+        elif "go" in language_type.lower():
+            if tmp_dir is None:
+                raise RuntimeError("Go should be evaluated in a dir where necessary module files installed.")
+
+            if tmp not in tmp_dir:
+                tmp_dir = os.path.join(tmp_dir, tmp)
+            tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
+            tmp_dir = os.path.abspath(tmp_dir)
+            if not os.path.exists(tmp_dir):
+                os.makedirs(tmp_dir)
+
+            os.chdir(tmp_dir)
+            files_in_tmp_dir = os.listdir(tmp_dir)
+            shutil.copytree(go_dir, tmp_dir, dirs_exist_ok=True)
+            files_in_tmp_dir = os.listdir(tmp_dir)
+            safe_open("main_test.go", 'w').write(sample[test_code])
+            exec_result = None
+            try:
+                # '/usr/local/go/bin/go', 'test', '-timeout=6s', 'main_test.go'
+                go_executable = shutil.which("go")
+                if go_executable is None:
+                    raise FileNotFoundError("Go executable not found in the system PATH.")
+                with time_limit(timeout):
+                    # WARNING
+                    # This program exists to execute untrusted model-generated code. Although
+                    # it is highly unlikely that model-generated code will do something overtly
+                    # malicious in response to this test suite, model-generated code may act
+                    # destructively due to a lack of model capability or alignment.
+                    # Users are strongly encouraged to sandbox this evaluation suite so that it
+                    # does not perform destructive actions on their host or network.
+                    exec_result = subprocess.run([go_executable, "test", f"-timeout={timeout}s", "main_test.go"],
+                                                 timeout=timeout, capture_output=True)
+
+                if exec_result.returncode == 0:
+                    result.append(passed)
+                else:
+                    if exec_result.stderr:
+                        try:
+                            err = exec_result.stderr.decode()
+                        except Exception:
+                            err = exec_result.stderr
+                    else:
+                        try:
+                            err = exec_result.stdout.decode()
+                        except Exception:
+                            err = exec_result.stdout
+                    result.append(f"failed: {err}")
+
+            except TimeoutException:
+                result.append(timed_out)
+
+            tmp_dir = os.path.abspath(tmp_dir)
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        elif "js" in language_type.lower():
+
+            if tmp not in tmp_dir:
+                tmp_dir = os.path.join(tmp_dir, tmp)
+            tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
+            tmp_dir = os.path.abspath(tmp_dir)
+
+            if not os.path.exists(tmp_dir):
+                os.makedirs(tmp_dir)
+
+            os.chdir(tmp_dir)
+            safe_open("test.js", 'w').write(sample[test_code])
+            exec_result = None
+            try:
+                node_executable = shutil.which("node")
+                if node_executable is None:
+                    raise FileNotFoundError("Node executable not found in the system PATH.")
+                with time_limit(timeout):
+                    # WARNING
+                    # This program exists to execute untrusted model-generated code. Although
+                    # it is highly unlikely that model-generated code will do something overtly
+                    # malicious in response to this test suite, model-generated code may act
+                    # destructively due to a lack of model capability or alignment.
+                    # Users are strongly encouraged to sandbox this evaluation suite so that it
+                    # does not perform destructive actions on their host or network.
+                    exec_result = subprocess.run([node_executable, "test.js"], timeout=timeout, capture_output=True)
+
+                if exec_result.stderr.decode():
+                    err = exec_result.stderr.decode()
+                    result.append(f"failed: {err}")
+                elif exec_result.stdout.decode():
+                    err = exec_result.stdout.decode()
+                    result.append(f"failed: {err}")
+                else:
+                    result.append(passed)
+
+            except TimeoutException:
+                result.append(timed_out)
+
+            tmp_dir = os.path.abspath(tmp_dir)
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        elif "cpp" in language_type.lower():
+
+            if tmp not in tmp_dir:
+                tmp_dir = os.path.join(tmp_dir, tmp)
+            tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
+            tmp_dir = os.path.abspath(tmp_dir)
+            if not os.path.exists(tmp_dir):
+                os.makedirs(tmp_dir)
+
+            os.chdir(tmp_dir)
+            safe_open(test_cpp, 'w').write(sample[test_code])
+            if "162" in task_id:
+                gpp_executable = shutil.which("g++")
+                compilation_result = subprocess.run([gpp_executable, "-std=c++11", test_cpp, "-lcrypto", "-lssl"],
+                                                    timeout=10,
+                                                    capture_output=True)
+                compilation_result = subprocess.run([gpp_executable, "-std=c++11", test_cpp, "-lcrypto", "-lssl"],
+                                                    timeout=10,
+                                                    capture_output=True)
+            else:
+                gpp_executable = shutil.which("g++")
+                compilation_result = subprocess.run([gpp_executable, "-std=c++11", test_cpp], timeout=timeout,
+                                                    capture_output=True)
+            if compilation_result.returncode != 0:
+                if compilation_result.stderr:
+                    err = compilation_result.stderr.decode()
+                else:
+                    err = compilation_result.stdout.decode()
+                result.append(f"failed: compilation error: {err}")
+            else:
+                exec_result = None
+                try:
+                    with time_limit(timeout):
+                        # WARNING
+                        # This program exists to execute untrusted model-generated code. Although
+                        # it is highly unlikely that model-generated code will do something overtly
+                        # malicious in response to this test suite, model-generated code may act
+                        # destructively due to a lack of model capability or alignment.
+                        # Users are strongly encouraged to sandbox this evaluation suite so that it
+                        # does not perform destructive actions on their host or network.
+                        exec_result = subprocess.run(["./a.out"], timeout=timeout, capture_output=True)
+
+                    if exec_result.returncode == 0:
+                        result.append(passed)
+                    else:
+                        if exec_result.stderr:
+                            try:
+                                err = exec_result.stderr.decode()
+                            except Exception:
+                                err = exec_result.stderr
+                        else:
+                            try:
+                                err = exec_result.stdout.decode()
+                            except Exception:
+                                err = exec_result.stdout
+                        result.append(f"failed: {err}")
+                except TimeoutException:
+                    result.append(timed_out)
+
+            tmp_dir = os.path.abspath(tmp_dir)
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        elif "java" in language_type.lower():
+            if tmp_dir is None:
+                raise RuntimeError("Java should be evaluated in a temporary dir.")
+
+            if tmp not in tmp_dir:
+                tmp_dir = os.path.join(tmp_dir, tmp)
+            tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
+            tmp_dir = os.path.abspath(tmp_dir)
+
+            if not os.path.exists(tmp_dir):
+                os.makedirs(tmp_dir, exist_ok=True)
+            os.chdir(tmp_dir)
+            with safe_open(os.path.join(tmp_dir, "Main.java"), mode='w', permission_mode=0o644) as f:
+                f.write(sample[test_code])
+
+            res = "failed: unknown error"
+            compile_returncode = -1
+            for _ in range(5):
+                try:
+                    javac_executable = shutil.which("javac")
+                    if javac_executable is None:
+                        raise FileNotFoundError("JavaC executable not found in the system PATH.")
+                    compilation_result = subprocess.run([javac_executable, os.path.join(tmp_dir, "Main.java")],
+                                                        timeout=5, capture_output=True)
+                    compile_returncode = compilation_result.returncode
+                    break
+                except subprocess.TimeoutExpired:
+                    continue
+            if compile_returncode != 0:
+                res = "failed: compilation error"
+            else:
+                exec_result = None
+                try:
+                    java_executable = shutil.which("java")
+                    if java_executable is None:
+                        raise FileNotFoundError("Java executable not found in the system PATH.")
+                    # WARNING
+                    # This program exists to execute untrusted model-generated code. Although
+                    # it is highly unlikely that model-generated code will do something overtly
+                    # malicious in response to this test suite, model-generated code may act
+                    # destructively due to a lack of model capability or alignment.
+                    # Users are strongly encouraged to sandbox this evaluation suite so that it
+                    # does not perform destructive actions on their host or network.
+                    exec_result = subprocess.run([java_executable, '-cp', tmp_dir, 'Main'],
+                                                 timeout=timeout, capture_output=True)
+                    if exec_result.returncode == 0:
+                        res = passed
+                    elif exec_result.returncode == 1:
+                        if "AssertionError" in exec_result.stderr.decode('unicode-escape'):
+                            res = "failed: wrong answer"
+                        else:
+                            res = f"failed: {exec_result.stderr.decode()}"
+                except subprocess.TimeoutExpired:
+                    res = "time out"
+                except BaseException as e:
+                    res = f"failed: {e}"
+            result.append(res)
+
+            tmp_dir = os.path.abspath(tmp_dir)
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+
+    p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        p.kill()
+
+    if not result:
+        result.append(timed_out)
+
+    return {
+        "task_id": task_id,
+        "completion_id": completion_id,
+        test_code: sample[test_code],
+        "prompt": sample["prompt"],
+        "generation": sample["generation"],
+        "result": result[0],
+        passed: result[0] == passed,
+        finish: -1 if finish not in sample else sample[finish],
+        file: "" if file not in sample else sample[file],
+        output: [] if output not in sample else sample[output],
+    }
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humanevalx.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humanevalx.py
new file mode 100644
index 0000000000000000000000000000000000000000..120bd7b9e36e5be8df8ca564905011554f7c5df3
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humanevalx.py
@@ -0,0 +1,253 @@
+import gzip
+import json
+import os
+import os.path as osp
+import re
+import subprocess
+import tempfile
+import time
+from shutil import copyfile
+from typing import Dict, Iterable
+
+from datasets import Dataset
+
+from ais_bench.benchmark.openicl.icl_evaluator import BaseEvaluator
+from ais_bench.benchmark.utils import get_data_path
+
+from ..base import BaseDataset
+from ..humaneval import humaneval_postprocess_v2
+from .humaneval_x_eval import evaluate_functional_correctness, EvalConfig
+
+_LANGUAGE_NAME_DICT = {
+    'cpp': 'CPP',
+    'go': 'Go',
+    'java': 'Java',
+    'js': 'JavaScript',
+    'python': 'Python',
+    'rust': 'Rust',
+}
+
+
+class HumanevalXDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, language, **kwargs):
+        path = get_data_path(path, local_mode=True)
+        assert language in _LANGUAGE_NAME_DICT.keys(), (
+            f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}')
+        file_path = osp.join(path, f'humanevalx_{language}.jsonl.gz')
+        dataset = HumanevalXDataset._stream_jsonl_all(file_path)
+        return Dataset.from_list(dataset)
+
+    @staticmethod
+    def _stream_jsonl_all(filename: str) -> Iterable[Dict]:
+        results = []
+        if filename.endswith('.gz'):
+            fp = gzip.open(open(filename, 'rb'), 'rt')
+        else:
+            fp = open(filename, 'r')
+        for line in fp:
+            if any(not x.isspace() for x in line):
+                results.append(json.loads(line))
+        fp.close()
+
+        return results
+
+
+import jsonlines
+
+
+def generate_predictions_from_file(file_path, language):
+    predictions = []
+    with jsonlines.open(file_path) as reader:
+        for i, obj in enumerate(reader):
+            task_id = obj['task_id']
+            generation = obj['generation']
+            # Clean up the code if necessary
+            # cleaned_generation = _clean_up_code(generation, language, None)  # Assuming 'refer' is not needed here
+            predictions.append({
+                'task_id': task_id,
+                'generation': generation
+            })
+    return predictions
+
+
+class HumanevalXEvaluator(BaseEvaluator):
+    """Evaluator for humanevalx.
+
+    Before you use this Evaluator, launch a code eval service according
+    to to readme of https://github.com/Ezra-Yu/code-evaluator.
+    Set `ip_address` and `port` according your environment.
+
+    Args:
+        language (str): the program language to evaluate.
+        ip_address (str): The IP Address of HumanevalX code evaluate service.
+            refer to https://github.com/Ezra-Yu/code-evaluator to launch a
+            code evaluate service. Defaults to 'localhost'.
+        port (int): The port of HumanevalX code evaluate service.
+            Defaults to 5000.
+        timeout (int): Maximum wait time when accessing the service,
+            Defaults to 100.
+
+    TODO: support 'k' of pass@k. default to use k = [1, 10, 100]
+    """
+
+    def __init__(self,
+                 language,
+                 ip_address='localhost',
+                 port='',
+                 retry=2,
+                 #  timeout=600) -> None:
+                 timeout=6) -> None:
+        assert language in _LANGUAGE_NAME_DICT.keys(), (
+            f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}')
+        if language == 'rust':
+            timeout *= 10  # rust need more time
+        self.language = language
+        self.ip_address = ip_address
+        self.port = port
+        self.retry = retry
+        self.timeout = timeout
+        super().__init__()
+
+    def score(self, predictions, references, test_set):
+        prompts = [item['prompt'] for item in test_set]
+        problem_file = f'benchmark/ais_bench/datasets/humanevalx/humanevalx_{self.language}.jsonl.gz'
+        # Get the absolute path
+        problem_file = os.path.abspath(problem_file)
+        import json
+        # 定义文件路径
+        go_dir = f'./benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation'
+
+        go_dir = os.path.abspath(go_dir)
+
+        predictions = [{
+            'task_id':
+                f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
+            'generation':
+                _clean_up_code(pred, self.language, refer),
+            'prompt': f'{prompt}'
+
+        } for i, (pred, refer, prompt) in enumerate(zip(predictions, references, prompts))]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_out_path = osp.join(tmp_dir,
+                                    f'humanevalx_{self.language}.json')
+            with open(tmp_out_path, 'w') as f:
+                for pred in predictions:
+                    f.write(json.dumps(pred) + '\n')
+
+                config = EvalConfig(input_file=tmp_out_path, timeout=self.timeout, problem_file=problem_file,
+                                    go_dir=go_dir)
+
+                # 调用评估函数进行功能正确性评估
+                result = evaluate_functional_correctness(config)
+
+                return result
+
+
+    def _code_eval_service(self, file_path):
+        if self.port:
+            eval_server_url = f'{self.ip_address}:{self.port}/evaluate'
+        else:
+            eval_server_url = f'{self.ip_address}/evaluate'
+        exec_result = subprocess.run([
+            'curl', '-X', 'POST', '-F', f'file=@{file_path}', '-F',
+            f'dataset=humanevalx/{self.language}', f'{eval_server_url}'
+        ],
+            timeout=self.timeout,
+            capture_output=True)
+
+        if exec_result.returncode == 0 and re.match(
+                "\"{.*:.*}\"", exec_result.stdout.decode('utf-8')):
+            return True, json.loads(exec_result.stdout.decode('utf-8'))
+        else:
+            if exec_result.stderr:
+                try:
+                    err = exec_result.stderr.decode()
+                except Exception:
+                    err = exec_result.stderr
+            else:
+                try:
+                    err = exec_result.stdout.decode()
+                except Exception:
+                    err = exec_result.stdout
+            return False, err
+
+
+def _clean_up_code(text: str, language_type: str, reference) -> str:
+    """Cleans up the generated code."""
+    try:
+        # for chatGLM related text
+        eval_text = eval(text)
+    except Exception:
+        pass
+    else:
+        if isinstance(eval_text, str):
+            text = eval_text
+    # extract code from code block
+    text = text.lstrip('\n')
+    if '```' in text:
+        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+        if len(blocks) == 0:
+            text = text.split('```')[1]  # fall back to default strategy
+        else:
+            text = blocks[0]  # fetch the first code block
+            if not text.startswith('\n'):  # in case starting with ```xxx
+                text = text[max(text.find('\n') + 1, 0):]
+    if language_type.lower() == 'python':
+        text = humaneval_postprocess_v2(text)
+        # we need to take care of the first line
+        # append extra space for first line for correct indentation
+        text = '    ' + text.lstrip()
+
+        text_splits = text.split('\n')
+        is_empty_line = False
+        ind_empty_line = None
+        for i, line in enumerate(text_splits):
+            if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
+                is_empty_line = True
+                ind_empty_line = i
+                break
+        if is_empty_line:
+            text = '\n'.join(text_splits[:ind_empty_line])
+        else:
+            end_words = [
+                '\ndef', '\nclass', '\n#', '\nassert', '\n"""', '\nprint',
+                '\nif', '\n\n\n'
+            ]
+            for w in end_words:
+                if w in text:
+                    text = text[:text.rfind(w)]
+    # strip function head for all other language
+    func_name = reference.strip().split('\n')[-1]
+    if func_name:
+        func_name = func_name.strip().strip('{')
+        if func_name in text:
+            text = '\n'.join(text[text.find(func_name):].split('\n')[1:])
+    if language_type.lower() == 'java':
+        main_pos = text.find('public static void main')
+        if main_pos != -1:
+            text = text[:main_pos] + '}'
+        if '}' in text:
+            text = text[:text.rfind('}')] + '}'
+        if text.count('{') + 1 == text.count('}'):
+            text += '\n}'
+    elif language_type.lower() == 'go':
+        if '\nfunc main(' in text:
+            text = text[:text.rfind('func main(')]
+        if '}' in text:
+            text = text[:text.rfind('}')] + '}'
+    elif language_type.lower() == 'cpp':
+        if '\nint main()' in text:
+            text = text[:text.rfind('int main()')]
+        if '}' in text:
+            text = text[:text.rfind('}')] + '}'
+    elif language_type.lower() == 'js':
+        if '}' in text:
+            text = text[:text.rfind('}')] + '}'
+    elif language_type.lower() == 'rust':
+        if '}' in text:
+            text = text[:text.rfind('}')] + '}'
+
+    return text
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py
index 274464d26e5012c01a6f0c44732199590cc954ae..5dd7e9c6b7a6e3fdaf33071337d6e1205d6317da 100644
--- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py
@@ -91,4 +91,6 @@ with read_base():
 
     # winogrande
     from ais_bench.benchmark.configs.datasets.winogrande.winogrande_gen_0_shot_chat_prompt import winogrande_datasets as winogrande_0_shot_chat
-    from ais_bench.benchmark.configs.datasets.winogrande.winogrande_gen_5_shot_chat_prompt import winogrande_datasets as winogrande_5_shot_chat
\ No newline at end of file
+    from ais_bench.benchmark.configs.datasets.winogrande.winogrande_gen_5_shot_chat_prompt import winogrande_datasets as winogrande_5_shot_chat
+
+    from ais_bench.benchmark.configs.datasets.humanevalx.humanevalx_gen_0_shot import import humanevalx_datasets as humanevalx_0_shot_str
\ No newline at end of file
diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/infer_api_minidie_vllm_humanevalx.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/infer_api_minidie_vllm_humanevalx.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e77484284ed541d25f62bb92ff4cf36f3f4aac5
--- /dev/null
+++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/infer_api_minidie_vllm_humanevalx.py
@@ -0,0 +1,46 @@
+from mmengine.config import read_base
+from ais_bench.benchmark.models import VLLMCustomAPI
+from ais_bench.benchmark.partitioners import NaivePartitioner
+from ais_bench.benchmark.runners.local_api import LocalAPIRunner
+from ais_bench.benchmark.tasks import OpenICLInferTask
+
+with read_base():
+    # from ais_bench.benchmark.configs.datasets.collections.chat_medium import datasets
+    from ais_bench.benchmark.configs.summarizers.example import summarizer
+    from ais_bench.benchmark.configs.datasets.humanevalx.humanevalx_gen_0_shot import humanevalx_datasets # 此处导入之前写的数据集配置文件中的数据集实例
+
+datasets = [
+    *humanevalx_datasets, # 添加数据集实例
+]
+
+
+models = [
+    dict(
+        type=VLLMCustomAPI, # 推理后端，建议优先写支持mindie服务化的。VLLMCustomAPIOld可对接mindie服务化(非流式)，VLLMCustomAPI可对接0.6+版本gpu上vllm拉起的服务，MindieStreamApi可对接mindie服务化(非流式)
+        abbr='mindie-vllm-api-humanevalx',
+        max_seq_len = 4096,
+        query_per_second = 1024,
+        rpm_verbose = False,
+        retry = 2,
+        host_ip = "90.91.56.32", # 使用时按实际服务化的ip修改
+        max_out_len = 1,
+
+        # 改为本机的
+        host_port = 9091, # 使用时按实际服务化的端口修改
+        enable_ssl = False,
+        generation_kwargs = dict(
+            temperature = 0,
+            seed = 1,
+        )
+    )
+]
+
+
+infer = dict(partitioner=dict(type=NaivePartitioner),
+             runner=dict(
+                 type=LocalAPIRunner,
+                 max_num_workers=2,
+                 concurrent_users=2,
+                 task=dict(type=OpenICLInferTask)), )
+
+work_dir = 'outputs/api-mindie-vllm-humanevalx/' # 自定义的工作路径，工具运行结果会落盘在这个路径下的某个时间戳目录下
\ No newline at end of file
diff --git a/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py b/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py
index bd6bcca0c5c274a9b091120e9153432b68e78e95..45630c39f09cffc990b4b27bcc031b6a763b12f6 100644
--- a/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py
+++ b/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py
@@ -20,6 +20,7 @@ DATASETS_CONFIGS_LIST = [
     "mgsm",
     "agieval",
     "cmmlu",
+    "humanevalx",
 ]
 
 class TestClass:
@@ -271,6 +272,48 @@ class TestClass:
         vis_md_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.md")
         assert os.path.exists(vis_md_path)
 
+
+    def test_vllm_api_all_qwen2_7b_humanevalx_0_shot(self, monkeypatch): # 唯一的测试函数名
+            fake_prediction = "112" # 模拟的推理输出，随便写吧
+            fake_time_str = "humanevalx_0_shot" # 模拟的时间戳，需要确保和其他用例不重复
+            datasets_abbr_name = "humanevalx-" # 被测数据集配置文件中abbr的名称 humanevalx-
+            datasets_script_name = "humanevalx_gen_0_shot" # 被测数据集配置文件名称
+            languages = ['python', 'cpp', 'go', 'java', 'js']
+
+            monkeypatch.setattr('sys.argv',
+                ["ais_bench", "--models", "vllm_api_general", "--datasets", datasets_script_name,
+                "--summarizer", "example","--mode", "all", "-w", self.test_data_path])
+            monkeypatch.setattr("ais_bench.benchmark.models.vllm_custom_api.VLLMCustomAPI._get_service_model_path", lambda *arg: "qwen2")
+            monkeypatch.setattr("ais_bench.benchmark.models.vllm_custom_api.VLLMCustomAPI._generate", lambda *arg: fake_prediction)
+            monkeypatch.setattr("ais_bench.benchmark.cli.main.get_current_time_str", lambda *arg: fake_time_str)
+            main()
+
+            for lang in languages:
+                curr_datasets_abbr_name = datasets_abbr_name + lang
+                # check infer out
+                infer_outputs_json_path = os.path.join(self.test_data_path, f"{fake_time_str}/predictions/vllm-api-general/{curr_datasets_abbr_name}.json")
+                assert os.path.exists(infer_outputs_json_path)
+                with open(infer_outputs_json_path, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+                assert data.get(f"0").get("prediction") == fake_prediction
+
+                # check eval out
+                results_json_path = os.path.join(self.test_data_path, f"{fake_time_str}/results/vllm-api-general/{curr_datasets_abbr_name}.json")
+
+                with open(results_json_path, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+                assert data.get("accuracy") is not None
+
+
+                # check vis
+                vis_csv_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.csv")
+                assert os.path.exists(vis_csv_path)
+                vis_txt_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.txt")
+                assert os.path.exists(vis_txt_path)
+                vis_md_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.md")
+                assert os.path.exists(vis_md_path)
+
+
     def test_vllm_api_all_qwen2_7b_math500_0_shot(self, monkeypatch):
         fake_prediction = "11"
         fake_time_str = "math500_0_shot"
@@ -1297,4 +1340,7 @@ class TestClass:
         vis_txt_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.txt")
         assert os.path.exists(vis_txt_path)
         vis_md_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.md")
-        assert os.path.exists(vis_md_path)
\ No newline at end of file
+        assert os.path.exists(vis_md_path)
+
+
+