diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/humanevalx/humanevalx_gen_0_shot.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/humanevalx/humanevalx_gen_0_shot.py new file mode 100644 index 0000000000000000000000000000000000000000..16382fd6a48aacba9f6fe3b6b5588484b03f3239 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/humanevalx/humanevalx_gen_0_shot.py @@ -0,0 +1,42 @@ +from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.datasets import HumanevalXDataset, HumanevalXEvaluator + +humanevalx_reader_cfg = dict( + input_columns=['prompt'], output_column='declaration', train_split='test') + +humanevalx_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, batch_size=16)) + +humanevalx_eval_cfg_dict = { + lang: dict( + evaluator=dict( + type=HumanevalXEvaluator, + + language=lang, + ip_address= + 'localhost', # replace to your code_eval_server ip_address, port + port=5001), + pred_role='BOT') + for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now +} + +# Please download the needed `xx.jsonl.gz` from +# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx +# and move them into `data/humanevalx/` folder +humanevalx_datasets = [ + dict( + type=HumanevalXDataset, + abbr=f'humanevalx-{lang}', + language=lang, + path='ais_bench/datasets/humanevalx', + reader_cfg=humanevalx_reader_cfg, + infer_cfg=humanevalx_infer_cfg, + eval_cfg=humanevalx_eval_cfg_dict[lang]) + for lang in ['python', 'cpp', 'go', 'java', 'js'] +] diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py index cb9e7b4c35c9a8402f8eefea48f2237d412f818a..9934cb8db588d4f165db9678ae1bc50a7aa51337 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py @@ -20,4 +20,6 @@ from .mbpp import * # noqa: F401,F403 from .agieval import * # noqa: F401, F403 from .hellaswag import * # noqa: F401, F403 from .triviaqa import * # noqa: F401, F403 -from .cmmlu import * # noqa: F401, F403 \ No newline at end of file +from .cmmlu import * # noqa: F401, F403 +from .humanevalx import humanevalx, humaneval_x_eval, humaneval_x_utils +from .humanevalx.humanevalx import * diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/README.md b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1affb959844843497c2f8871acdc6c68b6455035 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/README.md @@ -0,0 +1,73 @@ +# HumanEval_X数据集配置依赖环境 README + + +## `HumanEval_X`环境安装 +在测试HumanEval_X时,需要配置语言环境,包括GO、JS、JAVA这三种语言,安装步骤如下: + + +1.首先需要设置代理,根据服务器网段设置服务器代理 + +2.下载语言包: + + NPU环境上的语言包为: + go https://go.dev/dl/go1.18.4.linux-arm64.tar.gz + node https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-arm64.tar.gz + java https://download.oracle.com/java/18/archive/jdk-18.0.2.1_linux-aarch64_bin.tar.gz + + GPU环境上的语言包为: + go https://go.dev/dl/go1.18.4.linux-amd64.tar.gz + node https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-x64.tar.gz + java https://download.oracle.com/java/18/archive/jdk-18.0.2.1_linux-x64_bin.tar.gz + +3.执行解压安装: + + NPU安装步骤: + + # 安装npm:可能代理网络不好,导致npm下载失败,可以更换代理或者换个时间段在进行安装 + apt-get update + apt-get install -y npm + + # 安装GO语言: + tar -zxf go1.18.4.linux-arm64.tar.gz -C /usr/local + export PATH=/bin:/usr/local/go/bin:$PATH + + # 安装node: + mkdir -p /usr/local/lib/nodejs + tar -zxf node-v16.14.0-linux-arm64.tar.gz -C /usr/local/lib/nodejs + mv /usr/local/lib/nodejs/node-v16.14.0-linux-arm64 /usr/local/lib/nodejs/node + + # 安装js: + npm config set strict-ssl false + npm install -g js-md5@0.7.3 + export PATH=/usr/local/lib/nodejs/node/bin:$PATH + export NODE_PATH=/usr/local/lib/node_modules + + # 安装JAVA: + mkdir /usr/java + tar -zxf jdk-18.0.2.1_linux-aarch64_bin.tar.gz -C /usr/java + JAVA_HOME=/usr/java/jdk-18.0.2.1 + update-alternatives --install /usr/bin/java java $JAVA_HOME/bin/java 20000 + update-alternatives --install /usr/bin/javac javac $JAVA_HOME/bin/javac 20000 + + GPU:与NPU类似,只需要更换上述脚本中对应的语言安装包名即可 + +4.检查语言是否安装成功 + + go version #检查go语言 + js --version #检查js + java --version #检查java + npm -v #检查npm + +若语言出现对应的版本说明下载安装成功,即可进行精度测试 +若没有出现对应版本则需要重新下载或者单独安装相关的语言 + + +## `HumanEval_X`环境配置补充说明 + +1.如出现某个语言精度结果为0.0则应检查对应的语言版本是否下载完成,并查看是否导入环境变量 + +2.go语言和JAVA的执行时间比较久,如果发现这两个语言有波动,则可以检查model_test.py中的timeout设置是否太小。 + +3.每次进入环境都需要进行导入GO语言环境,检查语言版本 + + diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/cpp/evaluation/test.cpp b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/cpp/evaluation/test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76bd7d57832c29c152763a974f079171b50ab728 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/cpp/evaluation/test.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* +Input to this function is a string containing multiple groups of nested parentheses. Your goal is to +separate those group into separate strings and return the vector of those. +Separate groups are balanced (each open brace is properly closed) and not nested within each other +Ignore any spaces in the input string. +>>> separate_paren_groups("( ) (( )) (( )( ))") +{"()", "(())", "(()())"} +*/ +#include +#include +#include + +namespace { + + std::vector separate_paren_groups(std::string paren_string) + { + std::vector all_parens; + std::string current_paren; + int level = 0; + char chr; + int i; + + for (i = 0; i < paren_string.length(); i++) { + chr = paren_string[i]; + if (chr == '(') { + level += 1; + current_paren += chr; + } + if (chr == ')') { + level -= 1; + current_paren += chr; + if (level == 0) { + all_parens.push_back(current_paren); + current_paren = ""; + } + } + } + return all_parens; + } + +#undef NDEBUG +#include + + static bool g_issame(std::vector a, std::vector b) + { + if (a.size() != b.size()) return false; + for (int i = 0; i < a.size(); i++) { + if (a[i] != b[i]) return false; + } + return true; + } +} // namespace + +int main() +{ + assert(g_issame(separate_paren_groups("(()()) ((())) () ((())()())"), {"(()())", "((()))", "()", "((())()())"})); + assert(g_issame(separate_paren_groups("() (()) ((())) (((())))"), {"()", "(())", "((()))", "(((())))"})); + assert(g_issame(separate_paren_groups("(()(())((())))"), {"(()(())((())))"})); + assert(g_issame(separate_paren_groups("( ) (( )) (( )( ))"), {"()", "(())", "(()())"})); +} \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/file_utils.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9255a24cf9395e2b75dd52e33e09c6748476f0d4 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/file_utils.py @@ -0,0 +1,181 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +from functools import reduce +import os +import stat + +MAX_PATH_LENGTH = 4096 +MAX_FILE_SIZE = 100 * 1024 * 1024 +MAX_FILENUM_PER_DIR = 1024 +MAX_LINENUM_PER_FILE = 10 * 1024 * 1024 + +FLAG_OS_MAP = { + 'r': os.O_RDONLY, 'r+': os.O_RDWR, + 'w': os.O_CREAT | os.O_TRUNC | os.O_WRONLY, + 'w+': os.O_CREAT | os.O_TRUNC | os.O_RDWR, + 'a': os.O_CREAT | os.O_APPEND | os.O_WRONLY, + 'a+': os.O_CREAT | os.O_APPEND | os.O_RDWR, + 'x': os.O_CREAT | os.O_EXCL, + "b": getattr(os, "O_BINARY", 0) +} + + +def safe_open(file_path: str, mode='r', encoding=None, permission_mode=0o600, is_exist_ok=True, **kwargs): + """ + :param file_path: 文件路径 + :param mode: 文件打开模式 + :param encoding: 文件编码方式 + :param permission_mode: 文件权限模式 + :param is_exist_ok: 是否允许文件存在 + :param max_path_length: 文件路径最大长度 + :param max_file_size: 文件最大大小,单位: 字节, 默认值10MB + :param check_link: 是否校验软链接 + :param kwargs: + :return: + """ + max_path_length = kwargs.get('max_path_length', MAX_PATH_LENGTH) + max_file_size = kwargs.get('max_file_size', MAX_FILE_SIZE) + check_link = kwargs.get('check_link', False) + + file_path = standardize_path(file_path, max_path_length, check_link) + check_file_safety(file_path, mode, is_exist_ok, max_file_size) + + flags = [] + for item in list(mode): + if item == "+" and flags: + flags[-1] = f"{flags[-1]}+" + continue + flags.append(item) + flags = [FLAG_OS_MAP.get(mode, os.O_RDONLY) for mode in flags] + total_flag = reduce(lambda a, b: a | b, flags) + + return os.fdopen(os.open(file_path, total_flag, permission_mode), + mode, encoding=encoding) + + +def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True): + """ + check path + param: path + return: data real path after check + """ + check_path_is_none(path) + check_path_length_lt(path, max_path_length) + if check_link: + check_path_is_link(path) + path = os.path.realpath(path) + return path + + +def is_path_exists(path: str): + return os.path.exists(path) + + +def check_path_is_none(path: str): + if path is None: + raise TypeError("The file path should not be None.") + + +def check_path_is_link(path: str): + if os.path.islink(os.path.normpath(path)): + raise ValueError("The path should not be a symbolic link file. " + f"Please check the input path:{path}.") + + +def check_path_length_lt(path: str, max_path_length=MAX_PATH_LENGTH): + path_length = path.__len__() + if path_length > max_path_length: + raise ValueError(f"The length of path should not be greater than {max_path_length}, but got {path_length}. " + f"Please check the input path within the valid length range:{path[:max_path_length]}.") + + +def check_file_size_lt(path: str, max_file_size=MAX_FILE_SIZE): + file_size = os.path.getsize(path) + if file_size > max_file_size: + raise ValueError(f"The size of file should not be greater than {max_file_size}, but got {file_size}. " + f"Please check the input path:{path}.") + + +def check_owner(path: str): + """ + check the path owner + param: the input path + """ + path_stat = os.stat(path) + path_owner, path_gid = path_stat.st_uid, path_stat.st_gid + cur_uid = os.geteuid() + cur_gid = os.getgid() + if not (cur_uid == 0 or cur_uid == path_owner or path_gid == cur_gid): + raise PermissionError(f"The current user does not have permission to access the path:{path}. " + "Because he is not root or the path owner, " + "and not in the same user group with the path owner. " + "Please check and make sure to satisfy at least one of the conditions above.") + + +def check_other_write_permission(file_path: str): + """ + check if the specified file is writable by others who are neither the owner nor in the group + param: the path to the file to be checked + """ + # Get the status of the file + file_stat = os.stat(file_path) + # Get the mode (permission) of the file + mode = file_stat.st_mode + # check the write permission for others + if mode & stat.S_IWOTH: + raise PermissionError("The file should not be writable by others who are neither the owner nor in the group. " + f"Please check the input path:{file_path}, and change mode to {mode & ~stat.S_IWOTH}.") + + +def check_path_permission(file_path: str, is_internal_file=False): + check_inputfiles_permission = os.getenv("MINDIE_CHECK_INPUTFILES_PERMISSION", "1") != "0" + check_permission_flag = is_internal_file or check_inputfiles_permission + if check_permission_flag: + check_owner(file_path) + check_other_write_permission(file_path) + + +def check_file_safety(file_path: str, mode='r', is_exist_ok=True, + max_file_size=MAX_FILE_SIZE, is_check_file_size=True): + if is_path_exists(file_path): + if not is_exist_ok: + raise FileExistsError("The file is expected not to exist, but it already does. " + f"Please check the input path:{file_path}.") + if is_check_file_size: + check_file_size_lt(file_path, max_file_size) + file_dir = file_path + else: + if mode == 'r' or mode == 'r+': + raise FileNotFoundError("The file is expected to exist, but it does not. " + f"Please check the input path:{file_path}.") + file_dir = os.path.dirname(file_path) + + check_path_permission(file_dir) + + +def safe_listdir(file_path: str, max_file_num=MAX_FILENUM_PER_DIR): + filenames = os.listdir(file_path) + file_num = len(filenames) + if file_num > max_file_num: + raise ValueError(f"The file num in dir is {file_num}, which exceeds the limit {max_file_num}. " + f"Please check the input path:{file_path}.") + return filenames + + +def safe_chmod(file_path: str, permission_mode): + standard_path = standardize_path(file_path) + check_path_permission(standard_path) + os.chmod(file_path, permission_mode) + + +def has_owner_write_permission(file_path: str): + st = os.stat(file_path) + return st.st_mode & stat.S_IWUSR + + +def safe_readlines(file_obj, max_line_num=MAX_LINENUM_PER_FILE): + lines = file_obj.readlines() + line_num = len(lines) + if line_num > max_line_num: + raise ValueError(f"The file line num is {line_num}, which exceeds the limit {max_line_num}. " + f"Please check the input file:{file_obj.name}.") + return lines diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.mod b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..5fcb8d8692375bf8b6380956ac18922f4189aab8 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.mod @@ -0,0 +1,28 @@ +// Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module humanEval + +go 1.18 + +require ( + github.com/go-openapi/inflect v0.19.0 + github.com/stretchr/testify v1.8.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.sum b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.sum new file mode 100644 index 0000000000000000000000000000000000000000..d88557079e79dba9786aee16537cd1c35c2fbc1e --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-openapi/inflect v0.19.0 h1:9jCH9scKIbHeV9m12SmPilScz6krDxKRasNNSNPXu/4= +github.com/go-openapi/inflect v0.19.0/go.mod h1:lHpZVlpIQqLyKwJ4N+YSc9hchQy/i12fJykb83CRBH4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/vendor.tar.gz b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/vendor.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..6d6c649e558abd01c23488bb906c2d4f3e805e10 Binary files /dev/null and b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation/vendor.tar.gz differ diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_eval.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..4cf9bdb472ee42e3c02a3e8e98e27506b7a39577 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_eval.py @@ -0,0 +1,257 @@ +# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distsafe_openributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Counter, Tuple, Iterable, Dict + +from dataclasses import dataclass +import regex +import numpy as np +from tqdm.auto import tqdm + +from .file_utils import safe_open +from .humaneval_x_utils import read_dataset, IMPORT_HELPER, estimate_pass_at_k, check_correctness + +LANGUAGE_NAME = { + "cpp": "CPP", + "go": "Go", + "java": "Java", + "js": "JavaScript", + "python": "Python", +} + +COMPLETION_ID_KEY = "completion_id" +TEST_CODE_KEY = "test_code" +TASK_ID_KEY = "task_id" + + +@dataclass +class EvalConfig: + input_file: str = None + tmp_dir: str = "./" + n_workers: int = 32 + timeout: float = 500.0 + problem_file: str = "./benchmark/ais_bench//datasets/humanevalx/humanevalx_python.jsonl.gz" + out_dir: str = None + k: Tuple[int, int, int] = (1, 10, 100) + test_groundtruth: bool = False + example_test: bool = False + go_dir: str = None + + +def process_humaneval_test(sample, problems, example_test=False): + task_id = sample["task_id"] + language = task_id.split("/")[0].lower() + example_test_key = "example_test" + + prompt = sample["prompt"] + if example_test and example_test_key in problems[task_id] and problems[task_id][example_test_key] != "": + test = problems[task_id][example_test_key] + else: + test = problems[task_id]["test"] + + code = sample["generation"] + + # Pre-process for different languages + if language == "python": + code_ = [] + for line in code.split("\n"): + if (len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t'): + break + code_.append(line) + code = "\n".join(code_) + test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n" + test_string = test_setup + prompt + code + "\n" + test + "\n" + elif language == "cpp": + test_set_up = "" + for s in IMPORT_HELPER["cpp"]: + if s not in prompt: + test_set_up += s + "\n" + test_string = test_set_up + "\n" + prompt + code + "\n" + test + elif language == "java": + test_string = prompt + code + "\n" + test + elif language == "js" or language == "javascript": + test_string = prompt + code + "\n" + test + elif language == "go": + import_string = problems[task_id]["import"] + prompt = prompt.replace(import_string, "") + if example_test and example_test_key in problems[task_id]: + test = problems[task_id][example_test_key] + else: + test = problems[task_id]["test"] + test_setup = problems[task_id]["test_setup"] + other_pkgs = [] + for pkg in IMPORT_HELPER["go"]: + if pkg not in test_setup: + p = pkg.split("/")[-1] + if p + "." in code: + other_pkgs.append(f"\"{pkg}\"") + if other_pkgs: + import_other_pkgs = "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")" + test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test + else: + test_string = test_setup + "\n" + prompt + code + "\n" + test + elif language == "rust": + main = "\nfn main(){ \n } \n" + declaration = problems[task_id]["declaration"] + test_string = main + declaration + prompt + code + test + + return test_string + + +def stream_jsonl_all(filename: str) -> Iterable[Dict]: + results = [] + fp = safe_open(filename, "r") + for line in fp: + if any(not x.isspace() for x in line): + results.append(json.loads(line)) + fp.close() + + return results + + +def evaluate_functional_correctness(config: EvalConfig): + completion_id_key = COMPLETION_ID_KEY + test_code_key = TEST_CODE_KEY + task_id_key = TASK_ID_KEY + + if config.example_test: + pass + + problems = read_dataset(config.problem_file, + dataset_type="humaneval") + sample_jsonl = stream_jsonl_all(config.input_file) + + if config.example_test: + suffix = "_example_test.jsonl" + else: + suffix = "_results.jsonl" + if config.out_dir is not None: + if not os.path.exists(config.out_dir): + os.makedirs(config.out_dir) + out_file = os.path.join(config.out_dir, config.input_file.split('/')[-1].replace(".jsonl", suffix)) + else: + out_file = os.path.join(config.input_file.replace(".jsonl", suffix)) + + if "/codegeex/benchmark/humaneval-x/" in config.input_file: + config.test_groundtruth = True + + if "-to-" in config.input_file: + translation_mode = True + else: + translation_mode = False + + with ThreadPoolExecutor(max_workers=config.n_workers) as executor: + futures = [] + completion_id = Counter() + n_samples = 0 + results = defaultdict(list) + + if config.test_groundtruth: + for sample in tqdm(problems.values()): + task_id = sample[task_id_key] + lang = task_id.split("/")[0].lower() + if lang == "javascript": + lang = "js" + tmp_dir_ = os.path.join(config.tmp_dir, lang, "evaluation") + sample["generation"] = sample["canonical_solution"] + sample[test_code_key] = process_humaneval_test(sample, problems, config.example_test) + if sample[test_code_key] is None: + print(f"Skipping task {task_id} due to missing test code.") # 跳过的任务 + continue + config_dict = { + "language_type": lang, + "timeout": config.timeout, + "tmp_dir": tmp_dir_, + "completion_id": completion_id[task_id], + "go_dir": config.go_dir + } + args = (task_id, sample, config_dict) + future = executor.submit(check_correctness, *args) + futures.append(future) + completion_id[task_id] += 1 + n_samples += 1 + else: + for sample in tqdm(sample_jsonl): + task_id = sample[task_id_key] + + lang = task_id.split("/")[0].lower() + if translation_mode: + task_id = sample[task_id_key].split("/")[-1] + lang = regex.findall("-to-.*-", config.input_file)[0].split("-to-")[-1].rstrip("-") + for language in LANGUAGE_NAME: + if language in lang: + lang = language + break + task_id = f"{LANGUAGE_NAME[lang]}/{task_id}" + if lang == "javascript": + lang = "js" + tmp_dir_ = os.path.join(config.tmp_dir, lang, "evaluation") + sample[task_id_key] = task_id + sample[test_code_key] = process_humaneval_test(sample, problems, config.example_test) + if sample[test_code_key] is None: + continue + if completion_id_key in sample: + completion_id_ = sample[completion_id_key] + else: + completion_id_ = completion_id[task_id] + config_dict = { + "language_type": lang, + "timeout": config.timeout, + "tmp_dir": tmp_dir_, + "completion_id": completion_id_, + "go_dir": config.go_dir + } + args = (task_id, sample, config_dict) + future = executor.submit(check_correctness, *args) + futures.append(future) + completion_id[task_id] += 1 + n_samples += 1 + + if len(completion_id) == len(problems): + evaluate_pass_at_k = True + else: + evaluate_pass_at_k = False + + for future in tqdm(as_completed(futures), total=len(futures)): + result = future.result() + results[result[task_id_key]].append((result[completion_id_key], result)) + + # Calculate accuracy + total, correct, details = [], [], [] + for result in results.values(): + for r in result: + passed = r[1].get('passed', False) + total.append(1) + correct.append(1 if passed else 0) + details.append({'task_id': r[0], 'passed': passed, 'result': r[1]}) + + accuracy = 100 * sum(correct) / sum(total) if total else 0 + + result = {'accuracy': accuracy, 'details': details} + + fp = safe_open(out_file, 'w') + for res in results.values(): + for r in res: + fp.write(json.dumps(r[1], indent=4) + "\n") + fp.close() + + with safe_open(out_file, "ab") as fp: + fp.write((json.dumps(result) + "\n").encode('utf-8')) + + return result # Only return the required result + diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_utils.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..592b66cb5ef673383c5ebd1876d672c7a4b31b3e --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humaneval_x_utils.py @@ -0,0 +1,816 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import faulthandler +import json +import io +import itertools +import multiprocessing +import os +import sys +import shutil +import platform +import random +import signal +import subprocess +import tempfile +import builtins +import resource +import gzip +import json +from typing import Iterable, Dict +import importlib.util +from typing import Iterable, Dict, Union, List, Optional +import torch +import numpy as np +from torch.utils.data import Dataset +from transformers.generation.stopping_criteria import StoppingCriteria + +from .file_utils import safe_open + +# from atb_llm.utils.log.logging import logger + +LANGUAGE_TAG = { + "c": "// language: C", + "c++": "// language: C++", + "cpp": "// language: C++", + "c#": "// language: C#", + "csharp": "// language: C#", + "css": "/* language: CSS */", + "cuda": "// language: Cuda", + "dart": "// language: Dart", + "lua": "// language: Lua", + "objectivec": "// language: Objective-C", + "objective-c": "// language: Objective-C", + "objective-c++": "// language: Objective-C++", + "python": "# language: Python", + "perl": "# language: Perl", + "prolog": "% language: Prolog", + "swift": "// language: swift", + "lisp": "; language: Lisp", + "java": "// language: Java", + "scala": "// language: Scala", + "tex": "% language: TeX", + "vue": "", + "markdown": "", + "html": "", + "php": "// language: PHP", + "js": "// language: JavaScript", + "javascript": "// language: JavaScript", + "typescript": "// language: TypeScript", + "go": "// language: Go", + "shell": "# language: Shell", + "rust": "// language: Rust", + "sql": "-- language: SQL", + "kotlin": "// language: Kotlin", + "vb": "' language: Visual Basic", + "ruby": "# language: Ruby", + "pascal": "// language: Pascal", + "r": "# language: R", + "fortran": "!language: Fortran", + "lean": "-- language: Lean", + "matlab": "% language: Matlab", + "delphi": "{language: Delphi}", + "scheme": "; language: Scheme", + "basic": "' language: Basic", + "assembly": "; language: Assembly", + "groovy": "// language: Groovy", + "abap": "* language: Abap", + "gdscript": "# language: GDScript", + "haskell": "-- language: Haskell", + "julia": "# language: Julia", + "elixir": "# language: Elixir", + "excel": "' language: Excel", + "clojure": "; language: Clojure", + "actionscript": "// language: ActionScript", + "solidity": "// language: Solidity", + "powershell": "# language: PowerShell", + "erlang": "% language: Erlang", + "cobol": "// language: Cobol", +} + +IMPORT_HELPER = { + "python": [ + "import math", + "import re", + "import sys", + "import copy", + "import datetime", + "import itertools", + "import collections", + "import heapq", + "import statistics", + "import functools", + "import hashlib", + "import numpy", + "import numpy as np", + "import string", + "from typing import *", + "from collections import *", + ], + "go": [ + "math", + "strings", + "fmt", + "strconv", + "time", + "bytes", + "regexp", + "sort", + "math/rand", + "crypto/md5", + ], + "cpp": [ + "#include", + "#include", + "#include", + "#include", + "#include", + "#include", + "#include", + "#include", + "#include", + ], +} + + +class StoppingCriteriaWithHumanEvalX(StoppingCriteria): + def __init__(self, lang: str = None, original_input_len: int = None, tokenizer=None): + self.lang = lang + self.original_input_len = original_input_len + self.tokenizer = tokenizer + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + output_ids = input_ids[0] + if output_ids[-1].detach().cpu().numpy() in [self.tokenizer.eos_token_id]: + return True + text = self.tokenizer.decode(output_ids[self.original_input_len:], skip_special_tokens=False) + return is_code_generation_finished( + text, + language_type=self.lang, + dataset="humaneval", + ) + + +class HumanEvalXDataset(Dataset): + def __init__(self, task_dict): + self.task_dict = task_dict + self.keys = list(task_dict.keys()) + + def __len__(self): + return len(self.keys) + + def __getitem__(self, index): + return self.task_dict[self.keys[index]] + + +def estimate_pass_at_k( + num_samples: Union[int, List[int], np.ndarray], + num_correct: Union[List[int], np.ndarray], + k: int +) -> np.ndarray: + """ + Estimates pass@k of each problem and returns them in an array. + """ + + def estimator(n: int, c: int, k: int) -> float: + """ + Calculates 1 - comb(n - c, k) / comb(n, k). + """ + if n - c < k: + return 1.0 + return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) + + if isinstance(num_samples, int): + num_samples_it = itertools.repeat(num_samples, len(num_correct)) + else: + if len(num_samples) != len(num_correct): + raise RuntimeError("The lengths of num_samples and num_correct do not match.") + num_samples_it = iter(num_samples) + + return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) + + +def stream_jsonl(filename: str) -> Iterable[Dict]: + """ + Parses each jsonl line and yields it as a dictionary, assuming the file is gzipped if needed. + """ + + if filename.endswith('.gz'): + with gzip.open(filename, 'rt', encoding='utf-8') as fp: + for line in fp: + if any(not x.isspace() for x in line): + yield json.loads(line) + else: + with open(filename, 'r', encoding='utf-8') as fp: + for line in fp: + if any(not x.isspace() for x in line): + yield json.loads(line) + + +def read_dataset( + data_file: str = None, + dataset_type: str = "humaneval", + num_shot=None, +) -> Dict: + if num_shot is not None: + pass + if "humaneval" in dataset_type.lower(): + if data_file is None: + current_path = os.path.dirname(os.path.abspath(__file__)) + data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz") + dataset = {task["task_id"]: task for task in stream_jsonl(data_file)} + else: + raise f"Dataset: {dataset_type} not supported." + + return dataset + + +def process_extra_prompt(prompt: str, language_type: str = None) -> str: + """ + Processes the extra prompt. + """ + language = language_type.lower() + if language in LANGUAGE_TAG: + extra_prompt = LANGUAGE_TAG[language] + "\n" + else: + extra_prompt = "" + + return extra_prompt + prompt + + +def is_code_generation_finished( + code: str, + language_type: str = None, + dataset: str = None, +): + """ + Checks whether the generated code is finished. + """ + if language_type is None or dataset is None: + return False + + if "humaneval" in dataset.lower(): + if language_type.lower() == "python": + for line in code.split("\n"): + if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t': + return True + end_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"] + for w in end_words: + if w in code: + return True + elif language_type.lower() == "java": + if code.count("{") + 1 == code.count("}"): + return True + elif language_type.lower() == "go": + if code.count("{") + 1 == code.count("}"): + return True + elif language_type.lower() == "js": + if code.count("{") + 1 == code.count("}"): + return True + elif language_type.lower() == "cpp": + if code.count("{") + 1 == code.count("}"): + return True + + return False + + +def cleanup_code( + code: str, + language_type: str = None, + dataset: str = None, +): + """ + Cleans up the generated code. + """ + if language_type is None or dataset is None: + return code + + if "humaneval" in dataset.lower(): + if language_type.lower() == "python": + end_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint", "\nassert"] + for w in end_words: + if w in code: + code = code[:code.rfind(w)] + elif language_type.lower() == "java": + main_pos = code.find("public static void main") + if main_pos != -1: + code = code[:main_pos] + '}' + if '}' in code: + code = code[:code.rfind('}')] + '}' + if code.count('{') + 1 == code.count('}'): + code += "\n}" + elif language_type.lower() == "go": + end_words = ["\n//", "\nfunc main("] + for w in end_words: + if w in code: + code = code[:code.rfind(w)] + if '}' in code: + code = code[:code.rfind('}')] + '}' + elif language_type.lower() == "cpp": + if '}' in code: + code = code[:code.rfind('}')] + '}' + elif language_type.lower() == "js": + if '}' in code: + code = code[:code.rfind('}')] + '}' + + return code + + +class TimeoutException(Exception): + pass + + +class WriteOnlyStringIO(io.StringIO): + """ StringIO that throws an exception when it's read from """ + + def read(self, *args, **kwargs): + raise IOError + + def readline(self, *args, **kwargs): + raise IOError + + def readlines(self, *args, **kwargs): + raise IOError + + def readable(self, *args, **kwargs): + """ Returns True if the IO object can be read. """ + return False + + +class RedirectStdin: + def __init__(self, new_target): + self._new_target = new_target + self._old_target = None + + def __enter__(self): + self._old_target = sys.stdin + sys.stdin = self._new_target + return self._new_target + + def __exit__(self, exc_type, exc_value, traceback): + sys.stdin = self._old_target + + +@contextlib.contextmanager +def time_limit(seconds: float): + def signal_handler(signum, frame): + raise TimeoutException("Timed out!") + + signal.setitimer(signal.ITIMER_REAL, seconds) + signal.signal(signal.SIGALRM, signal_handler) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + + +@contextlib.contextmanager +def chdir(root): + if root == ".": + yield + return + cwd = os.getcwd() + os.chdir(root) + try: + yield + except BaseException as exc: + raise exc + finally: + os.chdir(cwd) + + +@contextlib.contextmanager +def create_tempdir(): + original_unlink = os.unlink + original_remove = os.remove + + def get_safe_unlink(): + return getattr(shutil, '_orig_unlink', original_unlink) + + def get_safe_remove(): + return getattr(shutil, '_orig_remove', original_remove) + + try: + os.unlink = get_safe_unlink() + os.remove = get_safe_remove() + with tempfile.TemporaryDirectory() as dirname: + yield dirname + finally: + os.unlink = original_unlink + os.remove = original_remove + + +@contextlib.contextmanager +def swallow_io(): + stream = WriteOnlyStringIO() + with contextlib.redirect_stdout(stream): + with contextlib.redirect_stderr(stream): + with RedirectStdin(stream): + yield + + +def reliability_guard(maximum_memory_bytes: Optional[int] = None): + """ + This disables various destructive functions and prevents the generated code + from interfering with the test (e.g. fork bomb, killing other processes, + removing filesystem files, etc.) + + WARNING + This function is NOT a security sandbox. Untrusted code, including, model- + generated code, should not be blindly executed outside of one. See the + Codex paper for more information about OpenAI's code sandbox, and proceed + with caution. + """ + + original_unlink = os.unlink + original_remove = os.remove + try: + if maximum_memory_bytes is not None: + resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)) + resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)) + if not platform.uname().system == 'Darwin': + resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)) + except Exception as e: + pass + + faulthandler.disable() + + builtins.exit = None + builtins.quit = None + + os.environ['OMP_NUM_THREADS'] = '1' + + os.kill = None + os.system = None + os.putenv = None + os.remove = None + os.removedirs = None + os.rmdir = None + os.fchdir = None + os.setuid = None + os.fork = None + os.forkpty = None + os.killpg = None + os.rename = None + os.renames = None + os.truncate = None + os.replace = None + os.unlink = None + os.fchmod = None + os.fchown = None + os.chmod = None + os.chown = None + os.chroot = None + os.fchdir = None + os.lchflags = None + os.lchmod = None + os.lchown = None + os.getcwd = None + os.chdir = None + + shutil.move = None + shutil.chown = None + + subprocess.Popen = None # type: ignore + + __builtins__['help'] = None + + sys.modules['ipdb'] = None + sys.modules['joblib'] = None + sys.modules['resource'] = None + sys.modules['psutil'] = None + + finally: + os.unlink = original_unlink + os.remove = original_remove + + +def check_correctness( + task_id: str, + sample: dict, + config: dict, +) -> Dict: + """ + Evaluates the functional correctness of a completion by running the test + suite provided in the problem. + """ + + language_type = config.get('language_type', 'python') + timeout = config.get('timeout', 300.0) + tmp_dir = config.get('tmp_dir', None) + completion_id = config.get('completion_id', None) + go_dir = config.get('go_dir', None) + + finish = "finish" + file = "file" + output = "output" + test_code = "test_code" + passed = "passed" + timed_out = "timed out" + tmp = "tmp" + test_cpp = "test.cpp" + + def unsafe_execute(tmp_dir): + random_id = random.uniform(1, 1000) + if "python" in language_type.lower(): + with create_tempdir() as tmp_dir: + + # These system calls are needed when cleaning up tempdir. + rmtree = shutil.rmtree + rmdir = os.rmdir + change_dir = os.chdir + + # Disable functionalities that can make destructive changes to the test. + reliability_guard() + code_file = os.path.join(tmp_dir, f"test_code_{random_id}.py") + + with open(code_file, 'w') as f: + f.write(sample[test_code]) + + exec_globals = {} + try: + with swallow_io(): + with time_limit(timeout): + # WARNING + # This program exists to execute untrusted model-generated code. Although + # it is highly unlikely that model-generated code will do something overtly + # malicious in response to this test suite, model-generated code may act + # destructively due to a lack of model capability or alignment. + # Users are strongly encouraged to sandbox this evaluation suite so that it + # does not perform destructive actions on their host or network. + spec = importlib.util.spec_from_file_location("test_code_module", code_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + result.append(passed) + except TimeoutException: + result.append(timed_out) + except AssertionError: + result.append("failed: AssertionError") + except BaseException as e: + result.append(f"failed: {e}") + + # Needed for cleaning up. + shutil.rmtree = rmtree + os.rmdir = rmdir + os.chdir = change_dir + + elif "go" in language_type.lower(): + if tmp_dir is None: + raise RuntimeError("Go should be evaluated in a dir where necessary module files installed.") + + if tmp not in tmp_dir: + tmp_dir = os.path.join(tmp_dir, tmp) + tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}") + tmp_dir = os.path.abspath(tmp_dir) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + os.chdir(tmp_dir) + files_in_tmp_dir = os.listdir(tmp_dir) + shutil.copytree(go_dir, tmp_dir, dirs_exist_ok=True) + files_in_tmp_dir = os.listdir(tmp_dir) + safe_open("main_test.go", 'w').write(sample[test_code]) + exec_result = None + try: + # '/usr/local/go/bin/go', 'test', '-timeout=6s', 'main_test.go' + go_executable = shutil.which("go") + if go_executable is None: + raise FileNotFoundError("Go executable not found in the system PATH.") + with time_limit(timeout): + # WARNING + # This program exists to execute untrusted model-generated code. Although + # it is highly unlikely that model-generated code will do something overtly + # malicious in response to this test suite, model-generated code may act + # destructively due to a lack of model capability or alignment. + # Users are strongly encouraged to sandbox this evaluation suite so that it + # does not perform destructive actions on their host or network. + exec_result = subprocess.run([go_executable, "test", f"-timeout={timeout}s", "main_test.go"], + timeout=timeout, capture_output=True) + + if exec_result.returncode == 0: + result.append(passed) + else: + if exec_result.stderr: + try: + err = exec_result.stderr.decode() + except Exception: + err = exec_result.stderr + else: + try: + err = exec_result.stdout.decode() + except Exception: + err = exec_result.stdout + result.append(f"failed: {err}") + + except TimeoutException: + result.append(timed_out) + + tmp_dir = os.path.abspath(tmp_dir) + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + elif "js" in language_type.lower(): + + if tmp not in tmp_dir: + tmp_dir = os.path.join(tmp_dir, tmp) + tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}") + tmp_dir = os.path.abspath(tmp_dir) + + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + os.chdir(tmp_dir) + safe_open("test.js", 'w').write(sample[test_code]) + exec_result = None + try: + node_executable = shutil.which("node") + if node_executable is None: + raise FileNotFoundError("Node executable not found in the system PATH.") + with time_limit(timeout): + # WARNING + # This program exists to execute untrusted model-generated code. Although + # it is highly unlikely that model-generated code will do something overtly + # malicious in response to this test suite, model-generated code may act + # destructively due to a lack of model capability or alignment. + # Users are strongly encouraged to sandbox this evaluation suite so that it + # does not perform destructive actions on their host or network. + exec_result = subprocess.run([node_executable, "test.js"], timeout=timeout, capture_output=True) + + if exec_result.stderr.decode(): + err = exec_result.stderr.decode() + result.append(f"failed: {err}") + elif exec_result.stdout.decode(): + err = exec_result.stdout.decode() + result.append(f"failed: {err}") + else: + result.append(passed) + + except TimeoutException: + result.append(timed_out) + + tmp_dir = os.path.abspath(tmp_dir) + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + elif "cpp" in language_type.lower(): + + if tmp not in tmp_dir: + tmp_dir = os.path.join(tmp_dir, tmp) + tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}") + tmp_dir = os.path.abspath(tmp_dir) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + os.chdir(tmp_dir) + safe_open(test_cpp, 'w').write(sample[test_code]) + if "162" in task_id: + gpp_executable = shutil.which("g++") + compilation_result = subprocess.run([gpp_executable, "-std=c++11", test_cpp, "-lcrypto", "-lssl"], + timeout=10, + capture_output=True) + compilation_result = subprocess.run([gpp_executable, "-std=c++11", test_cpp, "-lcrypto", "-lssl"], + timeout=10, + capture_output=True) + else: + gpp_executable = shutil.which("g++") + compilation_result = subprocess.run([gpp_executable, "-std=c++11", test_cpp], timeout=timeout, + capture_output=True) + if compilation_result.returncode != 0: + if compilation_result.stderr: + err = compilation_result.stderr.decode() + else: + err = compilation_result.stdout.decode() + result.append(f"failed: compilation error: {err}") + else: + exec_result = None + try: + with time_limit(timeout): + # WARNING + # This program exists to execute untrusted model-generated code. Although + # it is highly unlikely that model-generated code will do something overtly + # malicious in response to this test suite, model-generated code may act + # destructively due to a lack of model capability or alignment. + # Users are strongly encouraged to sandbox this evaluation suite so that it + # does not perform destructive actions on their host or network. + exec_result = subprocess.run(["./a.out"], timeout=timeout, capture_output=True) + + if exec_result.returncode == 0: + result.append(passed) + else: + if exec_result.stderr: + try: + err = exec_result.stderr.decode() + except Exception: + err = exec_result.stderr + else: + try: + err = exec_result.stdout.decode() + except Exception: + err = exec_result.stdout + result.append(f"failed: {err}") + except TimeoutException: + result.append(timed_out) + + tmp_dir = os.path.abspath(tmp_dir) + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + elif "java" in language_type.lower(): + if tmp_dir is None: + raise RuntimeError("Java should be evaluated in a temporary dir.") + + if tmp not in tmp_dir: + tmp_dir = os.path.join(tmp_dir, tmp) + tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}") + tmp_dir = os.path.abspath(tmp_dir) + + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir, exist_ok=True) + os.chdir(tmp_dir) + with safe_open(os.path.join(tmp_dir, "Main.java"), mode='w', permission_mode=0o644) as f: + f.write(sample[test_code]) + + res = "failed: unknown error" + compile_returncode = -1 + for _ in range(5): + try: + javac_executable = shutil.which("javac") + if javac_executable is None: + raise FileNotFoundError("JavaC executable not found in the system PATH.") + compilation_result = subprocess.run([javac_executable, os.path.join(tmp_dir, "Main.java")], + timeout=5, capture_output=True) + compile_returncode = compilation_result.returncode + break + except subprocess.TimeoutExpired: + continue + if compile_returncode != 0: + res = "failed: compilation error" + else: + exec_result = None + try: + java_executable = shutil.which("java") + if java_executable is None: + raise FileNotFoundError("Java executable not found in the system PATH.") + # WARNING + # This program exists to execute untrusted model-generated code. Although + # it is highly unlikely that model-generated code will do something overtly + # malicious in response to this test suite, model-generated code may act + # destructively due to a lack of model capability or alignment. + # Users are strongly encouraged to sandbox this evaluation suite so that it + # does not perform destructive actions on their host or network. + exec_result = subprocess.run([java_executable, '-cp', tmp_dir, 'Main'], + timeout=timeout, capture_output=True) + if exec_result.returncode == 0: + res = passed + elif exec_result.returncode == 1: + if "AssertionError" in exec_result.stderr.decode('unicode-escape'): + res = "failed: wrong answer" + else: + res = f"failed: {exec_result.stderr.decode()}" + except subprocess.TimeoutExpired: + res = "time out" + except BaseException as e: + res = f"failed: {e}" + result.append(res) + + tmp_dir = os.path.abspath(tmp_dir) + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + manager = multiprocessing.Manager() + result = manager.list() + + p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,)) + p.start() + p.join(timeout=timeout + 1) + if p.is_alive(): + p.kill() + + if not result: + result.append(timed_out) + + return { + "task_id": task_id, + "completion_id": completion_id, + test_code: sample[test_code], + "prompt": sample["prompt"], + "generation": sample["generation"], + "result": result[0], + passed: result[0] == passed, + finish: -1 if finish not in sample else sample[finish], + file: "" if file not in sample else sample[file], + output: [] if output not in sample else sample[output], + } diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humanevalx.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humanevalx.py new file mode 100644 index 0000000000000000000000000000000000000000..120bd7b9e36e5be8df8ca564905011554f7c5df3 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/humanevalx/humanevalx.py @@ -0,0 +1,253 @@ +import gzip +import json +import os +import os.path as osp +import re +import subprocess +import tempfile +import time +from shutil import copyfile +from typing import Dict, Iterable + +from datasets import Dataset + +from ais_bench.benchmark.openicl.icl_evaluator import BaseEvaluator +from ais_bench.benchmark.utils import get_data_path + +from ..base import BaseDataset +from ..humaneval import humaneval_postprocess_v2 +from .humaneval_x_eval import evaluate_functional_correctness, EvalConfig + +_LANGUAGE_NAME_DICT = { + 'cpp': 'CPP', + 'go': 'Go', + 'java': 'Java', + 'js': 'JavaScript', + 'python': 'Python', + 'rust': 'Rust', +} + + +class HumanevalXDataset(BaseDataset): + + @staticmethod + def load(path, language, **kwargs): + path = get_data_path(path, local_mode=True) + assert language in _LANGUAGE_NAME_DICT.keys(), ( + f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}') + file_path = osp.join(path, f'humanevalx_{language}.jsonl.gz') + dataset = HumanevalXDataset._stream_jsonl_all(file_path) + return Dataset.from_list(dataset) + + @staticmethod + def _stream_jsonl_all(filename: str) -> Iterable[Dict]: + results = [] + if filename.endswith('.gz'): + fp = gzip.open(open(filename, 'rb'), 'rt') + else: + fp = open(filename, 'r') + for line in fp: + if any(not x.isspace() for x in line): + results.append(json.loads(line)) + fp.close() + + return results + + +import jsonlines + + +def generate_predictions_from_file(file_path, language): + predictions = [] + with jsonlines.open(file_path) as reader: + for i, obj in enumerate(reader): + task_id = obj['task_id'] + generation = obj['generation'] + # Clean up the code if necessary + # cleaned_generation = _clean_up_code(generation, language, None) # Assuming 'refer' is not needed here + predictions.append({ + 'task_id': task_id, + 'generation': generation + }) + return predictions + + +class HumanevalXEvaluator(BaseEvaluator): + """Evaluator for humanevalx. + + Before you use this Evaluator, launch a code eval service according + to to readme of https://github.com/Ezra-Yu/code-evaluator. + Set `ip_address` and `port` according your environment. + + Args: + language (str): the program language to evaluate. + ip_address (str): The IP Address of HumanevalX code evaluate service. + refer to https://github.com/Ezra-Yu/code-evaluator to launch a + code evaluate service. Defaults to 'localhost'. + port (int): The port of HumanevalX code evaluate service. + Defaults to 5000. + timeout (int): Maximum wait time when accessing the service, + Defaults to 100. + + TODO: support 'k' of pass@k. default to use k = [1, 10, 100] + """ + + def __init__(self, + language, + ip_address='localhost', + port='', + retry=2, + # timeout=600) -> None: + timeout=6) -> None: + assert language in _LANGUAGE_NAME_DICT.keys(), ( + f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}') + if language == 'rust': + timeout *= 10 # rust need more time + self.language = language + self.ip_address = ip_address + self.port = port + self.retry = retry + self.timeout = timeout + super().__init__() + + def score(self, predictions, references, test_set): + prompts = [item['prompt'] for item in test_set] + problem_file = f'benchmark/ais_bench/datasets/humanevalx/humanevalx_{self.language}.jsonl.gz' + # Get the absolute path + problem_file = os.path.abspath(problem_file) + import json + # 定义文件路径 + go_dir = f'./benchmark/ais_bench/benchmark/datasets/humanevalx/go/evaluation' + + go_dir = os.path.abspath(go_dir) + + predictions = [{ + 'task_id': + f'{_LANGUAGE_NAME_DICT[self.language]}/{i}', + 'generation': + _clean_up_code(pred, self.language, refer), + 'prompt': f'{prompt}' + + } for i, (pred, refer, prompt) in enumerate(zip(predictions, references, prompts))] + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_out_path = osp.join(tmp_dir, + f'humanevalx_{self.language}.json') + with open(tmp_out_path, 'w') as f: + for pred in predictions: + f.write(json.dumps(pred) + '\n') + + config = EvalConfig(input_file=tmp_out_path, timeout=self.timeout, problem_file=problem_file, + go_dir=go_dir) + + # 调用评估函数进行功能正确性评估 + result = evaluate_functional_correctness(config) + + return result + + + def _code_eval_service(self, file_path): + if self.port: + eval_server_url = f'{self.ip_address}:{self.port}/evaluate' + else: + eval_server_url = f'{self.ip_address}/evaluate' + exec_result = subprocess.run([ + 'curl', '-X', 'POST', '-F', f'file=@{file_path}', '-F', + f'dataset=humanevalx/{self.language}', f'{eval_server_url}' + ], + timeout=self.timeout, + capture_output=True) + + if exec_result.returncode == 0 and re.match( + "\"{.*:.*}\"", exec_result.stdout.decode('utf-8')): + return True, json.loads(exec_result.stdout.decode('utf-8')) + else: + if exec_result.stderr: + try: + err = exec_result.stderr.decode() + except Exception: + err = exec_result.stderr + else: + try: + err = exec_result.stdout.decode() + except Exception: + err = exec_result.stdout + return False, err + + +def _clean_up_code(text: str, language_type: str, reference) -> str: + """Cleans up the generated code.""" + try: + # for chatGLM related text + eval_text = eval(text) + except Exception: + pass + else: + if isinstance(eval_text, str): + text = eval_text + # extract code from code block + text = text.lstrip('\n') + if '```' in text: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if len(blocks) == 0: + text = text.split('```')[1] # fall back to default strategy + else: + text = blocks[0] # fetch the first code block + if not text.startswith('\n'): # in case starting with ```xxx + text = text[max(text.find('\n') + 1, 0):] + if language_type.lower() == 'python': + text = humaneval_postprocess_v2(text) + # we need to take care of the first line + # append extra space for first line for correct indentation + text = ' ' + text.lstrip() + + text_splits = text.split('\n') + is_empty_line = False + ind_empty_line = None + for i, line in enumerate(text_splits): + if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t': + is_empty_line = True + ind_empty_line = i + break + if is_empty_line: + text = '\n'.join(text_splits[:ind_empty_line]) + else: + end_words = [ + '\ndef', '\nclass', '\n#', '\nassert', '\n"""', '\nprint', + '\nif', '\n\n\n' + ] + for w in end_words: + if w in text: + text = text[:text.rfind(w)] + # strip function head for all other language + func_name = reference.strip().split('\n')[-1] + if func_name: + func_name = func_name.strip().strip('{') + if func_name in text: + text = '\n'.join(text[text.find(func_name):].split('\n')[1:]) + if language_type.lower() == 'java': + main_pos = text.find('public static void main') + if main_pos != -1: + text = text[:main_pos] + '}' + if '}' in text: + text = text[:text.rfind('}')] + '}' + if text.count('{') + 1 == text.count('}'): + text += '\n}' + elif language_type.lower() == 'go': + if '\nfunc main(' in text: + text = text[:text.rfind('func main(')] + if '}' in text: + text = text[:text.rfind('}')] + '}' + elif language_type.lower() == 'cpp': + if '\nint main()' in text: + text = text[:text.rfind('int main()')] + if '}' in text: + text = text[:text.rfind('}')] + '}' + elif language_type.lower() == 'js': + if '}' in text: + text = text[:text.rfind('}')] + '}' + elif language_type.lower() == 'rust': + if '}' in text: + text = text[:text.rfind('}')] + '}' + + return text diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py index 274464d26e5012c01a6f0c44732199590cc954ae..5dd7e9c6b7a6e3fdaf33071337d6e1205d6317da 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/all_dataset_configs.py @@ -91,4 +91,6 @@ with read_base(): # winogrande from ais_bench.benchmark.configs.datasets.winogrande.winogrande_gen_0_shot_chat_prompt import winogrande_datasets as winogrande_0_shot_chat - from ais_bench.benchmark.configs.datasets.winogrande.winogrande_gen_5_shot_chat_prompt import winogrande_datasets as winogrande_5_shot_chat \ No newline at end of file + from ais_bench.benchmark.configs.datasets.winogrande.winogrande_gen_5_shot_chat_prompt import winogrande_datasets as winogrande_5_shot_chat + + from ais_bench.benchmark.configs.datasets.humanevalx.humanevalx_gen_0_shot import import humanevalx_datasets as humanevalx_0_shot_str \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/infer_api_minidie_vllm_humanevalx.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/infer_api_minidie_vllm_humanevalx.py new file mode 100644 index 0000000000000000000000000000000000000000..7e77484284ed541d25f62bb92ff4cf36f3f4aac5 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/configs/api_examples/infer_api_minidie_vllm_humanevalx.py @@ -0,0 +1,46 @@ +from mmengine.config import read_base +from ais_bench.benchmark.models import VLLMCustomAPI +from ais_bench.benchmark.partitioners import NaivePartitioner +from ais_bench.benchmark.runners.local_api import LocalAPIRunner +from ais_bench.benchmark.tasks import OpenICLInferTask + +with read_base(): + # from ais_bench.benchmark.configs.datasets.collections.chat_medium import datasets + from ais_bench.benchmark.configs.summarizers.example import summarizer + from ais_bench.benchmark.configs.datasets.humanevalx.humanevalx_gen_0_shot import humanevalx_datasets # 此处导入之前写的数据集配置文件中的数据集实例 + +datasets = [ + *humanevalx_datasets, # 添加数据集实例 +] + + +models = [ + dict( + type=VLLMCustomAPI, # 推理后端,建议优先写支持mindie服务化的。VLLMCustomAPIOld可对接mindie服务化(非流式),VLLMCustomAPI可对接0.6+版本gpu上vllm拉起的服务,MindieStreamApi可对接mindie服务化(非流式) + abbr='mindie-vllm-api-humanevalx', + max_seq_len = 4096, + query_per_second = 1024, + rpm_verbose = False, + retry = 2, + host_ip = "90.91.56.32", # 使用时按实际服务化的ip修改 + max_out_len = 1, + + # 改为本机的 + host_port = 9091, # 使用时按实际服务化的端口修改 + enable_ssl = False, + generation_kwargs = dict( + temperature = 0, + seed = 1, + ) + ) +] + + +infer = dict(partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalAPIRunner, + max_num_workers=2, + concurrent_users=2, + task=dict(type=OpenICLInferTask)), ) + +work_dir = 'outputs/api-mindie-vllm-humanevalx/' # 自定义的工作路径,工具运行结果会落盘在这个路径下的某个时间戳目录下 \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py b/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py index bd6bcca0c5c274a9b091120e9153432b68e78e95..45630c39f09cffc990b4b27bcc031b6a763b12f6 100644 --- a/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py +++ b/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_run_diff_datasets.py @@ -20,6 +20,7 @@ DATASETS_CONFIGS_LIST = [ "mgsm", "agieval", "cmmlu", + "humanevalx", ] class TestClass: @@ -271,6 +272,48 @@ class TestClass: vis_md_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.md") assert os.path.exists(vis_md_path) + + def test_vllm_api_all_qwen2_7b_humanevalx_0_shot(self, monkeypatch): # 唯一的测试函数名 + fake_prediction = "112" # 模拟的推理输出,随便写吧 + fake_time_str = "humanevalx_0_shot" # 模拟的时间戳,需要确保和其他用例不重复 + datasets_abbr_name = "humanevalx-" # 被测数据集配置文件中abbr的名称 humanevalx- + datasets_script_name = "humanevalx_gen_0_shot" # 被测数据集配置文件名称 + languages = ['python', 'cpp', 'go', 'java', 'js'] + + monkeypatch.setattr('sys.argv', + ["ais_bench", "--models", "vllm_api_general", "--datasets", datasets_script_name, + "--summarizer", "example","--mode", "all", "-w", self.test_data_path]) + monkeypatch.setattr("ais_bench.benchmark.models.vllm_custom_api.VLLMCustomAPI._get_service_model_path", lambda *arg: "qwen2") + monkeypatch.setattr("ais_bench.benchmark.models.vllm_custom_api.VLLMCustomAPI._generate", lambda *arg: fake_prediction) + monkeypatch.setattr("ais_bench.benchmark.cli.main.get_current_time_str", lambda *arg: fake_time_str) + main() + + for lang in languages: + curr_datasets_abbr_name = datasets_abbr_name + lang + # check infer out + infer_outputs_json_path = os.path.join(self.test_data_path, f"{fake_time_str}/predictions/vllm-api-general/{curr_datasets_abbr_name}.json") + assert os.path.exists(infer_outputs_json_path) + with open(infer_outputs_json_path, 'r', encoding='utf-8') as file: + data = json.load(file) + assert data.get(f"0").get("prediction") == fake_prediction + + # check eval out + results_json_path = os.path.join(self.test_data_path, f"{fake_time_str}/results/vllm-api-general/{curr_datasets_abbr_name}.json") + + with open(results_json_path, 'r', encoding='utf-8') as file: + data = json.load(file) + assert data.get("accuracy") is not None + + + # check vis + vis_csv_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.csv") + assert os.path.exists(vis_csv_path) + vis_txt_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.txt") + assert os.path.exists(vis_txt_path) + vis_md_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.md") + assert os.path.exists(vis_md_path) + + def test_vllm_api_all_qwen2_7b_math500_0_shot(self, monkeypatch): fake_prediction = "11" fake_time_str = "math500_0_shot" @@ -1297,4 +1340,7 @@ class TestClass: vis_txt_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.txt") assert os.path.exists(vis_txt_path) vis_md_path = os.path.join(self.test_data_path, f"{fake_time_str}/summary/summary_{fake_time_str}.md") - assert os.path.exists(vis_md_path) \ No newline at end of file + assert os.path.exists(vis_md_path) + + +