From f28a7e6f314a7aa8ce9552e8fc5a8c1b556f5a8a Mon Sep 17 00:00:00 2001 From: lijiaojiao Date: Mon, 30 Jun 2025 20:23:09 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90feature=E3=80=91=E3=80=90AISBench?= =?UTF-8?q?=E3=80=91=E5=A4=9A=E8=BD=AE=E5=AF=B9=E8=AF=9D=E6=80=A7=E8=83=BD?= =?UTF-8?q?=E6=B5=8B=E8=AF=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ais_bench/benchmark/clients/__init__.py | 3 +- .../benchmark/clients/base_client.py | 4 +- .../openai_chat_stream_sglang_client.py | 108 ++++++++++ .../configs/datasets/sharegpt/sharegpt_gen.py | 36 ++++ .../vllm_api_stream_chat_multiturn.py | 27 +++ .../ais_bench/benchmark/datasets/__init__.py | 1 + .../ais_bench/benchmark/datasets/sharegpt.py | 100 +++++++++ .../ais_bench/benchmark/models/__init__.py | 3 +- .../models/vllm_custom_api_chat_multiturn.py | 193 ++++++++++++++++++ .../benchmark/openicl/icl_prompt_template.py | 5 + .../ais_bench/benchmark/runners/local_api.py | 7 +- .../ais_bench/benchmark/tasks/openicl_perf.py | 7 +- .../ais_bench/benchmark/utils/build.py | 2 +- .../tests/ST/test_datasets/test_sharegpt.py | 91 +++++++++ 14 files changed, 577 insertions(+), 10 deletions(-) create mode 100644 ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/openai_chat_stream_sglang_client.py create mode 100644 ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/sharegpt/sharegpt_gen.py create mode 100644 ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat_multiturn.py create mode 100644 ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/sharegpt.py create mode 100644 ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/vllm_custom_api_chat_multiturn.py create mode 100644 ais-bench_workload/experimental_tools/benchmark/tests/ST/test_datasets/test_sharegpt.py diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/__init__.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/__init__.py index d4c391a5..be54c52f 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/__init__.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/__init__.py @@ -8,4 +8,5 @@ from ais_bench.benchmark.clients.tgi_text_client import TGITextClient from ais_bench.benchmark.clients.triton_text_client import TritonTextClient from ais_bench.benchmark.clients.openai_text_client import OpenAITextClient from ais_bench.benchmark.clients.openai_chat_text_client import OpenAIChatTextClient -from ais_bench.benchmark.clients.vllm_text_client import VLLMTextClient \ No newline at end of file +from ais_bench.benchmark.clients.vllm_text_client import VLLMTextClient +from ais_bench.benchmark.clients.openai_chat_stream_sglang_client import OpenAIChatStreamSglangClient \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/base_client.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/base_client.py index 2cabfca3..6d23576d 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/base_client.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/base_client.py @@ -179,7 +179,9 @@ class BaseClient(ABC): raise_error(f"Error processing stream response: {e}", self.lock, self.request_counter) except HTTPError as e: raise_error(f"HTTP error during stream response processing: {e}.", self.lock, self.request_counter) - + except Exception as e: + raise_error(f"Other error during stream response processing: {e}.", self.lock, self.request_counter) + self.rev_count() self.update_request_time(inputs, start_time) return "".join(response) diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/openai_chat_stream_sglang_client.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/openai_chat_stream_sglang_client.py new file mode 100644 index 00000000..158e9eb6 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/clients/openai_chat_stream_sglang_client.py @@ -0,0 +1,108 @@ +import json +import time +import re +from abc import abstractmethod, ABC + +from ais_bench.benchmark.clients.base_client import BaseStreamClient, _stream_data_split +from ais_bench.benchmark.utils import MiddleData +from ais_bench.benchmark.registry import CLIENTS +from ais_bench.benchmark.utils.valid_global_consts import valid_max_chunk_size + + +@CLIENTS.register_module() +class OpenAIChatStreamSglangClient(BaseStreamClient, ABC): + def preprocess_cur_line(self, cur_line: str) -> str: + if "\ndata" in cur_line: + end_ix = cur_line.find("data: [DONE]") + cur_line = cur_line if end_ix < 0 else cur_line[:end_ix] + data_blocks = cur_line.strip().split('\n\n') + print(f"{data_blocks=}") + merged_data = None + for block in data_blocks: + # 去掉 "data: " 前缀并解析 JSON + json_str = block.replace('data: ', '') + data = json.loads(json_str) + + # 如果是第一条数据,初始化 merged_data + if merged_data is None: + merged_data = data + else: + # 合并 choices + merged_data['choices'].extend(data['choices']) + if data.get("usage"): + merged_data["usage"] = data["usage"] + print(f"{merged_data=}") + return json.dumps(merged_data) + else: + end_ix = cur_line.find("data: [DONE]") + return cur_line if end_ix < 0 else cur_line[:end_ix] + + def construct_request_body( + self, + inputs: list, + parameters: dict = None, + ) -> dict: + data = dict( + stream = True, + messages = inputs, + ) + data = data | parameters + data["stream_options"] = {"include_usage": True} + return data + + def process_stream_line(self, json_content: dict) -> dict: + response = {} + generated_text = "" + for item in json_content["choices"]: + if item["delta"]["content"]: # content maybe null in sglang service + generated_text += item["delta"]["content"] + if generated_text: + response.update({"generated_text": generated_text}) + if self.do_performance: + response.update({"token_str": generated_text}) + if json_content.get("usage"): + response.update({"completion_tokens": json_content["usage"]["completion_tokens"]}) + return response + + def update_middle_data(self, res: dict, inputs: MiddleData): + generated_text = res.get("generated_text", "") + if generated_text: + inputs.output += generated_text + inputs.num_generated_chars = len(inputs.output) + prefill_time = res.get("prefill_time") + if prefill_time: + inputs.prefill_latency = prefill_time + decode_time = res.get("decode_time") + if decode_time: + inputs.decode_cost.append(decode_time) + chunk_time_point = res.get("chunk_time_point") + if chunk_time_point: + inputs.chunk_time_point_list.append(chunk_time_point) + if res.get("completion_tokens"): + inputs.num_generated_tokens = res.get("completion_tokens") + return generated_text + + def process_response(self, response, last_time_point): + time_name = "prefill_time" + for byte_line in response.stream(amt=valid_max_chunk_size()): + + if byte_line == b"\n": + print(f"{byte_line=}") + continue + cur_line = self.preprocess_cur_line(byte_line.decode()) + try: + for json_content in _stream_data_split(cur_line): + cur_time_point = time.perf_counter() + response_dict = self.process_stream_line(json_content) + if not response_dict.get("generated_text") and not response_dict.get("completion_tokens"): #first return chunk: None, reset start time + continue + if time_name not in response_dict.keys(): + response_dict[time_name] = ( + cur_time_point - last_time_point + ) * 1000 + response_dict["chunk_time_point"] = cur_time_point * 1000 + yield response_dict + time_name = "decode_time" + last_time_point = time.perf_counter() + except Exception as error: + raise ValueError(f"[StreamResponseError] {error}! Raw server response: {cur_line}") diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/sharegpt/sharegpt_gen.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/sharegpt/sharegpt_gen.py new file mode 100644 index 00000000..ec265d21 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/datasets/sharegpt/sharegpt_gen.py @@ -0,0 +1,36 @@ +from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.datasets import ShareGPTDataset, ShareGPTEvaluator, math_postprocess_v2 + + +sharegpt_reader_cfg = dict( + input_columns=['human'], + output_column='gpt' +) + + +sharegpt_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={'type': 'conversations', 'prompt': "human"} + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +sharegpt_eval_cfg = dict( + evaluator=dict(type=ShareGPTEvaluator) +) + +sharegpt_datasets = [ + dict( + abbr='sharegpt', + type=ShareGPTDataset, + disable_shuffle=True, + path='aisbench/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json', # 数据集路径,使用相对路径时相对于源码根路径,支持绝对路径 + reader_cfg=sharegpt_reader_cfg, + infer_cfg=sharegpt_infer_cfg, + eval_cfg=sharegpt_eval_cfg + ) +] diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat_multiturn.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat_multiturn.py new file mode 100644 index 00000000..7400c611 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat_multiturn.py @@ -0,0 +1,27 @@ +from ais_bench.benchmark.models import VllmMultiturnAPIChatStream +from ais_bench.benchmark.clients import OpenAIChatStreamClient, OpenAIChatStreamSglangClient + +models = [ + dict( + attr="service", + type=VllmMultiturnAPIChatStream, + abbr='vllm-multiturn-api-chat-stream', + path="", + model="", + request_rate = 0, + retry = 2, + host_ip = "localhost", + host_port = 8080, + max_out_len = 512, + batch_size=1, + trust_remote_code=False, + custom_client=dict(type=OpenAIChatStreamClient), # Set OpenAIChatStreamSglangClient when sglang service + generation_kwargs = dict( + temperature = 0.5, + top_k = 10, + top_p = 0.95, + seed = None, + repetition_penalty = 1.03, + ) + ) +] diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py index 68a6a5df..1cc5cc2c 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/__init__.py @@ -33,3 +33,4 @@ from ais_bench.benchmark.datasets.lambada import * # noqa: F401, F403 from ais_bench.benchmark.datasets.lcsts import * # noqa: F401, F403 from ais_bench.benchmark.datasets.siqa import * # noqa: F401, F403 from ais_bench.benchmark.datasets.xsum import * # noqa: F401, F403 +from ais_bench.benchmark.datasets.sharegpt import * diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/sharegpt.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/sharegpt.py new file mode 100644 index 00000000..acbf6c51 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/datasets/sharegpt.py @@ -0,0 +1,100 @@ +import json +import os +import re +from os import environ +import random +from pathlib import Path + +from datasets import Dataset, DatasetDict + +from ais_bench.benchmark.openicl import BaseEvaluator +from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from ais_bench.benchmark.utils import get_data_path +from ais_bench.benchmark.utils.logging import get_logger +from ais_bench.benchmark.utils.tokenizer import HuggingfaceTokenizer + +from .base import BaseDataset +MIN_PROMPT_LEN=4 +MIN_OUTPUT_LEN=4 +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +@LOAD_DATASET.register_module() +class ShareGPTDataset(BaseDataset): + + @staticmethod + def load(path, disable_shuffle, **kwargs): + tokenizer_path = kwargs.get("model_path", None) + try: + tokenizer = HuggingfaceTokenizer(tokenizer_path) + except Exception as e: + raise ValueError(f"Load tokenizer failed: {e}, Please check your model path in api configs!") + path = get_data_path(path, local_mode=True) + with open(path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + cnt_turn = 0 + logger = get_logger() + new_dataset = [] + for data in dataset: + if len(data["conversations"]) % 2 != 0: + continue + if data["conversations"][0]["from"] != "human": + continue + chat = {"human":[], "gpt":[]} + chat['id'] = data['id'] + total_len = len(data["conversations"]) + cnt_turn += total_len + for i in range(0, total_len, 2): + # One user One Assistant + chat['human'].append(data["conversations"][i]["value"]) + try: + output_len = len(tokenizer.encode(data["conversations"][i + 1]["value"])) + except: + output_len = None + chat['gpt'].append({"data": data["conversations"][i + 1]["value"], "output_len": output_len}) + new_dataset.append(chat) + logger.info(f"Number of conversations: {len(dataset)}; Number of requests: {cnt_turn // 2}") + if not disable_shuffle: + # Shuffle the dataset. + random.shuffle(new_dataset) + + return Dataset.from_list(new_dataset) + +class ShareGPTEvaluator(BaseEvaluator): + + def find_choice(self, result): + choose_map = { + "A": "laughter", + "B": "sigh", + "C": "cough", + "D": "throatclearing", + "E": "sneeze", + "F": "sniff" + } + if result in choose_map.keys(): + return choose_map[result] + else: + return "" + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + correct = 0 + count = 0 + details = [] + for i, j in zip(predictions, references): + detail = {'pred': i, 'answer': j, 'correct': False} + if len(i) > 1: + i = self.find_choice(i[0]) + count += 1 + if i == j: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/__init__.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/__init__.py index 1bc06d51..fb0f756b 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/__init__.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/__init__.py @@ -7,4 +7,5 @@ from ais_bench.benchmark.models.mindie_llm_api import MindieLLMModel from ais_bench.benchmark.models.huggingface import HuggingFace, HuggingFaceCausalLM from ais_bench.benchmark.models.huggingface_above_v4_33 import HuggingFaceBaseModel, HuggingFacewithChatTemplate from ais_bench.benchmark.models.tgi_api import TGICustomAPI, TGICustomAPIStream -from ais_bench.benchmark.models.triton_api import TritonCustomAPI, TritonCustomAPIStream \ No newline at end of file +from ais_bench.benchmark.models.triton_api import TritonCustomAPI, TritonCustomAPIStream +from ais_bench.benchmark.models.vllm_custom_api_chat_multiturn import VllmMultiturnAPIChatStream \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/vllm_custom_api_chat_multiturn.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/vllm_custom_api_chat_multiturn.py new file mode 100644 index 00000000..c7864d56 --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/models/vllm_custom_api_chat_multiturn.py @@ -0,0 +1,193 @@ +import os +import time +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, List, Optional, Union, Tuple + +from tqdm import tqdm + +from openai import OpenAI + +from ais_bench.benchmark.registry import MODELS +from ais_bench.benchmark.utils.prompt import PromptList, is_mm_prompt + +from ais_bench.benchmark.models.base_api import BaseAPIModel, handle_synthetic_input +from ais_bench.benchmark.models.performance_api import PerformanceAPIModel +from ais_bench.benchmark.clients import OpenAIChatStreamClient, OpenAIChatTextClient, OpenAIChatStreamSglangClient +from ais_bench.benchmark.utils.results import MiddleData +from ais_bench.benchmark.utils.build import build_client_from_cfg + +PromptType = Union[PromptList, str, dict] + + +@MODELS.register_module() +class VllmMultiturnAPIChatStream(PerformanceAPIModel): + """Multiturn Model wrapper around OpenAI's models. + + Args: + max_seq_len (int): The maximum allowed sequence length of a model. + Note that the length of prompt + generated tokens shall not exceed + this value. Defaults to 2048. + request_rate (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + retry (int): Number of retires if the API call fails. Defaults to 2. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + host_ip (str): The host ip of custom service, default "localhost". + host_port (int): The host port of custom service, default "8080". + enable_ssl (bool, optional): . + """ + + is_api: bool = True + is_chat_api: bool = True + + def __init__(self, + path, + model: str = "", + max_seq_len: int = 4096, + request_rate: int = 1, + rpm_verbose: bool = False, + retry: int = 2, + meta_template: Optional[Dict] = None, + verbose: bool = False, + host_ip: str = "localhost", + host_port: int = 8080, + enable_ssl: bool = False, + custom_client = dict(type=OpenAIChatStreamClient), + generation_kwargs: Optional[Dict] = None, + trust_remote_code: bool = False): + super().__init__(path=path, + max_seq_len=max_seq_len, + meta_template=meta_template, + request_rate=request_rate, + rpm_verbose=rpm_verbose, + retry=retry, + generation_kwargs=generation_kwargs, + verbose=verbose, + trust_remote_code=trust_remote_code) + self.host_ip = host_ip + self.host_port = host_port + self.enable_ssl = enable_ssl + self.base_url = self._get_base_url() + self.endpoint_url = os.path.join(self.base_url, "chat/completions") + self.model = model if model else self._get_service_model_path() + self.init_client(custom_client) + self.is_multi_modal = False + + def init_client(self, custom_client): + if not isinstance(custom_client, dict): + self.logger.warning(f"Value of custom_client: {custom_client} is not a dict! Use Default") + custom_client = dict(type=OpenAIChatStreamClient) + custom_client['url'] = self.endpoint_url + custom_client['retry'] = self.retry + self.client = build_client_from_cfg(custom_client) + + def encode_input(self, prompt: list) -> Tuple[float, List[int]]: + """Encode a string into tokens, measuring processing time.""" + prompt = prompt[:-1] + if not self.tokenizer: + self.logger.error("Tokenizer is not initialized.") + return 0.0, [] + + assert len(prompt)>0 and isinstance(prompt[0], dict) + if "content" in prompt[0] and isinstance(prompt[0]['content'], list): + self.logger.warning(f"Input type: expected a string, got list, InputTokens will be 0.") + return 0.0, [] + + messages = self.tokenizer.tokenizer.tokenizer_model.apply_chat_template(prompt, add_generation_prompt=True, tokenize=False) + time_start = time.perf_counter() + tokens = self.tokenizer.encode(messages) + time_cost = (time.perf_counter() - time_start) * 1000 # Convert to milliseconds + return time_cost, tokens + + def _input_decode(self, tokens: List): + if not self.tokenizer: + self.logger.error("Tokenizer is not initialized.") + return [] + return self.tokenizer.decode(tokens) + + def prepare_input_data(self, inputs: list, data_id: int = -1) -> MiddleData: + """Prepare input data, tokenize if performance mode is enabled.""" + rrid = uuid.uuid4().hex + cache_data = self.result_cache[rrid] + cache_data.data_id = data_id + cache_data.request_id = rrid + cache_data.input_data = inputs + return cache_data + + def generate(self, + inputs: List[PromptType], + max_out_len: int = 512, + **kwargs) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in AISBench' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + batch_size = kwargs.get("batch_size", len(inputs)) + with ThreadPoolExecutor(max_workers=batch_size) as executor: + results = list( + tqdm(executor.map(self._generate, inputs, + [max_out_len] * len(inputs)), + total=len(inputs), + desc='Inferencing')) + return results + + @handle_synthetic_input + def _generate(self, input: PromptType, max_out_len: int) -> str: + """Generate result given a input. + + Args: + input (PromptType): A string or PromptDict. + The PromptDict should be organized in AISBench' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + if isinstance(input, dict): + data_id = input.get('data_id') + gold = input.get('gold') + input = input.get('prompt') + assert len(gold) == len(input) + else: + data_id = -1 + if max_out_len <= 0: + return '' + history = [] + for i in range(len(input)): + if history: # request rate during multiturns + self.acquire() + messages = [{'role': 'user', 'content': input[i]}] + messages = history + messages + history = messages + + if gold[i].get("output_len"): + self.generation_kwargs.update({"max_tokens": gold[i].get("output_len")}) + else: + self.generation_kwargs.update({"max_tokens": max_out_len}) + self.generation_kwargs.update({"model": self.model}) + cache_data = self.prepare_input_data(messages, data_id) + response = self.client.request(cache_data, self.generation_kwargs) + history.append({'role': 'assistant', 'content': response}) + self.set_result(cache_data) + + return ''.join(response) + + + def _get_base_url(self): + if self.enable_ssl: + return f"https://{self.host_ip}:{self.host_port}/v1/" + return f"http://{self.host_ip}:{self.host_port}/v1/" + + def _get_service_model_path(self): + client = OpenAI(api_key="EMPTY", base_url=self.base_url) + return client.models.list().data[0].id \ No newline at end of file diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/openicl/icl_prompt_template.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/openicl/icl_prompt_template.py index b41e09ad..e76572f9 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/openicl/icl_prompt_template.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/openicl/icl_prompt_template.py @@ -256,6 +256,11 @@ class PromptTemplate: template.append({'type': 'text', 'text': self.template['prompt']}) return template + # #multi-turn conversations + elif isinstance(self.template, dict) and 'type' in self.template.keys() and self.template['type']=='conversations': + template = entry['human'] + return template + elif self.prompt_type == 'origin': # This if is only effective when you are using GenInferecner # with multi-label prompts. diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/runners/local_api.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/runners/local_api.py index c779ae73..5a2fa4c8 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/runners/local_api.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/runners/local_api.py @@ -16,7 +16,7 @@ from tqdm import tqdm from ais_bench.benchmark.registry import RUNNERS, TASKS from ais_bench.benchmark.tasks import OpenICLInferTask, OpenICLPerfTask, OpenICLInferMergedTask from ais_bench.benchmark.tasks.base import BaseTask -from ais_bench.benchmark.utils import (build_dataset_from_cfg, build_synthetic_dataset_from_cfg, +from ais_bench.benchmark.utils import (build_dataset_from_cfg, build_dataset_from_cfg_with_model_path, build_model_from_cfg, get_infer_output_path, get_logger, task_abbr_from_cfg) @@ -38,8 +38,9 @@ def monkey_run_perf(self): self.model_cfg = model_cfg self.dataset_cfg = dataset_cfg self.infer_cfg = self.dataset_cfg["infer_cfg"] - if self.dataset_cfg.get('type', None) == "ais_bench.benchmark.datasets.SyntheticDataset": - self.dataset = build_synthetic_dataset_from_cfg(self.dataset_cfg, self.model_cfg) + if self.dataset_cfg.get('type', None) in ["ais_bench.benchmark.datasets.SyntheticDataset", + "ais_bench.benchmark.datasets.ShareGPTDataset"]: + self.dataset = build_dataset_from_cfg_with_model_path(self.dataset_cfg, self.model_cfg) else: self.dataset = build_dataset_from_cfg(self.dataset_cfg) self.build_inference() diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/tasks/openicl_perf.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/tasks/openicl_perf.py index 67ab7fcb..19237c05 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/tasks/openicl_perf.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/tasks/openicl_perf.py @@ -18,7 +18,7 @@ from ais_bench.benchmark.registry import ( from ais_bench.benchmark.tasks.base import BaseTask from ais_bench.benchmark.utils import ( build_dataset_from_cfg, - build_synthetic_dataset_from_cfg, + build_dataset_from_cfg_with_model_path, build_model_from_cfg, get_infer_output_path, get_perf_output_path, @@ -110,8 +110,9 @@ class OpenICLPerfTask(BaseTask): self.model_cfg = model_cfg self.dataset_cfg = dataset_cfg self.infer_cfg = self.dataset_cfg["infer_cfg"] - if self.dataset_cfg.get('type', None) == "ais_bench.benchmark.datasets.SyntheticDataset": - self.dataset = build_synthetic_dataset_from_cfg(self.dataset_cfg, self.model_cfg) + if self.dataset_cfg.get('type', None) in ["ais_bench.benchmark.datasets.SyntheticDataset", + "ais_bench.benchmark.datasets.ShareGPTDataset"]: + self.dataset = build_dataset_from_cfg_with_model_path(self.dataset_cfg, self.model_cfg) else: self.dataset = build_dataset_from_cfg(self.dataset_cfg) self.build_inference() diff --git a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/utils/build.py b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/utils/build.py index d3f99a36..7482ab0a 100644 --- a/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/utils/build.py +++ b/ais-bench_workload/experimental_tools/benchmark/ais_bench/benchmark/utils/build.py @@ -66,7 +66,7 @@ def build_dataset_from_cfg(dataset_cfg: ConfigDict): return LOAD_DATASET.build(dataset_cfg) -def build_synthetic_dataset_from_cfg(dataset_cfg: ConfigDict, model_cfg: ConfigDict): +def build_dataset_from_cfg_with_model_path(dataset_cfg: ConfigDict, model_cfg: ConfigDict): dataset_cfg = copy.deepcopy(dataset_cfg) dataset_cfg.pop('infer_cfg', None) dataset_cfg.pop('eval_cfg', None) diff --git a/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_datasets/test_sharegpt.py b/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_datasets/test_sharegpt.py new file mode 100644 index 00000000..039127fe --- /dev/null +++ b/ais-bench_workload/experimental_tools/benchmark/tests/ST/test_datasets/test_sharegpt.py @@ -0,0 +1,91 @@ +import os +import json +import shutil +import sys +import pytest +import pandas as pd +from ais_bench.benchmark.cli.main import main + + +class Response: + def __init__(self): + data = {'choices': [{"text": "11"}]} + self.data = f"{json.dumps(data)}".encode() + + +class TestClass: + @classmethod + def setup_class(cls): + """ + class level setup_class + """ + cls.init(TestClass) + + @classmethod + def teardown_class(cls): + + print('\n ---class level teardown_class') + + def init(self): + self.cur_dir = os.path.dirname(os.path.abspath(__file__)) + self.test_data_path = os.path.abspath(os.path.join(self.cur_dir, "../testdatas")) + if os.path.exists(self.test_data_path): + shutil.rmtree(self.test_data_path) + os.makedirs(self.test_data_path) + self.perf_json_keys = ['Benchmark Duration', 'Total Requests', 'Failed Requests', 'Success Requests', + 'Concurrency', 'Max Concurrency', 'Request Throughput', 'Total Input Tokens', + 'Prefill Token Throughput', 'Total generated tokens', 'Input Token Throughput', + 'Output Token Throughput', 'Total Token Throughput'] + self.perf_csv_headers = ['Performance Parameters', 'Average', 'Min', 'Max', + 'Median', 'P75', 'P90', 'P99', 'N'] + self.perf_csv_params = ['E2EL', 'TTFT', 'TPOT', 'InputTokens', 'OutputTokens','OutputTokenThroughput'] + + def test_vllm_api_stream_chat_multiturn_perf_sharegpt(self, monkeypatch): + fake_prediction = [{'id': 0, 'input_data': 'A A', 'input_token_id': [32, 362, 362], + 'output': ' A A A', 'output_token_id': [362, 362, 362], + 'prefill_latency': 56.9, + 'decode_token_latencies': [26.4, 28.4], 'last_decode_latency': 28.4, + 'decode_max_token_latency': 28.4, 'seq_latency': 2700.04, + 'input_tokens_len': 2, 'generate_tokens_len': 3, + 'generate_tokens_speed': 37.03, 'input_characters_len': 3, + 'generate_characters_len': 6, 'characters_per_token': 2.0, + 'prefill_batch_size': 0, 'decode_batch_size': [], 'queue_wait_time': [], + 'request_id': '591c69416c694a6ab3194a06d6e1ed17', + 'start_time': 1742952029.5993671, 'end_time': 1742952032.299417, + 'is_success': True, 'is_empty': False}] + fake_time_str = "sharegpt_fake_time" + datasets_abbr_name = "sharegptdataset" + datasets_script_name = "sharegpt_gen" + + monkeypatch.setattr('sys.argv', + ["ais_bench", "--models", "vllm_api_stream_chat_multiturn", "--datasets", datasets_script_name, + "--mode", "perf", "-w", self.test_data_path]) + monkeypatch.setattr("ais_bench.benchmark.openicl.icl_inferencer.icl_gen_perf_inferencer.GenPerfInferencer.inference_with_multi_process", lambda *arg,**xargs: fake_prediction) + monkeypatch.setattr("ais_bench.benchmark.tasks.openicl_perf.OpenICLPerfTask.set_performance_api", lambda *arg: True) + monkeypatch.setattr("ais_bench.benchmark.models.vllm_custom_api_chat_multiturn.VllmMultiturnAPIChatStream._get_service_model_path", lambda *arg: "qwen2") + monkeypatch.setattr("urllib3.PoolManager.request", lambda *args, **kwargs: Response()) + monkeypatch.setattr("ais_bench.benchmark.cli.main.get_current_time_str", lambda *arg: fake_time_str) + main() + + # check perf json + infer_outputs_json_path = os.path.join(self.test_data_path, f"{fake_time_str}/performances/vllm-multiturn-api-chat-stream/{datasets_abbr_name}.json") + assert os.path.exists(infer_outputs_json_path) + with open(infer_outputs_json_path, 'r') as file: + data = json.load(file) + assert isinstance(data, dict) + for key in self.perf_json_keys: + assert key in data + assert data['Total Requests']['total'] == len(fake_prediction) + + #check perf csv + infer_outputs_csv_path = os.path.join(self.test_data_path, f"{fake_time_str}/performances/vllm-multiturn-api-chat-stream/{datasets_abbr_name}.csv") + assert os.path.exists(infer_outputs_csv_path) + data = pd.read_csv(infer_outputs_csv_path) + for header in self.perf_csv_headers: + assert header in data.columns + first_column = data.iloc[:,0] + for param in self.perf_csv_params: + assert param in first_column.values + + assert data.loc[data['Performance Parameters'] == 'ITL', 'Max'].values[0] == str(fake_prediction[0]['decode_max_token_latency']) + ' ms' + assert data.loc[data['Performance Parameters'] == 'OutputTokenThroughput', 'Average'].values[0] == str(fake_prediction[0]['generate_tokens_speed']) + ' token/s' \ No newline at end of file -- Gitee