diff --git a/README.md b/README.md index 3c7ed10f49be55e7d68ed97a068cc36487ef35ca..72ab8e0725d5727c371ad13387716666c691fcc4 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ python main.py gen-cases --config xxx/mcp-config.json ### 5. 执行测试用例并验证 ```bash # 执行校验命令 -python main.py val-cases --config xxx/mcp-config.json --testpath ./logs/perf_mcp_2025-09-11T07-31-04-418670 +python main.py val-cases --config xxx/mcp-config.json --testpath ./logs/perf_mcp_2025-09-11T07-31-04-418670/testcases.json ``` - `--testpath`:指定步骤 4 生成的测试用例目录路径; - 执行结果:用例的执行结果将保存至步骤 4 输出的用例目录。 \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..23b4c04c5b325d38bd3280d5eb1388f064ccbbaf --- /dev/null +++ b/main.py @@ -0,0 +1,67 @@ +import asyncio +import argparse + +def parse_args(): + parser = argparse.ArgumentParser( + description="MCP Server initialization args", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + subparsers = parser.add_subparsers(dest='command', required=True) + + # gen-cases 子命令 + gen_parser = subparsers.add_parser('gen-cases', help='Generate test cases') + gen_parser.add_argument( + "--config", + type=str, + default="./mcp-servers-perf.json", + help="Path to MCP Server config file" + ) + + #val-cases子命令 + val_parser = subparsers.add_parser('val-cases', help='Validate test cases') + val_parser.add_argument( + "--config", + type=str, + default="./mcp-servers-perf.json", + help="Path to MCP Server config file" + ) + val_parser.add_argument( + "--testpath", + type=str, + default=".logs/perf_mcp_2025-09-12T06-43-29-026631/testcases.json", + help="Path to get testcases" + ) + + # reporter 子命令 + rep_parser = subparsers.add_parser('rep-cases', help='report testing results') + rep_parser.add_argument( + "--valpath", + type=str, + default=".logs/perf_mcp_2025-09-11T07-31-04-418670/validation_results.json", + help="Path to MCP Server config file" + ) + + return parser.parse_args() + + +async def gen_cases(config_path): + from src.test_generator.TestGenerator import TestGenerator + generator = TestGenerator(config_path=config_path) + return await generator.run() + + +async def val_cases(config_path, testcase_path): + from src.validator.Response_validator_withenv import ResponseValidator_withenv + validator = ResponseValidator_withenv(config_path=config_path, testcase_path=testcase_path) + return await validator.run() + +async def main(): + args = parse_args() + if args.command == 'gen-cases': + await gen_cases(args.config) + if args.command == 'val-cases': + await val_cases(args.config, args.testpath) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/src/prompts/env_prompt.py b/src/prompts/env_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..44437b2a3c5a2d41fd80ab295573d320e9ba61b5 --- /dev/null +++ b/src/prompts/env_prompt.py @@ -0,0 +1,86 @@ +env_prompt = """You have received a debugging task: a test case has failed its validation due to not meeting specific environmental requirements. +The specific reason is: {reason}. Your job is to write a Bash script that configures the environment to satisfy these special requirements. + +**Important Notes:** +- The default environment already has Anaconda, Python, and the following common dependencies pre-installed: + {dependencies} + **Do not include redundant commands for these in your script.** +- Carefully review the test case's description field to identify any additional environment setup needed beyond the standard baseline. +- **Target Specifics**: Focus only on additional setup needed to address the failure, as indicated in the test case description. Examples include: + - Enabling/disabling services (e.g., stopping Docker if the test requires it to be unavailable) + - Creating required directories/files with specific structures or contents + - Installing/removing packages not in the pre-installed list +- Minimal Steps: Only include core commands needed to meet the special requirements—no extra configurations, optimizations, or redundant checks. +- The target operating system is openEuler. Please ensure: + - Use yum or dnf instead of apt-get + - Employ commands suitable for RHEL/CentOS series + +### Input Example +Test case(s): +{testcases} + +Test output and validation results: +output: {output} +validation_results: {validation_results} + +### Output Format +Wrap your bash script between the following tags: +```plaintext + +``` +""" + +not_pass_judge_prompt = """# Debugging Task Instructions + +You have received a debugging task for a test case that failed its validation rules. Please follow these steps to identify the root cause: + +### 1. Identify the Test Case Type +- Before identification, check the `expect` field of the test case: + - If the value is `"Success"`, the test case is **expected to succeed** (i.e., it is a normal/happy-path scenario). + - If the value is `"Error"`, the test case is **intentionally designed to fail** (i.e., it validates error-handling logic). + +### 2. Confirm Environmental Requirements +- Standard Testing Environment Baseline: The default environment already includes pre-installed Anaconda, Python, and the common dependencies +{dependencies} +Check for Special Requirements: Read the test case's description field to determine if it needs additional environmental setup beyond the standard baseline. +For additional environmental setup, examples include: + - Enabling/disabling services (e.g., stopping Docker if the test requires it to be unavailable) + - Creating required directories/files with specific structures or contents + - Installing/removing packages not in the pre-installed list + +### 3. The definition of rule type + - **contains**: Verify the response includes an **exact, static substring** (max 4 words/fragments). + - `value`: `"[exact_text_fragment]"`(wrap in double quotes; use single quotes inside for clarity, e.g., `" 'Python 3.11 installed' "`) + - **equals**: Verify the **entire response** matches an exact, fixed value. (fixed numbers, short strings, booleans). + - `value`: For strings: `"[exact_string]"`; for numbers/booleans: `[exact_value]`(no quotes) + - **schema**: Validate JSON structure/data types (required fields, value types). + - `value`: `[valid_json_schema]` + - **llm**: For semantic validation requiring human-like judgment (e.g., summary accuracy). + - `value`: `Natural language specifying semantic criteria.` + +### 4. Analyze the Failure Cause +- Review the `output` (actual result of the test) and `validation_results` (details of failed/passed rules): + - Judge if the failure stems from **unmet special environmental requirements** (e.g., unstopped Docker service, or unconfigured simulation that the test depends on). + - Or if the failure is caused by **issues with the validation rule itself** (e.g., incorrect expected values, or too tightly strict validation rules). +- Base your judgment on the known standard testing environment (do not assume unstated configurations). + +### Input Example +Test case(s): +{testcases} + +Test output and validation results: +output: {output} +validation_results: {validation_results} + +### Output Format +Provide a clear, direct answer using the following JSON format (do not include extra content): + +```json +{{ + "option": "a. Unmet special environmental requirement" or "b. Issue with the validation rule itself", + "reason": "Concise, specific explanation (e.g., 'The test requires an nginx process running on port 80, but the standard environment lacks this setup' or 'The validation rule expects Python 3.8, but the test uses Python 3.9—an unreasonable mismatch')" +}} +``` +""" diff --git a/src/prompts/eval_prompt.py b/src/prompts/eval_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..c4568621f62cdd62cd49dea4b261e8a273b7fe6f --- /dev/null +++ b/src/prompts/eval_prompt.py @@ -0,0 +1,11 @@ +eval_prompt = """Create a natural, conversational request for an AI assistant to perform this specific test scenario: + +Tool: {tool.name} - {tool.description} +Purpose: {test_case.description} + +Parameters to include: +{test_case_inputs} + +Craft a single, fluent sentence that naturally incorporates these parameter values as if you're asking for help with this specific task. Make it sound like a real user request rather than a technical specification. + +Natural language request:""" \ No newline at end of file diff --git a/src/prompts/tool_prompt.py b/src/prompts/tool_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..c217a9185cc2c23662946b83b892391793ece23b --- /dev/null +++ b/src/prompts/tool_prompt.py @@ -0,0 +1,67 @@ +tool_prompt = """ +You are an expert in generating comprehensive test cases for tools accessed through MCP (Model Context Protocol) servers on Linux operating systems. Your task is to create diverse, realistic test cases that thoroughly validate tool functionality. +## Tool Definition +Name: {tool.name} +Description: {tool.description} +Parameters: {input_properties} +Tool Source Code: {tool_function_str} +(since the tool may lack a formal outputSchema, the source code is the authoritative reference for output format/structure) + +## Instructions +1. Generate {tests_per_tool} diverse test cases covering these categories: + - Happy Path (80%): Normal, expected usage scenarios with valid inputs + - Error Cases (20%): Invalid or edge case inputs that should trigger proper error handling + +2. For each test case, provide these fields: + - `description`: A concise explanation of the scenario and its test intent. + - `input`: A JSON object with concrete and plausible parameter values. Avoid generic placeholders; use specific, realistic values as users would (e.g., use "3.11" for Python version). + - `expect`: + - `status`: "success" for happy path, or "error" if an error is expected. + - `validationRules`: An **array of assertion rules** to precisely check the tool's response. **Validation rules are critical for automated testing, so each rule must be clear, unambiguous, and directly translatable to Python test code.** + Every rule must include 3 key fields: + - `type`: One of [`contains`,`equals`,`schema`, `llm`] + - `value`: A **machine-parsable value** (no redundant natural language) in the specified format—see below for details. + - `message`: A helpful description to show if validation fails. + - **Choose the validation `type` and construct `value` as follows:** + - **contains**: Verify the response includes an **exact, static substring** (max 4 words/fragments). + - `value`: `"[exact_text_fragment]"`(wrap in double quotes; use single quotes inside for clarity, e.g., `" 'Python 3.11 installed' "`) + - **equals**: Verify the **entire response** matches an exact, fixed value. (fixed numbers, short strings, booleans). + - `value`: For strings: `"[exact_string]"`; for numbers/booleans: `[exact_value]`(no quotes) + - **schema**: Validate JSON structure/data types (required fields, value types). + - `value`: `[valid_json_schema]` + - **llm**: For semantic validation requiring human-like judgment (e.g., summary accuracy). + - `value`: `Natural language specifying semantic criteria.` + + - **For successful (“success”) test cases:** + - Prefer `schema` if the response is structured. Prefer `contains` for specific fragments, and `equals` only for fixed outputs. Use `llm` for **semantic validation requiring human judgment** (e.g., summary accuracy) + + - **For error (“error”) test cases:** + - Validate on the **presence and clarity of error indications** (e.g., specific error messages, error codes, required fields in the error response). + - Prefer `"contains"` for specific fragments/errors and `"schema"` only if error objects are structured. + - Make sure the rule can be programmatically checked. + +## Output Format + +Return a **pure JSON array** of test cases in the following structure: + +```json +[ + {{ + "description": "A brief but precise description of the test scenario.", + "input": {{ /* Concrete parameter values */ }}, + "expect": {{ + "status": "success|error", + "validationRules": [ + {{ + "type": "contains|equals|schema|llm", + "value": "xxx", // See format above; must be directly checkable + "message": "Custom failure explanation for this rule." + }} + /* ... more validation rules as appropriate ... */ + ] + }} + }} + /* ... more test cases ... */ +] +``` +""" \ No newline at end of file diff --git a/src/prompts/val_prompt.py b/src/prompts/val_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..b1f78aa05c7cbeeeaa44e2b0abcf2d1b865e2656 --- /dev/null +++ b/src/prompts/val_prompt.py @@ -0,0 +1,46 @@ +val_prompt_tool = """You are an expert evaluator responsible for assessing whether a specific MCP tool executed correctly for a given query. The validation rule serves as the expected criterion to validate the tool's output. + +**Tool:** {tool_name} +**Input:** {input} +**Validation Rule:** {validation_rule} +**Execution Output:** {output} + +Did the tool execute correctly and produce output that meets the expectations defined by the validation rule? Answer "yes" or "no" and provide a brief explanation. + +Output format: +```json +{{ + "answer": "yes" | "no", + "explanation": "Explanation of the result" +}} +```""" + +val_prompt_eval = """You are an expert evaluator specializing in assessing test cases for tools accessed via MCP (Model Context Protocol) servers on Linux operating systems. Your core responsibility is to verify whether the final output of a chat session (which executes MCP tools) aligns with the expected output of the corresponding test case. + +### 1. Test Case Category Definition +Test cases are divided into two types, and you need to first confirm the category of the current case: +- **Happy-path cases**: These cases represent normal, expected usage scenarios with valid inputs— the chat session should execute tools without errors and return results that match expected behavior. +- **Error cases**: These cases involve invalid inputs or edge cases— the chat session should trigger proper error handling (e.g., return error prompts, avoid abnormal crashes) instead of normal results. + +### 2. Current Test Case Information +- **User Query**: {{ query }} +- **Test Case Type**: {% if expect_type == "success" %}Happy-path case (valid inputs, expect normal execution){% else %}Error case (invalid/edge inputs, expect proper error handling){% endif %} +{{ expected_output }} +- **Chat Session's Final Output**: {{ output }} + +### 3. Evaluation Task +For the above test case, focus on two key points to evaluate: +1. Whether the chat session's tool execution process matches the case type (e.g., happy-path cases should have no execution errors; error cases should trigger error handling). +2. Whether the chat session's final output is consistent with the "Expected Output" (including result content for happy-path cases, or error prompt logic for error cases). + +Answer "yes" if the final output meets the test case's expectations; answer "no" otherwise. Provide a brief explanation to support your judgment. + +### 4. Output Format +```json +{{ '{{' }} + "answer": "yes" | "no", + "explanation": "Clear explanation of why the final output meets/does not meet expectations" +{{ '}}' }} +``` +""" + diff --git a/src/test_generator/TestGenerator.py b/src/test_generator/TestGenerator.py new file mode 100644 index 0000000000000000000000000000000000000000..9e97fab0122a881608ffdcaaa9e8ff81fdec7e56 --- /dev/null +++ b/src/test_generator/TestGenerator.py @@ -0,0 +1,295 @@ +import json +import datetime +import uuid +import re +import os +from typing import List +from ..llm.LLM import LLMClient +from ..type.types_def import ToolDefinition, TestCase +from ..prompts.tool_prompt import tool_prompt +from ..prompts.eval_prompt import eval_prompt +from ..client.Client import Configuration +from ..client.MCPClient import MCPClient +from ..utils.read_source_code import ReadSourceCode + +class TestGenerator: + """ + Generator for test cases using Large Language Model + """ + + def __init__(self, api_key: str = None, config_path: str = None): + """ + Create a new test generator + + Args: + api_key: API key for the language model + """ + Config_class = Configuration() + self.config = Config_class.load_config(config_path) + self.llm = LLMClient(api_key) + self.readsc = ReadSourceCode(config_path) + async def run(self): + # load config + + servers = [MCPClient(name, srv_config) for name, srv_config in self.config["mcpServers"].items()] + tests_per_tool = self.config["numTestsPerTool"] + for server in servers: + + # connect server + print("\n========================================") + print(f"Testing server: {server.name}") + print("========================================\n") + await server.initialize() + + # Get available tools + tools = await server.list_tools() + if not tools: + Warning('No tools found in the MCP server. Nothing to test.') + print(f"Found {len(tools)} tools:") + print("\n".join([f"{tool.format_for_llm()}" for tool in tools])) + + # Generate tests + print(f"Generating {tests_per_tool} tests per tool...") + test_cases = await self.generate_tests_for_each_server(tools, tests_per_tool,server.name) + print(f"Generated {len(test_cases)} test cases in total.") + + self.save_to_file(server.name, test_cases) + + await server.cleanup() + + + async def generate_tests_for_each_server( + self, + tools: List[ToolDefinition], + tests_per_tool: int, + server_name: str + ) -> List[TestCase]: + """ + Generate test cases for the given tools + + Args: + server_name: Name of the MCP server + tools: Tool definitions to generate tests for + config: Tester configuration + + Returns: + List of generated test cases + """ + all_tests: List[TestCase] = [] + tool_functions = self.readsc.get_code(server_name) + for tool in tools: + try: + print(f"Generating tests for tool interface: {tool.name}") + + tool_prompt_formatted = self.create_tool_prompt(tool, tests_per_tool, tool_functions[tool.name]) + + response = self.llm.get_response( + [{"role": "user", "content": tool_prompt_formatted}] + + ) + + if response: + test_cases = self.parse_response(response, tool.name) + + # Generate natural language queries for each test case + for test_case in test_cases: + print(f"Generating natural language query for {tool.name}") + eval_prompt_formatted = self.create_eval_prompt(tool, test_case) + try: + test_case.query = self.llm.get_response( + [{"role":"user","content": eval_prompt_formatted}] + ) + except Exception as err: + print(f"Failed to generate natural language query for {tool.name}: {err}") + test_case.query = '' + + all_tests.extend(test_cases) + print(f"Generated {len(test_cases)} tests for {tool.name}") + + else: + print(f"No response received for {tool.name}") + + except Exception as error: + print(f"Error generating tests for tool {tool.name}: {error}") + + return all_tests + + def create_tool_prompt(self, tool: ToolDefinition, tests_per_tool: int, tool_function_str: str) -> str: + """ + Create a prompt for the LLM to generate test cases for testing tool exeucation + + Args: + tool: Tool definition to generate tests for + tests_per_tool: Number of tests to generate per tool + + Returns: + Formatted prompt string + """ + # Extract input schema properties safely + input_properties = {} + if tool.input_schema and hasattr(tool.input_schema, 'properties'): + input_properties = json.dumps(input_properties, indent=2) + + formatted_prompt = tool_prompt.format( + tool=tool, + input_properties=input_properties, + tests_per_tool=tests_per_tool, + tool_function_str=tool_function_str + ) + return formatted_prompt + + def create_eval_prompt(self, tool: ToolDefinition, test_case: TestCase) -> str: + """ + Create a prompt for the LLM to generate test cases for end-to-end agent running + + Args: + tool: Tool definition to generate tests for + tests_per_tool: Number of tests to generate per tool + + Returns: + Formatted prompt string + """ + + input = {} + if test_case.input: + input = json.dumps(test_case.input, indent=2) + + formatted_prompt = eval_prompt.format(tool=tool, test_case=test_case, test_case_inputs = input) + return formatted_prompt + + + def parse_response(self, response_text: str, tool_name: str) -> List[TestCase]: + """ + Parse LLM's response into test cases + + Args: + response_text: LLM's response text + tool_name: Name of the tool being tested + + Returns: + List of parsed test cases + """ + json_content = response_text + + try: + # Extract JSON content between backticks if present + json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response_text) + if json_match and json_match.group(1): + json_content = json_match.group(1) + else: + # Attempt to gracefully handle cases where the LLM might forget the backticks + print(f"[{tool_name}] LLM response did not contain JSON within backticks. Attempting to parse directly.") + + # Parse JSON + try: + parsed_json = json.loads(json_content) + except json.JSONDecodeError as parse_error: + print(f"[{tool_name}] Failed to parse JSON from LLM response. Error: {parse_error}") + print(f"[{tool_name}] Raw response text was: {response_text}") + return [] # Return empty if JSON parsing fails + + # Ensure parsed_json is a list + if not isinstance(parsed_json, list): + print(f"[{tool_name}] Parsed JSON is not an array. LLM response might be malformed. Raw response: {response_text}") + # If it's a single object that looks like a test case, wrap it in an array + if (isinstance(parsed_json, dict) and + 'description' in parsed_json and + 'input' in parsed_json and + 'expect' in parsed_json): + print(f"[{tool_name}] Attempting to recover by wrapping single test case object in an array.") + parsed_json = [parsed_json] + else: + return [] + + valid_test_cases: List[TestCase] = [] + + for index, test in enumerate(parsed_json): + # Basic validation for essential fields + if not isinstance(test, dict): + print(f"[{tool_name}] Test case at index {index} is not a valid object. Skipping.") + continue + + if not test.get('description') or not isinstance(test['description'], str): + print(f"[{tool_name}] Test case at index {index} is missing or has an invalid 'description'. Skipping: {json.dumps(test)}") + continue + + if 'input' not in test or not isinstance(test['input'], dict): + print(f"[{tool_name}] Test case \"{test['description']}\" is missing or has invalid 'inputs'. Skipping: {json.dumps(test)}") + continue + + if not test.get('expect') or not isinstance(test['expect'], dict): + print(f"[{tool_name}] Test case \"{test['description']}\" is missing or has invalid 'expect'. Skipping: {json.dumps(test)}") + continue + + expected_outcome = test['expect'] + if (not expected_outcome.get('status') or + expected_outcome['status'] not in ['success', 'error']): + print(f"[{tool_name}] Test case \"{test['description']}\" has missing or invalid 'expectedOutcome.status'. Skipping: {json.dumps(test)}") + continue + + # Create test case + test_case = TestCase( + id=str(uuid.uuid4()), + toolName=tool_name, + description=test['description'], + query='', + input=test['input'], + expect={ + "status":expected_outcome['status'], + "validation_rules": expected_outcome.get('validationRules', []) or [] + } + ) + valid_test_cases.append(test_case) + return valid_test_cases + + except Exception as error: + # Catch any other unexpected errors during processing + print(f"[{tool_name}] Unexpected error in parse_response: {error}") + print(f"[{tool_name}] Response text was: {response_text}") + return [] + + + def testcases_to_dict(self, testcases: List[TestCase])-> List: + res = [] + for case in testcases: + res.append( { + "id": case.id, + "toolName": case.toolName, + "description": case.description, + "query": case.query, + "input": case.input, + "expect": case.expect + }) + return res + + def save_to_file(self, server_name: str, testcases: List[TestCase]): + """ + save test cases (array of JSON) to file + """ + testcases = self.testcases_to_dict(testcases) + try: + if not isinstance(testcases, list): + raise ValueError("input data should be an array of JSON") + + current_timestamp = datetime.datetime.utcnow().isoformat() + safe_timestamp = current_timestamp.replace(":", "-").replace(".", "-") + + folerpath = os.path.join(".logs",f'{server_name}_{safe_timestamp}') + if not os.path.exists(folerpath): + os.mkdir(folerpath) + + filename = f"testcases.json" + filepath = os.path.join(folerpath,filename) + with open(filepath, 'w', encoding='utf-8') as file: + json.dump(testcases, file, ensure_ascii=False, indent=4) + print(f"{server_name} test cases are successfully saved into {filepath}") + return True + + except IOError as e: + print(f"文件操作错误: {e}") + except ValueError as e: + print(f"数据格式错误: {e}") + except Exception as e: + print(f"发生未知错误: {e}") + + return False \ No newline at end of file