From 0bbfdfff8bd341dbfd403915b2451204b32ce2ad Mon Sep 17 00:00:00 2001
From: J12-Kyrie <kuoyuanj@gmail.com>
Date: Tue, 30 Sep 2025 00:26:57 -0700
Subject: [PATCH] feat: add beam search improvements and test framework

- Enhanced beam search functionality in sampler.py
- Updated sampling metadata for better beam search support
- Added comprehensive test framework in test_beam/ directory
- Improved __init__.py with new imports and configurations
---
 test_beam/common_beam_tests.py                | 413 ++++++++++++
 test_beam/test_v0.py                          |  12 +
 test_beam/test_v1.py                          |  12 +
 vllm_mindspore/__init__.py                    | 122 ++--
 .../model_executor/layers/sampler.py          | 631 +++++++++++++++++-
 .../model_executor/sampling_metadata.py       | 190 ++++++
 6 files changed, 1326 insertions(+), 54 deletions(-)
 create mode 100644 test_beam/common_beam_tests.py
 create mode 100644 test_beam/test_v0.py
 create mode 100644 test_beam/test_v1.py

diff --git a/test_beam/common_beam_tests.py b/test_beam/common_beam_tests.py
new file mode 100644
index 00000000..ae7e2c4e
--- /dev/null
+++ b/test_beam/common_beam_tests.py
@@ -0,0 +1,413 @@
+import os
+import time
+from typing import List, Dict
+
+import vllm_mindspore  # Ensure backend is imported/initialized
+from vllm import LLM
+from vllm.sampling_params import BeamSearchParams
+
+
+def _set_architecture_env(use_v1: bool) -> None:
+    if use_v1:
+        os.environ["VLLM_USE_V1"] = "1"
+    else:
+        # Explicitly set to 0 for consistency across runs
+        os.environ["VLLM_USE_V1"] = "0"
+
+
+def _make_llm(model_path: str,
+              max_num_seqs: int = 4,
+              max_model_len: int = 1024,
+              gpu_memory_utilization: float = 0.7) -> LLM:
+    return LLM(
+        model=model_path,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+    )
+
+
+# Unified parameter sets used across v0 and v1
+PARAM_SETS: Dict[str, BeamSearchParams] = {
+    "basic": BeamSearchParams(beam_width=4, max_tokens=100, temperature=0.7, length_penalty=1.0, ignore_eos=False),
+    "batch": BeamSearchParams(beam_width=3, max_tokens=45, temperature=0.7, length_penalty=1.2, ignore_eos=False),
+    "token_vs_text": BeamSearchParams(beam_width=3, max_tokens=40, temperature=0.7, length_penalty=1.0, ignore_eos=False),
+}
+
+PARAM_VARIATIONS: List[Dict] = [
+    {
+        "name": "High Temperature Diverse Generation",
+        "params": BeamSearchParams(beam_width=3, max_tokens=50, temperature=1.2, length_penalty=0.8, ignore_eos=False),
+        "prompt": "The future of artificial intelligence",
+    },
+    {
+        "name": "Low Temperature Focused Generation",
+        "params": BeamSearchParams(beam_width=4, max_tokens=60, temperature=0.1, length_penalty=1.5, ignore_eos=False),
+        "prompt": "Machine learning algorithms",
+    },
+    {
+        "name": "Zero Temperature Deterministic",
+        "params": BeamSearchParams(beam_width=2, max_tokens=40, temperature=0.0, length_penalty=1.0, ignore_eos=False),
+        "prompt": "Python programming language",
+    },
+    {
+        "name": "Ignore EOS Test",
+        "params": BeamSearchParams(beam_width=3, max_tokens=30, temperature=0.5, length_penalty=1.0, ignore_eos=True),
+        "prompt": "Hello world",
+    },
+    {
+        "name": "Strong Length Penalty",
+        "params": BeamSearchParams(beam_width=4, max_tokens=80, temperature=0.8, length_penalty=2.0, ignore_eos=False),
+        "prompt": "Deep learning neural networks",
+    },
+    {
+        "name": "Weak Length Penalty",
+        "params": BeamSearchParams(beam_width=3, max_tokens=70, temperature=0.6, length_penalty=0.5, ignore_eos=False),
+        "prompt": "Natural language processing",
+    },
+]
+
+EDGE_CASES: List[Dict] = [
+    {
+        "name": "Very Short Generation",
+        "params": BeamSearchParams(beam_width=2, max_tokens=1, temperature=0.5, length_penalty=1.0, ignore_eos=False),
+        "prompt": "Hello",
+    },
+    {
+        "name": "Single Beam",
+        "params": BeamSearchParams(beam_width=1, max_tokens=30, temperature=0.0, length_penalty=1.0, ignore_eos=False),
+        "prompt": "Single beam test",
+    },
+    {
+        "name": "Empty Prompt",
+        "params": BeamSearchParams(beam_width=2, max_tokens=20, temperature=0.7, length_penalty=1.0, ignore_eos=False),
+        "prompt": "",
+    },
+    {
+        "name": "Very Long Prompt",
+        "params": BeamSearchParams(beam_width=2, max_tokens=30, temperature=0.5, length_penalty=1.0, ignore_eos=False),
+        "prompt": (
+            "This is a very long prompt that contains many words and should test "
+            "the model's ability to handle longer input sequences while still generating "
+            "meaningful continuations with beam search algorithm"
+        ),
+    },
+]
+
+
+def test_basic(llm: LLM) -> None:
+    print("=== Basic Beam Search Test ===")
+    prompts = [{"prompt": "I am"}]
+    params = PARAM_SETS["basic"]
+    results = llm.beam_search(prompts, params)
+    for i, result in enumerate(results):
+        print(f"Result {i+1}:")
+        for j, sequence in enumerate(result.sequences, 1):
+            print(f"  Sequence {j}:")
+            print(f"    Text: {sequence.text!r}")
+            print(f"    Score: {sequence.cum_logprob:.4f}")
+            print(f"    Finish reason: {sequence.finish_reason}")
+        print("---")
+
+
+def test_parameter_variations(llm: LLM) -> None:
+    print("\n=== Parameter Variations Test ===")
+    for test_case in PARAM_VARIATIONS:
+        print(f"\n--- {test_case['name']} ---")
+        params = test_case["params"]
+        prompts = [{"prompt": test_case["prompt"]}]
+        start_time = time.time()
+        results = llm.beam_search(prompts, params)
+        end_time = time.time()
+        print(f"Generation time: {end_time - start_time:.2f}s")
+        for sequence in results[0].sequences:
+            print(f"  Text: {sequence.text!r}")
+            print(f"  Score: {sequence.cum_logprob:.4f}")
+            print(f"  Token count: {len(sequence.tokens)}")
+            print(f"  Finish reason: {sequence.finish_reason}")
+
+
+def test_batch_processing(llm: LLM) -> None:
+    print("\n=== Batch Processing Test ===")
+    multi_prompts = [
+        {"prompt": "Science and technology"},
+        {"prompt": "Climate change solutions"},
+        {"prompt": "Space exploration missions"},
+        {"prompt": "Renewable energy sources"},
+    ]
+    params = PARAM_SETS["batch"]
+    print(f"Processing {len(multi_prompts)} prompts in batch...")
+    start_time = time.time()
+    results = llm.beam_search(multi_prompts, params)
+    end_time = time.time()
+    print(f"Batch processing time: {end_time - start_time:.2f}s")
+    print(f"Average time per prompt: {(end_time - start_time) / len(multi_prompts):.2f}s")
+    for i, result in enumerate(results):
+        print(f"\nPrompt {i+1}: {multi_prompts[i]['prompt']}")
+        print("Generated sequences:")
+        for j, sequence in enumerate(result.sequences, 1):
+            print(f"  Beam {j}: {sequence.text!r}")
+            print(f"    Score: {sequence.cum_logprob:.4f}")
+            print(f"    Tokens: {len(sequence.tokens)}")
+
+
+def test_beam_width_comparison(llm: LLM) -> None:
+    print("\n=== Beam Width Comparison Test ===")
+    prompt = [{"prompt": "Artificial intelligence will"}]
+    beam_widths = [1, 2, 4, 6, 8]
+    for beam_width in beam_widths:
+        print(f"\n--- Beam Width: {beam_width} ---")
+        params = BeamSearchParams(
+            beam_width=beam_width,
+            max_tokens=50,
+            temperature=0.8,
+            length_penalty=1.0,
+            ignore_eos=False,
+        )
+        start_time = time.time()
+        results = llm.beam_search(prompt, params)
+        end_time = time.time()
+        print(f"Generation time: {end_time - start_time:.2f}s")
+        print(f"Number of sequences: {len(results[0].sequences)}")
+        texts = [seq.text for seq in results[0].sequences]
+        unique_texts = set(texts)
+        print(f"Unique sequences: {len(unique_texts)}/{len(texts)}")
+        best_sequence = results[0].sequences[0]
+        print(f"Best sequence: {best_sequence.text!r}")
+        print(f"Best score: {best_sequence.cum_logprob:.4f}")
+
+
+def test_token_vs_text_input(llm: LLM) -> None:
+    print("\n=== Token vs Text Input Test ===")
+    tokenizer = llm.get_tokenizer()
+    test_text = "The benefits of renewable energy"
+    text_prompt = [{"prompt": test_text}]
+    token_ids = tokenizer.encode(test_text)
+    token_prompt = [{"prompt_token_ids": token_ids}]
+    print(f"Original text: {test_text!r}")
+    print(f"Token IDs: {token_ids}")
+    print(f"Decoded tokens: {tokenizer.decode(token_ids)!r}")
+    params = PARAM_SETS["token_vs_text"]
+    print("\n--- Text Input ---")
+    try:
+        results_text = llm.beam_search(text_prompt, params)
+        print(f"Generated {len(results_text[0].sequences)} sequences")
+        for i, sequence in enumerate(results_text[0].sequences, 1):
+            print(f"  Sequence {i}: {sequence.text!r}")
+            print(f"    Score: {sequence.cum_logprob:.4f}")
+            print(f"    Tokens: {len(sequence.tokens)}")
+    except Exception as e:
+        print(f"Text input error: {e}")
+    print("\n--- Token Input ---")
+    try:
+        results_tokens = llm.beam_search(token_prompt, params)
+        print(f"Generated {len(results_tokens[0].sequences)} sequences")
+        for i, sequence in enumerate(results_tokens[0].sequences, 1):
+            print(f"  Sequence {i}: {sequence.text!r}")
+            print(f"    Score: {sequence.cum_logprob:.4f}")
+            print(f"    Tokens: {len(sequence.tokens)}")
+    except Exception as e:
+        print(f"Token input error: {e}")
+    print("\n--- Verification ---")
+    if 'results_text' in locals() and 'results_tokens' in locals():
+        try:
+            text_scores = [seq.cum_logprob for seq in results_text[0].sequences]
+            token_scores = [seq.cum_logprob for seq in results_tokens[0].sequences]
+            score_diff = max(abs(a - b) for a, b in zip(text_scores, token_scores))
+            if score_diff < 1e-6:
+                print("✓ Scores match between text and token input methods")
+            else:
+                print(f"⚠ Score difference: {score_diff:.2e}")
+            text_outputs = [seq.text for seq in results_text[0].sequences]
+            token_outputs = [seq.text for seq in results_tokens[0].sequences]
+            if text_outputs == token_outputs:
+                print("✓ Generated texts are identical")
+            else:
+                print("⚠ Generated texts differ")
+                for i, (t, tok) in enumerate(zip(text_outputs, token_outputs)):
+                    if t != tok:
+                        print(f"  Sequence {i+1} differs:")
+                        print(f"    Text: {t!r}")
+                        print(f"    Token: {tok!r}")
+        except Exception as e:
+            print(f"Verification error: {e}")
+    else:
+        print("⚠ Cannot perform verification due to previous errors")
+
+
+def test_edge_cases(llm: LLM) -> None:
+    print("\n=== Edge Cases Test ===")
+    for case in EDGE_CASES:
+        print(f"\n--- {case['name']} ---")
+        prompts = [{"prompt": case["prompt"]}]
+        try:
+            start_time = time.time()
+            results = llm.beam_search(prompts, case["params"])
+            end_time = time.time()
+            print(f"Success! Time: {end_time - start_time:.2f}s")
+            print(f"Generated {len(results[0].sequences)} sequences")
+            for i, sequence in enumerate(results[0].sequences, 1):
+                print(f"  Sequence {i}: {sequence.text!r}")
+                print(f"    Score: {sequence.cum_logprob:.4f}")
+        except Exception as e:
+            print(f"Error: {e}")
+
+
+def test_large_batch_processing(llm: LLM) -> None:
+    print("\n=== Large Batch Processing Test ===")
+    large_batch_prompts: List[Dict[str, str]] = []
+    for i in range(4):
+        large_batch_prompts.extend([
+            {"prompt": f"Topic {i+1}: The importance of"},
+            {"prompt": f"Research area {i+1}: Recent advances in"},
+            {"prompt": f"Technology {i+1}: The future of"},
+            {"prompt": f"Innovation {i+1}: How to improve"},
+        ])
+    params = BeamSearchParams(beam_width=2, max_tokens=25, temperature=0.6, length_penalty=1.0, ignore_eos=False)
+    print(f"Processing batch of {len(large_batch_prompts)} prompts...")
+    start_time = time.time()
+    results = llm.beam_search(large_batch_prompts, params)
+    end_time = time.time()
+    print(f"Large batch processing completed in {end_time - start_time:.2f}s")
+    print(f"Average time per prompt: {(end_time - start_time) / len(large_batch_prompts):.3f}s")
+    print(f"Throughput: {len(large_batch_prompts) / (end_time - start_time):.2f} prompts/second")
+    print("\nSample results (first 3 prompts):")
+    for i in range(min(3, len(results))):
+        print(f"  Prompt {i+1}: {large_batch_prompts[i]['prompt']}")
+        best_result = results[i].sequences[0].text
+        print(f"    Best result: {best_result!r}")
+
+
+def test_memory_stress(llm: LLM) -> None:
+    print("\n=== Memory Stress Test ===")
+    stress_tests = [
+        {"name": "High Beam Width", "beam_width": 8, "batch_size": 2, "max_tokens": 30},
+        {"name": "Large Batch", "beam_width": 3, "batch_size": 4, "max_tokens": 25},
+        {"name": "Long Generation", "beam_width": 4, "batch_size": 2, "max_tokens": 80},
+        {"name": "Balanced Load", "beam_width": 4, "batch_size": 3, "max_tokens": 40},
+    ]
+    for test_config in stress_tests:
+        print(f"\n--- {test_config['name']} ---")
+        print(
+            f"Configuration: beam_width={test_config['beam_width']}, "
+            f"batch_size={test_config['batch_size']}, "
+            f"max_tokens={test_config['max_tokens']}"
+        )
+        prompts = [
+            {"prompt": f"Stress test prompt {i+1}: Describe the concept of"}
+            for i in range(test_config["batch_size"])
+        ]
+        params = BeamSearchParams(
+            beam_width=test_config["beam_width"],
+            max_tokens=test_config["max_tokens"],
+            temperature=0.7,
+            length_penalty=1.0,
+            ignore_eos=False,
+        )
+        try:
+            start_time = time.time()
+            results = llm.beam_search(prompts, params)
+            end_time = time.time()
+            print(f"  ✓ Success! Time: {end_time - start_time:.2f}s")
+            print(f"  Generated {sum(len(r.sequences) for r in results)} total sequences")
+            total_tokens = sum(len(seq.tokens) for result in results for seq in result.sequences)
+            print(f"  Total tokens generated: {total_tokens}")
+            print(f"  Tokens per second: {total_tokens / (end_time - start_time):.1f}")
+        except Exception as e:
+            print(f"  ✗ Failed: {e}")
+
+
+def test_parameter_sensitivity(llm: LLM) -> None:
+    print("\n=== Parameter Sensitivity Analysis ===")
+    base_prompt = [{"prompt": "The key to success in artificial intelligence is"}]
+    param_combinations = [
+        {"beam_width": 3, "temperature": 0.0, "length_penalty": 1.0, "name": "Zero Temperature"},
+        {"beam_width": 3, "temperature": 0.5, "length_penalty": 1.0, "name": "Low Temperature"},
+        {"beam_width": 3, "temperature": 1.0, "length_penalty": 1.0, "name": "High Temperature"},
+        {"beam_width": 3, "temperature": 1.5, "length_penalty": 1.0, "name": "Very High Temperature"},
+        {"beam_width": 3, "temperature": 0.7, "length_penalty": 0.5, "name": "Weak Length Penalty"},
+        {"beam_width": 3, "temperature": 0.7, "length_penalty": 1.5, "name": "Strong Length Penalty"},
+        {"beam_width": 3, "temperature": 0.7, "length_penalty": 2.0, "name": "Very Strong Length Penalty"},
+        {"beam_width": 1, "temperature": 0.7, "length_penalty": 1.0, "name": "Single Beam"},
+        {"beam_width": 6, "temperature": 0.7, "length_penalty": 1.0, "name": "Wide Beam Search"},
+    ]
+    results_comparison = []
+    for config in param_combinations:
+        print(f"\n--- {config['name']} ---")
+        params = BeamSearchParams(
+            beam_width=config["beam_width"],
+            max_tokens=50,
+            temperature=config["temperature"],
+            length_penalty=config["length_penalty"],
+            ignore_eos=False,
+        )
+        start_time = time.time()
+        results = llm.beam_search(base_prompt, params)
+        end_time = time.time()
+        sequences = results[0].sequences
+        best_score = sequences[0].cum_logprob
+        avg_length = sum(len(seq.tokens) for seq in sequences) / len(sequences)
+        unique_texts = len(set(seq.text for seq in sequences))
+        result_analysis = {
+            "config": config["name"],
+            "best_score": best_score,
+            "avg_length": avg_length,
+            "diversity": unique_texts / len(sequences),
+            "execution_time": end_time - start_time,
+            "best_text": sequences[0].text,
+        }
+        results_comparison.append(result_analysis)
+        print(f"  Best score: {best_score:.4f}")
+        print(f"  Average length: {avg_length:.1f} tokens")
+        print(f"  Diversity: {unique_texts}/{len(sequences)} unique sequences")
+        print(f"  Time: {end_time - start_time:.2f}s")
+        print(f"  Best text: {sequences[0].text!r}")
+    print("\n--- Parameter Sensitivity Summary ---")
+    print("Configuration | Best Score | Avg Length | Diversity | Time")
+    print("-" * 65)
+    for result in results_comparison:
+        print(
+            f"{result['config']:<20} | {result['best_score']:>9.4f} | "
+            f"{result['avg_length']:>9.1f} | {result['diversity']:>8.2f} | "
+            f"{result['execution_time']:>6.2f}s"
+        )
+
+
+def run_beam_tests(model_path: str,
+                   use_v1: bool,
+                   max_num_seqs: int = 4,
+                   max_model_len: int = 1024,
+                   gpu_memory_utilization: float = 0.7) -> None:
+    print(
+        f"Starting Comprehensive vLLM-MindSpore Beam Search Test Suite (use_v1={use_v1})"
+    )
+    print("=" * 70)
+    _set_architecture_env(use_v1)
+    llm = _make_llm(
+        model_path=model_path,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+    )
+    try:
+        test_basic(llm)
+        test_parameter_variations(llm)
+        test_batch_processing(llm)
+        test_beam_width_comparison(llm)
+        test_token_vs_text_input(llm)
+        test_edge_cases(llm)
+        test_large_batch_processing(llm)
+        test_memory_stress(llm)
+        test_parameter_sensitivity(llm)
+        print("\n" + "=" * 70)
+        print("All tests completed successfully!")
+        print("vLLM-MindSpore beam search functionality verified across architectures.")
+    except Exception as e:
+        print(f"\nTest suite failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    # Default invocation for manual running; adjust model path as needed.
+    run_beam_tests(model_path="/home/ma-user/work/Qwen2.5-7B-Instruct", use_v1=False)
\ No newline at end of file
diff --git a/test_beam/test_v0.py b/test_beam/test_v0.py
new file mode 100644
index 00000000..62983388
--- /dev/null
+++ b/test_beam/test_v0.py
@@ -0,0 +1,12 @@
+import vllm_mindspore
+from common_beam_tests import run_beam_tests
+
+
+if __name__ == "__main__":
+    run_beam_tests(
+        model_path="/home/ma-user/work/Qwen2.5-7B-Instruct",
+        use_v1=False,
+        max_num_seqs=4,
+        max_model_len=1024,
+        gpu_memory_utilization=0.7,
+    )
diff --git a/test_beam/test_v1.py b/test_beam/test_v1.py
new file mode 100644
index 00000000..ac0a088a
--- /dev/null
+++ b/test_beam/test_v1.py
@@ -0,0 +1,12 @@
+import vllm_mindspore
+from common_beam_tests import run_beam_tests
+
+
+if __name__ == "__main__":
+    run_beam_tests(
+        model_path="/home/ma-user/work/Qwen2.5-7B-Instruct",
+        use_v1=True,
+        max_num_seqs=4,
+        max_model_len=1024,
+        gpu_memory_utilization=0.7,
+    )
\ No newline at end of file
diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index efd917b0..e38544af 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -85,19 +85,6 @@ from vllm_mindspore.v1.engine.core import shutdown
 
 vllm.v1.engine.core.DPEngineCoreProc.shutdown = shutdown
 
-from vllm_mindspore.v1.core.kv_cache_utils import (get_kv_cache_config,
-                                                   unify_kv_cache_configs)
-
-vllm.v1.core.kv_cache_utils.get_kv_cache_config = get_kv_cache_config
-vllm.v1.engine.core.get_kv_cache_config = get_kv_cache_config
-vllm.v1.core.kv_cache_utils.unify_kv_cache_configs = unify_kv_cache_configs
-vllm.v1.engine.core.unify_kv_cache_configs = unify_kv_cache_configs
-
-from vllm_mindspore.v1.core.single_type_kv_cache_manager import (
-    spec_manager_map)
-
-vllm.v1.core.single_type_kv_cache_manager.spec_manager_map = spec_manager_map
-
 from vllm_mindspore.utils import (
     make_tensor_with_pad,
     async_tensor_h2d,
@@ -105,18 +92,6 @@ from vllm_mindspore.utils import (
     ms_memory_profiling,
 )
 
-from vllm_mindspore.config import CacheDType, _CacheConfig, \
-    get_current_and_parent_class_attr_docs
-
-vllm.config.CacheConfig = _CacheConfig
-vllm.config.CacheDType = CacheDType
-vllm.config.get_attr_docs = get_current_and_parent_class_attr_docs
-import vllm.engine.arg_utils
-
-vllm.engine.arg_utils.CacheDType = CacheDType
-vllm.engine.arg_utils.CacheConfig = _CacheConfig
-vllm.engine.arg_utils.get_attr_docs = get_current_and_parent_class_attr_docs
-
 vllm.utils.make_tensor_with_pad = make_tensor_with_pad
 vllm.utils.async_tensor_h2d = async_tensor_h2d
 vllm.utils.cuda_is_initialized = ascend_is_initialized
@@ -197,17 +172,11 @@ from vllm_mindspore.worker.cache_engine import (
     ms_swap_out,
 )
 
-from vllm_mindspore.utils import get_dtype_size
 import vllm.worker.cache_engine
 
 vllm.worker.cache_engine.CacheEngine._allocate_kv_cache = ms_allocate_kv_cache
 vllm.worker.cache_engine.CacheEngine.swap_in = ms_swap_in
 vllm.worker.cache_engine.CacheEngine.swap_out = ms_swap_out
-vllm.worker.cache_engine.get_dtype_size = get_dtype_size
-
-import vllm.v1.kv_cache_interface
-
-vllm.v1.kv_cache_interface.get_dtype_size = get_dtype_size
 
 from vllm_mindspore.model_executor.model_loader.weight_utils import (
     safetensors_weights_iterator, )
@@ -279,15 +248,18 @@ vllm.executor.ray_distributed_executor.initialize_ray_cluster = (
     initialize_ray_cluster)
 vllm.v1.utils.CoreEngineActorManager.__init__ = core_engine_actor_manager_init
 
-from .config import (_verify_quantization, _verify_args, vllm_config_post_init,
-                     vllm_config_get_quantization_config, model_post_init,
-                     _get_and_verify_dtype, stateless_init_dp_group,
-                     has_unfinished_dp, v1_process_validate_sampling_params)
+from .config import (
+    _verify_quantization,
+    _verify_args,
+    vllm_config_post_init,
+    model_post_init,
+    _get_and_verify_dtype,
+    stateless_init_dp_group,
+    has_unfinished_dp,
+)
 
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
-vllm.config.VllmConfig._get_quantization_config = staticmethod(
-    vllm_config_get_quantization_config)
 vllm.config.SchedulerConfig._verify_args = _verify_args
 vllm.config.CompilationConfig.model_post_init = model_post_init
 vllm.config._get_and_verify_dtype = _get_and_verify_dtype
@@ -344,22 +316,16 @@ from vllm.inputs.registry import InputProcessingContext
 
 InputProcessingContext.call_hf_processor = call_hf_processor
 
-from vllm_mindspore.multimodal.inputs import (as_kwargs, batched_reduce_data,
-                                              flat_build_elems,
-                                              flat_reduce_data, from_items,
-                                              MultiModalFieldElem, _try_stack)
+from vllm_mindspore.multimodal.inputs import as_kwargs, \
+    from_items, MultiModalFieldElem, build_elems
 
-from vllm.multimodal.inputs import MultiModalBatchedField
-from vllm.multimodal.inputs import MultiModalFlatField
 from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFlatField
 
-MultiModalBatchedField._reduce_data = batched_reduce_data
-MultiModalFlatField.build_elems = flat_build_elems
-MultiModalFlatField._reduce_data = flat_reduce_data
 MultiModalKwargs.as_kwargs = as_kwargs
 MultiModalKwargs.from_items = from_items
-MultiModalKwargs._try_stack = _try_stack
 
+MultiModalFlatField.build_elems = build_elems
 vllm.multimodal.inputs.MultiModalFieldElem = MultiModalFieldElem
 vllm.v1.serial_utils.MultiModalFieldElem = MultiModalFieldElem
 
@@ -458,18 +424,74 @@ vllm.v1.utils.copy_slice = copy_slice
 vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice
 vllm.v1.utils.CoreEngineActorManager.create_dp_placement_groups = staticmethod(
     create_dp_placement_groups)
-
+# Enhanced sampler monkey patching with conversion wrappers
 from vllm_mindspore.model_executor.layers.sampler import (
+    sampler_forward_with_conversion,
     _apply_top_k_top_p,
     _apply_min_p,
     _random_sample,
+    _greedy_sample,
+    _multinomial,
+    _get_ranks,
+    get_logprobs,
+    get_pythonized_sample_results,
+    _build_sampler_output,
+    _get_prompt_logprob_if_needed,
+    _get_sampled_logprob_if_needed,
 )
+
+from vllm_mindspore.model_executor.sampling_metadata import (
+    MindSporeSamplingMetadata,
+    SamplingTensors,
+    TensorConverter,
+    create_mindspore_sampling_metadata,
+)
+
 import vllm.model_executor.layers
 
+# Setup conversion wrapper for main Sampler.forward method
+def _setup_sampler_conversion():
+    """Setup conversion wrappers for critical sampler methods."""
+    # Store original forward method
+    original_forward = vllm.model_executor.layers.sampler.Sampler.forward
+
+    # Apply conversion wrapper
+    vllm.model_executor.layers.sampler.Sampler.forward = sampler_forward_with_conversion(original_forward)
+
+    print("INFO: Sampler conversion wrappers initialized")
+
+# Apply conversion setup
+_setup_sampler_conversion()
+
+# Core sampling functions with conversion wrappers (CRITICAL FIXES)
 vllm.model_executor.layers.sampler._apply_top_k_top_p = _apply_top_k_top_p
 vllm.model_executor.layers.sampler._apply_min_p = _apply_min_p
 vllm.model_executor.layers.sampler._random_sample = _random_sample
 
+# Enhanced sampling functions for beam search
+vllm.model_executor.layers.sampler._greedy_sample = _greedy_sample
+vllm.model_executor.layers.sampler._multinomial = _multinomial
+
+# Logprobs and ranking functions with conversion (CRITICAL for beam search)
+vllm.model_executor.layers.sampler.get_logprobs = get_logprobs
+vllm.model_executor.layers.sampler._get_ranks = _get_ranks
+
+# Result processing functions
+vllm.model_executor.layers.sampler.get_pythonized_sample_results = get_pythonized_sample_results
+vllm.model_executor.layers.sampler._build_sampler_output = _build_sampler_output
+vllm.model_executor.layers.sampler._get_prompt_logprob_if_needed = _get_prompt_logprob_if_needed
+vllm.model_executor.layers.sampler._get_sampled_logprob_if_needed = _get_sampled_logprob_if_needed
+
+# Add tensor conversion utilities to vllm namespace for broader access
+vllm.model_executor.sampling_metadata.MindSporeSamplingMetadata = MindSporeSamplingMetadata
+vllm.model_executor.sampling_metadata.TensorConverter = TensorConverter
+vllm.model_executor.sampling_metadata.create_mindspore_sampling_metadata = create_mindspore_sampling_metadata
+
+# Update SamplingTensors.from_lists to use MindSpore version
+vllm.model_executor.sampling_metadata.SamplingTensors.from_lists = SamplingTensors.from_lists
+
+print("INFO: vLLM-MindSpore cross-framework tensor conversion initialized")
+
 from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors
 import vllm.v1.sample.ops.penalties
 
@@ -571,8 +593,4 @@ DPAsyncMPClient.get_core_engine_for_request = get_core_engine_for_request
 DPAsyncMPClient.add_request_async = add_request_async
 DPAsyncMPClient.process_engine_outputs = staticmethod(process_engine_outputs)
 
-from vllm.v1.engine.processor import Processor
-
-Processor._validate_sampling_params = v1_process_validate_sampling_params
-
 check_ready()
diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index e581c87d..9b84509c 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -17,15 +17,139 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""A layer that samples the next tokens from the model's outputs."""
+"""
+A layer that samples the next tokens from the model's outputs.
 
+This module provides:
+1. Core sampling algorithms: greedy, multinomial, top-k/top-p/min-p
+2. Beam search support through enhanced logprob computation
+3. Cross-framework compatibility via unified tensor conversion
+4. Memory-optimized implementations for large vocabulary models
+
+Module organization:
+- Sampling algorithms: _greedy_sample, _multinomial, _random_sample
+- Logit processing: _apply_top_k_top_p, _apply_min_p  
+- Logprob computation: get_logprobs (critical for beam search)
+- Tensor conversion: Integration with TensorConverter
+"""
+
+from typing import Optional, Union, Tuple, Dict
+from dataclasses import dataclass
 import mindspore as ms
 from mindspore import mint
-from vllm.model_executor.sampling_metadata import SequenceGroupToSample
+from vllm.model_executor.sampling_metadata import SequenceGroupToSample, SamplingMetadata
+from vllm.sampling_params import SamplingType
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs, SequenceOutput, CompletionSequenceGroupOutput
+
+# Import unified tensor conversion utilities
+from vllm_mindspore.model_executor.sampling_metadata import (
+    MindSporeSamplingMetadata,
+    create_mindspore_sampling_metadata,
+    TensorConverter
+)
 
 # (num_token_ids, num_parent_ids) per sequence group.
 SampleResultType = list[tuple[list[int], list[int]]]
 
+# Types of temporary data structures used for
+# computing sample_result
+SampleMetadataType = Dict[SamplingType, Tuple[list[int], list[SequenceGroupToSample]]]
+MultinomialSamplesType = Dict[SamplingType, ms.Tensor]
+SampleResultsDictType = Dict[int, Tuple[list[int], list[int]]]
+
+
+# Encapsulates temporary data structures for computing
+# sample_result.
+#
+# * For multi-step scheduling: must be returned
+#   by `Sampler.forward()` and used later to compute the pythonized
+#   sample_result
+#
+# * For single-step scheduling: consumed immediately
+#   inside `Sampler.forward()` to compute pythonized sample_result.
+@dataclass
+class SampleResultArgsType:
+    sample_metadata: SampleMetadataType
+    multinomial_samples: MultinomialSamplesType
+    sample_results_dict: SampleResultsDictType
+    sampling_metadata: SamplingMetadata
+    greedy_samples: Optional[ms.Tensor]
+
+
+# Union of non-deferred (single-step scheduling)
+# vs deferred (multi-step scheduling)
+# sample result types
+MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType]
+
+# Abbreviation of the _sample() return type
+SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[ms.Tensor]]
+
+
+class SamplerOutput:
+    """For each sequence group, we generate a list of SequenceOutput object,
+    each of which contains one possible candidate for the next token.
+
+    This data structure implements methods, so it can be used like a list, but
+    also has optional fields for device tensors.
+    """
+
+    def __init__(
+        self,
+        outputs: list[CompletionSequenceGroupOutput],
+        sampled_token_probs: Optional[ms.Tensor] = None,
+        sampled_token_ids: Optional[ms.Tensor] = None,
+        logprobs: Optional[ms.Tensor] = None,
+        deferred_sample_results_args: Optional[SampleResultArgsType] = None,
+    ):
+        self.outputs = outputs
+        self.sampled_token_probs = sampled_token_probs
+        self.sampled_token_ids = sampled_token_ids
+        self.logprobs = logprobs
+        self.deferred_sample_results_args = deferred_sample_results_args
+
+    def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput:
+        return self.outputs[idx]
+
+    def __len__(self) -> int:
+        return len(self.outputs)
+
+    def __iter__(self):
+        return iter(self.outputs)
+
+
+def _get_next_prompt_tokens(
+    seq_group: SequenceGroupToSample) -> tuple[int, ...]:
+    """Get a list of next prompt tokens to compute logprob from a
+    given sequence group.
+
+    It is used to compute prompt logprob. Imagine you have logprob for each
+    query token. Query token needs to know the next prompt token id to compute
+    prompt logprob. This is a helper to obtain next prompt token ids.
+
+    This API has to be used only when the caller knows seq_group is in prefill
+    stage.
+
+    Returns:
+        A list of next prompt tokens to compute logprob.
+    """
+    assert seq_group.is_prompt, (
+        "Caller should ensure the sequence group is in a prefill stage.")
+    seq_ids = seq_group.seq_ids
+    query_len = seq_group.query_len
+    assert query_len is not None
+    # prompt has only 1 seq id.
+    assert len(seq_ids) == 1
+    seq_data = seq_group.seq_data[seq_ids[0]]
+    computed_len = seq_data.get_num_computed_tokens()
+    prompt_tokens = seq_data.prompt_token_ids
+    # +1 because we are looking for a next prompt token.
+    next_token_index_start = computed_len + 1
+    next_token_index_end = min(computed_len + query_len + 1,
+                               len(prompt_tokens))
+    next_prompt_tokens = prompt_tokens[
+        next_token_index_start:next_token_index_end]
+    return next_prompt_tokens
+
 
 def _apply_top_k_top_p(
     logits: ms.Tensor,
@@ -116,3 +240,506 @@ def _random_sample(
         results.append((next_token_ids, parent_ids))
         sample_idx += num_parent_seqs
     return results
+
+
+def _multinomial(
+    probs: ms.Tensor,
+    num_samples: int,
+    seq_groups: Optional[list[SequenceGroupToSample]] = None,
+) -> ms.Tensor:
+    """MindSpore-compatible multinomial sampling avoiding GPU<->CPU sync.
+
+    Args:
+        probs: [batch_size, vocab_size] probability distribution
+        num_samples: Number of samples per distribution
+        seq_groups: Sequence groups for deterministic sampling
+
+    Returns:
+        ms.Tensor: [batch_size, num_samples] sampled token ids
+    """
+    if num_samples > 1:
+        probs = probs.repeat_interleave(num_samples, dim=0)
+
+    # Create exponential distribution tensor
+    q = mint.empty_like(probs)
+
+    if seq_groups is None:
+        # Standard random sampling
+        q.exponential_()
+    else:
+        # Deterministic sampling with per-sequence generators
+        sample_idx = 0
+        for seq_group in seq_groups:
+            seq_ids = seq_group.seq_ids
+            stride = len(seq_ids) * num_samples
+
+            # MindSpore generator compatibility
+            if seq_group.generator is not None:
+                # Use MindSpore random seed for deterministic sampling
+                ms.set_seed(seq_group.generator.initial_seed())
+
+            q[sample_idx:sample_idx + stride].exponential_()
+            sample_idx += stride
+
+    # Gumbel-max trick: argmax(log(probs) - log(-log(uniform)))
+    return probs.div_(q).argmax(dim=1).view(-1, num_samples)
+
+
+def _greedy_sample(
+    selected_seq_groups: list[SequenceGroupToSample],
+    samples: ms.Tensor,
+) -> SampleResultType:
+    """Enhanced greedy sampling with beam search support.
+
+    Args:
+        selected_seq_groups: Selected sequence groups
+        samples: [num_samples] argmax results tensor
+
+    Returns:
+        SampleResultType: [(next_token_ids, parent_ids), ...]
+    """
+    samples_lst = samples.asnumpy().tolist()
+    sample_idx = 0
+    results: SampleResultType = []
+
+    for seq_group in selected_seq_groups:
+        if not seq_group.do_sample:
+            results.append(([], []))
+            continue
+
+        seq_ids = seq_group.seq_ids
+        num_parent_seqs = len(seq_ids)
+
+        # Beam search: can have multiple sequences per group
+        if num_parent_seqs == 1:
+            # Standard greedy sampling
+            parent_ids = [0]
+            next_token_ids = [samples_lst[sample_idx]]
+        else:
+            # Beam search: multiple parents
+            parent_ids = list(range(num_parent_seqs))
+            next_token_ids = samples_lst[sample_idx:sample_idx + num_parent_seqs]
+
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+
+    return results
+
+
+def _get_ranks(x: ms.Tensor, indices: ms.Tensor) -> ms.Tensor:
+    """
+    Calculate ranks of chosen tokens in MindSpore with optimized memory usage.
+
+    Args:
+        x: [N, M] logprob tensor where N=num_tokens, M=vocab_size
+        indices: [N] chosen token indices
+
+    Returns:
+        ms.Tensor: [N] ranks of chosen tokens (1-based)
+    """
+    # Memory-efficient implementation: avoid creating intermediate vals tensor
+    # Use gather to get values and expand dimensions in one operation
+    chosen_values = x.gather(1, indices.unsqueeze(1))  # [N, 1]
+    
+    # Count tokens with higher probability using broadcast comparison
+    # This avoids storing the vals tensor separately
+    rank_counts = (x > chosen_values).sum(1)
+    
+    # Return 1-based rank
+    return rank_counts.add_(1)
+
+
+def _get_prompt_logprob_if_needed(
+    seq_group: SequenceGroupToSample,
+    selected_logprobs: ms.Tensor,
+    ranks: ms.Tensor,
+    top_token_ids: ms.Tensor,
+    top_logprobs: ms.Tensor,
+    selected_logprobs_idx: int,
+    top_logprob_idx: int,
+):
+    """Compute prompt logprobs if needed (MindSpore compatible)."""
+    sampling_params = seq_group.sampling_params
+    is_prompt = seq_group.is_prompt
+
+    prompt_logprobs: Optional[PromptLogprobs] = None
+    if is_prompt and sampling_params.prompt_logprobs is not None:
+        prompt_logprobs = []
+        num_logprobs = sampling_params.prompt_logprobs
+        next_prompt_tokens = _get_next_prompt_tokens(seq_group)
+
+        # Convert to Python lists for efficiency
+        selected_logprob_items = selected_logprobs[
+            selected_logprobs_idx:selected_logprobs_idx + len(next_prompt_tokens)
+        ].tolist()
+        rank_items = ranks[
+            selected_logprobs_idx:selected_logprobs_idx + len(next_prompt_tokens)
+        ].tolist()
+
+        for idx, token_id in enumerate(next_prompt_tokens):
+            # Build prompt logprobs dictionary
+            prompt_logprobs_dict: dict[int, tuple[float, int]] = {
+                token_id: (selected_logprob_items[idx], rank_items[idx])
+            }
+
+            # Add top-k logprobs if requested
+            if num_logprobs > 0:
+                top_ids = top_token_ids[top_logprob_idx, :num_logprobs].tolist()
+                top_probs = top_logprobs[top_logprob_idx, :num_logprobs].tolist()
+                top_ranks = range(1, num_logprobs + 1)
+
+                prompt_logprobs_dict.update({
+                    top_id: (top_prob, rank)
+                    for top_id, top_prob, rank in zip(top_ids, top_probs, top_ranks)
+                })
+
+            prompt_logprobs.append({
+                token_id: Logprob(*logprob_and_rank)
+                for token_id, logprob_and_rank in prompt_logprobs_dict.items()
+            })
+            top_logprob_idx += 1
+
+        selected_logprobs_idx += len(next_prompt_tokens)
+
+    return prompt_logprobs, top_logprob_idx, selected_logprobs_idx
+
+
+def _get_sampled_logprob_if_needed(
+    seq_group: SequenceGroupToSample,
+    sample_result: tuple[list[int], list[int]],
+    selected_logprobs: ms.Tensor,
+    ranks: ms.Tensor,
+    top_token_ids: ms.Tensor,
+    top_logprobs: ms.Tensor,
+    selected_logprobs_idx: int,
+    top_logprob_idx: int,
+):
+    """Compute sample logprobs for beam search candidates."""
+    seq_ids = seq_group.seq_ids
+    num_logprobs = seq_group.sampling_params.logprobs
+    sampled_logprobs: SampleLogprobs = []
+    next_token_ids, parent_seq_ids = sample_result
+
+    if seq_group.do_sample:
+        assert len(next_token_ids) > 0
+
+        if num_logprobs is None:
+            # No detailed logprobs requested
+            for next_token_id in next_token_ids:
+                sampled_logprobs.append({next_token_id: Logprob(float('inf'))})
+        else:
+            # Detailed logprobs for beam search
+            selected_logprob_items = selected_logprobs[
+                selected_logprobs_idx:selected_logprobs_idx + len(next_token_ids)
+            ].tolist()
+            rank_items = ranks[
+                selected_logprobs_idx:selected_logprobs_idx + len(next_token_ids)
+            ].tolist()
+
+            for idx, (next_token_id, parent_id) in enumerate(zip(next_token_ids, parent_seq_ids)):
+                # Build sampled logprobs dictionary
+                sampled_logprobs_dict = {
+                    next_token_id: (selected_logprob_items[idx], rank_items[idx])
+                }
+
+                # Add top-k candidates (CRITICAL for beam search)
+                if num_logprobs is not None and num_logprobs > 0:
+                    top_ids = top_token_ids[
+                        top_logprob_idx + parent_id, :num_logprobs
+                    ].tolist()
+                    top_probs = top_logprobs[
+                        top_logprob_idx + parent_id, :num_logprobs
+                    ].tolist()
+                    top_ranks = range(1, num_logprobs + 1)
+
+                    sampled_logprobs_dict.update({
+                        top_id: (top_prob, rank)
+                        for top_id, top_prob, rank in zip(top_ids, top_probs, top_ranks)
+                    })
+
+                sampled_logprobs.append({
+                    token_id: Logprob(*logprob_and_rank)
+                    for token_id, logprob_and_rank in sampled_logprobs_dict.items()
+                })
+
+        # Update indices for next sequence group
+        selected_logprobs_idx += len(next_token_ids)
+        top_logprob_idx += len(seq_ids)
+
+    return sampled_logprobs, top_logprob_idx, selected_logprobs_idx
+
+
+def get_logprobs(
+    logprobs: ms.Tensor,
+    sampling_metadata,  # Union[MindSporeSamplingMetadata, SamplingMetadata]
+    sample_results: SampleResultType,
+) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]:
+    """Calculate logprobs for beam search candidates.
+
+    This is the CRITICAL function for beam search - it generates the
+    top-k candidates that beam search uses for expansion.
+
+    Args:
+        logprobs: [num_tokens, vocab_size] model log probabilities
+        sampling_metadata: Sampling configuration and metadata (auto-detects type)
+        sample_results: Sampling results from _sample()
+
+    Returns:
+        tuple: (prompt_logprobs, sample_logprobs) for each sequence group
+    """
+    # Auto-detect and convert sampling metadata type
+    if not isinstance(sampling_metadata, MindSporeSamplingMetadata):
+        sampling_metadata = create_mindspore_sampling_metadata(sampling_metadata)
+
+    # Collect query indices and next token IDs
+    query_indices: list[int] = []
+    next_token_ids: list[int] = []
+    largest_num_logprobs = -1
+
+    # Process each sequence group for logprob collection
+    for (seq_group, sample_result) in zip(sampling_metadata.seq_groups, sample_results):
+        sampling_params = seq_group.sampling_params
+
+        # Handle prompt logprobs
+        if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None):
+            largest_num_logprobs = max(largest_num_logprobs, sampling_params.prompt_logprobs)
+            next_prompt_tokens = _get_next_prompt_tokens(seq_group)
+            query_indices.extend(seq_group.prompt_logprob_indices)
+            next_token_ids.extend(next_prompt_tokens)
+
+        # Handle sample logprobs (KEY for beam search)
+        if seq_group.do_sample:
+            token_ids, parent_seq_ids = sample_result
+            query_idx = seq_group.sample_indices[0]
+
+            # Extend query indices for all parent sequences
+            query_indices.extend([query_idx + parent_id for parent_id in parent_seq_ids])
+            next_token_ids.extend(token_ids)
+
+            # Update largest logprobs count (beam search uses this)
+            if sampling_params.logprobs is not None:
+                largest_num_logprobs = max(largest_num_logprobs, sampling_params.logprobs)
+
+    if len(query_indices) == 0:
+        # No logprobs needed
+        empty_sampled_logprob: SampleLogprobs = []
+        empty_prompt_logprob: Optional[PromptLogprobs] = None
+        num_seq_groups = len(sampling_metadata.seq_groups)
+        return [empty_prompt_logprob] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups
+
+    selected_logprobs, ranks = None, None
+    top_logprobs, top_token_ids = None, None
+
+    # Calculate logprobs if needed
+    if largest_num_logprobs >= 0:
+        # Convert to MindSpore tensors
+        query_indices_gpu = ms.Tensor(query_indices, dtype=ms.int64)
+        next_token_ids_gpu = ms.Tensor(next_token_ids, dtype=ms.int64)
+
+        # Extract selected token logprobs
+        selected_logprobs = logprobs[query_indices_gpu, next_token_ids_gpu]
+
+        # Calculate ranks using MindSpore operations
+        ranks = _get_ranks(logprobs[query_indices_gpu], next_token_ids_gpu)
+
+        # Get top-k logprobs for beam search candidates
+        if largest_num_logprobs > 0:
+            top_logprobs, top_token_ids = mint.topk(logprobs, largest_num_logprobs, dim=-1)
+            top_logprobs = top_logprobs.asnumpy()
+            top_token_ids = top_token_ids.asnumpy()
+
+        # Transfer to CPU
+        selected_logprobs = selected_logprobs.asnumpy()
+        ranks = ranks.asnumpy()
+
+    # Build final logprobs results
+    prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = []
+    sample_logprobs_per_seq_group: list[SampleLogprobs] = []
+    top_logprob_idx = 0
+    selected_logprobs_idx = 0
+
+    for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results):
+        # Process prompt logprobs
+        (prompt_logprobs, top_logprob_idx, selected_logprobs_idx) = _get_prompt_logprob_if_needed(
+            seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs,
+            selected_logprobs_idx, top_logprob_idx
+        )
+        prompt_logprobs_per_seq_group.append(prompt_logprobs)
+
+        # Process sample logprobs (CRITICAL for beam search)
+        (sampled_logprobs, top_logprob_idx, selected_logprobs_idx) = _get_sampled_logprob_if_needed(
+            seq_group, sample_result, selected_logprobs, ranks, top_token_ids,
+            top_logprobs, selected_logprobs_idx, top_logprob_idx
+        )
+        sample_logprobs_per_seq_group.append(sampled_logprobs)
+
+    return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group
+
+
+def get_pythonized_sample_results(
+    sample_result_args: SampleResultArgsType
+) -> SampleResultType:
+    """Convert MindSpore GPU tensors to Python lists.
+
+    This function handles the GPU→CPU sync and Pythonization of
+    sampling results for both single-step and multi-step scheduling.
+
+    Args:
+        sample_result_args: GPU-side sampling arguments
+
+    Returns:
+        SampleResultType: Python list of (token_ids, parent_ids) tuples
+    """
+    # Unpack arguments
+    (sample_metadata, sampling_metadata, greedy_samples,
+     multinomial_samples, sample_results_dict) = (
+        sample_result_args.sample_metadata,
+        sample_result_args.sampling_metadata,
+        sample_result_args.greedy_samples,
+        sample_result_args.multinomial_samples,
+        sample_result_args.sample_results_dict,
+    )
+
+    # Process each sampling type
+    for sampling_type in SamplingType:
+        if sampling_type not in sample_metadata:
+            continue
+
+        (seq_group_id, seq_groups) = sample_metadata[sampling_type]
+
+        if sampling_type == SamplingType.GREEDY:
+            # Greedy sampling: use enhanced _greedy_sample
+            sample_results = _greedy_sample(seq_groups, greedy_samples)
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+            # Random sampling: use enhanced _random_sample
+            sample_results = _random_sample(seq_groups, multinomial_samples[sampling_type])
+
+        # Update results dictionary
+        sample_results_dict.update(zip(seq_group_id, sample_results))
+
+    # Return ordered results
+    return [
+        sample_results_dict.get(i, ([], []))
+        for i in range(len(sampling_metadata.seq_groups))
+    ]
+
+
+def _build_sampler_output(
+    maybe_deferred_sample_results: MaybeDeferredSampleResultType,
+    sampling_metadata: SamplingMetadata,
+    prompt_logprobs: Optional[list[Optional[PromptLogprobs]]],
+    sample_logprobs: Optional[list[SampleLogprobs]],
+    on_device_tensors: Optional[tuple[ms.Tensor, ms.Tensor, ms.Tensor]],
+    skip_sampler_cpu_output: bool = False,
+) -> SamplerOutput:
+    """Build SamplerOutput with MindSpore tensor compatibility.
+
+    Args:
+        maybe_deferred_sample_results: Sampling results (immediate or deferred)
+        sampling_metadata: Sampling metadata
+        prompt_logprobs: Prompt logprobs per sequence group
+        sample_logprobs: Sample logprobs per sequence group
+        on_device_tensors: GPU-side tensors (probs, logprobs, sampled_tokens)
+        skip_sampler_cpu_output: Whether to defer CPU serialization
+
+    Returns:
+        SamplerOutput: Final sampler output with proper tensor formats
+    """
+    sampler_output: list[CompletionSequenceGroupOutput] = []
+
+    if skip_sampler_cpu_output:
+        # Multi-step scheduling: defer Pythonization
+        assert isinstance(maybe_deferred_sample_results, SampleResultArgsType)
+        deferred_sample_results_args = maybe_deferred_sample_results
+    else:
+        # Single-step scheduling: immediate Pythonization
+        assert prompt_logprobs is not None
+        assert sample_logprobs is not None
+        assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType)
+        deferred_sample_results_args = None
+
+        # Build completion outputs
+        for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(
+            sampling_metadata.seq_groups, maybe_deferred_sample_results,
+            prompt_logprobs, sample_logprobs
+        ):
+            seq_ids = seq_group.seq_ids
+            next_token_ids, parent_ids = sample_result
+            seq_outputs: list[SequenceOutput] = []
+
+            # Create sequence outputs for each sampled token
+            for parent_id, next_token_id, logprobs in zip(
+                parent_ids, next_token_ids, group_sample_logprobs
+            ):
+                seq_outputs.append(
+                    SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)
+                )
+
+            sampler_output.append(
+                CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs)
+            )
+
+    # Handle on-device tensors
+    if on_device_tensors is not None:
+        (sampled_token_probs, logprobs_tensor, sampled_token_ids) = on_device_tensors
+    else:
+        sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, None)
+
+    return SamplerOutput(
+        outputs=sampler_output,
+        sampled_token_probs=sampled_token_probs,
+        sampled_token_ids=sampled_token_ids,
+        logprobs=logprobs_tensor,
+        deferred_sample_results_args=deferred_sample_results_args
+    )
+
+def apply_sampling_penalties(
+    logits: ms.Tensor,
+    p: Optional[ms.Tensor] = None,
+    k: Optional[ms.Tensor] = None,
+    min_p: Optional[ms.Tensor] = None,
+) -> ms.Tensor:
+    """
+    Apply sampling penalties to logits.
+
+    This function assumes all input tensors are already MindSpore tensors.
+    Use TensorConverter.ensure_mindspore_tensor() for conversion if needed.
+
+    Args:
+        logits: [batch_size, vocab_size] MindSpore tensor
+        p: top-p parameters as MindSpore tensor (optional)
+        k: top-k parameters as MindSpore tensor (optional)
+        min_p: min-p parameters as MindSpore tensor (optional)
+
+    Returns:
+        ms.Tensor: Processed logits with penalties applied
+    """
+    processed_logits = logits
+
+    # Apply top-k/top-p if provided
+    if p is not None and k is not None:
+        processed_logits = _apply_top_k_top_p(processed_logits, p, k)
+
+    # Apply min-p if provided
+    if min_p is not None:
+        processed_logits = _apply_min_p(processed_logits, min_p)
+
+    return processed_logits
+
+
+def sampler_forward_with_conversion(original_forward):
+    """
+    Decorator to wrap Sampler.forward() with automatic tensor conversion.
+    
+    This enables transparent cross-framework compatibility for the complete
+    sampling pipeline including beam search.
+    """
+    def wrapper(self, logits, sampling_metadata):
+        # Convert sampling metadata to MindSpore-compatible version
+        ms_sampling_metadata = create_mindspore_sampling_metadata(sampling_metadata)
+        # Call the original forward method with converted metadata
+        return original_forward(self, logits, ms_sampling_metadata)
+    return wrapper
+
+
+
diff --git a/vllm_mindspore/model_executor/sampling_metadata.py b/vllm_mindspore/model_executor/sampling_metadata.py
index 5c9efd06..0289fbe4 100644
--- a/vllm_mindspore/model_executor/sampling_metadata.py
+++ b/vllm_mindspore/model_executor/sampling_metadata.py
@@ -18,10 +18,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+MindSpore Sampling Metadata Module
+
+This module provides:
+1. SamplingTensors: Core tensor structures for sampling operations
+2. TensorConverter: Unified cross-framework tensor conversion utilities  
+3. MindSporeSamplingMetadata: MindSpore-compatible metadata wrapper
+
+Key design principles:
+- Single responsibility: Each class has a clear, focused purpose
+- Unified interfaces: All tensor conversion goes through TensorConverter
+- Minimal dependencies: Avoid circular imports
+"""
+
 from array import array
 from dataclasses import dataclass
+from typing import Optional, Dict
 
 import mindspore as ms
+import torch
 from mindspore import Tensor
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 
@@ -127,3 +143,177 @@ class SamplingTensors:
             prompt_tokens=prompt_t,
             output_tokens=output_t,
         )
+
+
+class TensorConverter:
+    """Unified tensor conversion utilities for cross-framework compatibility."""
+
+    _DTYPE_MAPPING = {
+        torch.float32: ms.float32,
+        torch.float16: ms.float16,
+        torch.bfloat16: ms.bfloat16,
+        torch.int32: ms.int32,
+        torch.int64: ms.int64,
+        torch.bool: ms.bool_,
+        torch.uint8: ms.uint8,
+    }
+
+    @classmethod
+    def ensure_mindspore_tensor(cls, tensor) -> ms.Tensor:
+        """
+        Unified tensor conversion entry point.
+        
+        Args:
+            tensor: Input tensor (PyTorch, MindSpore, or other types)
+            
+        Returns:
+            ms.Tensor: MindSpore tensor
+        """
+        if tensor is None:
+            return None
+            
+        if isinstance(tensor, ms.Tensor):
+            return tensor
+            
+        if isinstance(tensor, torch.Tensor):
+            return cls.torch_to_mindspore(tensor)
+            
+        # Handle other types (lists, scalars, etc.)
+        try:
+            return ms.Tensor(tensor)
+        except Exception:
+            print(f"Warning: Failed to convert {type(tensor)} to MindSpore tensor, returning original")
+            return tensor
+
+    @classmethod
+    def torch_to_mindspore(cls, tensor: torch.Tensor) -> ms.Tensor:
+        """Convert PyTorch tensor to MindSpore tensor with robust error handling."""
+        if tensor is None:
+            return None
+
+        if not isinstance(tensor, torch.Tensor):
+            return tensor
+
+        try:
+            # Handle scalar tensors
+            if tensor.numel() == 1:
+                return ms.Tensor(tensor.item(), dtype=cls._convert_dtype(tensor.dtype))
+
+            # Convert via numpy for efficiency
+            if tensor.is_cuda:
+                numpy_array = tensor.detach().cpu().numpy()
+            else:
+                numpy_array = tensor.detach().numpy()
+
+            ms_tensor = ms.Tensor(numpy_array, dtype=cls._convert_dtype(tensor.dtype))
+            return ms_tensor
+
+        except Exception as e:
+            # Robust fallback: convert via Python lists
+            try:
+                if tensor.is_cuda:
+                    data = tensor.detach().cpu().tolist()
+                else:
+                    data = tensor.detach().tolist()
+                return ms.Tensor(data, dtype=cls._convert_dtype(tensor.dtype))
+            except:
+                # Last resort: return original tensor and log warning
+                print(f"Warning: Failed to convert tensor {tensor.shape} {tensor.dtype}, using original")
+                return tensor
+
+    @classmethod
+    def _convert_dtype(cls, torch_dtype: torch.dtype) -> ms.dtype:
+        """Convert PyTorch dtype to MindSpore dtype."""
+        return cls._DTYPE_MAPPING.get(torch_dtype, ms.float32)
+
+    @classmethod
+    def convert_categorized_sample_indices(cls, categorized_indices: dict) -> dict:
+        """Convert categorized sample indices from PyTorch to MindSpore."""
+        converted_indices = {}
+        for sampling_type, indices in categorized_indices.items():
+            converted_indices[sampling_type] = TensorConverter.ensure_mindspore_tensor(indices)
+        return converted_indices
+
+    @classmethod
+    def convert_batch_tensors(cls, *tensors):
+        """
+        Convert multiple tensors in batch for efficiency.
+
+        Args:
+            *tensors: Variable number of tensors to convert
+
+        Returns:
+            tuple: Converted MindSpore tensors
+        """
+        return tuple(TensorConverter.ensure_mindspore_tensor(tensor) for tensor in tensors)
+
+
+# Enhanced MindSpore Sampling Metadata with proper attribute handling
+class MindSporeSamplingMetadata:
+    """MindSpore-compatible wrapper for SamplingMetadata with complete tensor conversion."""
+
+    def __init__(self, original_metadata):
+        self.original = original_metadata
+        self._sampling_tensors = None
+        self._converted_cache = {}
+
+    @property
+    def seq_groups(self):
+        """Pass through sequence groups directly."""
+        return self.original.seq_groups
+
+    @property
+    def selected_token_indices(self) -> Optional[ms.Tensor]:
+        """Convert selected token indices to MindSpore tensor."""
+        if 'selected_token_indices' not in self._converted_cache:
+            original_indices = getattr(self.original, 'selected_token_indices', None)
+            self._converted_cache['selected_token_indices'] = TensorConverter.ensure_mindspore_tensor(original_indices)
+        return self._converted_cache['selected_token_indices']
+
+    @property
+    def categorized_sample_indices(self) -> dict:
+        """Convert categorized sample indices to MindSpore tensors."""
+        if 'categorized_sample_indices' not in self._converted_cache:
+            original_indices = getattr(self.original, 'categorized_sample_indices', {})
+            self._converted_cache['categorized_sample_indices'] = TensorConverter.convert_categorized_sample_indices(original_indices)
+        return self._converted_cache['categorized_sample_indices']
+
+    @property
+    def num_prompts(self) -> int:
+        """Pass through num_prompts."""
+        return getattr(self.original, 'num_prompts', 0)
+
+    @property
+    def skip_sampler_cpu_output(self) -> bool:
+        """Pass through skip_sampler_cpu_output."""
+        return getattr(self.original, 'skip_sampler_cpu_output', False)
+
+    @property
+    def reuse_sampling_tensors(self) -> bool:
+        """Pass through reuse_sampling_tensors."""
+        return getattr(self.original, 'reuse_sampling_tensors', False)
+
+    @property
+    def sampling_tensors(self):
+        """Get MindSpore-compatible sampling tensors."""
+        if self._sampling_tensors is None:
+            original_tensors = getattr(self.original, 'sampling_tensors', None)
+            # Use SamplingTensors directly as it already supports MindSpore
+            self._sampling_tensors = original_tensors
+        return self._sampling_tensors
+
+    # Pass-through all other attributes
+    def __getattr__(self, name):
+        return getattr(self.original, name)
+
+    def __repr__(self) -> str:
+        return f"MindSporeSamplingMetadata(original={self.original})"
+
+
+# Convenience functions
+def create_mindspore_sampling_metadata(original_metadata) -> MindSporeSamplingMetadata:
+    """Create MindSpore-compatible sampling metadata from original."""
+    if isinstance(original_metadata, MindSporeSamplingMetadata):
+        return original_metadata
+    return MindSporeSamplingMetadata(original_metadata)
+
-- 
Gitee