diff --git a/models/cv/classification/clip/ixformer/README.md b/models/cv/classification/clip/ixformer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b43ffada6dc50dadd1d13234d0ba468f8f4b82c
--- /dev/null
+++ b/models/cv/classification/clip/ixformer/README.md
@@ -0,0 +1,40 @@
+# CLIP
+
+## Description
+
+CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet zero-shot without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision.
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install -U transformers==4.27.1
+```
+
+### Download
+
+Pretrained model: Go to the website <https://huggingface.co/models> to find the pre-trained model you need. Here, we choose clip-vit-base-patch32.
+
+```bash
+# Download model from the website and make sure the model's path is "/home/data/openai/clip-vit-base-patch32"
+mkdir -p /data
+unzip clip-vit-base-patch32.zip
+```
+
+## Run model
+
+### Test using the OpenAI interface
+
+Please modify the part in the test_clip.py script that pertains to the model path.
+```bash
+python3 test_clip.py
+```
diff --git a/models/cv/classification/clip/ixformer/inference.py b/models/cv/classification/clip/ixformer/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..013c96e8cd5123312751021a3ae0ee898515826e
--- /dev/null
+++ b/models/cv/classification/clip/ixformer/inference.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import time
+
+import requests
+import torch
+
+# from transformers import CLIPModel
+from ixformer.inference.models.clip import CLIPModel
+from PIL import Image
+from torch.cuda import profiler
+from transformers import CLIPProcessor
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = (
+    CLIPModel.from_pretrained("/home/data/openai/clip-vit-base-patch32")
+    .to(device)
+    .half()
+)
+model = model.eval()
+processor = CLIPProcessor.from_pretrained("/home/data/openai/clip-vit-base-patch32")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+batch_size_list = [32, 64, 128, 256, 512, 1024, 2048]
+with torch.no_grad():
+    for batch_size in batch_size_list:
+        images = [image for item in range(batch_size)]
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"],
+            images=images,
+            return_tensors="pt",
+            padding=True,
+        )
+        inputs["input_ids"] = inputs["input_ids"].to(device)
+        inputs["attention_mask"] = inputs["attention_mask"].to(device)
+        inputs["pixel_values"] = inputs["pixel_values"].to(device).half()
+        # warmup
+        for i in range(2):
+            outputs = model(**inputs)
+        torch.cuda.synchronize()
+        profiler.start()
+        start_time = time.perf_counter()
+        outputs = model(**inputs)
+        profiler.stop()
+        torch.cuda.synchronize()
+        end_time = time.perf_counter()
+        logits_per_image = (
+            outputs.logits_per_image
+        )  # this is the image-text similarity score
+        probs = logits_per_image.softmax(
+            dim=1
+        )  # we can take the softmax to get the label probabilities
+        print(probs[:5])
+        print(probs[-5:-1])
+
+        print("QPS: ", batch_size / (end_time - start_time))
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..247ba8e387c66c7ad1ca94c1e1760886fbaf3801
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md
@@ -0,0 +1,57 @@
+# ChatGLM3-6B-32K
+
+## Description
+
+ChatGLM3-6B-32K further enhances the understanding of long text capabilities based on ChatGLM3-6B, enabling better handling of contexts up to 32K in length. Specifically, we have updated the positional encoding and designed more targeted long text training methods, using a 32K context length during the training phase. In practical use, if your context length is mostly within 8K, we recommend using ChatGLM3-6B; if you need to handle context lengths exceeding 8K, we recommend using ChatGLM3-6B-32K.
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install transformers==4.37.1
+```
+
+### Download
+
+Pretrained model: <https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k>
+
+```bash
+mkdir -p /data/chatglm/
+mv chatglm3-6b-32k.zip/tar /data/chatglm/
+```
+
+## Run model
+
+```bash
+python3 offline_inference.py --model /data/chatglm/chatglm3-6b-32k --trust-remote-code --temperature 0.0 --max-tokens 256
+```
+
+## Use the server
+
+### Start the server
+
+```bash
+python3 -m vllm.entrypoints.openai.api_server --model /data/chatglm/chatglm3-6b-32k --gpu-memory-utilization 0.9 --max-num-batched-tokens 8193 \
+        --max-num-seqs 32 --disable-log-requests --host 127.0.0.1 --port 12345 --trust-remote-code
+```
+
+### Test using the OpenAI interface
+
+```bash
+python3 server_inference.py --host 127.0.0.1 --port 12345 --model_path /data/chatglm/chatglm3-6b-32k
+```
+
+## Results
+
+| Model           | Precision | tokens | QPS    |
+| --------------- | --------- | ------ | ------ |
+| ChatGLM3-6B-32K | FP16      | 745    | 110.85 |
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc731079f72988cd20c5a68b3ccb4e192769c8fb
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        logging.warning(
+            "For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. "
+            "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex "
+            "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] "
+            "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions "
+            "to use when you use openai api."
+        )
+        tokenizer = llm.get_tokenizer()
+        prompts_new = []
+        for prompt in prompts:
+            input_idx = (
+                tokenizer.build_chat_input(prompt)["input_ids"][0].cpu().tolist()
+            )
+            prompts_new.append(input_idx)
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6dcf8b88dd25d95c972cee251291c9e515fd9b8
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+import time
+
+from openai import OpenAI
+from transformers import AutoTokenizer
+
+
+def send_request(
+    api_url: str,
+    prompt: str,
+    output_len: int,
+    stream: bool,
+) -> None:
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key="EMPTY",
+        base_url=api_url,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    completion = client.completions.create(
+        model=model,
+        # messages=[{"role": "user", "content": prompt},],
+        prompt=prompt,
+        n=1,
+        stream=stream,
+        max_tokens=output_len,
+        temperature=0.0,
+    )
+
+    if stream:
+        for each_com in completion:
+            print(each_com)
+    else:
+        print("++++++++++++++++++")
+        print(completion)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--stream", action="store_true")
+    parser.add_argument("--output_token", type=int, default=1024)
+    parser.add_argument("--model_path", type=str)
+
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/v1"
+
+    prompts = [
+        "你好",
+        "Which city is the capital of China?",
+        "1 + 1 = ?",
+        "中国的首都是哪里",
+        "请讲以下内容翻译为英文：\n你好,我来自中国。",
+    ]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts]
+
+    for prompt in prompts:
+        send_request(api_url, prompt, args.output_token, args.stream)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from copy import deepcopy
+from typing import Tuple, List, Union
+
+import codecs
+import logging
+import argparse
+
+# 瀵逛簬chat妯″瀷锛屾垨鑰呮ā鍨嬮渶瑕佺壒瀹氱殑杈撳叆锛岄渶瑕佸prompt杩涜棰濆鐨勫鐞嗐€�
+# 濡傛灉鎮ㄥ湪浣跨敤涓湁棰濆鐨刾rompt澶勭悊鏂瑰紡闇€姹傛垨鑰呴敊璇弽棣堬紝鍙互鑱旂郴鐜嬪潥鎴栬€呭珐浜氶锛屾垜浠細瀵筸odelzoo杩涜鏇存柊閫傞厤銆�
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+        if chat_template is not None:
+            try:
+                with open(chat_template, "r") as f:
+                    tokenizer.chat_template = f.read()
+            except OSError:
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                tokenizer.chat_template = codecs.decode(
+                    chat_template, "unicode_escape")
+
+            logging.info(
+                f"Using supplied chat template:\n{tokenizer.chat_template}"
+            )
+        elif tokenizer.chat_template is not None:
+            logging.info(
+                f"Using default chat template:\n{tokenizer.chat_template}"
+            )
+        else:
+            logging.warning(
+                "No chat template provided. Chat API will not work.")
+
+def default_build_chat(tokenizer,prompt):
+    return prompt
+
+def chatglm2_build_chat(tokenizer,prompt):
+    return tokenizer.build_prompt(prompt)
+
+def chatglm3_build_chat(tokenizer,prompt):
+    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
+
+def llama2_build_chat(tokenizer,prompt):
+    return f"[INST]{prompt}[/INST]"
+
+# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
+def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+
+    messages = [{"role": "user", "content": f"{prompt}"}]
+    max_new_tokens = max_new_tokens
+    max_input_tokens = 4096 - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(195)
+            else:
+                round_tokens.append(196)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(196)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return input_tokens
+
+def qwen_build_chat(
+    tokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set()
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            response_text, response_tokens_part = _tokenize_str(
+                "assistant", turn_response
+            )
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (
+                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            )
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+    return raw_text, context_tokens
+
+def codellama_build_chat(tokenizer,prompt):
+    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
+
+def build_chat(tokenizer, prompt, model_name, **kwargs):
+    model_name = model_name.lower()
+        # return str or list[int]
+    if "chatglm2" in model_name:
+        prompt = chatglm2_build_chat(tokenizer,prompt)
+    elif "chatglm3" in model_name:
+        prompt = chatglm3_build_chat(tokenizer,prompt)
+    elif "llama2" in model_name and 'chat' in model_name:
+        prompt = llama2_build_chat(tokenizer,prompt)
+    elif "baichuan2" in model_name and 'chat' in model_name:
+        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
+    elif "qwen" in model_name and 'chat' in model_name:
+        prompt = qwen_build_chat(tokenizer,prompt)
+    elif "code" in model_name and 'llama' in model_name:
+        prompt = codellama_build_chat(tokenizer,prompt)
+    else:
+        prompt = default_build_chat(tokenizer,prompt)
+    return prompt
+
+
+# for output
+def default_post_process(output):
+    return output
+
+def glm2_post_process(output):
+    output = output.strip()
+    output = output.replace("[[璁粌鏃堕棿]]", "2023骞�")
+    return output
+
+def glm3_post_process(output, history=[]):
+    content = ""
+    history = deepcopy(history)
+    for response in output.split("<|assistant|>"):
+        metadata, content = response.split("\n", maxsplit=1)
+        if not metadata.strip():
+            content = content.strip()
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            content = content.replace("[[璁粌鏃堕棿]]", "2023骞�")
+        else:
+            history.append({"role": "assistant", "metadata": metadata, "content": content})
+            if history[0]["role"] == "system" and "tools" in history[0]:
+                content = "\n".join(content.split("\n")[1:-1])
+                def tool_call(**kwargs):
+                    return kwargs
+                parameters = eval(content)
+                content = {"name": metadata.strip(), "parameters": parameters}
+            else:
+                content = {"name": metadata.strip(), "content": content}
+    return content
+
+def post_process(response, model_name,**kwargs):
+    model_name = model_name.lower()
+    if "chatglm2" in model_name:
+        response = glm2_post_process(response)
+    elif "chatglm3" in model_name:
+        response = glm3_post_process(response)
+    else:
+        response = default_post_process(response)
+    return response
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/README.md b/models/nlp/large_language_model/stablelm/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cff35998f5762b600061b1050282aa8a92122ade
--- /dev/null
+++ b/models/nlp/large_language_model/stablelm/vllm/README.md
@@ -0,0 +1,40 @@
+# StableLm-2-1_6B
+
+## Description
+
+Stable LM 2 1.6B is a decoder-only language model with 1.6 billion parameters. It has been pre-trained on a diverse multilingual and code dataset, comprising 2 trillion tokens, for two epochs. This model is designed for various natural language processing tasks, including text generation and dialogue systems. Due to its extensive training on such a large and diverse dataset, Stable LM 2 1.6B can effectively capture the nuances of language, including grammar, semantics, and contextual relationships, which enhances the quality and accuracy of the generated text.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+pip3 install transformers
+```
+
+### Download
+
+-Model: <https://huggingface.co/stabilityai/stablelm-2-1_6b/tree/main>
+
+```bash
+# Download model from the website and make sure the model's path is "data/stablelm/stablelm-2-1_6b"
+mkdir -p data/stablelm/stablelm-2-1_6b
+```
+
+## Inference
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1
+python3 offline_inference.py --model ./data/stablelm/stablelm-2-1_6b --max-tokens 256 -tp 1 --temperature 0.0
+```
+
+## Results
+
+| Model      | QPS   |
+| ---------- | ----- |
+| StableLM   | 254.3 |
diff --git a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..40678a62ea18296ecdd53cbbcf7d8c3c25e0950d
--- /dev/null
+++ b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = [
+        "What signs may indicate that a person is experiencing anxiety?",
+        "Describe how to make cheese pizza.",
+        "Write a review article on the development of 5G networks.",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/utils.py b/models/nlp/large_language_model/stablelm/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6def85dedc08ef9c3a489ce9dc5b1ff4a5e48b0
--- /dev/null
+++ b/models/nlp/large_language_model/stablelm/vllm/utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import codecs
+import logging
+import argparse
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md b/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dc49881226176329767fcf52f9d0742a4912056
--- /dev/null
+++ b/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
@@ -0,0 +1,46 @@
+# MiniCPM-V-2
+
+## Description
+
+MiniCPM-V-2 is a compact and efficient language model designed for various natural language processing (NLP) tasks. Building on its predecessor, MiniCPM-V-1, this model integrates advancements in architecture and optimization techniques, making it suitable for deployment in resource-constrained environments.s
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+pip3 install timm==0.9.10
+pip3 install transformers
+pip3 install --user --upgrade pillow -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+### Download
+
+-Model: <https://huggingface.co/openbmb/MiniCPM-V-2>
+Note: Due to the official weights missing some necessary files for vllm execution, you can download the additional files from here: <https://github.com/HwwwwwwwH/MiniCPM-V-2> to ensure that the file directory matches the structure shown here: <https://github.com/HwwwwwwwH/MiniCPM-V-2>.
+
+```bash
+# Download model from the website and make sure the model's path is "data/MiniCPM-V-2"
+mkdir data
+
+```
+
+## Inference
+
+```bash
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PATH=/usr/local/corex/bin:${PATH}
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 
+```
+
+```bash
+wget https://img.zcool.cn/community/012e285a1ea496a8012171323c6bf1.jpg -O dog.jpg
+python3 minicpmv-2.0-offline.py --model-path /path/to/model --image-path ./dog.jpg
+```
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py b/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6add4d8f00fcc8bb307767d149dad8009f182b0
--- /dev/null
+++ b/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from PIL import Image
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+import argparse
+
+def main(args):
+    # 鍥惧儚鏂囦欢璺緞鍒楄〃
+    ## wget https://img.zcool.cn/community/012e285a1ea496a8012171323c6bf1.jpg@3000w_1l_0o_100sh.jpg -O dog.jpg
+    IMAGES = [
+        args.image_path,  # 鏈湴鍥剧墖璺緞
+    ]
+
+    # 妯″瀷鍚嶇О鎴栬矾寰�
+    MODEL_NAME = args.model_path  # 鏈湴妯″瀷璺緞鎴朒ugging Face妯″瀷鍚嶇О
+
+    # 鎵撳紑骞惰浆鎹㈠浘鍍�
+    image = Image.open(IMAGES[0]).convert("RGB")
+
+    # 鍒濆鍖栧垎璇嶅櫒
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    # 鍒濆鍖栬瑷€妯″瀷
+    llm = LLM(model=MODEL_NAME,
+            gpu_memory_utilization=0.95,  # 浣跨敤鍏ㄩ儴GPU鍐呭瓨
+            trust_remote_code=True,
+            max_model_len=1024,
+            max_num_seqs=1,
+            max_num_batched_tokens=1024,)  # 鏍规嵁鍐呭瓨鐘跺喌鍙皟鏁存鍊�
+
+    # 鏋勫缓瀵硅瘽娑堟伅
+    messages = [{'role': 'user', 'content': '(<image>./</image>)\n' + '璇锋弿杩拌繖寮犲浘鐗�'}]
+
+    # 搴旂敤瀵硅瘽妯℃澘鍒版秷鎭�
+    prompt = tokenizer.apply_chat_template(messages)
+
+    # 璁剧疆鍋滄绗D
+    # 2.0
+    stop_token_ids = [tokenizer.eos_id]
+    # 2.5
+    #stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+    # 2.6 
+    # stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    # stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    # 璁剧疆鐢熸垚鍙傛暟
+    sampling_params = SamplingParams(
+        stop_token_ids=stop_token_ids,
+        # temperature=0.7,
+        # top_p=0.8,
+        # top_k=100,
+        # seed=3472,
+        max_tokens=128,
+        # min_tokens=150,
+        temperature=0,
+        use_beam_search=False,
+        # length_penalty=1.2,
+        best_of=1)
+
+    # 鑾峰彇妯″瀷杈撳嚭
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": image
+        }
+    }, sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default=None, help="model path")
+    parser.add_argument("--image-path", type=str, default=None, help="sample image path")
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file