diff --git a/aikg/benchmark/aikgbench/llm/common/Concat.py b/aikg/benchmark/aikgbench/llm/common/Concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ce58e49bb5d23f0c2e84466a1461a42e2e8ea3
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Concat.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs concatenation of input tensors.
+    """
+
+    def __init__(self, concat_dim):
+        super(Model, self).__init__()
+        self.concat_dim = concat_dim
+
+    def forward(self, x, y):
+        """
+        Perform concatenation of two input tensors.
+
+        Args:
+            x: First input tensor
+            y: Second input tensor
+
+        Returns:
+            Concatenated tensor along the specified dimension
+        """
+        return torch.cat((x, y), dim=self.concat_dim)
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with float16 data type as tested in test_concat.py
+    """
+    # Use float16 test case
+    shape1 = (28, 32, 4096)
+    shape2 = (28, 64, 4096)
+
+    # Generate random tensors similar to test_concat.py
+    input0 = torch.randn(shape1, dtype=torch.float16)
+    input1 = torch.randn(shape2, dtype=torch.float16)
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For concatenation, we need the concatenation dimension.
+    Default to 1 (second dimension) as in test_concat.py.
+    """
+    return [1]  # Default concatenation dimension (dim=1)
diff --git a/aikg/benchmark/aikgbench/llm/common/Cumsum.py b/aikg/benchmark/aikgbench/llm/common/Cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee8dce31ff825680dc9bd98fc4594ff20ddd6521
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Cumsum.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs cumulative sum operation.
+    """
+
+    def __init__(self, dim=1):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        """
+        Perform cumulative sum operation on input tensor.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Cumulative sum tensor along the specified dimension
+        """
+        return torch.cumsum(x, dim=self.dim)
+
+
+# Model parameters - using the same parameters as in test_cumsum.py
+# Default parameters for cumsum operation
+default_params = {
+    'dim': 1,  # Default axis/dimension for cumsum
+    'shape': (28, 32, 4096),  # Default tensor shape
+    'dtype': torch.float16  # Default data type
+}
+
+# Supported axes for cumsum operation
+supported_axes = [0, 1, 2]  # For 3D tensor (28, 32, 4096)
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_cumsum.py
+    """
+    # Use default parameters from test_cumsum.py
+    shape = default_params['shape']
+    dtype = default_params['dtype']
+
+    # Generate random tensor similar to test_cumsum.py
+    input0 = torch.randn(shape, dtype=dtype)
+    return [input0]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For cumsum operation, we need the dimension/axis.
+    """
+    return [default_params['dim']]  # Default dimension (axis=1)
diff --git a/aikg/benchmark/aikgbench/llm/common/DynamicNTK.py b/aikg/benchmark/aikgbench/llm/common/DynamicNTK.py
new file mode 100644
index 0000000000000000000000000000000000000000..033d5cc0fcfe851bed88fc021c2c5fbd28b186e8
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/DynamicNTK.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+import math
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs dynamic NTK (Neural Tangent Kernel) operation.
+    """
+
+    def __init__(self, output_type=1):
+        super(Model, self).__init__()
+        self.output_type = output_type
+
+    def forward(self, position_ids, inv_freqs, seq_lens):
+        """
+        Perform dynamic NTK operation to generate sinusoidal position embeddings.
+
+        Args:
+            position_ids: Position IDs tensor [num_tokens]
+            inv_freqs: Inverse frequencies tensor [batch, dim/2]
+            seq_lens: Sequence lengths tensor [batch]
+
+        Returns:
+            Tuple of (sin_output, cos_output) tensors
+        """
+        off = 0
+        num_tokens = position_ids.shape[0]
+        dim = inv_freqs.shape[1] * 2
+        batch_num = seq_lens.shape[0]
+
+        # Determine output type
+        otype = torch.float16 if self.output_type == 0 else torch.bfloat16
+
+        sin_out = torch.zeros([num_tokens, dim], dtype=torch.float32)
+        cos_out = torch.zeros([num_tokens, dim], dtype=torch.float32)
+
+        for batch_id in range(batch_num):
+            pos_len = seq_lens[batch_id]
+            freqs = torch.einsum('i,j->ij',
+                                 position_ids[off:off +
+                                              pos_len].to(torch.float32),
+                                 inv_freqs[batch_id])
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos_out[off:off + pos_len, :] = emb.cos()
+            sin_out[off:off + pos_len, :] = emb.sin()
+            off += pos_len
+
+        return sin_out.to(otype), cos_out.to(otype)
+
+
+# Model parameters - using the same parameters as in test_dynamic_ntk.py
+# Default parameters for dynamic NTK operation
+default_params = {
+    'output_type': 1,  # Default output type (1 for bfloat16, 0 for float16)
+    'batch': 16,       # Default batch size
+    'num_tokens': 256,  # Default number of tokens
+    'dim': 128,        # Default dimension
+    'max_seq_len': 256000  # Default maximum sequence length
+}
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_dynamic_ntk.py
+    """
+    # Use default parameters from test_dynamic_ntk.py
+    batch = default_params['batch']
+    num_tokens = default_params['num_tokens']
+    dim = default_params['dim']
+    max_seq_len = default_params['max_seq_len']
+
+    # Generate test data similar to test_dynamic_ntk.py
+    aux_array = torch.arange(0, dim, 2, dtype=torch.float32) / dim
+    batch_base = torch.randint(10000, 50000, [batch], dtype=torch.float32)
+    position_ids = torch.randint(
+        0, max_seq_len, [num_tokens], dtype=torch.int32)
+    inv_freqs = torch.zeros([batch, int(dim / 2)], dtype=torch.float32)
+
+    for i in range(batch):
+        inv_freqs[i, :] = 1.0 / batch_base[i] ** aux_array
+
+    avg_seq_len = int(num_tokens / batch)
+    seq_lens = torch.ones([batch], dtype=torch.int32) * avg_seq_len
+    seq_lens[0] += num_tokens - avg_seq_len * batch
+
+    return [position_ids, inv_freqs, seq_lens]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For dynamic NTK operation, we need the output type.
+    """
+    return [default_params['output_type']]  # Default output type (1 for bfloat16)
diff --git a/aikg/benchmark/aikgbench/llm/common/FastSoftMax.py b/aikg/benchmark/aikgbench/llm/common/FastSoftMax.py
new file mode 100644
index 0000000000000000000000000000000000000000..947464f2f3fabe4cd8c41cc5c26bc3f7ae72bbf3
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/FastSoftMax.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs fast softmax operation.
+    """
+
+    def __init__(self, head_num=8, q_seq_len=None):
+        super(Model, self).__init__()
+        self.head_num = head_num
+        self.q_seq_len = q_seq_len if q_seq_len is not None else [
+            200]  # Default sequence length
+
+    def forward(self, x):
+        """
+        Perform fast softmax operation on input tensor.
+
+        Args:
+            x: Input tensor to be processed
+
+        Returns:
+            Softmax output tensor
+        """
+        golden = torch.empty_like(x)
+        start = 0
+
+        for i in range(len(self.q_seq_len)):
+            end = start + self.head_num * self.q_seq_len[i] * self.q_seq_len[i]
+            cur_data_input = x[start:end].reshape(-1, self.q_seq_len[i])
+            cur_golden = torch.softmax(cur_data_input.to(
+                torch.float32), dim=-1).to(torch.float16)
+            golden[start:end] = cur_golden.reshape(-1)
+            start = end
+
+        return golden
+
+
+# Model parameters - using the same parameters as in test_fastsoftmax_operation.py
+# Default parameters for fast softmax operation
+default_params = {
+    'head_num': 8,      # Default number of heads
+    'batch_size': 4,    # Default batch size
+    'seq_len_range': (100, 300)  # Default sequence length range
+}
+
+# Default sequence lengths for testing
+default_seq_lens = [200, 150, 250, 180]  # Example sequence lengths
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_fastsoftmax_operation.py
+    """
+    # Use default parameters from test_fastsoftmax_operation.py
+    batch_size = default_params['batch_size']
+    head_num = default_params['head_num']
+    q_seq_len = default_seq_lens
+
+    # Generate data_input_list based on q_seq_len
+    data_input_list = []
+    for i in range(batch_size):
+        data_input = torch.randn(
+            head_num * q_seq_len[i] * q_seq_len[i]).to(torch.float16)
+        data_input_list.append(data_input)
+
+    # Concatenate all inputs
+    data_input = torch.cat(data_input_list)
+
+    return [data_input]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For fast softmax operation, we need head_num and q_seq_len.
+    """
+    # Use default sequence lengths for initialization
+    return [default_params['head_num'], default_seq_lens]
diff --git a/aikg/benchmark/aikgbench/llm/common/Fill.py b/aikg/benchmark/aikgbench/llm/common/Fill.py
new file mode 100644
index 0000000000000000000000000000000000000000..772636148788f633a34ad1e6f9f8e1a604a0674b
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Fill.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs fill operation with mask.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, value):
+        """
+        Perform fill operation on input tensor.
+
+        Args:
+            x: Input tensor to be filled
+            value: Value to fill with
+
+        Returns:
+            Filled tensor
+        """
+        return torch.full_like(x, value.item())
+
+
+# Model parameters - using the same parameters as in test_fill.py
+# Default parameters for fill operation
+default_params = {
+    'with_mask': True,  # Default to use mask
+    'value': -10000,    # Default fill value
+    'shape': (5, 5),    # Default tensor shape
+    'dtype': torch.float16  # Default data type
+}
+
+# Supported fill values for testing
+supported_values = [-10000, 10000]
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_fill.py
+    """
+    # Use default parameters from test_fill.py
+    shape = default_params['shape']
+    dtype = default_params['dtype']
+
+    # Generate input tensor similar to test_fill.py
+    input0 = torch.rand(shape).to(dtype)
+
+    # Generate fill value
+    input1 = torch.tensor([default_params['value']], dtype=dtype)
+
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For fill operation, no parameters needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/common/Gather.py b/aikg/benchmark/aikgbench/llm/common/Gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3046eb9746746942e585f8bbed83677f34528be
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Gather.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs gather operation along a specified axis.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+        self.axis = 1
+
+    def forward(self, x, indices):
+        """
+        Perform gather operation on input tensor using indices along the specified axis.
+
+        Args:
+            x: Input tensor to gather from
+            indices: Indices tensor
+
+        Returns:
+            Gathered tensor
+        """
+        return torch.gather(x, dim=self.axis, index=indices)
+
+
+# Model parameters - using the same parameters as in test_gather.py
+# Default parameters for gather operation
+default_params = {
+    'axis': 1,  # Default axis for gather
+    'input_shape': (3, 5),  # Default input tensor shape
+    'indices_shape': (3, 4),  # Default indices tensor shape
+    'dtype': torch.float16,  # Default data type
+    'indices_dtype': torch.int64  # Default indices data type
+}
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_gather.py
+    """
+    # Use default parameters from test_gather.py
+    input_shape = default_params['input_shape']
+    indices_shape = default_params['indices_shape']
+    dtype = default_params['dtype']
+    indices_dtype = default_params['indices_dtype']
+
+    # Generate input tensor
+    input0 = torch.randn(input_shape, dtype=dtype)
+    # Generate indices tensor (values in range [0, input_shape[axis]))
+    input1 = torch.randint(
+        0, input_shape[1], indices_shape, dtype=indices_dtype)
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For gather operation, no parameters needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/common/IndexAdd.py b/aikg/benchmark/aikgbench/llm/common/IndexAdd.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd0916e8bd73632fbad9d46279e9773df23417eb
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/IndexAdd.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs index add operation.
+    """
+
+    def __init__(self, index_type, axis):
+        super(Model, self).__init__()
+        self.index_type = index_type
+        self.axis = axis
+
+    def forward(self, x, indices, values, alpha):
+        """
+        Perform index add operation on input tensor.
+
+        Args:
+            x: Input tensor to be modified
+            indices: Indices tensor
+            values: Values tensor to add
+            alpha: Alpha scaling factor
+
+        Returns:
+            Modified tensor
+        """
+        if self.index_type == 1:
+            cloned_x = x.clone()
+            cloned_x.index_add_(self.axis, indices, values, alpha=alpha.item())
+            return cloned_x
+
+        return x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on test_index_add.py
+    """
+    axis = 0
+    n, k = 1024, 4096
+    num_indices = 90
+    shape0 = (n, k)
+    shape1 = (num_indices,)
+    shape2 = (num_indices, k)
+    shape3 = (1,)
+
+    input0 = torch.rand(shape0, dtype=torch.half)
+    input1 = torch.arange(num_indices, dtype=torch.int32)
+    input2 = torch.rand(shape2, dtype=torch.half)
+    input3 = torch.rand(shape3, dtype=torch.half)
+
+    return [input0, input1, input2, input3]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on test_index_add.py parameters
+    """
+    return [1, 0]  # indexType=1, axis=0
diff --git a/aikg/benchmark/aikgbench/llm/common/KvCache.py b/aikg/benchmark/aikgbench/llm/common/KvCache.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb2c73d4625a143553c9423628b95806ceeef6a
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/KvCache.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs KV cache update operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, newkv, layer_id, cache_in, token_offset, seqlen):
+        """
+        Update KV cache with newkv according to token_offset and seqlen.
+
+        Args:
+            newkv: [ntokens, hidden_size]
+            layer_id: [1] (int tensor)
+            cache_in: [layer, batch, max_seqlen, hidden_size]
+            token_offset: [batch] (int tensor)
+            seqlen: [batch] (int tensor)
+
+        Returns:
+            cache_out: [layer, batch, max_seqlen, hidden_size]
+        """
+        # Clone cache_in to cache_out
+        cache_out = cache_in.clone()
+        layer_id_val = layer_id.item() if layer_id.numel(
+        ) == 1 else int(layer_id[0].item())
+        batch = seqlen.shape[0]
+        hidden_size = newkv.shape[1]
+        prefix_ntokens = 0
+        for i in range(batch):
+            for j in range(seqlen[i].item()):
+                pos = token_offset[i].item() - seqlen[i].item() + j
+                cache_out[layer_id_val, i, pos,
+                          :] = newkv[prefix_ntokens + j, :]
+            prefix_ntokens += seqlen[i].item()
+        return cache_out
+
+
+# Default parameters for kv_cache operation
+default_params = {
+    'layer': 28,
+    'layer_id': 0,
+    'batch': 16,
+    'max_seqlen': 384,
+    'hidden_size': 1024
+}
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_kv_cache_operation.py
+    """
+    layer = default_params['layer']
+    layer_id = default_params['layer_id']
+    batch = default_params['batch']
+    max_seqlen = default_params['max_seqlen']
+    hidden_size = default_params['hidden_size']
+    seqlen = torch.randint(1, max_seqlen // 2, (batch,), dtype=torch.int32)
+    token_offset = seqlen.clone()
+    ntokens = seqlen.sum().item()
+    newkv = (torch.rand(ntokens, hidden_size) - 0.5) * 10  # [-5, 5]
+    newkv = newkv.to(torch.float16)
+    cache_in = torch.zeros(layer, batch, max_seqlen,
+                           hidden_size, dtype=torch.float16)
+    layer_id_tensor = torch.tensor([layer_id], dtype=torch.int32)
+    return [newkv, layer_id_tensor, cache_in, token_offset, seqlen]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For kv_cache operation, no extra init params needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/common/Nonzero.py b/aikg/benchmark/aikgbench/llm/common/Nonzero.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b98483360f07614dc1cc6f835342455b422f477
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Nonzero.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs nonzero operation with padding.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        """
+        Find nonzero indices and pad the result to match the input size.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            result: Padded nonzero indices tensor
+            num_non_negative: Number of nonzero elements
+        """
+        num_non_negative = torch.count_nonzero(x)
+        padding_num = x.numel() - num_non_negative
+        # nonzero as tuple of indices, stack to shape [ndim, num_non_negative]
+        result = torch.stack(list(torch.nonzero(x, as_tuple=True)))
+        # pad with zeros to shape [ndim, numel]
+        if padding_num > 0:
+            padding = torch.zeros(
+                (x.shape[0], padding_num), dtype=result.dtype, device=result.device)
+            result = torch.cat((result, padding), dim=-1).long()
+        else:
+            result = result.long()
+        return result, torch.tensor(num_non_negative).long()
+
+
+def get_inputs():
+    """
+    Generate random input tensor for testing.
+    """
+    input0 = torch.randint(0, 2, (2, 490), dtype=torch.int64)
+    return [input0]
+
+
+def get_init_inputs():
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/common/Onehot.py b/aikg/benchmark/aikgbench/llm/common/Onehot.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a698691e4ade075d6ad7a76d7dd428df7a1d178
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Onehot.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs onehot operation.
+    """
+
+    def __init__(self, axis=-1, depth=10):
+        super(Model, self).__init__()
+        self.axis = axis
+        self.depth = depth
+
+    def forward(self, x):
+        """
+        Perform onehot encoding on input tensor.
+
+        Args:
+            x: Input tensor (indices)
+
+        Returns:
+            Onehot encoded tensor
+        """
+        # PyTorch one_hot always puts the new axis at the end, so we may need to permute
+        onehot = torch.nn.functional.one_hot(x, num_classes=self.depth)
+        if self.axis != -1 and self.axis != x.dim():
+            # Move the new onehot axis to the desired position
+            dims = list(range(onehot.dim()))
+            new_axis = self.axis if self.axis >= 0 else x.dim() + 1 + self.axis
+            dims = dims[:-1]
+            dims.insert(new_axis, onehot.dim() - 1)
+            onehot = onehot.permute(*dims)
+        return onehot
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    shape0 = (16,)
+    input0 = torch.randint(0, 10, shape0, dtype=torch.int64)
+    return [input0]
+
+
+def get_init_inputs():
+    return [-1, 10]
diff --git a/aikg/benchmark/aikgbench/llm/common/Pad.py b/aikg/benchmark/aikgbench/llm/common/Pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fa2f51e6b46dec36ff3f1d7508de8aac429a230
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Pad.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs pad operation (sequence gather from padded output).
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, tmp_out, padding_offset, seq_len, input_ids):
+        """
+        Gather the last valid output for each sequence in the batch.
+
+        Args:
+            tmp_out: [token_num, hidden_dim] tensor
+            padding_offset: [1, token_num] tensor
+            seq_len: [batch, 1] tensor
+            input_ids: [batch, total_length] tensor
+
+        Returns:
+            out: [batch, hidden_dim] tensor
+        """
+        batch = input_ids.shape[0]
+        hidden_dim = tmp_out.shape[1]
+        out = torch.zeros((batch, hidden_dim),
+                          dtype=tmp_out.dtype, device=tmp_out.device)
+        temp_val = 0
+        for i in range(batch):
+            temp_val = temp_val + seq_len[i][0].item()
+            out[i] = tmp_out[temp_val - 1].cpu()
+        return out
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing (pure PyTorch version).
+    """
+    batch = 32
+    total_length = 64
+    hidden_dim = 4096
+    seq_len = torch.randint(1, total_length, (batch,), dtype=torch.int32)
+    input_ids = torch.zeros((batch, total_length), dtype=torch.long)
+    token_num = seq_len.sum().item()
+    tmp_out = (torch.rand(token_num, hidden_dim) *
+               2 - 1).to(torch.float16)  # [-1, 1]
+    # 构造 input_ids
+    for i in range(batch):
+        input_ids[i, :seq_len[i]] = torch.randint(
+            1, 50, (seq_len[i],), dtype=torch.long)
+    # 构造 padding_offset
+    zeros_num = total_length - seq_len
+    cum_offsets_now = torch.cumsum(zeros_num, dim=0)
+    padding_offset = []
+    for i in range(batch):
+        if i == 0:
+            padding_offset += [0] * seq_len[0].item()
+        else:
+            padding_offset += [cum_offsets_now[i - 1].item()] * \
+                seq_len[i].item()
+    padding_offset = torch.tensor(
+        padding_offset, dtype=torch.int32).reshape(1, token_num)
+    seq_len = seq_len.reshape(batch, 1)
+    return [tmp_out, padding_offset, seq_len, input_ids]
+
+
+def get_init_inputs():
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/common/Slice.py b/aikg/benchmark/aikgbench/llm/common/Slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cac14f56fc049b15b4783cf31cad0dbbc1caecf
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Slice.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs slice operation.
+    """
+
+    def __init__(self, offsets, size):
+        super(Model, self).__init__()
+        self.offsets = offsets
+        self.size = size
+
+    def forward(self, x):
+        """
+        Slice the input tensor according to offsets and size.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Sliced tensor
+        """
+        return x[self.offsets[0]:self.offsets[0]+self.size[0],
+                 self.offsets[1]:self.offsets[1]+self.size[1]]
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    return [torch.randn(32, 128).half()]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [[2, 8], [10, 100]]
diff --git a/aikg/benchmark/aikgbench/llm/common/Sort.py b/aikg/benchmark/aikgbench/llm/common/Sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b385eb4f3b1404df31f3dd2bff9d69f93fd3eca
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Sort.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs sort operation (top-k).
+    """
+
+    def __init__(self, num):
+        super(Model, self).__init__()
+        self.num = num
+
+    def forward(self, x):
+        """
+        Perform top-k operation on the input tensor.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            values: Top-k values
+            indices: Top-k indices
+        """
+        values, indices = torch.topk(x, k=self.num, largest=True)
+        return values, indices.int()
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    return [torch.randint(-65504, 65504, (10, 22, 4096)).float().half()]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [1000]  # 减少到1000，确保不超过最后一个维度
diff --git a/aikg/benchmark/aikgbench/llm/common/Split.py b/aikg/benchmark/aikgbench/llm/common/Split.py
new file mode 100644
index 0000000000000000000000000000000000000000..c38c52a96ad1413ae5ce7283ffb2857ea7d8c9ef
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Split.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs split operation.
+    """
+
+    def __init__(self, split_dim, split_num):
+        super(Model, self).__init__()
+        self.split_dim = split_dim
+        self.split_num = split_num
+
+    def forward(self, x):
+        """
+        Split the input tensor into chunks.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            List of split tensors
+        """
+        return torch.chunk(x, chunks=self.split_num, dim=self.split_dim)
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    return [torch.rand(4096, 22016).half()]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [-1, 2]
diff --git a/aikg/benchmark/aikgbench/llm/common/Transpose.py b/aikg/benchmark/aikgbench/llm/common/Transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..259de3398b83c8c481ad4fdc381005bd77eec71a
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Transpose.py
@@ -0,0 +1,38 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs transpose operation.
+    """
+
+    def __init__(self, perm):
+        super(Model, self).__init__()
+        self.perm = perm
+
+    def forward(self, x):
+        """
+        Transpose the input tensor according to perm.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Transposed tensor
+        """
+        return x.permute(*self.perm)
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    return [torch.randn(32, 128).half()]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [[1, 0]]
diff --git a/aikg/benchmark/aikgbench/llm/common/Unpad.py b/aikg/benchmark/aikgbench/llm/common/Unpad.py
new file mode 100644
index 0000000000000000000000000000000000000000..221decdd790b53099561f79775634e8981a6f546
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/Unpad.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs unpad operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, input_ids, cum_offsets_now, token_num, seq_len):
+        """
+        Remove padding from input_ids and generate corresponding offsets.
+
+        Args:
+            input_ids: [batch, total_length] tensor
+            cum_offsets_now: [batch, 1] tensor
+            token_num: [1, 1] tensor
+            seq_len: [batch, 1] tensor
+
+        Returns:
+            x_remove_padding: [1, batch * total_length] tensor
+            cum_offsets_out: [batch, 1] tensor
+            padding_offset: [1, batch * total_length] tensor
+        """
+        batch = input_ids.shape[0]
+        total_length_imm = input_ids.shape[1]
+
+        # Remove padding from input_ids
+        x_remove_padding = input_ids[0, :seq_len[0]]
+        for i in range(1, batch):
+            x_remove_padding = torch.cat(
+                [x_remove_padding, input_ids[i, :seq_len[i]]])
+
+        # Pad to full length
+        target_length = batch * total_length_imm
+        current_length = x_remove_padding.shape[0]
+        if current_length < target_length:
+            padding_size = target_length - current_length
+            padding = torch.zeros(
+                padding_size, dtype=x_remove_padding.dtype, device=x_remove_padding.device)
+            x_remove_padding = torch.cat([x_remove_padding, padding])
+
+        x_remove_padding = x_remove_padding.reshape(
+            1, batch * total_length_imm)
+
+        # Generate cum_offsets_out
+        cum_offsets_out = torch.zeros(
+            batch, 1, dtype=torch.int32, device=input_ids.device)
+        for i in range(1, batch):
+            cum_offsets_out[i] = cum_offsets_now[i - 1]
+
+        # Generate padding_offset
+        padding_offset = []
+        for i in range(batch):
+            if i == 0:
+                padding_offset += [0] * seq_len[0].item()
+            else:
+                padding_offset += [cum_offsets_now[i - 1]
+                                   [0].item()] * seq_len[i].item()
+
+        # Add zeros for padding
+        zero_offset = torch.zeros(1, batch * total_length_imm - token_num[0][0].item(),
+                                  dtype=torch.int32, device=input_ids.device)
+        padding_offset = torch.tensor(
+            padding_offset, dtype=torch.int32, device=input_ids.device)
+        padding_offset = torch.cat([padding_offset, zero_offset.flatten()])
+        padding_offset = padding_offset.reshape(1, batch * total_length_imm)
+
+        return x_remove_padding.long(), cum_offsets_out, padding_offset
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing (pure PyTorch version).
+    """
+    batch = 32
+    total_length_imm = 64
+
+    # Generate seq_len
+    seq_len = torch.randint(1, total_length_imm + 1,
+                            (batch,), dtype=torch.int32)
+
+    # Generate input_ids
+    input_ids = torch.zeros((batch, total_length_imm), dtype=torch.int32)
+    for i in range(batch):
+        input_ids[i, :seq_len[i]] = torch.randint(
+            1, 50, (seq_len[i],), dtype=torch.int32)
+
+    # Generate cum_offsets_now
+    zeros_num = total_length_imm - seq_len
+    cum_offsets_now = torch.cumsum(zeros_num, dim=0)
+
+    # Generate token_num
+    token_num = seq_len.sum()
+
+    # Reshape tensors
+    cum_offsets_now = cum_offsets_now.reshape(batch, 1)
+    token_num = token_num.reshape(1, 1)
+    seq_len = seq_len.reshape(batch, 1)
+
+    return [input_ids.long(), cum_offsets_now, token_num, seq_len]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/common/View.py b/aikg/benchmark/aikgbench/llm/common/View.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a525253f25d7548ae78d7b3d943c4766889c83e
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/common/View.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs view operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        """
+        Perform view operation on input tensor.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Reshaped tensor
+        """
+        # Reshape to 2D
+        return x.view(2, -1)
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    x = torch.rand(2, 64, dtype=torch.float16)
+    return [x]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseAdd.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseAdd.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c81e3403bc73236c1611907916f3be144972724
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseAdd.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs element-wise addition between two tensors.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+        # For element-wise addition, we don't need learnable parameters
+        # The addition will be performed between two input tensors
+
+    def forward(self, x, y):
+        """
+        Perform element-wise addition between two tensors.
+
+        Args:
+            x: First input tensor
+            y: Second input tensor
+
+        Returns:
+            Element-wise sum of x and y
+        """
+        return torch.add(x, y)
+
+
+# Model parameters - using the same shape as in test_add.py
+shape = (1000000,)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_add.py
+    """
+    # Generate random tensors similar to test_add.py
+    input0 = torch.rand(shape)
+    input1 = torch.rand(shape)
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For element-wise addition, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseCast.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseCast.py
new file mode 100644
index 0000000000000000000000000000000000000000..50676730dfc1b18cf52710af95ba77fd709d0d08
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseCast.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs data type casting on input tensor.
+    """
+
+    def __init__(self, output_dtype=torch.float32):
+        super(Model, self).__init__()
+        self.output_dtype = output_dtype
+
+    def forward(self, x):
+        """
+        Perform data type casting on input tensor.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Input tensor cast to the specified output data type
+        """
+        return x.type(self.output_dtype)
+
+
+# Model parameters - using the same shape as in test_cast.py
+shape = (10000,)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_cast.py
+    """
+    # Generate random tensors similar to test_cast.py (range [-5, 5])
+    input0 = torch.rand(shape) * 10 - 5
+    return [input0]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For data type casting, we need the target output data type.
+    Default to float16.
+    """
+    return [torch.float16]
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseEqual.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseEqual.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3d11ff593d1bf4dcf4829f9859019f71e319e62
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseEqual.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs element-wise equality comparison between two tensors.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+        # For element-wise equality comparison, we don't need learnable parameters
+
+    def forward(self, x, y):
+        """
+        Perform element-wise equality comparison between two tensors.
+
+        Args:
+            x: First input tensor
+            y: Second input tensor
+
+        Returns:
+            Boolean tensor indicating element-wise equality (converted to int8)
+        """
+        return torch.eq(x, y).int().to(torch.int8)
+
+
+# Model parameters - using the same shape as in test_equal.py
+shape = (8, 6)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_equal.py
+    """
+    # Generate random tensors similar to test_equal.py (range [0, 100])
+    input0 = torch.rand(shape) * 100
+    input1 = torch.rand(shape) * 100
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For element-wise equality comparison, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseMul.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseMul.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d87922f1d0965fc0595bf2b023ed0107004dd3f
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseMul.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs element-wise multiplication between two tensors.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+        # For element-wise multiplication, we don't need learnable parameters
+        # The multiplication will be performed between two input tensors
+
+    def forward(self, x, y):
+        """
+        Perform element-wise multiplication between two tensors.
+
+        Args:
+            x: First input tensor
+            y: Second input tensor
+
+        Returns:
+            Element-wise product of x and y
+        """
+        return torch.mul(x, y)
+
+
+# Model parameters - using the same shape as in test_mul.py
+shape = (1000000,)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_mul.py
+    """
+    # Generate random tensors similar to test_mul.py (range [0, 100])
+    input0 = torch.rand(shape) * 100
+    input1 = torch.rand(shape) * 100
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For element-wise multiplication, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseMuls.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseMuls.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c732a6e46292c72106c99ab9ae9dec3f7aef721
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseMuls.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs element-wise scalar multiplication on input tensor.
+    """
+
+    def __init__(self, scalar_value=2):
+        super(Model, self).__init__()
+        self.scalar_value = scalar_value
+
+    def forward(self, x):
+        """
+        Perform element-wise scalar multiplication on input tensor.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Element-wise product of input tensor and scalar value
+        """
+        return torch.mul(x, self.scalar_value)
+
+
+# Model parameters - using the same shape as in test_muls.py
+shape = (1000000,)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+scalar_value = 2  # From test_muls.py OP_PARAM_MULS["varAttr"]
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_muls.py
+    """
+    # Generate random tensors similar to test_muls.py (range [0, 100])
+    input0 = torch.rand(shape) * 100
+    return [input0]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For element-wise scalar multiplication, we need the scalar value.
+    """
+    return [scalar_value]
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseRealDiv.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseRealDiv.py
new file mode 100644
index 0000000000000000000000000000000000000000..28828f5f29bf2b6b08dc3791e65b349fc18f88b0
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseRealDiv.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs element-wise real division between two tensors.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+        # For element-wise division, we don't need learnable parameters
+
+    def forward(self, x, y):
+        """
+        Perform element-wise real division between two tensors.
+
+        Args:
+            x: First input tensor (dividend)
+            y: Second input tensor (divisor)
+
+        Returns:
+            Element-wise quotient of x divided by y
+        """
+        return torch.div(x, y)
+
+
+# Model parameters - using the same shape as in test_real_div.py
+shape = (1000000,)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_real_div.py
+    """
+    # Generate random tensors similar to test_real_div.py (range [0, 100])
+    input0 = torch.rand(shape) * 100
+    input1 = torch.rand(shape) * 100
+    # Avoid division by zero by replacing zeros with small values
+    input1 = torch.where(input1 == 0, torch.tensor(2**(-4)), input1)
+    return [input0, input1]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For element-wise division, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/elewise/ElewiseTanh.py b/aikg/benchmark/aikgbench/llm/elewise/ElewiseTanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b978c46d49be8bfa9f1dc1a4d9938063a95a48
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/ElewiseTanh.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs tanh activation function on input tensor.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+        # For tanh activation, we don't need learnable parameters
+
+    def forward(self, x):
+        """
+        Apply tanh activation function to input tensor.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Tanh activation of input tensor
+        """
+        return torch.tanh(x)
+
+
+# Model parameters - using the same shape as in test_tanh.py
+shape = (1000000,)
+batch_size = 1  # For element-wise operations, batch size is typically 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_tanh.py
+    """
+    # Generate random tensors similar to test_tanh.py (range [-100, 100])
+    input0 = torch.rand(shape) * 200 - 100
+    return [input0]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For tanh activation, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/elewise/FastSoftMax.py b/aikg/benchmark/aikgbench/llm/elewise/FastSoftMax.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a51516fefccf5cb3d9bec9ba1a4f32cc48b4de5
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/elewise/FastSoftMax.py
@@ -0,0 +1,82 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs fast softmax operation.
+    Based on test_fastsoftmax_operation.py from opstest/python/operations/fast_soft_max/
+    """
+
+    def __init__(self, head_num=8, q_seq_len=None):
+        super(Model, self).__init__()
+        self.head_num = head_num
+        self.q_seq_len = q_seq_len if q_seq_len is not None else [
+            200]  # Default sequence length
+
+    def forward(self, x):
+        """
+        Perform fast softmax operation on input tensor.
+
+        Args:
+            x: Input tensor to be processed
+
+        Returns:
+            Softmax output tensor
+        """
+        golden = torch.empty_like(x)
+        start = 0
+
+        for i in range(len(self.q_seq_len)):
+            end = start + self.head_num * self.q_seq_len[i] * self.q_seq_len[i]
+            cur_data_input = x[start:end].reshape(-1, self.q_seq_len[i])
+            cur_golden = torch.softmax(cur_data_input.to(
+                torch.float32), dim=-1).to(torch.float16)
+            golden[start:end] = cur_golden.reshape(-1)
+            start = end
+
+        return golden
+
+
+# Model parameters - using the same parameters as in test_fastsoftmax_operation.py
+# Default parameters for fast softmax operation
+default_params = {
+    'head_num': 8,      # Default number of heads
+    'batch_size': 4,    # Default batch size
+    'seq_len_range': (100, 300)  # Default sequence length range
+}
+
+# Default sequence lengths for testing
+default_seq_lens = [200, 150, 250, 180]  # Example sequence lengths
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_fastsoftmax_operation.py
+    """
+    # Use default parameters from test_fastsoftmax_operation.py
+    batch_size = default_params['batch_size']
+    head_num = default_params['head_num']
+    q_seq_len = default_seq_lens
+
+    # Generate data_input_list based on q_seq_len
+    data_input_list = []
+    for i in range(batch_size):
+        data_input = torch.randn(
+            head_num * q_seq_len[i] * q_seq_len[i]).to(torch.float16)
+        data_input_list.append(data_input)
+
+    # Concatenate all inputs
+    data_input = torch.cat(data_input_list)
+
+    return [data_input]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For fast softmax operation, we need head_num and q_seq_len.
+    """
+    # Use default sequence lengths for initialization
+    return [default_params['head_num'], default_seq_lens]
diff --git a/aikg/benchmark/aikgbench/llm/index/DynamicQuantUpdateScatter.py b/aikg/benchmark/aikgbench/llm/index/DynamicQuantUpdateScatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..924cb31876b2807e5825e9777f1fee28ffaf0f65
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/DynamicQuantUpdateScatter.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs DynamicQuantUpdateScatter operation.
+    """
+
+    def __init__(self, reduce="add", axis=0):
+        super(Model, self).__init__()
+        self.reduce = reduce
+        self.axis = axis
+
+    def forward(self, var, var_scale, indices, updates, smooth_scales):
+        """
+        Perform DynamicQuantUpdateScatter operation.
+
+        Args:
+            var: Quantized variable tensor (int8)
+            var_scale: Variable scale tensor (float32)
+            indices: Indices tensor
+            updates: Updates tensor (float16/bfloat16)
+            smooth_scales: Smooth scales tensor (float16/bfloat16)
+
+        Returns:
+            Tuple of (y, var, var_scale) tensors
+        """
+        # Dequantize var using var_scale
+        var_dequantized = var.float() * var_scale
+
+        # Apply smooth scales to updates
+        scaled_updates = updates * smooth_scales
+
+        # Create a copy of var_dequantized for scatter operation
+        output = var_dequantized.clone()
+
+        # Ensure data type compatibility
+        output = output.to(torch.float32)
+        scaled_updates = scaled_updates.to(torch.float32)
+
+        # Perform scatter operation based on reduce mode
+        if self.reduce == "add":
+            output.scatter_add_(self.axis, indices, scaled_updates)
+        else:
+            output.scatter_(self.axis, indices, scaled_updates)
+
+        # Re-quantize the result
+        # Find the scale for the updated tensor
+        max_val = torch.abs(output).max()
+        new_scale = max_val / 127.0  # Assuming int8 quantization
+
+        # Quantize to int8
+        y = torch.clamp(torch.round(output / new_scale), -
+                        128, 127).to(torch.int8)
+
+        return y, output, new_scale.unsqueeze(0)
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Create tensors with appropriate shapes and types
+    var_shape = [4, 4]
+    indices_shape = [2, 4]
+
+    var = torch.randint(-128, 127, var_shape, dtype=torch.int8)
+    var_scale = torch.randn(1, dtype=torch.float32)
+    indices = torch.randint(0, 4, indices_shape, dtype=torch.int64)  # 改为 int64
+    updates = torch.randn(indices_shape, dtype=torch.float16)
+    smooth_scales = torch.randn(indices_shape, dtype=torch.float16)
+
+    return [var, var_scale, indices, updates, smooth_scales]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return ["add", 0]  # reduce="add", axis=0
diff --git a/aikg/benchmark/aikgbench/llm/index/EmbeddingBag.py b/aikg/benchmark/aikgbench/llm/index/EmbeddingBag.py
new file mode 100644
index 0000000000000000000000000000000000000000..1005651f07ef11b998106a950622a10f3b43431f
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/EmbeddingBag.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs EmbeddingBag operation.
+    """
+
+    def __init__(self, scale_grad_by_freq=False, mode='sum', sparse=False, include_last_offset=False, padding_idx=-1):
+        super(Model, self).__init__()
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.mode = mode
+        self.sparse = sparse
+        self.include_last_offset = include_last_offset
+        self.padding_idx = padding_idx
+
+    def forward(self, weight, indices, offsets, per_sample_weights=None):
+        """
+        Perform EmbeddingBag operation.
+
+        Args:
+            weight: Embedding weight tensor
+            indices: Indices tensor
+            offsets: Offsets tensor
+            per_sample_weights: Per-sample weights tensor (optional)
+
+        Returns:
+            Embedding bag output tensor
+        """
+        # Use torch.nn.functional.embedding_bag for the operation
+        output = torch.nn.functional.embedding_bag(
+            indices,
+            weight,
+            offsets=offsets,
+            max_norm=None,
+            norm_type=2,
+            scale_grad_by_freq=self.scale_grad_by_freq,
+            mode=self.mode,
+            sparse=self.sparse,
+            per_sample_weights=per_sample_weights,
+            include_last_offset=self.include_last_offset,
+            padding_idx=self.padding_idx,
+        )
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: weight = np.random.randn(9).reshape(3, 3), indices = np.random.randint(0, 3, size=6)
+    num_weights = 3
+    weight_shape = [3, 3]
+    indices_shape = [6]
+
+    weight = torch.randn(weight_shape, dtype=torch.float32)
+    indices = torch.randint(0, num_weights, indices_shape, dtype=torch.int64)
+    offsets = torch.tensor([0, 2, 4, 5], dtype=torch.int64)
+    per_sample_weights = torch.ones(indices_shape, dtype=torch.float32)
+
+    return [weight, indices, offsets, per_sample_weights]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [False, 'sum', False, False, 1]  # scale_grad_by_freq=False, mode='sum', sparse=False, include_last_offset=False, padding_idx=1
diff --git a/aikg/benchmark/aikgbench/llm/index/EmbeddingDenseGradV2.py b/aikg/benchmark/aikgbench/llm/index/EmbeddingDenseGradV2.py
new file mode 100644
index 0000000000000000000000000000000000000000..65429ccd10388a99cec4775ccaab5100b00b8534
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/EmbeddingDenseGradV2.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs EmbeddingDenseGradV2 operation.
+    """
+
+    def __init__(self, num_weights=4, padding_idx=0, scale_grad_by_freq=False):
+        super(Model, self).__init__()
+        self.num_weights = num_weights
+        self.padding_idx = padding_idx
+        self.scale_grad_by_freq = scale_grad_by_freq
+
+    def forward(self, grad, sort_indices, pos_idx, num_weights_tensor=None, padding_idx_tensor=None, scale_grad_by_freq_tensor=None):
+        """
+        Perform EmbeddingDenseGradV2 operation.
+
+        Args:
+            grad: Gradient tensor
+            sort_indices: Sorted indices tensor
+            pos_idx: Position indices tensor
+            num_weights_tensor: Number of weights tensor (optional)
+            padding_idx_tensor: Padding index tensor (optional)
+            scale_grad_by_freq_tensor: Scale gradient by frequency tensor (optional)
+
+        Returns:
+            Embedding gradient tensor
+        """
+        # Use provided tensors or fall back to instance variables
+        num_weights = num_weights_tensor.item(
+        ) if num_weights_tensor is not None else self.num_weights
+        padding_idx = padding_idx_tensor.item(
+        ) if padding_idx_tensor is not None else self.padding_idx
+        scale_grad_by_freq = scale_grad_by_freq_tensor.item(
+        ) if scale_grad_by_freq_tensor is not None else self.scale_grad_by_freq
+
+        # Use torch.ops.aten.embedding_dense_backward for the operation
+        result = torch.ops.aten.embedding_dense_backward(
+            grad_output=grad,
+            indices=sort_indices,
+            num_weights=num_weights,
+            padding_idx=padding_idx,
+            scale_grad_by_freq=scale_grad_by_freq
+        )
+
+        return result
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: grad = np.random.randn(6).reshape(2, 3), num_weights = 4
+    grad_shape = [2, 3]
+    num_weights = 4
+
+    grad = torch.randn(grad_shape, dtype=torch.float32)
+    sort_indices = torch.randint(0, num_weights, (2,), dtype=torch.int32)
+    pos_idx = torch.randint(0, 2, (2,), dtype=torch.int32)
+    num_weights_tensor = torch.tensor([num_weights], dtype=torch.int32)
+    padding_idx_tensor = torch.tensor([0], dtype=torch.int32)
+    scale_grad_by_freq_tensor = torch.tensor([False], dtype=torch.bool)
+
+    return [grad, sort_indices, pos_idx, num_weights_tensor, padding_idx_tensor, scale_grad_by_freq_tensor]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [4, 0, False]  # num_weights=4, padding_idx=0, scale_grad_by_freq=False
diff --git a/aikg/benchmark/aikgbench/llm/index/FeedsRepeat.py b/aikg/benchmark/aikgbench/llm/index/FeedsRepeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..990c3a478ebb26d5613dbc80aaa565a2bb8092fe
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/FeedsRepeat.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs FeedsRepeat operation.
+    """
+
+    def __init__(self, output_feeds_size=500):
+        super(Model, self).__init__()
+        self.output_feeds_size = output_feeds_size
+
+    def forward(self, feeds, feeds_repeat_times):
+        """
+        Perform FeedsRepeat operation.
+
+        Args:
+            feeds: Input feeds tensor
+            feeds_repeat_times: Repeat times for each feed
+
+        Returns:
+            Repeated and padded feeds tensor
+        """
+        # Repeat feeds according to feeds_repeat_times
+        repeated_feeds = torch.repeat_interleave(
+            feeds, feeds_repeat_times, dim=0)
+
+        # Calculate total repeated size
+        total_repeated = torch.sum(feeds_repeat_times)
+
+        # Calculate padding needed
+        pad_size = self.output_feeds_size - total_repeated
+
+        if pad_size > 0:
+            # Pad with zeros to reach output_feeds_size
+            pad_shape = list(repeated_feeds.shape)
+            pad_shape[0] = pad_size
+            padding = torch.zeros(
+                pad_shape, dtype=repeated_feeds.dtype, device=repeated_feeds.device)
+            output = torch.cat([repeated_feeds, padding], dim=0)
+        else:
+            # Truncate if output_feeds_size is smaller than total repeated
+            output = repeated_feeds[:self.output_feeds_size]
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: feeds = np.array([1, 2, 3, 4, 5, 6]).reshape(2, 3)
+    feeds = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
+    feeds_repeat_times = torch.tensor([100, 200], dtype=torch.int32)
+
+    return [feeds, feeds_repeat_times]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [500]  # output_feeds_size=500
diff --git a/aikg/benchmark/aikgbench/llm/index/GatherV3.py b/aikg/benchmark/aikgbench/llm/index/GatherV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ce808b919ec1f88eebee6dd2e0b501bfb70632
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/GatherV3.py
@@ -0,0 +1,53 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs GatherV3 operation.
+    """
+
+    def __init__(self, axis=0):
+        super(Model, self).__init__()
+        self.axis = axis
+
+    def forward(self, self_tensor, indices, axis_tensor=None):
+        """
+        Perform GatherV3 operation.
+
+        Args:
+            self_tensor: Input tensor
+            indices: Index tensor
+            axis_tensor: Axis tensor (optional, uses self.axis if not provided)
+
+        Returns:
+            Gathered tensor
+        """
+        # Use the provided axis_tensor or fall back to self.axis
+        axis = axis_tensor.item() if axis_tensor is not None else self.axis
+
+        # Use torch.gather to gather values along the specified axis
+        result = torch.gather(self_tensor, axis, indices)
+        return result
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: input_data = torch.randn(4, 2), index = torch.randint(0, 4, (2,))
+    self_shape = [4, 2]
+    indices_shape = [2]
+
+    self_tensor = torch.randn(self_shape, dtype=torch.float32)
+    indices = torch.randint(0, 4, indices_shape, dtype=torch.int64)  # 改为 int64
+    axis_tensor = torch.tensor([0], dtype=torch.int64)
+
+    return [self_tensor, indices, axis_tensor]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [0]  # axis=0
diff --git a/aikg/benchmark/aikgbench/llm/index/InplaceIndexAddWithSorted.py b/aikg/benchmark/aikgbench/llm/index/InplaceIndexAddWithSorted.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3adae31589b194998eceed488144c47082a0603
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/InplaceIndexAddWithSorted.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs InplaceIndexAddWithSorted operation.
+    """
+
+    def __init__(self, axis=0):
+        super(Model, self).__init__()
+        self.axis = axis
+
+    def forward(self, var, value, sorted_indices, pos, alpha=1.0):
+        """
+        Perform InplaceIndexAddWithSorted operation.
+
+        Args:
+            var: Base tensor to add into
+            value: Source tensor with values to add
+            sorted_indices: Sorted index tensor
+            pos: Position tensor
+            alpha: Scaling factor
+
+        Returns:
+            Tensor with values added at specified indices
+        """
+        # Create a copy of var to avoid modifying the original
+        output = var.clone()
+
+        # Use index_add_ to perform in-place index add operation
+        # This adds values at the specified indices along the given axis
+        output.index_add_(self.axis, sorted_indices, value, alpha=alpha)
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: self = np.random.randn(4, 2), index = torch.randint(0, 4, (4,))
+    var_shape = [4, 2]
+    value_shape = [4, 2]
+    sorted_indices_shape = [4]
+    pos_shape = [4]
+
+    var = torch.randn(var_shape, dtype=torch.float32)
+    value = torch.randn(value_shape, dtype=torch.float32)
+    sorted_indices = torch.randint(
+        0, 4, sorted_indices_shape, dtype=torch.int32)
+    pos = torch.randint(0, 4, pos_shape, dtype=torch.int32)
+
+    return [var, value, sorted_indices, pos]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [0]  # axis=0
diff --git a/aikg/benchmark/aikgbench/llm/index/MaskedSelectV3.py b/aikg/benchmark/aikgbench/llm/index/MaskedSelectV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4aa88e10b5cc269ed641966eff1d2d1722edb94
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/MaskedSelectV3.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs MaskedSelectV3 operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, mask):
+        """
+        Perform MaskedSelectV3 operation.
+
+        Args:
+            x: Input tensor
+            mask: Boolean mask tensor
+
+        Returns:
+            Selected elements as 1D tensor
+        """
+        # Use torch.masked_select to select elements based on boolean mask
+        result = torch.masked_select(x, mask)
+        return result
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: input_tensor = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.int32)
+    mask = torch.tensor(
+        [[True, False, True], [False, True, False]], dtype=torch.bool)
+
+    return [x, mask]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/index/ScatterAddWithSorted.py b/aikg/benchmark/aikgbench/llm/index/ScatterAddWithSorted.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a8a1666e3e7358e361375ea3b7122a893530f43
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/ScatterAddWithSorted.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs ScatterAddWithSorted operation.
+    """
+
+    def __init__(self, dim=0):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, var, value, sorted_index, pos, reduction="add"):
+        """
+        Perform ScatterAddWithSorted operation.
+
+        Args:
+            var: Base tensor to scatter into
+            value: Source tensor with values to scatter
+            sorted_index: Sorted index tensor
+            pos: Position tensor
+            reduction: Reduction method ("add" for scatter_add)
+
+        Returns:
+            Scattered tensor with accumulated values
+        """
+        # Create a copy of var to avoid modifying the original
+        output = var.clone()
+
+        # Use scatter_add_ to perform in-place scatter add operation
+        # This accumulates values at the same indices
+        output.scatter_add_(self.dim, sorted_index, value)
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: self = torch.randn(4, 4), index = torch.randint(0, 3, (3, 4))
+    var_shape = [4, 4]
+    value_shape = [4, 4]
+    sorted_index_shape = [3, 4]
+    pos_shape = [3, 4]
+
+    var = torch.randn(var_shape, dtype=torch.float32)
+    value = torch.randn(value_shape, dtype=torch.float32)
+    sorted_index = torch.randint(
+        0, 3, sorted_index_shape, dtype=torch.int64)  # 改为 int64
+    pos = torch.randint(0, 4, pos_shape, dtype=torch.int32)
+
+    return [var, value, sorted_index, pos]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [0]  # dim=0
diff --git a/aikg/benchmark/aikgbench/llm/index/ScatterElementsV2.py b/aikg/benchmark/aikgbench/llm/index/ScatterElementsV2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c35f7c2e90cdaccc0ca2c3020d55edf62b132a0
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/ScatterElementsV2.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs ScatterElementsV2 operation.
+    """
+
+    def __init__(self, dim=0):
+        super(Model, self).__init__()
+        self.dim = dim
+
+    def forward(self, self_tensor, index, src):
+        """
+        Perform ScatterElementsV2 operation.
+
+        Args:
+            self_tensor: Base tensor to scatter into
+            index: Index tensor specifying positions
+            src: Source tensor with values to scatter
+
+        Returns:
+            Scattered tensor
+        """
+        # Create a copy of self_tensor to avoid modifying the original
+        output = self_tensor.clone()
+
+        # Use scatter_ to perform in-place scatter operation along specified dimension
+        output.scatter_(self.dim, index, src)
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: self_shape = [3, 4], index_shape = [2, 3], src_shape = [2, 3]
+    self_shape = [3, 4]
+    index_shape = [2, 3]
+    src_shape = [2, 3]
+
+    self_tensor = torch.randn(self_shape, dtype=torch.float32)
+    # indices within valid range
+    index = torch.randint(0, 3, index_shape, dtype=torch.int64)
+    src = torch.randn(src_shape, dtype=torch.float32)
+
+    return [self_tensor, index, src]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [0]  # dim=0
diff --git a/aikg/benchmark/aikgbench/llm/index/ScatterList.py b/aikg/benchmark/aikgbench/llm/index/ScatterList.py
new file mode 100644
index 0000000000000000000000000000000000000000..e15d79e3eea028048235859ac4945cff59f13dbe
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/ScatterList.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs ScatterList operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, self_tensor, index, src):
+        """
+        Perform ScatterList operation.
+
+        Args:
+            self_tensor: Base tensor to scatter into
+            index: Index tensor specifying positions
+            src: Source tensor with values to scatter
+
+        Returns:
+            Scattered tensor
+        """
+        # Create a copy of self_tensor to avoid modifying the original
+        output = self_tensor.clone()
+
+        # Use scatter_ to perform in-place scatter operation
+        # This is equivalent to scatter operation where we update values at specified indices
+        output.scatter_(0, index, src)
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: varRefShape = [5, 3, 4], indiceShape = [1, 2], updatesShape = [1, 5, 3, 4]
+    varRefShape = [5, 3, 4]
+    indiceShape = [1, 2]
+    updatesShape = [1, 5, 3, 4]
+
+    self_tensor = torch.randn(varRefShape, dtype=torch.float32)
+    # indices within valid range
+    index = torch.randint(0, 3, indiceShape, dtype=torch.int64)
+    src = torch.randn(updatesShape, dtype=torch.float32)
+
+    return [self_tensor, index, src]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/index/TopKV3.py b/aikg/benchmark/aikgbench/llm/index/TopKV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d82e50f9d7c1bcb8d716a04a1f907c6c58de727e
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/index/TopKV3.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs TopKV3 operation.
+    """
+
+    def __init__(self, k=2, dim=1, largest=True, sorted=True):
+        super(Model, self).__init__()
+        self.k = k
+        self.dim = dim
+        self.largest = largest
+        self.sorted = sorted
+
+    def forward(self, self_tensor):
+        """
+        Perform TopKV3 operation.
+
+        Args:
+            self_tensor: Input tensor
+
+        Returns:
+            Tuple of (values, indices) tensors
+        """
+        values, indices = torch.topk(self_tensor, k=self.k, dim=self.dim,
+                                     largest=self.largest, sorted=self.sorted)
+        return values, indices
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from gen_data.py: input_shape = [2, 16]
+    input_shape = [2, 16]
+    self_tensor = torch.randn(input_shape, dtype=torch.float32)
+    return [self_tensor]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [2, 1, True, True]  # k=2, dim=1, largest=True, sorted=True
diff --git a/aikg/benchmark/aikgbench/llm/matmul/BatchMatMulV3.py b/aikg/benchmark/aikgbench/llm/matmul/BatchMatMulV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..566495287ac7b71f7d91872d68011f41700d8006
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/BatchMatMulV3.py
@@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs BatchMatMulV3 operation.
+    """
+
+    def __init__(self, adj_x1=False, adj_x2=False, offset_x=0, enable_hf32=False):
+        super(Model, self).__init__()
+        self.adj_x1 = adj_x1
+        self.adj_x2 = adj_x2
+        self.offset_x = offset_x
+        self.enable_hf32 = enable_hf32
+
+    def forward(self, x1, x2, bias=None, offset_w=None):
+        """
+        Perform BatchMatMulV3 operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            bias: Optional bias tensor
+            offset_w: Optional offset tensor
+
+        Returns:
+            Output tensor after batch matrix multiplication
+        """
+        # Apply adjoint operations if needed
+        if self.adj_x1:
+            x1 = x1.transpose(-2, -1)
+        if self.adj_x2:
+            x2 = x2.transpose(-2, -1)
+
+        # Perform batch matrix multiplication
+        output = torch.matmul(x1, x2)
+
+        # Add bias if provided
+        if bias is not None:
+            output = output + bias
+
+        # Add offset if provided
+        if offset_w is not None:
+            output = output + offset_w
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from README: x1(2, 16, 32), x2(2, 32, 16)
+    batch_size, m, k, n = 2, 16, 32, 16
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, m, k, dtype=torch.float32)
+    x2 = torch.randn(batch_size, k, n, dtype=torch.float32)
+
+    # Generate optional bias
+    bias = torch.randn(batch_size, 1, n, dtype=torch.float32)
+
+    # Generate optional offset
+    offset_w = torch.randn(batch_size, m, n, dtype=torch.float32)
+
+    return [x1, x2, bias, offset_w]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [False, False, 0, False]  # adj_x1=False, adj_x2=False, offset_x=0, enable_hf32=False
diff --git a/aikg/benchmark/aikgbench/llm/matmul/ComplexMatMul.py b/aikg/benchmark/aikgbench/llm/matmul/ComplexMatMul.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab97519bccdfa8f97238c609545f0498ee92f13
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/ComplexMatMul.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs ComplexMatMul operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, bias):
+        """
+        Perform ComplexMatMul operation.
+
+        Args:
+            x: First complex input tensor
+            y: Second complex input tensor
+            bias: Complex bias tensor
+
+        Returns:
+            Complex output tensor after matrix multiplication
+        """
+        # Perform complex matrix multiplication
+        output = torch.matmul(x, y)
+
+        # Add bias
+        output = output + bias
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use complex64 data type as specified in README
+    batch_size, m, k, n = 2, 16, 32, 16
+
+    # Generate complex input tensors
+    x_real = torch.randn(batch_size, m, k, dtype=torch.float32)
+    x_imag = torch.randn(batch_size, m, k, dtype=torch.float32)
+    x = torch.complex(x_real, x_imag)
+
+    y_real = torch.randn(batch_size, k, n, dtype=torch.float32)
+    y_imag = torch.randn(batch_size, k, n, dtype=torch.float32)
+    y = torch.complex(y_real, y_imag)
+
+    # Generate complex bias
+    bias_real = torch.randn(batch_size, m, n, dtype=torch.float32)
+    bias_imag = torch.randn(batch_size, m, n, dtype=torch.float32)
+    bias = torch.complex(bias_real, bias_imag)
+
+    return [x, y, bias]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/matmul/GemmV2.py b/aikg/benchmark/aikgbench/llm/matmul/GemmV2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fba348216201c6cb99c2a0931fbdfb7465ae61b
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/GemmV2.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs GemmV2 operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B, alpha, beta, C):
+        """
+        Perform GemmV2 operation.
+
+        Args:
+            A: First input tensor
+            B: Second input tensor
+            alpha: Alpha scaling factor
+            beta: Beta scaling factor
+            C: Third input tensor
+
+        Returns:
+            Output tensor: out = α(A @ B) + βC
+        """
+        # Perform matrix multiplication
+        matmul_result = torch.matmul(A, B)
+
+        # Apply alpha scaling
+        scaled_result = alpha * matmul_result
+
+        # Apply beta scaling to C and add
+        output = scaled_result + beta * C
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from README: A(2, 2), B(2, 2), C(2, 2)
+    m, n, k = 2, 2, 2
+
+    # Generate input tensors (using float16 as specified)
+    A = torch.randn(m, k, dtype=torch.float16)
+    B = torch.randn(k, n, dtype=torch.float16)
+
+    # Generate scaling factors
+    alpha = torch.randn(1, dtype=torch.float16)
+    beta = torch.randn(1, dtype=torch.float16)
+
+    # Generate C tensor (using float32 as specified)
+    C = torch.randn(m, n, dtype=torch.float32)
+
+    return [A, B, alpha, beta, C]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/matmul/MatMulV3.py b/aikg/benchmark/aikgbench/llm/matmul/MatMulV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..712ef63e284e30d775d45e83f67d1cb188ecb510
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/MatMulV3.py
@@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs MatMulV3 operation.
+    """
+
+    def __init__(self, transpose_x1=False, transpose_x2=False, offset_x=0, enable_hf32=False):
+        super(Model, self).__init__()
+        self.transpose_x1 = transpose_x1
+        self.transpose_x2 = transpose_x2
+        self.offset_x = offset_x
+        self.enable_hf32 = enable_hf32
+
+    def forward(self, x1, x2, bias=None, offset_w=None):
+        """
+        Perform MatMulV3 operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            bias: Optional bias tensor
+            offset_w: Optional offset tensor
+
+        Returns:
+            Output tensor after matrix multiplication
+        """
+        # Apply transpose operations if needed
+        if self.transpose_x1:
+            x1 = x1.transpose(-2, -1)
+        if self.transpose_x2:
+            x2 = x2.transpose(-2, -1)
+
+        # Perform matrix multiplication
+        output = torch.matmul(x1, x2)
+
+        # Add bias if provided
+        if bias is not None:
+            output = output + bias
+
+        # Add offset if provided
+        if offset_w is not None:
+            output = output + offset_w
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from README: x1(2, 16, 32), x2(2, 32, 16)
+    batch_size, m, k, n = 2, 16, 32, 16
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, m, k, dtype=torch.float32)
+    x2 = torch.randn(batch_size, k, n, dtype=torch.float32)
+
+    # Generate optional bias
+    bias = torch.randn(batch_size, 1, n, dtype=torch.float32)
+
+    # Generate optional offset
+    offset_w = torch.randn(batch_size, m, n, dtype=torch.float32)
+
+    return [x1, x2, bias, offset_w]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [False, False, 0, False]  # transpose_x1=False, transpose_x2=False, offset_x=0, enable_hf32=False
diff --git a/aikg/benchmark/aikgbench/llm/matmul/Mmad.py b/aikg/benchmark/aikgbench/llm/matmul/Mmad.py
new file mode 100644
index 0000000000000000000000000000000000000000..9303af50cf0bf623cc844deb998b83e5c7280a12
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/Mmad.py
@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs Mmad operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B, Bias):
+        """
+        Perform Mmad operation.
+
+        Args:
+            A: Left matrix tensor [M, K]
+            B: Right matrix tensor [K, N]
+            Bias: Bias tensor [N]
+
+        Returns:
+            Output tensor C = A * B + Bias
+        """
+        # Perform matrix multiplication
+        output = torch.matmul(A, B)
+
+        # Add bias to each row
+        output = output + Bias
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from README: A(32, 32), B(32, 32), Bias(1, 32)
+    M, K, N = 32, 32, 32
+
+    # Generate input tensors (using float16 as specified)
+    A = torch.randn(M, K, dtype=torch.float16)
+    B = torch.randn(K, N, dtype=torch.float16)
+
+    # Generate bias (using float as specified)
+    Bias = torch.randn(1, N, dtype=torch.float32)
+
+    return [A, B, Bias]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/matmul/QuantBatchMatmulV3.py b/aikg/benchmark/aikgbench/llm/matmul/QuantBatchMatmulV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..a149d92ffd80ce1adda281bb726ba233d2625d10
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/QuantBatchMatmulV3.py
@@ -0,0 +1,69 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs QuantBatchMatmulV3 operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x1, x2, scale, offset, bias=None):
+        """
+        Perform QuantBatchMatmulV3 operation.
+
+        Args:
+            x1: First quantized input tensor
+            x2: Second quantized input tensor
+            scale: Scale factor for quantization
+            offset: Offset factor for quantization
+            bias: Optional bias tensor
+
+        Returns:
+            Quantized output tensor
+        """
+        # Convert to float for matrix multiplication
+        x1_float = x1.float()
+        x2_float = x2.float()
+
+        # Perform matrix multiplication
+        output = torch.matmul(x1_float, x2_float)
+
+        # Add bias if provided
+        if bias is not None:
+            output = output + bias
+
+        # Apply scale and offset
+        output = output * scale + offset
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from README: x1(16, 32), x2(32, 16)
+    m, k, n = 16, 32, 16
+
+    # Generate quantized input tensors (using int8 as specified)
+    x1 = torch.randint(-128, 127, (m, k), dtype=torch.int8)
+    x2 = torch.randint(-128, 127, (k, n), dtype=torch.int8)
+
+    # Generate scale and offset
+    scale = torch.randn(1, dtype=torch.float32)
+    offset = torch.randn(1, dtype=torch.float32)
+
+    # Generate optional bias
+    bias = torch.randn(n, dtype=torch.float32)
+
+    return [x1, x2, scale, offset, bias]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/matmul/WeightQuantBatchMatmulV2.py b/aikg/benchmark/aikgbench/llm/matmul/WeightQuantBatchMatmulV2.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b668b083d164f7eda6a57222021d0ebce253e9
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/matmul/WeightQuantBatchMatmulV2.py
@@ -0,0 +1,77 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs WeightQuantBatchMatmulV2 operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, weight, antiquantScale, antiquantOffsetOptional=None, quantScaleOptional=None, quantOffsetOptional=None):
+        """
+        Perform WeightQuantBatchMatmulV2 operation.
+
+        Args:
+            x: Input tensor
+            weight: Quantized weight tensor
+            antiquantScale: Scale for weight dequantization
+            antiquantOffsetOptional: Optional offset for weight dequantization
+            quantScaleOptional: Optional scale for output quantization
+            quantOffsetOptional: Optional offset for output quantization
+
+        Returns:
+            Output tensor after weight dequantization and matrix multiplication
+        """
+        # Dequantize weight: ANTIQUANT(weight) = (weight + antiquantOffset) * antiquantScale
+        dequantized_weight = weight.float()
+        if antiquantOffsetOptional is not None:
+            dequantized_weight = dequantized_weight + antiquantOffsetOptional
+        dequantized_weight = dequantized_weight * antiquantScale
+
+        # Convert x to float32 for matrix multiplication if needed
+        x_float = x.float() if x.dtype != torch.float32 else x
+
+        # Perform matrix multiplication
+        output = torch.matmul(x_float, dequantized_weight)
+
+        # Apply output quantization if provided
+        if quantScaleOptional is not None:
+            output = output * quantScaleOptional
+            if quantOffsetOptional is not None:
+                output = output + quantOffsetOptional
+
+        return output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use shapes from README: x(16, 32), weight(32, 16)
+    m, k, n = 16, 32, 16
+
+    # Generate input tensors (using float16 as specified)
+    x = torch.randn(m, k, dtype=torch.float16)
+
+    # Generate quantized weight (using int8 as specified)
+    weight = torch.randint(-128, 127, (k, n), dtype=torch.int8)
+
+    # Generate dequantization parameters
+    antiquantScale = torch.randn(1, dtype=torch.float16)
+    antiquantOffsetOptional = torch.randn(1, dtype=torch.float16)
+
+    # Generate optional quantization parameters
+    quantScaleOptional = torch.randn(1, dtype=torch.float32)
+    quantOffsetOptional = torch.randn(1, dtype=torch.float32)
+
+    return [x, weight, antiquantScale, antiquantOffsetOptional, quantScaleOptional, quantOffsetOptional]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddLayerNorm.py b/aikg/benchmark/aikgbench/llm/norm/AddLayerNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ba6f6a748c8b943d41273553118c8b22d65474
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddLayerNorm.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddLayerNorm operation.
+    """
+
+    def __init__(self, epsilon=1e-6, additional_output=False):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+        self.additional_output = additional_output
+
+    def forward(self, x1, x2, gamma, beta, bias=None):
+        """
+        Perform AddLayerNorm operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor  
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+            bias: Optional bias tensor
+
+        Returns:
+            Tuple of (output, mean, rstd, x) if additional_output=True
+            Otherwise just the output tensor
+        """
+        # Add the two input tensors
+        if bias is not None:
+            x = x1 + x2 + bias
+        else:
+            x = x1 + x2
+
+        # Get input shape and reshape for layer norm
+        input_shape = x.shape
+        row_size = x.shape[-1]
+        row_count = 1
+        for i in range(0, len(input_shape) - 1):
+            row_count *= input_shape[i]
+
+        x_shape = (row_count, row_size)
+        x_mean_shape = (row_count, 1)
+
+        # Reshape for layer norm computation
+        x_reshaped = x.reshape(x_shape)
+
+        # Compute mean and variance
+        x_mean = torch.mean(x_reshaped, dim=1, keepdim=True)
+        x_var = torch.var(x_reshaped, dim=1, keepdim=True,
+                          unbiased=False) + self.epsilon
+        x_rstd = 1.0 / torch.sqrt(x_var)
+
+        # Broadcast tensors to match x_shape
+        x_mean_broadcast = x_mean.expand(x_shape)
+        x_rstd_broadcast = x_rstd.expand(x_shape)
+        gamma_broadcast = gamma.expand(x_shape)
+        beta_broadcast = beta.expand(x_shape)
+
+        # Apply layer normalization
+        y = torch.multiply(torch.multiply(
+            x_reshaped - x_mean_broadcast, x_rstd_broadcast), gamma_broadcast) + beta_broadcast
+
+        # Reshape back to original shape
+        y = y.reshape(input_shape)
+        x_mean = x_mean.reshape(input_shape[:-1] + (1,))
+        x_rstd = x_rstd.reshape(input_shape[:-1] + (1,))
+
+        if self.additional_output:
+            return y, x_mean, x_rstd, x
+        else:
+            return y
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, seq_len, hidden_size = 1, 2, 8
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma and beta parameters
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+    beta = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate optional bias
+    bias = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [x1, x2, gamma, beta, bias]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [1e-6, True]  # epsilon=1e-6, additional_output=True
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddLayerNormGrad.py b/aikg/benchmark/aikgbench/llm/norm/AddLayerNormGrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..01ce7b919d8ee571cb06a736b1f394f892f17548
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddLayerNormGrad.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddLayerNormGrad operation. 
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, dy, x1, x2, rstd, mean, gamma, dsum):
+        """
+        Perform AddLayerNormGrad operation (backward pass).
+
+        Args:
+            dy: Gradient of output
+            x1: First input tensor
+            x2: Second input tensor
+            rstd: Reciprocal standard deviation from forward pass
+            mean: Mean from forward pass
+            gamma: Scale parameter tensor
+            dsum: Sum of gradients
+
+        Returns:
+            Tuple of (dx, dgamma, dbeta) where:
+            - dx is the gradient with respect to input
+            - dgamma is the gradient with respect to gamma
+            - dbeta is the gradient with respect to beta
+        """
+        # Add the two input tensors (same as forward pass)
+        x = x1 + x2
+
+        # Compute gradients for layer norm backward
+        # This is a simplified implementation of the backward pass
+        N = x.shape[0]
+        C = x.shape[-1]
+
+        # Compute gradients for gamma and beta
+        dgamma = torch.sum(dy * (x - mean) * rstd, dim=0)
+        dbeta = torch.sum(dy, dim=0)
+
+        # Compute gradient with respect to input
+        # Simplified gradient computation
+        dx = dy * gamma * rstd
+
+        return dx, dgamma, dbeta
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other layer norm operations
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gradient of output
+    dy = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate statistics from forward pass
+    x = x1 + x2
+    mean = torch.mean(x, dim=-1, keepdim=True)
+    var = torch.var(x, dim=-1, keepdim=True, unbiased=False) + 1e-6
+    rstd = 1.0 / torch.sqrt(var)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate dsum (simplified)
+    dsum = torch.randn_like(dy)
+
+    return [dy, x1, x2, rstd, mean, gamma, dsum]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For gradient operations, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddLayerNormQuant.py b/aikg/benchmark/aikgbench/llm/norm/AddLayerNormQuant.py
new file mode 100644
index 0000000000000000000000000000000000000000..069fcdba71c0a7a42f201bb5664fefd46764a25a
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddLayerNormQuant.py
@@ -0,0 +1,125 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddLayerNormQuant operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6, additional_output=False, quant_mode="symmetric"):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+        self.additional_output = additional_output
+        self.quant_mode = quant_mode
+
+    def forward(self, x1, x2, gamma, beta, bias, scales1, scales2, zero_points1, zero_points2):
+        """
+        Perform AddLayerNormQuant operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+            bias: Bias tensor
+            scales1: Quantization scales for first output
+            scales2: Quantization scales for second output
+            zero_points1: Quantization zero points for first output
+            zero_points2: Quantization zero points for second output
+
+        Returns:
+            Tuple of (y1, y2, x, out_scales1, out_scales2) where:
+            - y1, y2 are quantized outputs
+            - x is the sum of input tensors
+            - out_scales1, out_scales2 are output scales
+        """
+        # Add the two input tensors
+        if bias is not None:
+            x = x1 + x2 + bias
+        else:
+            x = x1 + x2
+
+        # Get input shape and reshape for layer norm
+        input_shape = x.shape
+        row_size = x.shape[-1]
+        row_count = 1
+        for i in range(0, len(input_shape) - 1):
+            row_count *= input_shape[i]
+
+        x_shape = (row_count, row_size)
+        x_mean_shape = (row_count, 1)
+
+        # Reshape for layer norm computation
+        x_reshaped = x.reshape(x_shape)
+
+        # Compute mean and variance
+        x_mean = torch.mean(x_reshaped, dim=1, keepdim=True)
+        x_var = torch.var(x_reshaped, dim=1, keepdim=True,
+                          unbiased=False) + self.epsilon
+        x_rstd = 1.0 / torch.sqrt(x_var)
+
+        # Broadcast tensors to match x_shape
+        x_mean_broadcast = x_mean.expand(x_shape)
+        x_rstd_broadcast = x_rstd.expand(x_shape)
+        gamma_broadcast = gamma.expand(x_shape)
+        beta_broadcast = beta.expand(x_shape)
+
+        # Apply layer normalization
+        y = torch.multiply(torch.multiply(
+            x_reshaped - x_mean_broadcast, x_rstd_broadcast), gamma_broadcast) + beta_broadcast
+
+        # Reshape back to original shape
+        y = y.reshape(input_shape)
+
+        # Quantize outputs
+        y1_quantized = torch.round(
+            y / scales1 + zero_points1).clamp(-128, 127).to(torch.int8)
+        y2_quantized = torch.round(
+            y / scales2 + zero_points2).clamp(-128, 127).to(torch.int8)
+
+        # Compute output scales
+        out_scales1 = scales1
+        out_scales2 = scales2
+
+        if self.additional_output:
+            return y1_quantized, y2_quantized, x, out_scales1, out_scales2
+        else:
+            return y1_quantized, y2_quantized
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other layer norm operations
+    batch_size, seq_len, hidden_size = 1, 2, 8
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma and beta parameters
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+    beta = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate bias
+    bias = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate quantization parameters
+    scales1 = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    scales2 = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    zero_points1 = torch.zeros(1, dtype=torch.float32)
+    zero_points2 = torch.zeros(1, dtype=torch.float32)
+
+    return [x1, x2, gamma, beta, bias, scales1, scales2, zero_points1, zero_points2]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6, True, "symmetric"]  # epsilon=1e-6, additional_output=True, quant_mode="symmetric"
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddRmsNorm.py b/aikg/benchmark/aikgbench/llm/norm/AddRmsNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8269a7338a684c1574eb9cbbdfd9959dcbe8e12
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddRmsNorm.py
@@ -0,0 +1,71 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddRmsNorm operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x1, x2, gamma):
+        """
+        Perform AddRmsNorm operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (output, rstd, x) where:
+            - output is the normalized tensor
+            - rstd is the reciprocal standard deviation
+            - x is the sum of input tensors
+        """
+        # Add the two input tensors
+        x = x1 + x2
+
+        # Compute RMS (Root Mean Square) normalization
+        # Unlike LayerNorm, RmsNorm doesn't subtract the mean
+        x_squared = x.pow(2)
+        x_rms = torch.sqrt(x_squared.mean(dim=-1, keepdim=True) + self.epsilon)
+        x_rstd = 1.0 / x_rms
+
+        # Apply normalization
+        x_normalized = x * x_rstd
+
+        # Apply scale parameter
+        output = x_normalized * gamma
+
+        return output, x_rstd, x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [x1, x2, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6]  # epsilon=1e-6
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddRmsNormCast.py b/aikg/benchmark/aikgbench/llm/norm/AddRmsNormCast.py
new file mode 100644
index 0000000000000000000000000000000000000000..454726e0f8c86ebc9580389fc39c397616e4d9ae
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddRmsNormCast.py
@@ -0,0 +1,77 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddRmsNormCast operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x1, x2, gamma):
+        """
+        Perform AddRmsNormCast operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (y1, y2, rstd, x) where:
+            - y1 is the cast output (float)
+            - y2 is the normalized output (original dtype)
+            - rstd is the reciprocal standard deviation
+            - x is the sum of input tensors
+        """
+        # Add the two input tensors
+        x = x1 + x2
+
+        # Compute RMS (Root Mean Square) normalization
+        x_squared = x.pow(2)
+        x_rms = torch.sqrt(x_squared.mean(dim=-1, keepdim=True) + self.epsilon)
+        x_rstd = 1.0 / x_rms
+
+        # Apply normalization
+        x_normalized = x * x_rstd
+
+        # Apply scale parameter
+        output = x_normalized * gamma
+
+        # Cast to float for y1
+        y1 = output.to(torch.float32)
+
+        # Keep original dtype for y2
+        y2 = output
+
+        return y1, y2, x_rstd, x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other rms norm operations
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensors (using float16 as specified in README)
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float16)
+
+    return [x1, x2, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6]  # epsilon=1e-6
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddRmsNormDynamicQuant.py b/aikg/benchmark/aikgbench/llm/norm/AddRmsNormDynamicQuant.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9767215ccea71b55a31e9d60f9e9909ee391b8a
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddRmsNormDynamicQuant.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddRmsNormDynamicQuant operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x1, x2, gamma, smooth_scale1, smooth_scale2):
+        """
+        Perform AddRmsNormDynamicQuant operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+            smooth_scale1: Smooth scale for first output
+            smooth_scale2: Smooth scale for second output
+
+        Returns:
+            Tuple of (y1, y2, x, scale1, scale2) where:
+            - y1, y2 are dynamically quantized outputs
+            - x is the sum of input tensors
+            - scale1, scale2 are dynamic quantization scales
+        """
+        # Add the two input tensors
+        x = x1 + x2
+
+        # Compute RMS (Root Mean Square) normalization
+        x_squared = x.pow(2)
+        x_rms = torch.sqrt(x_squared.mean(dim=-1, keepdim=True) + self.epsilon)
+        x_rstd = 1.0 / x_rms
+
+        # Apply normalization
+        x_normalized = x * x_rstd
+
+        # Apply scale parameter
+        output = x_normalized * gamma
+
+        # Dynamic quantization for first output
+        max_val1 = torch.max(torch.abs(output))
+        scale1 = max_val1 / 127.0
+        y1 = torch.round(output / scale1).clamp(-128, 127).to(torch.int8)
+
+        # Dynamic quantization for second output
+        max_val2 = torch.max(torch.abs(output))
+        scale2 = max_val2 / 127.0
+        y2 = torch.round(output / scale2).clamp(-128, 127).to(torch.int8)
+
+        return y1, y2, x, scale1, scale2
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other rms norm operations
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensors (using float16 as specified in README)
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float16)
+
+    # Generate smooth scales
+    smooth_scale1 = torch.rand(1, dtype=torch.float16) * 0.1 + 0.01
+    smooth_scale2 = torch.rand(1, dtype=torch.float16) * 0.1 + 0.01
+
+    return [x1, x2, gamma, smooth_scale1, smooth_scale2]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6]  # epsilon=1e-6
diff --git a/aikg/benchmark/aikgbench/llm/norm/AddRmsNormQuant.py b/aikg/benchmark/aikgbench/llm/norm/AddRmsNormQuant.py
new file mode 100644
index 0000000000000000000000000000000000000000..15f6d4528178dd16ff011ad6d54ca3c6f9439931
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/AddRmsNormQuant.py
@@ -0,0 +1,94 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs AddRmsNormQuant operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6, axis=-1, div_mode=False):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+        self.axis = axis
+        self.div_mode = div_mode
+
+    def forward(self, x1, x2, gamma, scales1, scales2, zero_points1, zero_points2):
+        """
+        Perform AddRmsNormQuant operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+            scales1: Quantization scales for first output
+            scales2: Quantization scales for second output
+            zero_points1: Quantization zero points for first output
+            zero_points2: Quantization zero points for second output
+
+        Returns:
+            Tuple of (y1, y2, x) where:
+            - y1, y2 are quantized outputs
+            - x is the sum of input tensors
+        """
+        # Add the two input tensors
+        x = x1 + x2
+
+        # Compute RMS (Root Mean Square) normalization
+        x_squared = x.pow(2)
+        x_rms = torch.sqrt(x_squared.mean(
+            dim=self.axis, keepdim=True) + self.epsilon)
+        x_rstd = 1.0 / x_rms
+
+        # Apply normalization
+        x_normalized = x * x_rstd
+
+        # Apply scale parameter
+        output = x_normalized * gamma
+
+        # Quantize outputs
+        if self.div_mode:
+            y1_quantized = torch.round(
+                output / scales1 + zero_points1).clamp(-128, 127).to(torch.int8)
+            y2_quantized = torch.round(
+                output / scales2 + zero_points2).clamp(-128, 127).to(torch.int8)
+        else:
+            y1_quantized = torch.round(
+                output * scales1 + zero_points1).clamp(-128, 127).to(torch.int8)
+            y2_quantized = torch.round(
+                output * scales2 + zero_points2).clamp(-128, 127).to(torch.int8)
+
+        return y1_quantized, y2_quantized, x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other rms norm operations
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensors (using float16 as specified in README)
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float16)
+
+    # Generate quantization parameters
+    scales1 = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    scales2 = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    zero_points1 = torch.zeros(1, dtype=torch.int32)
+    zero_points2 = torch.zeros(1, dtype=torch.int32)
+
+    return [x1, x2, gamma, scales1, scales2, zero_points1, zero_points2]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6, -1, False]  # epsilon=1e-6, axis=-1, div_mode=False
diff --git a/aikg/benchmark/aikgbench/llm/norm/BatchNormV3.py b/aikg/benchmark/aikgbench/llm/norm/BatchNormV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b65d91bfb28445be816bdef1e15726381d0f9e6
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/BatchNormV3.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs BatchNormV3 operation.
+    Based on 
+    """
+
+    def __init__(self, momentum=0.1, epsilon=1e-5, is_training=True):
+        super(Model, self).__init__()
+        self.momentum = momentum
+        self.epsilon = epsilon
+        self.is_training = is_training
+
+    def forward(self, x, weight, bias, running_mean, running_var):
+        """
+        Perform BatchNormV3 operation.
+
+        Args:
+            x: Input tensor
+            weight: Weight parameter tensor
+            bias: Bias parameter tensor
+            running_mean: Running mean tensor
+            running_var: Running variance tensor
+
+        Returns:
+            Tuple of (output, running_mean, running_var, save_mean, save_rstd)
+        """
+        # Use PyTorch's native batch norm
+        output, save_mean, save_rstd = torch.ops.aten.native_batch_norm(
+            input=x,
+            weight=weight,
+            bias=bias,
+            running_mean=running_mean,
+            running_var=running_var,
+            training=self.is_training,
+            momentum=self.momentum,
+            eps=self.epsilon
+        )
+
+        return output, running_mean, running_var, save_mean, save_rstd
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, channels, height, width = 1, 2, 1, 4
+
+    # Generate input tensor
+    x = torch.randn(batch_size, channels, height, width, dtype=torch.float32)
+
+    # Generate weight and bias parameters
+    weight = torch.ones(channels, dtype=torch.float32)
+    bias = torch.zeros(channels, dtype=torch.float32)
+
+    # Generate running statistics
+    running_mean = torch.zeros(channels, dtype=torch.float32)
+    running_var = torch.ones(channels, dtype=torch.float32)
+
+    return [x, weight, bias, running_mean, running_var]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [0.1, 1e-5, True]  # momentum=0.1, epsilon=1e-5, is_training=True
diff --git a/aikg/benchmark/aikgbench/llm/norm/DeepNorm.py b/aikg/benchmark/aikgbench/llm/norm/DeepNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e9c905c43f92734f4174ad1c1566121634b692
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/DeepNorm.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs DeepNorm operation.
+    Based on 
+    """
+
+    def __init__(self, alpha=0.3, epsilon=1e-6):
+        super(Model, self).__init__()
+        self.alpha = alpha
+        self.epsilon = epsilon
+
+    def forward(self, x, gx, beta, gamma):
+        """
+        Perform DeepNorm operation.
+
+        Args:
+            x: Input tensor
+            gx: Gate tensor
+            beta: Shift parameter tensor
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (mean, rstd, y) where:
+            - mean is the mean of the normalized tensor
+            - rstd is the reciprocal standard deviation
+            - y is the normalized output tensor
+        """
+        # Apply alpha scaling and add gate
+        x_add = x * self.alpha + gx
+
+        # Compute mean and variance
+        mean = x_add.mean(-1, keepdim=True)
+        diff = x_add - mean
+        variance = diff.pow(2).mean(-1, keepdim=True)
+
+        # Compute reciprocal standard deviation
+        rstd = torch.rsqrt(variance + self.epsilon)
+
+        # Apply normalization
+        output = gamma * diff * rstd + beta
+
+        return mean, rstd, output
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, seq_len, hidden_size = 3, 1, 4
+
+    # Generate input tensors
+    x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+                     dtype=torch.float32).reshape(batch_size, seq_len, hidden_size)
+    gx = torch.tensor([2, 2, 2, 4, 4, 4, 6, 6, 6, 8, 8, 8],
+                      dtype=torch.float32).reshape(batch_size, seq_len, hidden_size)
+
+    # Generate beta and gamma parameters
+    beta = torch.tensor([0, 1, 2, 3], dtype=torch.float32)
+    gamma = torch.tensor([0, 1, 2, 3], dtype=torch.float32)
+
+    return [x, gx, beta, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [0.3, 1e-6]  # alpha=0.3, epsilon=1e-6
diff --git a/aikg/benchmark/aikgbench/llm/norm/DeepNormGrad.py b/aikg/benchmark/aikgbench/llm/norm/DeepNormGrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d29bd841efbd10dbef6aaf2986e001dc487f21
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/DeepNormGrad.py
@@ -0,0 +1,99 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs DeepNormGrad operation.
+    Based on 
+    """
+
+    def __init__(self, alpha=0.3):
+        super(Model, self).__init__()
+        self.alpha = alpha
+
+    def forward(self, dy, x, gx, gamma, mean, rstd):
+        """
+        Perform DeepNormGrad operation (backward pass).
+
+        Args:
+            dy: Gradient of output
+            x: Input tensor from forward pass
+            gx: Gate tensor from forward pass
+            gamma: Scale parameter tensor
+            mean: Mean from forward pass
+            rstd: Reciprocal standard deviation from forward pass
+
+        Returns:
+            Tuple of (dx, dgx, dbeta, dgamma) where:
+            - dx is the gradient with respect to input x
+            - dgx is the gradient with respect to gate gx
+            - dbeta is the gradient with respect to beta
+            - dgamma is the gradient with respect to gamma
+        """
+        # Apply alpha scaling and add gate (same as forward pass)
+        x_add = x * self.alpha + gx
+
+        # Compute normalized output (same as forward pass)
+        diff = x_add - mean
+        output = gamma * diff * rstd
+
+        # Compute gradients for DeepNorm backward
+        # This is a simplified implementation of the backward pass
+
+        # Gradient with respect to gamma
+        dgamma = torch.sum(dy * diff * rstd, dim=0)
+
+        # Gradient with respect to beta (simplified)
+        dbeta = torch.sum(dy, dim=0)
+
+        # Gradient with respect to normalized output
+        dy_normalized = dy * gamma * rstd
+
+        # Gradient with respect to x_add
+        dx_add = dy_normalized
+
+        # Gradient with respect to x and gx
+        dx = dx_add * self.alpha
+        dgx = dx_add
+
+        return dx, dgx, dbeta, dgamma
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other deep norm operations
+    batch_size, seq_len, hidden_size = 3, 1, 4
+
+    # Generate input tensors
+    x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+                     dtype=torch.float32).reshape(batch_size, seq_len, hidden_size)
+    gx = torch.tensor([2, 2, 2, 4, 4, 4, 6, 6, 6, 8, 8, 8],
+                      dtype=torch.float32).reshape(batch_size, seq_len, hidden_size)
+
+    # Generate gradient of output
+    dy = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate statistics from forward pass
+    alpha = 0.3
+    x_add = x * alpha + gx
+    mean = x_add.mean(-1, keepdim=True)
+    diff = x_add - mean
+    variance = diff.pow(2).mean(-1, keepdim=True) + 1e-6
+    rstd = 1.0 / torch.sqrt(variance)
+
+    # Generate gamma parameter
+    gamma = torch.tensor([0, 1, 2, 3], dtype=torch.float32)
+
+    return [dy, x, gx, gamma, mean, rstd]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [0.3]  # alpha=0.3
diff --git a/aikg/benchmark/aikgbench/llm/norm/DuaQuantizeAddLayerNorm.py b/aikg/benchmark/aikgbench/llm/norm/DuaQuantizeAddLayerNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b6d18b185722530cb09e29bb656be73ad8ee623
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/DuaQuantizeAddLayerNorm.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs DuaQuantizeAddLayerNorm operation.
+    Based on 
+    """
+
+    def __init__(self, dtype=torch.float32, axis=-1, epsilon=1e-6, additional_output=False):
+        super(Model, self).__init__()
+        self.dtype = dtype
+        self.axis = axis
+        self.epsilon = epsilon
+        self.additional_output = additional_output
+
+    def forward(self, x1, x2, gamma, beta, bias, scales1, scales2, zero_points1, zero_points2):
+        """
+        Perform DuaQuantizeAddLayerNorm operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+            bias: Bias tensor
+            scales1: Quantization scales for first output
+            scales2: Quantization scales for second output
+            zero_points1: Quantization zero points for first output
+            zero_points2: Quantization zero points for second output
+
+        Returns:
+            Tuple of (y1, y2, x) where:
+            - y1, y2 are quantized outputs
+            - x is the sum of input tensors
+        """
+        # Add the two input tensors
+        if bias is not None:
+            x = x1 + x2 + bias
+        else:
+            x = x1 + x2
+
+        # Get input shape and reshape for layer norm
+        input_shape = x.shape
+        row_size = x.shape[-1]
+        row_count = 1
+        for i in range(0, len(input_shape) - 1):
+            row_count *= input_shape[i]
+
+        x_shape = (row_count, row_size)
+        x_mean_shape = (row_count, 1)
+
+        # Reshape for layer norm computation
+        x_reshaped = x.reshape(x_shape)
+
+        # Compute mean and variance
+        x_mean = torch.mean(x_reshaped, dim=1, keepdim=True)
+        x_var = torch.var(x_reshaped, dim=1, keepdim=True,
+                          unbiased=False) + self.epsilon
+        x_rstd = 1.0 / torch.sqrt(x_var)
+
+        # Broadcast tensors to match x_shape
+        x_mean_broadcast = x_mean.expand(x_shape)
+        x_rstd_broadcast = x_rstd.expand(x_shape)
+        gamma_broadcast = gamma.expand(x_shape)
+        beta_broadcast = beta.expand(x_shape)
+
+        # Apply layer normalization
+        y = torch.multiply(torch.multiply(
+            x_reshaped - x_mean_broadcast, x_rstd_broadcast), gamma_broadcast) + beta_broadcast
+
+        # Reshape back to original shape
+        y = y.reshape(input_shape)
+
+        # Quantize outputs
+        y1_quantized = torch.round(
+            y / scales1 + zero_points1).clamp(-128, 127).to(torch.int8)
+        y2_quantized = torch.round(
+            y / scales2 + zero_points2).clamp(-128, 127).to(torch.int8)
+
+        return y1_quantized, y2_quantized, x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other layer norm operations
+    batch_size, seq_len, hidden_size = 1, 2, 8
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma and beta parameters
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+    beta = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate bias
+    bias = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate quantization parameters
+    scales1 = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    scales2 = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    zero_points1 = torch.zeros(1, dtype=torch.float32)
+    zero_points2 = torch.zeros(1, dtype=torch.float32)
+
+    return [x1, x2, gamma, beta, bias, scales1, scales2, zero_points1, zero_points2]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [torch.float32, -1, 1e-6, False]  # dtype=float32, axis=-1, epsilon=1e-6, additional_output=False
diff --git a/aikg/benchmark/aikgbench/llm/norm/GroupNormGrad.py b/aikg/benchmark/aikgbench/llm/norm/GroupNormGrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..da943c3230cf095dd6d4fe04c721f4c223018789
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/GroupNormGrad.py
@@ -0,0 +1,110 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs GroupNormGrad operation.
+    Based on 
+    """
+
+    def __init__(self, num_groups=2, data_format="NCHW", dx_is_require=True, dgamma_is_require=True, dbeta_is_require=True):
+        super(Model, self).__init__()
+        self.num_groups = num_groups
+        self.data_format = data_format
+        self.dx_is_require = dx_is_require
+        self.dgamma_is_require = dgamma_is_require
+        self.dbeta_is_require = dbeta_is_require
+
+    def forward(self, dy, mean, rstd, x, gamma):
+        """
+        Perform GroupNormGrad operation (backward pass).
+
+        Args:
+            dy: Gradient of output
+            mean: Mean from forward pass
+            rstd: Reciprocal standard deviation from forward pass
+            x: Input tensor from forward pass
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (dx, dgamma, dbeta) where:
+            - dx is the gradient with respect to input
+            - dgamma is the gradient with respect to gamma
+            - dbeta is the gradient with respect to beta
+        """
+        # Get input dimensions
+        N, C = x.shape[:2]
+        remaining_dims = x.shape[2:]
+        HxW = 1
+        for size in remaining_dims:
+            HxW *= size
+
+        # Reshape for group norm computation
+        x_reshaped = x.reshape(N, self.num_groups, C // self.num_groups, HxW)
+        dy_reshaped = dy.reshape(N, self.num_groups, C // self.num_groups, HxW)
+
+        # Compute gradients for group norm backward
+        # This is a simplified implementation of the backward pass
+        if self.dgamma_is_require:
+            dgamma = torch.sum(dy * (x - mean) * rstd, dim=0)
+        else:
+            dgamma = torch.zeros_like(gamma)
+
+        if self.dbeta_is_require:
+            dbeta = torch.sum(dy, dim=0)
+        else:
+            dbeta = torch.zeros_like(gamma)
+
+        # Compute gradient with respect to input
+        if self.dx_is_require:
+            dx = dy * gamma * rstd
+        else:
+            dx = torch.zeros_like(x)
+
+        return dx, dgamma, dbeta
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other group norm operations
+    batch_size, channels, height, width = 4, 2, 8, 8
+
+    # Generate input tensor
+    x = torch.randn(batch_size, channels, height, width, dtype=torch.float32)
+
+    # Generate gradient of output
+    dy = torch.randn(batch_size, channels, height, width, dtype=torch.float32)
+
+    # Generate statistics from forward pass
+    N, C = x.shape[:2]
+    remaining_dims = x.shape[2:]
+    HxW = 1
+    for size in remaining_dims:
+        HxW *= size
+
+    x_reshaped = x.reshape(N, 2, C // 2, HxW)  # num_groups=2
+    mean = torch.mean(x_reshaped, dim=(2, 3), keepdim=True)
+    var = torch.var(x_reshaped, dim=(2, 3),
+                    keepdim=True, unbiased=False) + 1e-6
+    rstd = 1.0 / torch.sqrt(var)
+
+    # Reshape back to original shape
+    mean = mean.reshape(x.shape)
+    rstd = rstd.reshape(x.shape)
+
+    # Generate gamma parameter
+    gamma = torch.randn(channels, dtype=torch.float32)
+
+    return [dy, mean, rstd, x, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [2, "NCHW", True, True, True]  # num_groups=2, data_format="NCHW", dx_is_require=True, dgamma_is_require=True, dbeta_is_require=True
diff --git a/aikg/benchmark/aikgbench/llm/norm/GroupNormSilu.py b/aikg/benchmark/aikgbench/llm/norm/GroupNormSilu.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f0108f069434210d1e007ba27e5aeafe8b06b2b
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/GroupNormSilu.py
@@ -0,0 +1,83 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs GroupNormSilu operation.
+    Based on 
+    """
+
+    def __init__(self, num_groups=2, epsilon=1e-6, activate_silu=True):
+        super(Model, self).__init__()
+        self.num_groups = num_groups
+        self.epsilon = epsilon
+        self.activate_silu = activate_silu
+
+    def forward(self, x, gamma, beta):
+        """
+        Perform GroupNormSilu operation.
+
+        Args:
+            x: Input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+
+        Returns:
+            Tuple of (output, mean_out, rstd_out) where:
+            - output is the normalized and silu-activated tensor
+            - mean_out is the mean of each group
+            - rstd_out is the reciprocal standard deviation of each group
+        """
+        # Get input dimensions
+        N, C = x.shape[:2]
+        remaining_dims = x.shape[2:]
+        HxW = 1
+        for size in remaining_dims:
+            HxW *= size
+
+        # Use PyTorch's native group norm
+        output, mean_out, rstd_out = torch.ops.aten.native_group_norm(
+            input=x,
+            weight=gamma,
+            bias=beta,
+            N=N,
+            C=C,
+            HxW=HxW,
+            group=self.num_groups,
+            eps=self.epsilon
+        )
+
+        # Apply SiLU activation if enabled
+        if self.activate_silu:
+            sigmoid_out = 1 / (1 + torch.exp(-output))
+            output = output * sigmoid_out
+
+        return output, mean_out, rstd_out
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, channels, height, width = 4, 2, 8, 8
+
+    # Generate input tensor
+    x = torch.rand(batch_size, channels, height, width,
+                   dtype=torch.float32) * 0.9 + 0.1
+
+    # Generate gamma and beta parameters
+    gamma = torch.rand(channels, dtype=torch.float32) * 0.9 + 0.1
+    beta = torch.rand(channels, dtype=torch.float32) * 0.9 + 0.1
+
+    return [x, gamma, beta]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [2, 1e-6, True]  # num_groups=2, epsilon=1e-6, activate_silu=True
diff --git a/aikg/benchmark/aikgbench/llm/norm/GroupNormSwish.py b/aikg/benchmark/aikgbench/llm/norm/GroupNormSwish.py
new file mode 100644
index 0000000000000000000000000000000000000000..86336b6cafbf3b2821affe17197b8ab61ef3823e
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/GroupNormSwish.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs GroupNormSwish operation.
+    Based on 
+    """
+
+    def __init__(self, num_groups=8, epsilon=1e-5, activate_swish=True, swish_scale=1.0):
+        super(Model, self).__init__()
+        self.num_groups = num_groups
+        self.epsilon = epsilon
+        self.activate_swish = activate_swish
+        self.swish_scale = swish_scale
+
+    def forward(self, x, gamma, beta):
+        """
+        Perform GroupNormSwish operation.
+
+        Args:
+            x: Input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+
+        Returns:
+            Tuple of (output, mean_out, rstd_out) where:
+            - output is the normalized and swish-activated tensor
+            - mean_out is the mean of each group
+            - rstd_out is the reciprocal standard deviation of each group
+        """
+        # Get input dimensions
+        N, C = x.shape[:2]
+        remaining_dims = x.shape[2:]
+        HxW = 1
+        for size in remaining_dims:
+            HxW *= size
+
+        # Use PyTorch's native group norm
+        output, mean_out, rstd_out = torch.ops.aten.native_group_norm(
+            input=x,
+            weight=gamma,
+            bias=beta,
+            N=N,
+            C=C,
+            HxW=HxW,
+            group=self.num_groups,
+            eps=self.epsilon
+        )
+
+        # Apply Swish activation if enabled
+        if self.activate_swish:
+            sigmoid_out = 1 / (1 + torch.exp(-self.swish_scale * output))
+            output = output * sigmoid_out
+
+        return output, mean_out, rstd_out
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    N, C = 100, 32
+    x_shape = (N, C)
+
+    # Generate input tensor
+    x = torch.rand(x_shape, dtype=torch.float16)
+
+    # Generate gamma and beta parameters
+    gamma = torch.rand(C, dtype=torch.float16)
+    beta = torch.rand(C, dtype=torch.float16)
+
+    return [x, gamma, beta]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [8, 1e-5, True, 1.0]  # num_groups=8, epsilon=1e-5, activate_swish=True, swish_scale=1.0
diff --git a/aikg/benchmark/aikgbench/llm/norm/GroupNormSwishGrad.py b/aikg/benchmark/aikgbench/llm/norm/GroupNormSwishGrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..447998e7932588b5e4f839860509f3b5f095fae2
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/GroupNormSwishGrad.py
@@ -0,0 +1,123 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs GroupNormSwishGrad operation.
+    Based on 
+    """
+
+    def __init__(self, num_groups=2, data_format="NCHW", swish_scale=1.0, dgamma_is_require=True, dbeta_is_require=True):
+        super(Model, self).__init__()
+        self.num_groups = num_groups
+        self.data_format = data_format
+        self.swish_scale = swish_scale
+        self.dgamma_is_require = dgamma_is_require
+        self.dbeta_is_require = dbeta_is_require
+
+    def forward(self, dy, mean, rstd, x, gamma, beta):
+        """
+        Perform GroupNormSwishGrad operation (backward pass).
+
+        Args:
+            dy: Gradient of output
+            mean: Mean from forward pass
+            rstd: Reciprocal standard deviation from forward pass
+            x: Input tensor from forward pass
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+
+        Returns:
+            Tuple of (dx_out, dgamma_out, dbeta_out) where:
+            - dx_out is the gradient with respect to input
+            - dgamma_out is the gradient with respect to gamma
+            - dbeta_out is the gradient with respect to beta
+        """
+        # Get input dimensions
+        N, C = x.shape[:2]
+        remaining_dims = x.shape[2:]
+        HxW = 1
+        for size in remaining_dims:
+            HxW *= size
+
+        # Reshape for group norm computation
+        x_reshaped = x.reshape(N, self.num_groups, C // self.num_groups, HxW)
+
+        # Apply group normalization
+        x_normalized = (x_reshaped - mean) * rstd
+        output = x_normalized * gamma + beta
+
+        # Apply Swish activation
+        sigmoid_out = 1 / (1 + torch.exp(-self.swish_scale * output))
+        swish_output = output * sigmoid_out
+
+        # Compute gradients for Swish backward
+        # d(swish_output)/d(output) = sigmoid_out + output * sigmoid_out * (1 - sigmoid_out) * swish_scale
+        swish_grad = sigmoid_out + swish_output * \
+            (1 - sigmoid_out) * self.swish_scale
+
+        # Apply gradient through Swish
+        dy_swish = dy * swish_grad
+
+        # Compute gradients for group norm backward
+        if self.dgamma_is_require:
+            dgamma_out = torch.sum(dy_swish * x_normalized, dim=0)
+        else:
+            dgamma_out = torch.zeros_like(gamma)
+
+        if self.dbeta_is_require:
+            dbeta_out = torch.sum(dy_swish, dim=0)
+        else:
+            dbeta_out = torch.zeros_like(beta)
+
+        # Compute gradient with respect to input
+        dx_out = dy_swish * gamma * rstd
+
+        return dx_out, dgamma_out, dbeta_out
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other group norm operations
+    batch_size, channels, height, width = 4, 2, 8, 8
+
+    # Generate input tensor
+    x = torch.randn(batch_size, channels, height, width, dtype=torch.float32)
+
+    # Generate gradient of output
+    dy = torch.randn(batch_size, channels, height, width, dtype=torch.float32)
+
+    # Generate statistics from forward pass
+    N, C = x.shape[:2]
+    remaining_dims = x.shape[2:]
+    HxW = 1
+    for size in remaining_dims:
+        HxW *= size
+
+    x_reshaped = x.reshape(N, 2, C // 2, HxW)  # num_groups=2
+    mean = torch.mean(x_reshaped, dim=(2, 3), keepdim=True)
+    var = torch.var(x_reshaped, dim=(2, 3),
+                    keepdim=True, unbiased=False) + 1e-6
+    rstd = 1.0 / torch.sqrt(var)
+
+    # Reshape back to original shape
+    mean = mean.reshape(x.shape)
+    rstd = rstd.reshape(x.shape)
+
+    # Generate gamma and beta parameters
+    gamma = torch.randn(channels, dtype=torch.float32)
+    beta = torch.randn(channels, dtype=torch.float32)
+
+    return [dy, mean, rstd, x, gamma, beta]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [2, "NCHW", 1.0, True, True]  # num_groups=2, data_format="NCHW", swish_scale=1.0, dgamma_is_require=True, dbeta_is_require=True
diff --git a/aikg/benchmark/aikgbench/llm/norm/InplaceAddLayerNorm.py b/aikg/benchmark/aikgbench/llm/norm/InplaceAddLayerNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4a47d5f977df4f7d549da1ee6a869249d0a8b60
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/InplaceAddLayerNorm.py
@@ -0,0 +1,104 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs InplaceAddLayerNorm operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6, additional_output=False):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+        self.additional_output = additional_output
+
+    def forward(self, x1, x2, gamma, beta, bias=None):
+        """
+        Perform InplaceAddLayerNorm operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+            bias: Optional bias tensor
+
+        Returns:
+            Tuple of (y, mean, rstd, x) if additional_output=True
+            Otherwise just the output tensor
+        """
+        # Add the two input tensors (inplace operation simulated)
+        if bias is not None:
+            x = x1 + x2 + bias
+        else:
+            x = x1 + x2
+
+        # Get input shape and reshape for layer norm
+        input_shape = x.shape
+        row_size = x.shape[-1]
+        row_count = 1
+        for i in range(0, len(input_shape) - 1):
+            row_count *= input_shape[i]
+
+        x_shape = (row_count, row_size)
+        x_mean_shape = (row_count, 1)
+
+        # Reshape for layer norm computation
+        x_reshaped = x.reshape(x_shape)
+
+        # Compute mean and variance
+        x_mean = torch.mean(x_reshaped, dim=1, keepdim=True)
+        x_var = torch.var(x_reshaped, dim=1, keepdim=True,
+                          unbiased=False) + self.epsilon
+        x_rstd = 1.0 / torch.sqrt(x_var)
+
+        # Broadcast tensors to match x_shape
+        x_mean_broadcast = x_mean.expand(x_shape)
+        x_rstd_broadcast = x_rstd.expand(x_shape)
+        gamma_broadcast = gamma.expand(x_shape)
+        beta_broadcast = beta.expand(x_shape)
+
+        # Apply layer normalization
+        y = torch.multiply(torch.multiply(
+            x_reshaped - x_mean_broadcast, x_rstd_broadcast), gamma_broadcast) + beta_broadcast
+
+        # Reshape back to original shape
+        y = y.reshape(input_shape)
+        x_mean = x_mean.reshape(input_shape[:-1] + (1,))
+        x_rstd = x_rstd.reshape(input_shape[:-1] + (1,))
+
+        if self.additional_output:
+            return y, x_mean, x_rstd, x
+        else:
+            return y
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other layer norm operations
+    batch_size, seq_len, hidden_size = 1, 2, 8
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma and beta parameters
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+    beta = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate optional bias
+    bias = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [x1, x2, gamma, beta, bias]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6, True]  # epsilon=1e-6, additional_output=True
diff --git a/aikg/benchmark/aikgbench/llm/norm/InplaceAddRmsNorm.py b/aikg/benchmark/aikgbench/llm/norm/InplaceAddRmsNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e416468c7ef48f921e78dfd5b4e228f9b82049fb
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/InplaceAddRmsNorm.py
@@ -0,0 +1,71 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs InplaceAddRmsNorm operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-6):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x1, x2, gamma):
+        """
+        Perform InplaceAddRmsNorm operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (output, rstd, x) where:
+            - output is the normalized tensor
+            - rstd is the reciprocal standard deviation
+            - x is the sum of input tensors
+        """
+        # Add the two input tensors (inplace operation simulated)
+        x = x1 + x2
+
+        # Compute RMS (Root Mean Square) normalization
+        # Unlike LayerNorm, RmsNorm doesn't subtract the mean
+        x_squared = x.pow(2)
+        x_rms = torch.sqrt(x_squared.mean(dim=-1, keepdim=True) + self.epsilon)
+        x_rstd = 1.0 / x_rms
+
+        # Apply normalization
+        x_normalized = x * x_rstd
+
+        # Apply scale parameter
+        output = x_normalized * gamma
+
+        return output, x_rstd, x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other rms norm operations
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [x1, x2, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-6]  # epsilon=1e-6
diff --git a/aikg/benchmark/aikgbench/llm/norm/InstanceNormV3.py b/aikg/benchmark/aikgbench/llm/norm/InstanceNormV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d67005e0eefe46cb308a0ee765af989e1bc136
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/InstanceNormV3.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs InstanceNormV3 operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-5, data_format="NCHW"):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+        self.data_format = data_format
+
+    def forward(self, x, gamma, beta):
+        """
+        Perform InstanceNormV3 operation.
+
+        Args:
+            x: Input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+
+        Returns:
+            Tuple of (output, mean, variance) where:
+            - output is the normalized tensor
+            - mean is the mean of each instance
+            - variance is the variance of each instance
+        """
+        # Determine reduction axes based on data format
+        if self.data_format == 'NHWC':
+            reduce_axis = [1, 2]
+            gamma = gamma.reshape([1, 1, 1, gamma.shape[0]])
+            beta = beta.reshape([1, 1, 1, beta.shape[0]])
+        else:  # NCHW
+            reduce_axis = [2, 3]
+            gamma = gamma.reshape([1, gamma.shape[0], 1, 1])
+            beta = beta.reshape([1, beta.shape[0], 1, 1])
+
+        # Compute mean and variance
+        mean = torch.mean(x, dim=reduce_axis, keepdim=True)
+        var = torch.mean(torch.pow((x - mean), 2),
+                         dim=reduce_axis, keepdim=True)
+
+        # Compute reciprocal standard deviation
+        rstd = 1 / torch.sqrt(var + self.epsilon)
+
+        # Apply normalization
+        tmp_x = (x - mean) * rstd
+        output = tmp_x * gamma + beta
+
+        return output, mean, var
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, channels, height, width = 1, 8, 4, 4
+
+    # Generate input tensor
+    x = torch.ones(batch_size, channels, height,
+                   width, dtype=torch.float32) * 0.77
+
+    # Generate gamma and beta parameters
+    gamma = torch.ones(channels, dtype=torch.float32) * 1.5
+    beta = torch.ones(channels, dtype=torch.float32) * 0.5
+
+    return [x, gamma, beta]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-5, "NCHW"]  # epsilon=1e-5, data_format="NCHW"
diff --git a/aikg/benchmark/aikgbench/llm/norm/LayerNormGradV3.py b/aikg/benchmark/aikgbench/llm/norm/LayerNormGradV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..fec7536ac2a7eea5449d7ae2a0fbddc68fff9c11
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/LayerNormGradV3.py
@@ -0,0 +1,76 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs LayerNormGradV3 operation.
+    Based on 
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, dy, x, rstd, mean, gamma):
+        """
+        Perform LayerNormGradV3 operation (backward pass).
+
+        Args:
+            dy: Gradient of output
+            x: Input tensor from forward pass
+            rstd: Reciprocal standard deviation from forward pass
+            mean: Mean from forward pass
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (pd_x, pd_gamma, pd_beta) where:
+            - pd_x is the gradient with respect to input
+            - pd_gamma is the gradient with respect to gamma
+            - pd_beta is the gradient with respect to beta
+        """
+        # Compute gradients for layer norm backward
+        # This is a simplified implementation of the backward pass
+
+        # Gradient with respect to gamma
+        pd_gamma = torch.sum(dy * (x - mean) * rstd, dim=0)
+
+        # Gradient with respect to beta
+        pd_beta = torch.sum(dy, dim=0)
+
+        # Gradient with respect to input
+        pd_x = dy * gamma * rstd
+
+        return pd_x, pd_gamma, pd_beta
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other layer norm operations
+    batch_size, seq_len, hidden_size = 1, 2, 32
+
+    # Generate input tensor
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gradient of output
+    dy = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate statistics from forward pass
+    mean = torch.mean(x, dim=-1, keepdim=True)
+    var = torch.var(x, dim=-1, keepdim=True, unbiased=False) + 1e-6
+    rstd = 1.0 / torch.sqrt(var)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [dy, x, rstd, mean, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For gradient operations, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/norm/LayerNormV4.py b/aikg/benchmark/aikgbench/llm/norm/LayerNormV4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e01458b3f5abbf8d9299565c1b109e80883a474
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/LayerNormV4.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs LayerNormV4 operation.
+    Based on 
+    """
+
+    def __init__(self, epsilon=1e-5):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x, normalized_shape, gamma, beta):
+        """
+        Perform LayerNormV4 operation.
+
+        Args:
+            x: Input tensor
+            normalized_shape: Shape of the normalized dimensions
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+
+        Returns:
+            Tuple of (output, mean, rstd) where:
+            - output is the normalized tensor
+            - mean is the mean of the normalized dimensions
+            - rstd is the reciprocal standard deviation
+        """
+        # Use PyTorch's native layer norm
+        output, mean, variance = torch.ops.aten.native_layer_norm(
+            x, normalized_shape, gamma, beta, eps=self.epsilon
+        )
+
+        # Convert variance to rstd (reciprocal standard deviation)
+        rstd = 1.0 / torch.sqrt(variance + self.epsilon)
+
+        return output, mean, rstd
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, seq_len, hidden_size = 1, 2, 32
+
+    # Generate input tensor
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate normalized_shape (same as gamma shape)
+    normalized_shape = (hidden_size,)
+
+    # Generate gamma and beta parameters
+    gamma = torch.ones(hidden_size, dtype=torch.float32)
+    beta = torch.zeros(hidden_size, dtype=torch.float32)
+
+    return [x, normalized_shape, gamma, beta]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [1e-5]  # epsilon=1e-5
diff --git a/aikg/benchmark/aikgbench/llm/norm/QuantizeAddLayerNorm.py b/aikg/benchmark/aikgbench/llm/norm/QuantizeAddLayerNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a690077d52d35e345587976a542e37538f46db38
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/QuantizeAddLayerNorm.py
@@ -0,0 +1,112 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs QuantizeAddLayerNorm operation.
+    Based on 
+    """
+
+    def __init__(self, dtype=torch.float32, axis=-1, epsilon=1e-6, additional_output=False):
+        super(Model, self).__init__()
+        self.dtype = dtype
+        self.axis = axis
+        self.epsilon = epsilon
+        self.additional_output = additional_output
+
+    def forward(self, x1, x2, gamma, beta, bias, scales, zero_points):
+        """
+        Perform QuantizeAddLayerNorm operation.
+
+        Args:
+            x1: First input tensor
+            x2: Second input tensor
+            gamma: Scale parameter tensor
+            beta: Shift parameter tensor
+            bias: Bias tensor
+            scales: Quantization scales
+            zero_points: Quantization zero points
+
+        Returns:
+            Tuple of (y, x) where:
+            - y is the quantized output
+            - x is the sum of input tensors
+        """
+        # Add the two input tensors
+        if bias is not None:
+            x = x1 + x2 + bias
+        else:
+            x = x1 + x2
+
+        # Get input shape and reshape for layer norm
+        input_shape = x.shape
+        row_size = x.shape[-1]
+        row_count = 1
+        for i in range(0, len(input_shape) - 1):
+            row_count *= input_shape[i]
+
+        x_shape = (row_count, row_size)
+        x_mean_shape = (row_count, 1)
+
+        # Reshape for layer norm computation
+        x_reshaped = x.reshape(x_shape)
+
+        # Compute mean and variance
+        x_mean = torch.mean(x_reshaped, dim=1, keepdim=True)
+        x_var = torch.var(x_reshaped, dim=1, keepdim=True,
+                          unbiased=False) + self.epsilon
+        x_rstd = 1.0 / torch.sqrt(x_var)
+
+        # Broadcast tensors to match x_shape
+        x_mean_broadcast = x_mean.expand(x_shape)
+        x_rstd_broadcast = x_rstd.expand(x_shape)
+        gamma_broadcast = gamma.expand(x_shape)
+        beta_broadcast = beta.expand(x_shape)
+
+        # Apply layer normalization
+        y = torch.multiply(torch.multiply(
+            x_reshaped - x_mean_broadcast, x_rstd_broadcast), gamma_broadcast) + beta_broadcast
+
+        # Reshape back to original shape
+        y = y.reshape(input_shape)
+
+        # Quantize output
+        y_quantized = torch.round(
+            y / scales + zero_points).clamp(-128, 127).to(torch.int8)
+
+        return y_quantized, x
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other layer norm operations
+    batch_size, seq_len, hidden_size = 1, 2, 8
+
+    # Generate input tensors
+    x1 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+    x2 = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma and beta parameters
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+    beta = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate bias
+    bias = torch.randn(hidden_size, dtype=torch.float32)
+
+    # Generate quantization parameters
+    scales = torch.rand(1, dtype=torch.float32) * 0.1 + 0.01
+    zero_points = torch.zeros(1, dtype=torch.float32)
+
+    return [x1, x2, gamma, beta, bias, scales, zero_points]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    Based on parameters
+    """
+    return [torch.float32, -1, 1e-6, False]  # dtype=float32, axis=-1, epsilon=1e-6, additional_output=False
diff --git a/aikg/benchmark/aikgbench/llm/norm/RmsNorm.py b/aikg/benchmark/aikgbench/llm/norm/RmsNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a621ac569c3eaa84a6c5e749fa3e22db58d5760d
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/RmsNorm.py
@@ -0,0 +1,60 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs RmsNorm operation.
+    """
+
+    def __init__(self, epsilon=1e-6):
+        super(Model, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x, gamma):
+        """
+        Perform RmsNorm operation.
+
+        Args:
+            x: Input tensor
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (output, rstd) where output is the normalized tensor
+        """
+        # Compute RMS (Root Mean Square) normalization
+        # Unlike LayerNorm, RmsNorm doesn't subtract the mean
+        x_squared = x.pow(2)
+        x_rms = torch.sqrt(x_squared.mean(dim=-1, keepdim=True) + self.epsilon)
+        x_rstd = 1.0 / x_rms
+
+        # Apply normalization
+        x_normalized = x * x_rstd
+
+        # Apply scale parameter
+        output = x_normalized * gamma
+
+        return output, x_rstd
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    """
+    # Use the same shapes as in gen_data.py
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensor
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [x, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    """
+    return [1e-6]  # epsilon=1e-6
diff --git a/aikg/benchmark/aikgbench/llm/norm/RmsNormGrad.py b/aikg/benchmark/aikgbench/llm/norm/RmsNormGrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c240ce5a14c11137328350f43dcef2644f687c7
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/norm/RmsNormGrad.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs RmsNormGrad operation.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, dy, x, rstd, gamma):
+        """
+        Perform RmsNormGrad operation (backward pass).
+
+        Args:
+            dy: Gradient of output
+            x: Input tensor from forward pass
+            rstd: Reciprocal standard deviation from forward pass
+            gamma: Scale parameter tensor
+
+        Returns:
+            Tuple of (dx, dgamma) where:
+            - dx is the gradient with respect to input
+            - dgamma is the gradient with respect to gamma
+        """
+        # Compute gradients for RMS norm backward
+        # This is a simplified implementation of the backward pass
+
+        # Gradient with respect to gamma
+        dgamma = torch.sum(dy * x * rstd, dim=0)
+
+        # Gradient with respect to input
+        dx = dy * gamma * rstd
+
+        return dx, dgamma
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Based on 
+    """
+    # Use similar shapes as other rms norm operations
+    batch_size, seq_len, hidden_size = 2, 1, 16
+
+    # Generate input tensor
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate gradient of output
+    dy = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
+
+    # Generate statistics from forward pass
+    x_squared = x.pow(2)
+    x_rms = torch.sqrt(x_squared.mean(dim=-1, keepdim=True) + 1e-6)
+    rstd = 1.0 / x_rms
+
+    # Generate gamma parameter
+    gamma = torch.randn(hidden_size, dtype=torch.float32)
+
+    return [dy, x, rstd, gamma]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For gradient operations, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/quant/ElewiseDequantPerChannel.py b/aikg/benchmark/aikgbench/llm/quant/ElewiseDequantPerChannel.py
new file mode 100644
index 0000000000000000000000000000000000000000..2de17c4f314d0ba3d6aa292efc9ff35b82c3cc9c
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/quant/ElewiseDequantPerChannel.py
@@ -0,0 +1,67 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs per-channel dequantization on input tensor.
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, scale, offset=None):
+        """
+        Perform per-channel dequantization on input tensor.
+
+        Args:
+            x: Input int8 tensor
+            scale: Scale tensor for dequantization
+            offset: Offset tensor for dequantization (optional)
+
+        Returns:
+            Dequantized float16 tensor
+        """
+        # Convert to float32 for processing
+        x_float = x.to(torch.float32)
+        scale_float = scale.to(torch.float32)
+
+        if offset is not None:
+            offset_float = offset.to(torch.float32)
+            # Dequantize with offset
+            x_dequant = (x_float - offset_float) * scale_float
+        else:
+            # Dequantize without offset
+            x_dequant = x_float * scale_float
+
+        # Clip to float16 range and convert to float16
+        out = torch.clamp(x_dequant, -65504, 65504)
+        out = out.to(torch.float16)
+
+        return out
+
+
+# Model parameters - using the same shape as in test_dequant_per_channel.py
+shape = (10, 8192)  # Default shape from test case
+scale_shape = (8192,)  # Scale shape
+batch_size = 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_dequant_per_channel.py
+    """
+    # Generate random tensors similar to test_dequant_per_channel.py (range [-5, 5])
+    input0 = torch.randint(-5, 5, shape, dtype=torch.int8)
+    input1 = torch.rand(scale_shape) * 10 - 5
+    input2 = torch.randint(-5, 5, scale_shape, dtype=torch.int8)
+    return [input0, input1, input2]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For per-channel dequantization, no specific initialization parameters are needed.
+    """
+    return []
diff --git a/aikg/benchmark/aikgbench/llm/quant/ElewiseDynamicQuant.py b/aikg/benchmark/aikgbench/llm/quant/ElewiseDynamicQuant.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9d14980e7180d2807243712ef8a245bbe56c19
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/quant/ElewiseDynamicQuant.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs dynamic quantization on input tensor.
+    """
+
+    def __init__(self, asymmetric=False):
+        super(Model, self).__init__()
+        self.asymmetric = asymmetric
+
+    def forward(self, x):
+        """
+        Perform dynamic quantization on input tensor.
+
+        Args:
+            x: Input tensor (float16/float32/bfloat16)
+
+        Returns:
+            Tuple of (quantized_tensor, scale, offset)
+            - quantized_tensor: int8 tensor
+            - scale: float32 tensor for scaling
+            - offset: float32 tensor for offset (only for asymmetric)
+        """
+        # Convert to float32 for processing
+        x_float = x.to(torch.float32)
+
+        if self.asymmetric:
+            # Asymmetric quantization
+            row_max = torch.max(x_float, dim=-1, keepdim=True)[0]
+            row_min = torch.min(x_float, dim=-1, keepdim=True)[0]
+            out_scale = (row_max - row_min) / 255.0
+            out_offset = -(row_max + row_min) / (2.0 * out_scale)
+
+            # Avoid division by zero
+            out_scale = torch.where(out_scale == 0, torch.tensor(
+                1e-6, device=out_scale.device), out_scale)
+
+            x_scaled = x_float / out_scale
+            x_offset = x_scaled + out_offset
+            x_clipped = torch.clamp(x_offset, -128, 127)
+            out_x = torch.round(x_clipped)
+
+            return (out_x.to(torch.int8),
+                    out_scale.squeeze(-1).to(torch.float32),
+                    out_offset.squeeze(-1).to(torch.float32))
+        else:
+            # Symmetric quantization
+            input_abs = torch.abs(x_float)
+            scale = torch.max(input_abs, dim=-1, keepdim=True)[0]
+            out_scale = scale / 127.0
+
+            # Avoid division by zero
+            out_scale = torch.where(out_scale == 0, torch.tensor(
+                1e-6, device=out_scale.device), out_scale)
+
+            x_scaled = x_float * 127.0 / scale
+            out_x = torch.round(x_scaled)
+
+            return (out_x.to(torch.int8),
+                    out_scale.squeeze(-1).to(torch.float32))
+
+
+# Model parameters - using the same shape as in test_dynamic_quant.py
+shape = (2, 32, 32)  # Default shape from test case
+batch_size = 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_dynamic_quant.py
+    """
+    # Generate random tensors similar to test_dynamic_quant.py (range [-5, 10])
+    input0 = torch.rand(shape) * 15 - 5
+    return [input0]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For dynamic quantization, we need the asymmetric flag.
+    """
+    return [False]  # Default to symmetric quantization
diff --git a/aikg/benchmark/aikgbench/llm/quant/ElewiseQuantPerChannel.py b/aikg/benchmark/aikgbench/llm/quant/ElewiseQuantPerChannel.py
new file mode 100644
index 0000000000000000000000000000000000000000..702fe3e76314f0cfdfb362b549fd56a27aa4b42d
--- /dev/null
+++ b/aikg/benchmark/aikgbench/llm/quant/ElewiseQuantPerChannel.py
@@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs per-channel quantization on input tensor.
+    """
+
+    def __init__(self, min_neg_127=False):
+        super(Model, self).__init__()
+        self.min_neg_127 = min_neg_127
+
+    def forward(self, x, scale, offset=None):
+        """
+        Perform per-channel quantization on input tensor.
+
+        Args:
+            x: Input tensor (float16/float32/bfloat16)
+            scale: Scale tensor for quantization
+            offset: Offset tensor for quantization (optional)
+
+        Returns:
+            Quantized int8 tensor
+        """
+        # Convert to float32 for processing
+        x_float = x.to(torch.float32)
+        scale_float = scale.to(torch.float32)
+
+        # Set lower bound based on configuration
+        int8_lower_bound = -127 if self.min_neg_127 else -128
+
+        # Avoid division by zero
+        scale_safe = torch.where(scale_float == 0, torch.tensor(
+            1e-6, device=scale_float.device), scale_float)
+
+        # Quantize
+        x_scaled = x_float / scale_safe
+        x_rounded = torch.round(x_scaled)
+
+        if offset is not None:
+            offset_float = offset.to(torch.float32)
+            x_rounded = x_rounded + offset_float
+
+        # Clip to int8 range
+        out = torch.clamp(x_rounded, int8_lower_bound, 127)
+
+        return out.to(torch.int8)
+
+
+# Model parameters - using the same shape as in test_quant_per_channel.py
+shape = (10, 8192)  # Default shape from test case
+scale_shape = (8192,)  # Scale shape
+batch_size = 1
+
+
+def get_inputs():
+    """
+    Generate random input tensors for testing.
+    Returns tensors with different data types as tested in test_quant_per_channel.py
+    """
+    # Generate random tensors similar to test_quant_per_channel.py (range [-5, 5])
+    input0 = torch.rand(shape) * 10 - 5
+    input1 = torch.rand(scale_shape) * 10 - 5
+    input2 = torch.randint(-5, 5, scale_shape, dtype=torch.int8)
+    return [input0, input1, input2]
+
+
+def get_init_inputs():
+    """
+    Return initialization parameters for the model.
+    For per-channel quantization, we need the min_neg_127 flag.
+    """
+    return [False]  # Default to -128 as lower bound