diff --git a/configs/general/llm_finetune_dense_template.yaml b/configs/general/llm_finetune_dense_template.yaml
index 2daf15914795da206dcc76abeceb02234347bca9..7fb37b72ed2fb47dc79e2ad93e6edb186dd26af0 100644
--- a/configs/general/llm_finetune_dense_template.yaml
+++ b/configs/general/llm_finetune_dense_template.yaml
@@ -87,8 +87,6 @@ parallel_config:
   context_parallel: 1          # Set the number of sequence parallel
   use_seq_parallel: True       # Corresponding to Megatron Short Sequence Parallelism
   micro_batch_num: 2           # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1
-  vocab_emb_dp: False          # Shard embedding in model parallel or data parallel. If True, the embedding lookup
-# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
 micro_batch_interleave_num: 1  # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value
 
 # callbacks
diff --git a/configs/general/llm_finetune_moe_template.yaml b/configs/general/llm_finetune_moe_template.yaml
index 8379a68cd31ce2a37bd95f01e6ed0b145ca42dd3..bcab079acb6a52d5d0ca67e7401fe2fda5da27d9 100644
--- a/configs/general/llm_finetune_moe_template.yaml
+++ b/configs/general/llm_finetune_moe_template.yaml
@@ -101,8 +101,6 @@ parallel_config:
   context_parallel: 1          # Set the number of sequence parallel
   use_seq_parallel: True       # Corresponding to Megatron Short Sequence Parallelism
   micro_batch_num: 2           # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1
-  vocab_emb_dp: False          # Shard embedding in model parallel or data parallel. If True, the embedding lookup
-# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
 micro_batch_interleave_num: 1  # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value
 
 # callbacks
diff --git a/configs/general/llm_pretrain_dense_template.yaml b/configs/general/llm_pretrain_dense_template.yaml
index 6eafb3dab0e97b52779165c4e735d6f554673f98..77bc4b30c29492f64a8da5d42e2132dbebfe860e 100644
--- a/configs/general/llm_pretrain_dense_template.yaml
+++ b/configs/general/llm_pretrain_dense_template.yaml
@@ -91,8 +91,6 @@ parallel_config:
   context_parallel: 1          # Set the number of sequence parallel
   use_seq_parallel: True       # Corresponding to Megatron Short Sequence Parallelism
   micro_batch_num: 2           # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1
-  vocab_emb_dp: False          # Shard embedding in model parallel or data parallel. If True, the embedding lookup
-# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
 micro_batch_interleave_num: 1  # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value
 
 # callbacks
diff --git a/configs/general/llm_pretrain_moe_template.yaml b/configs/general/llm_pretrain_moe_template.yaml
index 4d03a999cb2318ede14b27b9799cfd0ececd6083..05866361108d7bc7886e1ea8546c26174873d24c 100644
--- a/configs/general/llm_pretrain_moe_template.yaml
+++ b/configs/general/llm_pretrain_moe_template.yaml
@@ -105,8 +105,6 @@ parallel_config:
   context_parallel: 1          # Set the number of sequence parallel
   use_seq_parallel: True       # Corresponding to Megatron Short Sequence Parallelism
   micro_batch_num: 2           # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1
-  vocab_emb_dp: False          # Shard embedding in model parallel or data parallel. If True, the embedding lookup
-# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
 micro_batch_interleave_num: 1  # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value
 
 # callbacks
diff --git a/configs/general/run_general_task.yaml b/configs/general/run_general_task.yaml
index 0086c02da29770d0438b5f0352a075b0d98e59c7..bf4c79887e7e784755eeb361e761c1f99809961b 100644
--- a/configs/general/run_general_task.yaml
+++ b/configs/general/run_general_task.yaml
@@ -46,7 +46,6 @@ parallel_config:
   pipeline_stage: 1
   use_seq_parallel: False
   micro_batch_num: 1
-  vocab_emb_dp: True
   gradient_aggregation_group: 4
 micro_batch_interleave_num: 1
 
diff --git a/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml b/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml
index fcf03a6dfb4ae7e03d260205b870415fe587c571..bc398dea778984f5bafb9064d02031efe4ad727c 100644
--- a/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml
+++ b/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml
@@ -101,7 +101,6 @@ parallel_config:
   pipeline_stage: 4
   micro_batch_num: &micro_batch_num 16
   expert_parallel: 4
-  vocab_emb_dp: True
   use_seq_parallel: True
   gradient_aggregation_group: 1
 # when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
diff --git a/mindformers/core/parallel_config.py b/mindformers/core/parallel_config.py
index bf67e7812fee88f0d36d11d1d8b2adc666c36702..726ef5b89caee899c8c2b96aaef51210ecdc217d 100644
--- a/mindformers/core/parallel_config.py
+++ b/mindformers/core/parallel_config.py
@@ -61,14 +61,14 @@ def build_parallel_config(config):
         if not isinstance(config.parallel_config, TransformerOpParallelConfig):
             logger.info("initial parallel_config from dict: %s", config.parallel_config)
             if config.parallel_config.auto_parallel or config.parallel_config.pipeline_stage > 1:
-                logger.info("pipeline_stage = %s > 1, vocab_emd_dp will be reset to False.",
+                logger.warning("pipeline_stage = %s > 1, vocab_emb_dp will be reset to False.",
                             config.parallel_config.pipeline_stage)
                 config.parallel_config.vocab_emb_dp = False
             _set_rp_matmul_mem_coef(config.parallel_config.get('mem_coeff', 0.1))
             if config.parallel_config.context_parallel_algo and \
                 config.parallel_config.context_parallel_algo == "hybird_cp":
-                logger.warning(f"context_parallel_algo `hybird_cp` will not take effect in later versions, "
-                               f"and will be replaced by `hybrid_cp` in the new version.")
+                logger.warning("context_parallel_algo `hybird_cp` will not take effect in later versions, "
+                               "and will be replaced by `hybrid_cp` in the new version.")
             config.parallel_config = TransformerOpParallelConfig(recompute=config.recompute_config,
                                                                  swap=config.swap_config,
                                                                  **config.parallel_config)
diff --git a/mindformers/parallel_core/model_parallel_config.py b/mindformers/parallel_core/model_parallel_config.py
index 54ce51219a4deb1183d343cb9c46acb466480762..2b31f3090b2a946b0d6ac7b83cec905239a5ed95 100644
--- a/mindformers/parallel_core/model_parallel_config.py
+++ b/mindformers/parallel_core/model_parallel_config.py
@@ -93,7 +93,11 @@ class ModelParallelConfig:
     """
 
     vocab_emb_dp: Optional[bool] = False
-    """Whether to split the vocabulary only along the dp dimension. Default: True."""
+    """
+    Whether to split the vocabulary only along the dp dimension.
+    This setting is not supported to be configured as True at present; 
+    otherwise, it will be converted to False automatically.
+    """
 
     ###################
     # Training
diff --git a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py
index 76c3e45ef6d16f648d31be8c3ddd7d76c60547a8..9979dabc3d40b36546225f6204766d4df2ee8029 100644
--- a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py
+++ b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py
@@ -106,6 +106,8 @@ class VocabParallelEmbedding(nn.Cell):
         self.weight = Parameter(init_method([self.num_embeddings, self.embedding_dim]), name="weight")
         self.embedding_morph = P.Morph(
             self.embedding_func, embedding_infer_shape, embedding_infer_dtype).add_prim_attr("self_define_shard", True)
+        self.enable_embedding_tp = self.tp > 1 and self.num_embeddings % self.tp == 0
+
         self.reshape = Reshape()
         self.config = config
         if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation():
@@ -144,7 +146,7 @@ class VocabParallelEmbedding(nn.Cell):
         bs, seq_len = input_.shape
         _, hidden = weight.shape
         input_ = self.reshape(input_, (bs * seq_len,))
-        if self.tp > 1:
+        if self.enable_embedding_tp:
             # Build the mask. # Mask the input.
             input_ = input_ - self.vocab_start_index
             masked_input = self.relu(input_)
@@ -157,7 +159,7 @@ class VocabParallelEmbedding(nn.Cell):
         output_parallel = mint.nn.functional.embedding(masked_input, weight)
 
         # Mask the output embedding.
-        if self.tp > 1:
+        if self.enable_embedding_tp:
             input_mask = input_mask.expand_dims(-1)
             output_parallel = ops.mul(output_parallel, input_mask)
 
@@ -175,10 +177,10 @@ class VocabParallelEmbedding(nn.Cell):
                 output = output.reshape(bs, -1, hidden)
             return output
 
-        if self.tp == 1:
-            output = output_parallel
-        else:
+        if self.enable_embedding_tp:
             output = ops.AllReduce(group=self.group)(output_parallel)
+        else:
+            output = output_parallel
 
         return output
 
@@ -189,24 +191,18 @@ class VocabParallelEmbedding(nn.Cell):
         else:
             out_strategy = layout("dp", "cp", "None")
 
-        if config.vocab_emb_dp or (self.num_embeddings % self.tp != 0):
-            self.embedding_morph.shard(
-                in_strategy=(
-                    layout("dp", "cp",),
-                    layout("None", "None"),
-                ),
-                out_strategy=(
-                    out_strategy,
-                )
-            )
+        if self.enable_embedding_tp:
+            embedding_strategy = layout("tp", "None")
         else:
-            self.embedding_morph.shard(
-                in_strategy=(
-                    layout("dp", "cp"),
-                    layout("tp", "None"),
-                ),
-                out_strategy=(out_strategy,),
-            )
+            embedding_strategy = layout("None", "None")
+
+        self.embedding_morph.shard(
+            in_strategy=(
+                layout("dp", "cp"),
+                embedding_strategy,
+            ),
+            out_strategy=(out_strategy,),
+        )
 
     def sharding_propagation(self, config: TransformerConfig):
         pass
@@ -258,7 +254,7 @@ class ColumnParallelLinear(nn.Cell):
                  transpose_b: bool = True,
                  bias_init: Callable = None
                  ):
-        super(ColumnParallelLinear, self).__init__()
+        super().__init__()
         if gather_output:
             raise NotImplementedError("For ColumnParallelLinear, `gather_output` is not supported for now")
         if stride > 1:
@@ -470,7 +466,7 @@ class RowParallelLinear(nn.Cell):
                  transpose_b: bool = True,
                  bias_init: Callable = None
                  ):
-        super(RowParallelLinear, self).__init__()
+        super().__init__()
         if input_is_parallel:
             raise NotImplementedError("For RowParallelLinear, `input_is_parallel` is not supported for now")
         if stride > 1:
diff --git a/mindformers/parallel_core/transformer_config.py b/mindformers/parallel_core/transformer_config.py
index fa8238769caa54643a25c32f95f0ceec52da8e02..c0e0ddd14cf2e46b8aaf4e81f28111073cf8775d 100644
--- a/mindformers/parallel_core/transformer_config.py
+++ b/mindformers/parallel_core/transformer_config.py
@@ -435,6 +435,10 @@ class TransformerConfig(ModelParallelConfig, MFModelConfig):
         if not isinstance(self.attention_dropout, float) or not 0 <= self.attention_dropout < 1:
             raise ValueError(f"attention_dropout should be a float within [0, 1), but get {self.attention_dropout}.")
 
+        if self.vocab_emb_dp:
+            logger.warning("vocab_emb_dp is not supported in MCore, it will be converted to False automatically.")
+            self.vocab_emb_dp = False
+
         if self.pad_token_id is None:
             self.pad_token_id = 0