diff --git a/tests/st/test_ut/test_models/test_auto/test_tokenization_auto.py b/tests/st/test_ut/test_models/test_auto/test_tokenization_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e87533b9ddef7ee233b8f3d59eff43d9b70f404
--- /dev/null
+++ b/tests/st/test_ut/test_models/test_auto/test_tokenization_auto.py
@@ -0,0 +1,376 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test tokenization auto."""
+import os
+import shutil
+import json
+import sys
+from unittest.mock import patch, MagicMock
+import yaml
+import pytest
+import sentencepiece as spm
+from mindformers.models.auto.tokenization_auto import AutoTokenizer, is_experimental_mode
+from mindformers.models.auto.configuration_auto import AutoConfig
+from mindformers.models.auto.tokenization_auto import TOKENIZER_MAPPING
+from mindformers.models.auto.tokenization_auto import tokenizer_class_from_name
+
+# Add project root to path
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, PROJECT_ROOT)
+
+
+class TestAutoTokenizer:
+    """ test auto tokenizer """
+    @classmethod
+    def setup_class(cls):
+        """ create test directory """
+        cls.test_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp_tokenizer_test_coverage")
+        if os.path.exists(cls.test_dir):
+            shutil.rmtree(cls.test_dir)
+        os.makedirs(cls.test_dir)
+
+        cls.vocab_file = os.path.join(cls.test_dir, "tokenizer.model")
+
+        # Create dummy sentencepiece model
+        corpus_path = os.path.join(cls.test_dir, "corpus.txt")
+        with open(corpus_path, "w", encoding="utf-8") as f:
+            f.write("This is a test corpus for sentencepiece training. One Two Three.")
+
+        model_prefix = os.path.join(cls.test_dir, "tokenizer")
+        spm.SentencePieceTrainer.Train(
+            input=corpus_path,
+            model_prefix=model_prefix,
+            vocab_size=100,  # Increased vocab size to avoid "smaller than required_chars" error
+            model_type='bpe',
+            character_coverage=1.0,
+            user_defined_symbols=['<pad>']
+        )
+
+    @classmethod
+    def teardown_class(cls):
+        """ Clean up """
+        if os.path.exists(cls.test_dir):
+            shutil.rmtree(cls.test_dir)
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_is_experimental_mode(self):
+        """
+        Coverage for `is_experimental_mode`
+        """
+        # 1. Directory with .yaml -> False (Origin Mode)
+        yaml_dir = os.path.join(self.test_dir, "mode_yaml")
+        os.makedirs(yaml_dir, exist_ok=True)
+        with open(os.path.join(yaml_dir, "model.yaml"), 'w', encoding="utf-8") as f:
+            f.write("key: value")
+        assert not is_experimental_mode(yaml_dir)
+
+        # 2. Directory without .yaml -> True (Experimental/HF Mode)
+        json_dir = os.path.join(self.test_dir, "mode_json")
+        os.makedirs(json_dir, exist_ok=True)
+        # Assuming no yaml here
+        assert is_experimental_mode(json_dir)
+
+        # 3. Supported model name logic
+        # We can't easily modify global TOKENIZER_SUPPORT_LIST safely, but we can test unknown string
+        assert is_experimental_mode("unknown_model_string")
+
+        # 4. Path exists but is a file (and unsupported string) -> True
+        dummy_file = os.path.join(self.test_dir, "dummy_file.txt")
+        with open(dummy_file, 'w', encoding="utf-8") as f:
+            f.write("content")
+        assert is_experimental_mode(dummy_file)
+
+    @patch("mindformers.tools.MindFormerRegister")
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_from_pretrained_origin_mode(self, mock_register):
+        """
+        Test `from_pretrained` using Origin Mode (YAML detection).
+        This simulates loading from a directory containing a YAML file.
+        """
+        yaml_dir = os.path.join(self.test_dir, "origin_mode_load")
+        os.makedirs(yaml_dir, exist_ok=True)
+
+        # Prepare environment: valid yaml + vocab file
+        shutil.copy(self.vocab_file, os.path.join(yaml_dir, "tokenizer.model"))
+
+        config_data = {
+            "processor": {
+                "tokenizer": {
+                    "type": "LlamaTokenizer",
+                    "vocab_file": "tokenizer.model"
+                }
+            }
+        }
+        with open(os.path.join(yaml_dir, "mindspore_model.yaml"), 'w', encoding="utf-8") as f:
+            yaml.dump(config_data, f)
+
+        # Setup Mock
+        mock_tokenizer_cls = MagicMock()
+        mock_tokenizer_instance = MagicMock()
+        mock_tokenizer_cls.from_pretrained.return_value = mock_tokenizer_instance
+        # When MindFormerRegister.get_cls is called, return our mock class
+        mock_register.get_cls.return_value = mock_tokenizer_cls
+
+        # Call SUT
+        tokenizer = AutoTokenizer.from_pretrained(yaml_dir)
+
+        # Verifications
+        # 1. Should detect yaml -> origin mode
+        # 2. Origin mode calls MindFormerRegister.get_cls(..., class_name='LlamaTokenizer')
+        mock_register.get_cls.assert_called_with(module_type='tokenizer', class_name='LlamaTokenizer')
+        # 3. Should instantiate and call from_pretrained on the retrieved class
+        mock_tokenizer_cls.from_pretrained.assert_called()
+        assert tokenizer == mock_tokenizer_instance
+
+    @patch("mindformers.models.auto.tokenization_auto.tokenizer_class_from_name")
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_from_pretrained_experimental_mode(self, mock_class_from_name):
+        """
+        Test `from_pretrained` using Experimental Mode (JSON/HF style).
+        This simulates loading from a directory with tokenizer_config.json
+        """
+        json_dir = os.path.join(self.test_dir, "exp_mode_load")
+        os.makedirs(json_dir, exist_ok=True)
+
+        shutil.copy(self.vocab_file, os.path.join(json_dir, "tokenizer.model"))
+
+        # Create tokenizer_config.json indicating the class
+        config_data = {
+            "tokenizer_class": "LlamaTokenizer",
+            "vocab_file": "tokenizer.model"
+        }
+        with open(os.path.join(json_dir, "tokenizer_config.json"), 'w', encoding="utf-8") as f:
+            json.dump(config_data, f)
+
+        # Setup Mock
+        mock_tokenizer_cls = MagicMock()
+        mock_tokenizer_instance = MagicMock()
+        mock_tokenizer_cls.from_pretrained.return_value = mock_tokenizer_instance
+        mock_class_from_name.return_value = mock_tokenizer_cls
+
+        # Call SUT
+        tokenizer = AutoTokenizer.from_pretrained(json_dir)
+        mock_class_from_name.assert_called_with("LlamaTokenizerFast")
+        mock_tokenizer_cls.from_pretrained.assert_called_with(json_dir, _from_auto=True, _commit_hash=None)
+        assert tokenizer == mock_tokenizer_instance
+
+    @patch("mindformers.models.auto.tokenization_auto.tokenizer_class_from_name")
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_from_pretrained_with_explicit_type(self, mock_class_from_name):
+        """
+        Test `from_pretrained` when `tokenizer_type` arg is explicitly provided.
+        """
+        mock_tokenizer_cls = MagicMock()
+        mock_tokenizer_instance = MagicMock()
+        mock_tokenizer_cls.from_pretrained.return_value = mock_tokenizer_instance
+        mock_class_from_name.return_value = mock_tokenizer_cls
+
+        # 'llama' is a known key in TOKENIZER_MAPPING_NAMES
+        # Origin or Exp mode check happens first, but tokenizer_type argument logic inside AutoTokenizer
+        # usually shortcuts or guides the class selection.
+
+        dummy_path = "dummy_path_not_exist"
+        # This triggers experimental mode because it doesn't exist locally (usually) and not in support list
+
+        tokenizer = AutoTokenizer.from_pretrained(dummy_path, tokenizer_type="llama")
+
+        # Verify "llama" -> "LlamaTokenizerFast" (default preference) mapping usage
+        mock_class_from_name.assert_called_with("LlamaTokenizerFast")
+        assert tokenizer == mock_tokenizer_instance
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_invalid_yaml_name(self):
+        """
+        Test the `invalid_yaml_name` method for filtering/validating model names.
+        """
+        # "invalid_name" should return True (invalid)
+        assert AutoTokenizer.invalid_yaml_name("invalid_name")
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_register_custom_tokenizer(self):
+        """
+        Test `AutoTokenizer.register` API.
+        """
+
+        class MyConfig(AutoConfig):
+            pass
+
+        class MyTokenizer:
+            pass
+
+        # Register
+        AutoTokenizer.register(MyConfig, slow_tokenizer_class=MyTokenizer, exist_ok=True)
+
+        # Check registration
+        assert MyConfig in TOKENIZER_MAPPING
+        assert TOKENIZER_MAPPING[MyConfig][0] == MyTokenizer
+
+        # Cleanup
+        # TOKENIZER_MAPPING is a _LazyAutoMapping object, not a dict.
+        # It stores extra content in _extra_content.
+        # pylint: disable=W0212
+        if MyConfig in TOKENIZER_MAPPING._extra_content:
+            del TOKENIZER_MAPPING._extra_content[MyConfig]
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_tokenizer_class_from_name_helper_real(self):
+        """Test the helper function for loading class (real execution)"""
+        res = tokenizer_class_from_name("UnknownTokenizerClassXYZ")
+        assert res is None
+
+    # Correct the mock structure: {'type': {'name_suffix': ['full_name']}}
+    # glm4_9b -> type: glm4, suffix: 9b.
+    @patch("mindformers.models.auto.tokenization_auto.TOKENIZER_SUPPORT_LIST", {'glm4': {'9b': ['glm4_9b']}})
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_invalid_yaml_name_logic(self):
+        """Test extended logic of invalid_yaml_name"""
+        # Valid cases
+        # With corrected mock, these should return False (not invalid) and NOT raise ValueError
+        assert not AutoTokenizer.invalid_yaml_name("glm4_9b")
+        assert not AutoTokenizer.invalid_yaml_name("mindspore/glm4_9b")
+
+        # Invalid cases
+        # unknown_model is not in support list keys, so returns True immediately (no exception)
+        assert AutoTokenizer.invalid_yaml_name("unknown_model")
+
+        # "glm4_unknown" starts with a known prefix "glm4" but fails specific model check, raising ValueError
+        with pytest.raises(ValueError):
+            AutoTokenizer.invalid_yaml_name("glm4_unknown")
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_origin_mode_download_logic(self):
+        """Test origin mode triggering download paths (mocked)"""
+        # We need to mock MindFormerBook and os.path.exists to simulate download paths
+        # Also mock os.makedirs to prevent attempting to create directories for "downloads"
+        with patch("mindformers.models.auto.tokenization_auto.MindFormerBook") as mock_book, \
+                patch("os.path.exists") as mock_exists, \
+                patch("mindformers.models.auto.tokenization_auto.set_default_yaml_file") as mock_set_yaml, \
+                patch("mindformers.models.auto.tokenization_auto.AutoTokenizer._get_class_name_from_yaml") as \
+                        mock_get_cls, \
+                patch("mindformers.tools.MindFormerRegister") as _:
+            mock_book.get_xihe_checkpoint_download_folder.return_value = "/tmp/xihe"
+            mock_book.get_default_checkpoint_download_folder.return_value = "/tmp/default"
+            mock_exists.return_value = False  # Simulate file needs download logic trigger (though code just mkdirs)
+            mock_get_cls.return_value = ("LlamaTokenizer", MagicMock(processor=MagicMock(tokenizer={})))
+
+            # Mock set_default_yaml_file to avoid actual file ops or checks
+            mock_set_yaml.return_value = None
+
+            # 1. Mindspore prefix
+            AutoTokenizer.get_class_from_origin_mode("mindspore/glm4_9b")
+            # Check that it tried to access the download folder, confirming logic path
+            mock_book.get_xihe_checkpoint_download_folder.assert_called()
+
+            # 2. Default prefix
+            with patch("mindformers.models.auto.tokenization_auto.AutoTokenizer.invalid_yaml_name", return_value=False):
+                AutoTokenizer.get_class_from_origin_mode("glm4_9b")
+                mock_book.get_default_checkpoint_download_folder.assert_called()
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_origin_mode_error_handling(self):
+        """Test error paths in get_class_from_origin_mode"""
+        # 1. Not a string
+        with pytest.raises(TypeError):
+            AutoTokenizer.get_class_from_origin_mode(123)
+
+        # 2. Path is dir but no yaml (class_name is None)
+        with patch("os.path.isdir", return_value=True), \
+                patch("os.path.exists", return_value=True), \
+                patch("mindformers.models.auto.tokenization_auto.AutoTokenizer._get_class_name_from_yaml",
+                      return_value=(None, None)):
+            with pytest.raises(ValueError):
+                AutoTokenizer.get_class_from_origin_mode("/tmp/dummy_dir")
+
+        # 3. Unsupported model
+        with patch("mindformers.models.auto.tokenization_auto.AutoTokenizer.invalid_yaml_name", return_value=True), \
+                patch("os.path.exists", return_value=False), \
+                patch("os.path.isdir", return_value=False), \
+                patch("os.makedirs") as _:
+            with pytest.raises(FileNotFoundError):
+                AutoTokenizer.get_class_from_origin_mode("unsupported_model")
+
+    @patch("mindformers.models.auto.tokenization_auto.AutoConfig")
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_experimental_mode_legacy_automap(self, mock_auto_config):
+        """Test experimental mode with legacy auto_map format (list/tuple)"""
+        json_dir = os.path.join(self.test_dir, "legacy_automap")
+        os.makedirs(json_dir, exist_ok=True)
+
+        # 1. Setup tokenizer config with LEGACY auto_map (list)
+        tokenizer_config_data = {
+            "auto_map": ["AutoTokenizer", "LlamaTokenizer"],
+        }
+
+        # 2. Mock AutoConfig to avoid loading real config (which might fail in test env)
+        # and to strictly control attributes.
+        mock_config_instance = MagicMock()
+        mock_config_instance.tokenizer_class = None
+        # Ensure auto_map attribute is missing or None to avoid entering the crashing block (lines 532-533)
+        del mock_config_instance.auto_map
+        mock_config_instance.auto_map = None
+
+        mock_auto_config.from_pretrained.return_value = mock_config_instance
+
+        with open(os.path.join(json_dir, "tokenizer_config.json"), 'w', encoding="utf-8") as f:
+            json.dump(tokenizer_config_data, f)
+
+        with patch("mindformers.models.auto.tokenization_auto.get_tokenizer_config",
+                   return_value=tokenizer_config_data), \
+                patch("mindformers.models.auto.tokenization_auto.resolve_trust_remote_code", return_value=True), \
+                patch("mindformers.models.auto.tokenization_auto.get_class_from_dynamic_module") as mock_get_class:
+            mock_tokenizer = MagicMock()
+            mock_get_class.return_value = mock_tokenizer
+
+            AutoTokenizer.from_pretrained(json_dir, trust_remote_code=True)
+            # Verify it tried to load from dynamic module using legacy format
+            mock_get_class.assert_called()
+
+    @patch("mindformers.models.auto.tokenization_auto.TOKENIZER_MAPPING")
+    @patch("mindformers.models.auto.tokenization_auto.AutoConfig")
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_from_pretrained_value_errors(self, mock_auto_config, mock_mapping):
+        """Test ValueError scenarios in from_pretrained"""
+        # Mock keys() to return a simple list of mocks with __name__ to avoid iteration errors over LazyMapping
+        mock_cls = MagicMock()
+        mock_cls.__name__ = "MockConfig"
+        mock_mapping.keys.return_value = [mock_cls]
+
+        # 1. Tokenizer type not found
+        # ValueError is wrapped in RuntimeError by @experimental_mode_func_checker
+        with pytest.raises(RuntimeError):
+            AutoTokenizer.from_pretrained("dummy", tokenizer_type="InvalidType")
+
+        # 2. Config class unrecognized
+        mock_config_instance = MagicMock()
+        mock_config_instance.tokenizer_class = None
+
+        mock_auto_config.from_pretrained.return_value = mock_config_instance
+
+        # ValueError is wrapped in RuntimeError by @experimental_mode_func_checker
+        with pytest.raises(RuntimeError):
+            with patch("mindformers.models.auto.tokenization_auto.get_tokenizer_config", return_value={}):
+                AutoTokenizer.from_pretrained("dummy_path_val_error")
diff --git a/tests/st/test_ut/test_tokenizers/test_tokenization_utils.py b/tests/st/test_ut/test_tokenizers/test_tokenization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e963db2d185a2b9879616b32d4bd870a6f6ab7c0
--- /dev/null
+++ b/tests/st/test_ut/test_tokenizers/test_tokenization_utils.py
@@ -0,0 +1,412 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test tokenization utils."""
+import os
+import shutil
+import sys
+import pytest
+from mindformers.models.tokenization_utils import (
+    PreTrainedTokenizer,
+    Trie,
+    AddedToken,
+    _is_whitespace,
+    _is_control,
+    _is_punctuation,
+    _is_end_of_word,
+    _is_start_of_word,
+    _insert_one_token_to_ordered_list
+)
+
+# Add project root to path
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, PROJECT_ROOT)
+
+
+class ConcreteTokenizer(PreTrainedTokenizer):
+    """ concrete tokenizer """
+    def __init__(self, vocab_file, **kwargs):
+        self.vocab = {}
+        with open(vocab_file, encoding="utf-8") as f:
+            for line in f:
+                if line.strip():
+                    token = line.strip()
+                    self.vocab[token] = len(self.vocab)
+
+        # Add UNK token if not present
+        if "<unk>" not in self.vocab:
+            self.vocab["<unk>"] = len(self.vocab)
+
+        self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
+        super().__init__(unk_token="<unk>", **kwargs)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return self.vocab
+
+    def _tokenize(self, text):
+        # Simple whitespace tokenizer for testing
+        return text.split()
+
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+
+class TestTokenizationUtils:
+    """ test tokenization utils """
+    @classmethod
+    def setup_class(cls):
+        """ Create a temporary directory """
+        cls.test_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp_tokenization_utils_coverage")
+        if os.path.exists(cls.test_dir):
+            shutil.rmtree(cls.test_dir)
+        os.makedirs(cls.test_dir)
+
+        # Generate a simple vocab file
+        cls.vocab_file = os.path.join(cls.test_dir, "vocab.txt")
+        with open(cls.vocab_file, "w", encoding="utf-8") as f:
+            vocab_list = ["hello", "world", "this", "is", "a", "test", "tokenizer", "[CLS]", "[SEP]", "[MASK]"]
+            f.write("\n".join(vocab_list))
+
+    @classmethod
+    def teardown_class(cls):
+        """ Remove the test directory """
+        if os.path.exists(cls.test_dir):
+            shutil.rmtree(cls.test_dir)
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_trie_logic(self):
+        """Test Trie add and split functionality"""
+        trie = Trie()
+        # Test basic add and split
+        trie.add("[CLS]")
+        trie.add("extra_id_1")
+
+        text = "[CLS] This is a extra_id_1 test"
+        split_text = trie.split(text)
+        assert split_text == ["[CLS]", " This is a ", "extra_id_1", " test"]
+
+        # Test duplicate add (idempotent)
+        trie.add("[CLS]")
+        split_text_2 = trie.split(text)
+        assert split_text == split_text_2
+
+        # Test empty string add
+        trie.add("")
+
+        # Test matching longest first
+        trie.add("extra_id_100")
+        text_long = "extra_id_100 should match full not extra_id_1"
+        split_long = trie.split(text_long)
+        assert split_long[0] == "extra_id_100"
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_trie_atom_methods(self):
+        """Test static atom methods of Trie class"""
+        # Test split_atom_1: Checks if current state has reached a token end ("")
+        # Construct a state where one path has ended
+        states = {0: {"": 1}, 5: {}}
+        offsets = []
+        text = "sample text"
+        res_offsets = Trie.split_atom_1(states, text, offsets)
+        # Should append start(0) and len(text) to offsets and break
+        assert res_offsets == [0, 11]
+
+        # Test split_atom_2: Reset or cleanup states
+        # Case 1: Reset = True
+        states = {0: {}, 1: {}}
+        res_states = Trie.split_atom_2(reset=True, states=states)
+        assert res_states == {}
+
+        # Case 2: Reset = False, remove specific keys
+        states = {0: "val0", 1: "val1", 2: "val2"}
+        to_remove = {1}
+        res_states = Trie.split_atom_2(reset=False, to_remove=to_remove, states=states)
+        assert res_states == {0: "val0", 2: "val2"}
+
+        # We want to hit: `if "" in looktrie_pointer:` -> update start, end, skip
+        looktrie_pointer = {"": 1}
+        states = {0: looktrie_pointer}
+
+        res_states, res_start, res_end, res_skip = Trie.split_atom_3(states, current=0, text="ab", start=0, skip=0)
+        assert res_start == 0
+        # end is updated to lookahead_index which starts at current(0)
+        assert res_end == 0
+        assert res_skip == 0
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_module_helper_functions(self):
+        """Test standalone helper functions in tokenization_utils"""
+        # _is_whitespace
+        assert _is_whitespace(" ")
+        assert _is_whitespace("\t")
+        assert _is_whitespace("\n")
+        assert _is_whitespace("\r")
+        assert _is_whitespace("\u00A0")
+        assert not _is_whitespace("a")
+
+        # _is_control
+        # \t, \n, \r are NOT control in this specific function logic (explicitly excluded)
+        assert not _is_control("\t")
+        assert not _is_control("\n")
+        assert not _is_control("\r")
+        assert _is_control("\x00")  # Null char is control (Cc)
+        assert not _is_control("a")
+
+        # _is_punctuation
+        assert _is_punctuation("!")
+        assert _is_punctuation(",")
+        # ASCII non-letter/number characters check: 33-47, 58-64, 91-96, 123-126
+        assert _is_punctuation("$")  # 36
+        assert not _is_punctuation("A")
+
+        # _is_end_of_word: checks last char
+        assert _is_end_of_word("word.")
+        assert not _is_end_of_word("word")
+
+        # _is_start_of_word: checks first char
+        assert _is_start_of_word(".word")
+        assert not _is_start_of_word("word")
+
+        # _insert_one_token_to_ordered_list
+        token_list = ["a", "c"]
+        _insert_one_token_to_ordered_list(token_list, "b")
+        assert token_list == ["a", "b", "c"]
+
+        # Insert existing
+        _insert_one_token_to_ordered_list(token_list, "b")
+        assert token_list == ["a", "b", "c"]  # No duplicate
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_tokenizer_instantiation(self):
+        """Test tokenizer initialization and vocab loading"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        assert tokenizer.vocab_size > 0
+        assert "hello" in tokenizer.get_vocab()
+        assert tokenizer.unk_token == "<unk>"
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_tokenize_basic(self):
+        """Test basic tokenization flow"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        text = "hello world"
+        tokens = tokenizer.tokenize(text)
+        assert tokens == ["hello", "world"]
+
+        # Test with unknown token
+        text_unk = "hello unknown_word"
+        tokens_unk = tokenizer.tokenize(text_unk)
+        assert tokens_unk == ["hello", "unknown_word"]
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_convert_tokens_to_ids(self):
+        """Test converting tokens to IDs"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        ids = tokenizer.convert_tokens_to_ids(["hello", "world"])
+        assert isinstance(ids, list)
+        assert len(ids) == 2
+        assert ids[0] == tokenizer.vocab["hello"]
+
+        # Single string input
+        id_single = tokenizer.convert_tokens_to_ids("hello")
+        assert id_single == tokenizer.vocab["hello"]
+
+        # None input
+        assert tokenizer.convert_tokens_to_ids(None) is None
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_convert_ids_to_tokens(self):
+        """Test converting IDs to tokens"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        target_id = tokenizer.vocab["hello"]
+        tokens = tokenizer.convert_ids_to_tokens([target_id])
+        assert tokens == ["hello"]
+
+        # Single int input
+        token_single = tokenizer.convert_ids_to_tokens(target_id)
+        assert token_single == "hello"
+
+        # Out of vocab size
+        with pytest.raises(IndexError):
+            tokenizer.convert_ids_to_tokens(99999)
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_special_tokens_handling(self):
+        """Test handling of added special tokens"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+
+        # Add a new special token
+        new_token = AddedToken("[SPECIAL]", special=True)
+        tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+
+        assert "[SPECIAL]" in tokenizer.get_added_vocab()
+
+        # Test encoding with special token not splitting
+        text = "hello [SPECIAL] world"
+        tokens = tokenizer.tokenize(text)
+        assert "[SPECIAL]" in tokens
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_added_tokens_decoder_setter_validation(self):
+        """Test validation logic in added_tokens_decoder setter"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+
+        # Valid setter
+        valid_dict = {100: AddedToken("token")}
+        tokenizer.added_tokens_decoder = valid_dict
+        assert tokenizer.added_tokens_decoder[100].content == "token"
+
+        valid_dict_str = {101: "token_str"}
+        tokenizer.added_tokens_decoder = valid_dict_str
+        assert tokenizer.added_tokens_decoder[101].content == "token_str"
+
+        # Invalid Key Type
+        with pytest.raises(ValueError):
+            tokenizer.added_tokens_decoder = {"bad_key": "val"}
+
+        # Invalid Value Type
+        with pytest.raises(ValueError):
+            tokenizer.added_tokens_decoder = {102: 12345}
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_encode_decode_cycle(self):
+        """Test full encode and decode cycle"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        text = "hello world"
+
+        # Encode
+        input_ids = tokenizer.encode(text, add_special_tokens=False)
+        assert len(input_ids) == 2
+
+        # Decode
+        decoded_text = tokenizer.decode(input_ids)
+        assert decoded_text.strip() == text  # strip because join might add spaces
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_batch_encode_plus(self):
+        """Test batch encoding"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        batch_text = ["hello world", "this is a test"]
+
+        encoded = tokenizer.batch_encode_plus(batch_text, padding=False)
+        assert len(encoded["input_ids"]) == 2
+        assert len(encoded["input_ids"][0]) == 2  # hello world
+        assert len(encoded["input_ids"][1]) == 4  # this is a test
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_num_special_tokens_to_add(self):
+        """Test calculation of added special tokens"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        # Default build_inputs_with_special_tokens adds nothing if not overridden,
+        # unless we set bos/eos/etc.
+        count = tokenizer.num_special_tokens_to_add(pair=False)
+        assert count == 0
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_prepare_for_tokenization(self):
+        """Test hook for preparing text"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+        text = "Raw Text"
+        processed_text, kwargs = tokenizer.prepare_for_tokenization(text, custom_arg="val")
+        assert text == processed_text
+        assert kwargs == {"custom_arg": "val"}
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_advanced_input_handling(self):
+        """Test advanced input types for coverage of _encode_plus"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+
+        # 1. Test list of strings with is_split_into_words=True
+        # Input: ["hello", "world"] -> treated as words, tokenized individually -> ["hello", "world"] -> ids
+        res = tokenizer.encode_plus(["hello", "world"], is_split_into_words=True, return_attention_mask=False,
+                                    return_token_type_ids=False)
+        assert res['input_ids'] == [tokenizer.vocab["hello"], tokenizer.vocab["world"]]
+
+        # 2. Test list of strings with is_split_into_words=False
+        # Input: ["hello", "world"] -> treated as already tokenized strings
+        res = tokenizer.encode_plus(["hello", "world"], is_split_into_words=False, return_attention_mask=False,
+                                    return_token_type_ids=False)
+        assert res['input_ids'] == [tokenizer.vocab["hello"], tokenizer.vocab["world"]]
+
+        # 3. Test list of integers (pre-tokenized IDs)
+        ids = [tokenizer.vocab["hello"], tokenizer.vocab["world"]]
+        res = tokenizer.encode_plus(ids, return_attention_mask=False, return_token_type_ids=False)
+        assert res['input_ids'] == ids
+
+        # 4. Error: is_split_into_words=True but input is invalid (e.g. integer)
+        with pytest.raises(ValueError):
+            tokenizer.encode_plus(123, is_split_into_words=True)
+
+        # 5. Error: Input not valid (e.g. float) and is_split_into_words=False
+        with pytest.raises(ValueError):
+            tokenizer.encode_plus(12.34)
+
+        # 6. Error: return_offsets_mapping=True
+        with pytest.raises(NotImplementedError):
+            tokenizer.encode_plus("hello", return_offsets_mapping=True)
+
+    @pytest.mark.level1
+    @pytest.mark.platform_x86_cpu
+    def test_batch_encode_plus_advanced(self):
+        """Test advanced inputs for batch_encode_plus"""
+        tokenizer = ConcreteTokenizer(self.vocab_file)
+
+        # 1. List of list of strings (batch of pre-tokenized sentences)
+        batch_tokens = [["hello", "world"], ["test", "tokenizer"]]
+        # is_split_into_words=True means each list item is a sequence of words
+        res = tokenizer.batch_encode_plus(batch_tokens, is_split_into_words=True, return_attention_mask=False)
+        assert len(res['input_ids']) == 2
+        assert res['input_ids'][0] == [tokenizer.vocab["hello"], tokenizer.vocab["world"]]
+
+        # 2. List of list of ints (batch of IDs)
+        # Note: is_split_into_words=True is required here because otherwise [id1, id2] is interpreted
+        # as a pair (id1, id2), and get_input_ids(id1) fails because single int input is not supported.
+        ids = [tokenizer.vocab["hello"], tokenizer.vocab["world"]]
+        batch_ids = [ids, ids]
+        res = tokenizer.batch_encode_plus(batch_ids, is_split_into_words=True, return_attention_mask=False)
+        assert res['input_ids'] == batch_ids
+
+        # 3. Error: return_offsets_mapping=True
+        with pytest.raises(NotImplementedError):
+            tokenizer.batch_encode_plus(["hello"], return_offsets_mapping=True)
+
+        # 4. Batch with pairs (list of tuples)
+        pairs = [("hello", "world"), ("test", "tokenizer")]
+        res = tokenizer.batch_encode_plus(pairs, return_attention_mask=False)
+        assert len(res['input_ids']) == 2
+
+        # 5. Test invalid batch input to trigger ValueError
+        # Input list of floats
+        with pytest.raises(ValueError):
+            tokenizer.batch_encode_plus([12.34])