diff --git a/tests/st/test_ut/test_models/test_auto/test_tokenization_auto.py b/tests/st/test_ut/test_models/test_auto/test_tokenization_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..2e87533b9ddef7ee233b8f3d59eff43d9b70f404 --- /dev/null +++ b/tests/st/test_ut/test_models/test_auto/test_tokenization_auto.py @@ -0,0 +1,376 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test tokenization auto.""" +import os +import shutil +import json +import sys +from unittest.mock import patch, MagicMock +import yaml +import pytest +import sentencepiece as spm +from mindformers.models.auto.tokenization_auto import AutoTokenizer, is_experimental_mode +from mindformers.models.auto.configuration_auto import AutoConfig +from mindformers.models.auto.tokenization_auto import TOKENIZER_MAPPING +from mindformers.models.auto.tokenization_auto import tokenizer_class_from_name + +# Add project root to path +PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, PROJECT_ROOT) + + +class TestAutoTokenizer: + """ test auto tokenizer """ + @classmethod + def setup_class(cls): + """ create test directory """ + cls.test_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp_tokenizer_test_coverage") + if os.path.exists(cls.test_dir): + shutil.rmtree(cls.test_dir) + os.makedirs(cls.test_dir) + + cls.vocab_file = os.path.join(cls.test_dir, "tokenizer.model") + + # Create dummy sentencepiece model + corpus_path = os.path.join(cls.test_dir, "corpus.txt") + with open(corpus_path, "w", encoding="utf-8") as f: + f.write("This is a test corpus for sentencepiece training. One Two Three.") + + model_prefix = os.path.join(cls.test_dir, "tokenizer") + spm.SentencePieceTrainer.Train( + input=corpus_path, + model_prefix=model_prefix, + vocab_size=100, # Increased vocab size to avoid "smaller than required_chars" error + model_type='bpe', + character_coverage=1.0, + user_defined_symbols=[''] + ) + + @classmethod + def teardown_class(cls): + """ Clean up """ + if os.path.exists(cls.test_dir): + shutil.rmtree(cls.test_dir) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_is_experimental_mode(self): + """ + Coverage for `is_experimental_mode` + """ + # 1. Directory with .yaml -> False (Origin Mode) + yaml_dir = os.path.join(self.test_dir, "mode_yaml") + os.makedirs(yaml_dir, exist_ok=True) + with open(os.path.join(yaml_dir, "model.yaml"), 'w', encoding="utf-8") as f: + f.write("key: value") + assert not is_experimental_mode(yaml_dir) + + # 2. Directory without .yaml -> True (Experimental/HF Mode) + json_dir = os.path.join(self.test_dir, "mode_json") + os.makedirs(json_dir, exist_ok=True) + # Assuming no yaml here + assert is_experimental_mode(json_dir) + + # 3. Supported model name logic + # We can't easily modify global TOKENIZER_SUPPORT_LIST safely, but we can test unknown string + assert is_experimental_mode("unknown_model_string") + + # 4. Path exists but is a file (and unsupported string) -> True + dummy_file = os.path.join(self.test_dir, "dummy_file.txt") + with open(dummy_file, 'w', encoding="utf-8") as f: + f.write("content") + assert is_experimental_mode(dummy_file) + + @patch("mindformers.tools.MindFormerRegister") + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_from_pretrained_origin_mode(self, mock_register): + """ + Test `from_pretrained` using Origin Mode (YAML detection). + This simulates loading from a directory containing a YAML file. + """ + yaml_dir = os.path.join(self.test_dir, "origin_mode_load") + os.makedirs(yaml_dir, exist_ok=True) + + # Prepare environment: valid yaml + vocab file + shutil.copy(self.vocab_file, os.path.join(yaml_dir, "tokenizer.model")) + + config_data = { + "processor": { + "tokenizer": { + "type": "LlamaTokenizer", + "vocab_file": "tokenizer.model" + } + } + } + with open(os.path.join(yaml_dir, "mindspore_model.yaml"), 'w', encoding="utf-8") as f: + yaml.dump(config_data, f) + + # Setup Mock + mock_tokenizer_cls = MagicMock() + mock_tokenizer_instance = MagicMock() + mock_tokenizer_cls.from_pretrained.return_value = mock_tokenizer_instance + # When MindFormerRegister.get_cls is called, return our mock class + mock_register.get_cls.return_value = mock_tokenizer_cls + + # Call SUT + tokenizer = AutoTokenizer.from_pretrained(yaml_dir) + + # Verifications + # 1. Should detect yaml -> origin mode + # 2. Origin mode calls MindFormerRegister.get_cls(..., class_name='LlamaTokenizer') + mock_register.get_cls.assert_called_with(module_type='tokenizer', class_name='LlamaTokenizer') + # 3. Should instantiate and call from_pretrained on the retrieved class + mock_tokenizer_cls.from_pretrained.assert_called() + assert tokenizer == mock_tokenizer_instance + + @patch("mindformers.models.auto.tokenization_auto.tokenizer_class_from_name") + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_from_pretrained_experimental_mode(self, mock_class_from_name): + """ + Test `from_pretrained` using Experimental Mode (JSON/HF style). + This simulates loading from a directory with tokenizer_config.json + """ + json_dir = os.path.join(self.test_dir, "exp_mode_load") + os.makedirs(json_dir, exist_ok=True) + + shutil.copy(self.vocab_file, os.path.join(json_dir, "tokenizer.model")) + + # Create tokenizer_config.json indicating the class + config_data = { + "tokenizer_class": "LlamaTokenizer", + "vocab_file": "tokenizer.model" + } + with open(os.path.join(json_dir, "tokenizer_config.json"), 'w', encoding="utf-8") as f: + json.dump(config_data, f) + + # Setup Mock + mock_tokenizer_cls = MagicMock() + mock_tokenizer_instance = MagicMock() + mock_tokenizer_cls.from_pretrained.return_value = mock_tokenizer_instance + mock_class_from_name.return_value = mock_tokenizer_cls + + # Call SUT + tokenizer = AutoTokenizer.from_pretrained(json_dir) + mock_class_from_name.assert_called_with("LlamaTokenizerFast") + mock_tokenizer_cls.from_pretrained.assert_called_with(json_dir, _from_auto=True, _commit_hash=None) + assert tokenizer == mock_tokenizer_instance + + @patch("mindformers.models.auto.tokenization_auto.tokenizer_class_from_name") + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_from_pretrained_with_explicit_type(self, mock_class_from_name): + """ + Test `from_pretrained` when `tokenizer_type` arg is explicitly provided. + """ + mock_tokenizer_cls = MagicMock() + mock_tokenizer_instance = MagicMock() + mock_tokenizer_cls.from_pretrained.return_value = mock_tokenizer_instance + mock_class_from_name.return_value = mock_tokenizer_cls + + # 'llama' is a known key in TOKENIZER_MAPPING_NAMES + # Origin or Exp mode check happens first, but tokenizer_type argument logic inside AutoTokenizer + # usually shortcuts or guides the class selection. + + dummy_path = "dummy_path_not_exist" + # This triggers experimental mode because it doesn't exist locally (usually) and not in support list + + tokenizer = AutoTokenizer.from_pretrained(dummy_path, tokenizer_type="llama") + + # Verify "llama" -> "LlamaTokenizerFast" (default preference) mapping usage + mock_class_from_name.assert_called_with("LlamaTokenizerFast") + assert tokenizer == mock_tokenizer_instance + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_invalid_yaml_name(self): + """ + Test the `invalid_yaml_name` method for filtering/validating model names. + """ + # "invalid_name" should return True (invalid) + assert AutoTokenizer.invalid_yaml_name("invalid_name") + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_register_custom_tokenizer(self): + """ + Test `AutoTokenizer.register` API. + """ + + class MyConfig(AutoConfig): + pass + + class MyTokenizer: + pass + + # Register + AutoTokenizer.register(MyConfig, slow_tokenizer_class=MyTokenizer, exist_ok=True) + + # Check registration + assert MyConfig in TOKENIZER_MAPPING + assert TOKENIZER_MAPPING[MyConfig][0] == MyTokenizer + + # Cleanup + # TOKENIZER_MAPPING is a _LazyAutoMapping object, not a dict. + # It stores extra content in _extra_content. + # pylint: disable=W0212 + if MyConfig in TOKENIZER_MAPPING._extra_content: + del TOKENIZER_MAPPING._extra_content[MyConfig] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_tokenizer_class_from_name_helper_real(self): + """Test the helper function for loading class (real execution)""" + res = tokenizer_class_from_name("UnknownTokenizerClassXYZ") + assert res is None + + # Correct the mock structure: {'type': {'name_suffix': ['full_name']}} + # glm4_9b -> type: glm4, suffix: 9b. + @patch("mindformers.models.auto.tokenization_auto.TOKENIZER_SUPPORT_LIST", {'glm4': {'9b': ['glm4_9b']}}) + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_invalid_yaml_name_logic(self): + """Test extended logic of invalid_yaml_name""" + # Valid cases + # With corrected mock, these should return False (not invalid) and NOT raise ValueError + assert not AutoTokenizer.invalid_yaml_name("glm4_9b") + assert not AutoTokenizer.invalid_yaml_name("mindspore/glm4_9b") + + # Invalid cases + # unknown_model is not in support list keys, so returns True immediately (no exception) + assert AutoTokenizer.invalid_yaml_name("unknown_model") + + # "glm4_unknown" starts with a known prefix "glm4" but fails specific model check, raising ValueError + with pytest.raises(ValueError): + AutoTokenizer.invalid_yaml_name("glm4_unknown") + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_origin_mode_download_logic(self): + """Test origin mode triggering download paths (mocked)""" + # We need to mock MindFormerBook and os.path.exists to simulate download paths + # Also mock os.makedirs to prevent attempting to create directories for "downloads" + with patch("mindformers.models.auto.tokenization_auto.MindFormerBook") as mock_book, \ + patch("os.path.exists") as mock_exists, \ + patch("mindformers.models.auto.tokenization_auto.set_default_yaml_file") as mock_set_yaml, \ + patch("mindformers.models.auto.tokenization_auto.AutoTokenizer._get_class_name_from_yaml") as \ + mock_get_cls, \ + patch("mindformers.tools.MindFormerRegister") as _: + mock_book.get_xihe_checkpoint_download_folder.return_value = "/tmp/xihe" + mock_book.get_default_checkpoint_download_folder.return_value = "/tmp/default" + mock_exists.return_value = False # Simulate file needs download logic trigger (though code just mkdirs) + mock_get_cls.return_value = ("LlamaTokenizer", MagicMock(processor=MagicMock(tokenizer={}))) + + # Mock set_default_yaml_file to avoid actual file ops or checks + mock_set_yaml.return_value = None + + # 1. Mindspore prefix + AutoTokenizer.get_class_from_origin_mode("mindspore/glm4_9b") + # Check that it tried to access the download folder, confirming logic path + mock_book.get_xihe_checkpoint_download_folder.assert_called() + + # 2. Default prefix + with patch("mindformers.models.auto.tokenization_auto.AutoTokenizer.invalid_yaml_name", return_value=False): + AutoTokenizer.get_class_from_origin_mode("glm4_9b") + mock_book.get_default_checkpoint_download_folder.assert_called() + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_origin_mode_error_handling(self): + """Test error paths in get_class_from_origin_mode""" + # 1. Not a string + with pytest.raises(TypeError): + AutoTokenizer.get_class_from_origin_mode(123) + + # 2. Path is dir but no yaml (class_name is None) + with patch("os.path.isdir", return_value=True), \ + patch("os.path.exists", return_value=True), \ + patch("mindformers.models.auto.tokenization_auto.AutoTokenizer._get_class_name_from_yaml", + return_value=(None, None)): + with pytest.raises(ValueError): + AutoTokenizer.get_class_from_origin_mode("/tmp/dummy_dir") + + # 3. Unsupported model + with patch("mindformers.models.auto.tokenization_auto.AutoTokenizer.invalid_yaml_name", return_value=True), \ + patch("os.path.exists", return_value=False), \ + patch("os.path.isdir", return_value=False), \ + patch("os.makedirs") as _: + with pytest.raises(FileNotFoundError): + AutoTokenizer.get_class_from_origin_mode("unsupported_model") + + @patch("mindformers.models.auto.tokenization_auto.AutoConfig") + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_experimental_mode_legacy_automap(self, mock_auto_config): + """Test experimental mode with legacy auto_map format (list/tuple)""" + json_dir = os.path.join(self.test_dir, "legacy_automap") + os.makedirs(json_dir, exist_ok=True) + + # 1. Setup tokenizer config with LEGACY auto_map (list) + tokenizer_config_data = { + "auto_map": ["AutoTokenizer", "LlamaTokenizer"], + } + + # 2. Mock AutoConfig to avoid loading real config (which might fail in test env) + # and to strictly control attributes. + mock_config_instance = MagicMock() + mock_config_instance.tokenizer_class = None + # Ensure auto_map attribute is missing or None to avoid entering the crashing block (lines 532-533) + del mock_config_instance.auto_map + mock_config_instance.auto_map = None + + mock_auto_config.from_pretrained.return_value = mock_config_instance + + with open(os.path.join(json_dir, "tokenizer_config.json"), 'w', encoding="utf-8") as f: + json.dump(tokenizer_config_data, f) + + with patch("mindformers.models.auto.tokenization_auto.get_tokenizer_config", + return_value=tokenizer_config_data), \ + patch("mindformers.models.auto.tokenization_auto.resolve_trust_remote_code", return_value=True), \ + patch("mindformers.models.auto.tokenization_auto.get_class_from_dynamic_module") as mock_get_class: + mock_tokenizer = MagicMock() + mock_get_class.return_value = mock_tokenizer + + AutoTokenizer.from_pretrained(json_dir, trust_remote_code=True) + # Verify it tried to load from dynamic module using legacy format + mock_get_class.assert_called() + + @patch("mindformers.models.auto.tokenization_auto.TOKENIZER_MAPPING") + @patch("mindformers.models.auto.tokenization_auto.AutoConfig") + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_from_pretrained_value_errors(self, mock_auto_config, mock_mapping): + """Test ValueError scenarios in from_pretrained""" + # Mock keys() to return a simple list of mocks with __name__ to avoid iteration errors over LazyMapping + mock_cls = MagicMock() + mock_cls.__name__ = "MockConfig" + mock_mapping.keys.return_value = [mock_cls] + + # 1. Tokenizer type not found + # ValueError is wrapped in RuntimeError by @experimental_mode_func_checker + with pytest.raises(RuntimeError): + AutoTokenizer.from_pretrained("dummy", tokenizer_type="InvalidType") + + # 2. Config class unrecognized + mock_config_instance = MagicMock() + mock_config_instance.tokenizer_class = None + + mock_auto_config.from_pretrained.return_value = mock_config_instance + + # ValueError is wrapped in RuntimeError by @experimental_mode_func_checker + with pytest.raises(RuntimeError): + with patch("mindformers.models.auto.tokenization_auto.get_tokenizer_config", return_value={}): + AutoTokenizer.from_pretrained("dummy_path_val_error") diff --git a/tests/st/test_ut/test_tokenizers/test_tokenization_utils.py b/tests/st/test_ut/test_tokenizers/test_tokenization_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e963db2d185a2b9879616b32d4bd870a6f6ab7c0 --- /dev/null +++ b/tests/st/test_ut/test_tokenizers/test_tokenization_utils.py @@ -0,0 +1,412 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test tokenization utils.""" +import os +import shutil +import sys +import pytest +from mindformers.models.tokenization_utils import ( + PreTrainedTokenizer, + Trie, + AddedToken, + _is_whitespace, + _is_control, + _is_punctuation, + _is_end_of_word, + _is_start_of_word, + _insert_one_token_to_ordered_list +) + +# Add project root to path +PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, PROJECT_ROOT) + + +class ConcreteTokenizer(PreTrainedTokenizer): + """ concrete tokenizer """ + def __init__(self, vocab_file, **kwargs): + self.vocab = {} + with open(vocab_file, encoding="utf-8") as f: + for line in f: + if line.strip(): + token = line.strip() + self.vocab[token] = len(self.vocab) + + # Add UNK token if not present + if "" not in self.vocab: + self.vocab[""] = len(self.vocab) + + self.ids_to_tokens = {v: k for k, v in self.vocab.items()} + super().__init__(unk_token="", **kwargs) + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return self.vocab + + def _tokenize(self, text): + # Simple whitespace tokenizer for testing + return text.split() + + def _convert_token_to_id(self, token): + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + return self.ids_to_tokens.get(index, self.unk_token) + + +class TestTokenizationUtils: + """ test tokenization utils """ + @classmethod + def setup_class(cls): + """ Create a temporary directory """ + cls.test_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp_tokenization_utils_coverage") + if os.path.exists(cls.test_dir): + shutil.rmtree(cls.test_dir) + os.makedirs(cls.test_dir) + + # Generate a simple vocab file + cls.vocab_file = os.path.join(cls.test_dir, "vocab.txt") + with open(cls.vocab_file, "w", encoding="utf-8") as f: + vocab_list = ["hello", "world", "this", "is", "a", "test", "tokenizer", "[CLS]", "[SEP]", "[MASK]"] + f.write("\n".join(vocab_list)) + + @classmethod + def teardown_class(cls): + """ Remove the test directory """ + if os.path.exists(cls.test_dir): + shutil.rmtree(cls.test_dir) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_trie_logic(self): + """Test Trie add and split functionality""" + trie = Trie() + # Test basic add and split + trie.add("[CLS]") + trie.add("extra_id_1") + + text = "[CLS] This is a extra_id_1 test" + split_text = trie.split(text) + assert split_text == ["[CLS]", " This is a ", "extra_id_1", " test"] + + # Test duplicate add (idempotent) + trie.add("[CLS]") + split_text_2 = trie.split(text) + assert split_text == split_text_2 + + # Test empty string add + trie.add("") + + # Test matching longest first + trie.add("extra_id_100") + text_long = "extra_id_100 should match full not extra_id_1" + split_long = trie.split(text_long) + assert split_long[0] == "extra_id_100" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_trie_atom_methods(self): + """Test static atom methods of Trie class""" + # Test split_atom_1: Checks if current state has reached a token end ("") + # Construct a state where one path has ended + states = {0: {"": 1}, 5: {}} + offsets = [] + text = "sample text" + res_offsets = Trie.split_atom_1(states, text, offsets) + # Should append start(0) and len(text) to offsets and break + assert res_offsets == [0, 11] + + # Test split_atom_2: Reset or cleanup states + # Case 1: Reset = True + states = {0: {}, 1: {}} + res_states = Trie.split_atom_2(reset=True, states=states) + assert res_states == {} + + # Case 2: Reset = False, remove specific keys + states = {0: "val0", 1: "val1", 2: "val2"} + to_remove = {1} + res_states = Trie.split_atom_2(reset=False, to_remove=to_remove, states=states) + assert res_states == {0: "val0", 2: "val2"} + + # We want to hit: `if "" in looktrie_pointer:` -> update start, end, skip + looktrie_pointer = {"": 1} + states = {0: looktrie_pointer} + + res_states, res_start, res_end, res_skip = Trie.split_atom_3(states, current=0, text="ab", start=0, skip=0) + assert res_start == 0 + # end is updated to lookahead_index which starts at current(0) + assert res_end == 0 + assert res_skip == 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_module_helper_functions(self): + """Test standalone helper functions in tokenization_utils""" + # _is_whitespace + assert _is_whitespace(" ") + assert _is_whitespace("\t") + assert _is_whitespace("\n") + assert _is_whitespace("\r") + assert _is_whitespace("\u00A0") + assert not _is_whitespace("a") + + # _is_control + # \t, \n, \r are NOT control in this specific function logic (explicitly excluded) + assert not _is_control("\t") + assert not _is_control("\n") + assert not _is_control("\r") + assert _is_control("\x00") # Null char is control (Cc) + assert not _is_control("a") + + # _is_punctuation + assert _is_punctuation("!") + assert _is_punctuation(",") + # ASCII non-letter/number characters check: 33-47, 58-64, 91-96, 123-126 + assert _is_punctuation("$") # 36 + assert not _is_punctuation("A") + + # _is_end_of_word: checks last char + assert _is_end_of_word("word.") + assert not _is_end_of_word("word") + + # _is_start_of_word: checks first char + assert _is_start_of_word(".word") + assert not _is_start_of_word("word") + + # _insert_one_token_to_ordered_list + token_list = ["a", "c"] + _insert_one_token_to_ordered_list(token_list, "b") + assert token_list == ["a", "b", "c"] + + # Insert existing + _insert_one_token_to_ordered_list(token_list, "b") + assert token_list == ["a", "b", "c"] # No duplicate + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_tokenizer_instantiation(self): + """Test tokenizer initialization and vocab loading""" + tokenizer = ConcreteTokenizer(self.vocab_file) + assert tokenizer.vocab_size > 0 + assert "hello" in tokenizer.get_vocab() + assert tokenizer.unk_token == "" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_tokenize_basic(self): + """Test basic tokenization flow""" + tokenizer = ConcreteTokenizer(self.vocab_file) + text = "hello world" + tokens = tokenizer.tokenize(text) + assert tokens == ["hello", "world"] + + # Test with unknown token + text_unk = "hello unknown_word" + tokens_unk = tokenizer.tokenize(text_unk) + assert tokens_unk == ["hello", "unknown_word"] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_convert_tokens_to_ids(self): + """Test converting tokens to IDs""" + tokenizer = ConcreteTokenizer(self.vocab_file) + ids = tokenizer.convert_tokens_to_ids(["hello", "world"]) + assert isinstance(ids, list) + assert len(ids) == 2 + assert ids[0] == tokenizer.vocab["hello"] + + # Single string input + id_single = tokenizer.convert_tokens_to_ids("hello") + assert id_single == tokenizer.vocab["hello"] + + # None input + assert tokenizer.convert_tokens_to_ids(None) is None + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_convert_ids_to_tokens(self): + """Test converting IDs to tokens""" + tokenizer = ConcreteTokenizer(self.vocab_file) + target_id = tokenizer.vocab["hello"] + tokens = tokenizer.convert_ids_to_tokens([target_id]) + assert tokens == ["hello"] + + # Single int input + token_single = tokenizer.convert_ids_to_tokens(target_id) + assert token_single == "hello" + + # Out of vocab size + with pytest.raises(IndexError): + tokenizer.convert_ids_to_tokens(99999) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_special_tokens_handling(self): + """Test handling of added special tokens""" + tokenizer = ConcreteTokenizer(self.vocab_file) + + # Add a new special token + new_token = AddedToken("[SPECIAL]", special=True) + tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) + + assert "[SPECIAL]" in tokenizer.get_added_vocab() + + # Test encoding with special token not splitting + text = "hello [SPECIAL] world" + tokens = tokenizer.tokenize(text) + assert "[SPECIAL]" in tokens + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_added_tokens_decoder_setter_validation(self): + """Test validation logic in added_tokens_decoder setter""" + tokenizer = ConcreteTokenizer(self.vocab_file) + + # Valid setter + valid_dict = {100: AddedToken("token")} + tokenizer.added_tokens_decoder = valid_dict + assert tokenizer.added_tokens_decoder[100].content == "token" + + valid_dict_str = {101: "token_str"} + tokenizer.added_tokens_decoder = valid_dict_str + assert tokenizer.added_tokens_decoder[101].content == "token_str" + + # Invalid Key Type + with pytest.raises(ValueError): + tokenizer.added_tokens_decoder = {"bad_key": "val"} + + # Invalid Value Type + with pytest.raises(ValueError): + tokenizer.added_tokens_decoder = {102: 12345} + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_encode_decode_cycle(self): + """Test full encode and decode cycle""" + tokenizer = ConcreteTokenizer(self.vocab_file) + text = "hello world" + + # Encode + input_ids = tokenizer.encode(text, add_special_tokens=False) + assert len(input_ids) == 2 + + # Decode + decoded_text = tokenizer.decode(input_ids) + assert decoded_text.strip() == text # strip because join might add spaces + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_batch_encode_plus(self): + """Test batch encoding""" + tokenizer = ConcreteTokenizer(self.vocab_file) + batch_text = ["hello world", "this is a test"] + + encoded = tokenizer.batch_encode_plus(batch_text, padding=False) + assert len(encoded["input_ids"]) == 2 + assert len(encoded["input_ids"][0]) == 2 # hello world + assert len(encoded["input_ids"][1]) == 4 # this is a test + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_num_special_tokens_to_add(self): + """Test calculation of added special tokens""" + tokenizer = ConcreteTokenizer(self.vocab_file) + # Default build_inputs_with_special_tokens adds nothing if not overridden, + # unless we set bos/eos/etc. + count = tokenizer.num_special_tokens_to_add(pair=False) + assert count == 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_prepare_for_tokenization(self): + """Test hook for preparing text""" + tokenizer = ConcreteTokenizer(self.vocab_file) + text = "Raw Text" + processed_text, kwargs = tokenizer.prepare_for_tokenization(text, custom_arg="val") + assert text == processed_text + assert kwargs == {"custom_arg": "val"} + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_advanced_input_handling(self): + """Test advanced input types for coverage of _encode_plus""" + tokenizer = ConcreteTokenizer(self.vocab_file) + + # 1. Test list of strings with is_split_into_words=True + # Input: ["hello", "world"] -> treated as words, tokenized individually -> ["hello", "world"] -> ids + res = tokenizer.encode_plus(["hello", "world"], is_split_into_words=True, return_attention_mask=False, + return_token_type_ids=False) + assert res['input_ids'] == [tokenizer.vocab["hello"], tokenizer.vocab["world"]] + + # 2. Test list of strings with is_split_into_words=False + # Input: ["hello", "world"] -> treated as already tokenized strings + res = tokenizer.encode_plus(["hello", "world"], is_split_into_words=False, return_attention_mask=False, + return_token_type_ids=False) + assert res['input_ids'] == [tokenizer.vocab["hello"], tokenizer.vocab["world"]] + + # 3. Test list of integers (pre-tokenized IDs) + ids = [tokenizer.vocab["hello"], tokenizer.vocab["world"]] + res = tokenizer.encode_plus(ids, return_attention_mask=False, return_token_type_ids=False) + assert res['input_ids'] == ids + + # 4. Error: is_split_into_words=True but input is invalid (e.g. integer) + with pytest.raises(ValueError): + tokenizer.encode_plus(123, is_split_into_words=True) + + # 5. Error: Input not valid (e.g. float) and is_split_into_words=False + with pytest.raises(ValueError): + tokenizer.encode_plus(12.34) + + # 6. Error: return_offsets_mapping=True + with pytest.raises(NotImplementedError): + tokenizer.encode_plus("hello", return_offsets_mapping=True) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + def test_batch_encode_plus_advanced(self): + """Test advanced inputs for batch_encode_plus""" + tokenizer = ConcreteTokenizer(self.vocab_file) + + # 1. List of list of strings (batch of pre-tokenized sentences) + batch_tokens = [["hello", "world"], ["test", "tokenizer"]] + # is_split_into_words=True means each list item is a sequence of words + res = tokenizer.batch_encode_plus(batch_tokens, is_split_into_words=True, return_attention_mask=False) + assert len(res['input_ids']) == 2 + assert res['input_ids'][0] == [tokenizer.vocab["hello"], tokenizer.vocab["world"]] + + # 2. List of list of ints (batch of IDs) + # Note: is_split_into_words=True is required here because otherwise [id1, id2] is interpreted + # as a pair (id1, id2), and get_input_ids(id1) fails because single int input is not supported. + ids = [tokenizer.vocab["hello"], tokenizer.vocab["world"]] + batch_ids = [ids, ids] + res = tokenizer.batch_encode_plus(batch_ids, is_split_into_words=True, return_attention_mask=False) + assert res['input_ids'] == batch_ids + + # 3. Error: return_offsets_mapping=True + with pytest.raises(NotImplementedError): + tokenizer.batch_encode_plus(["hello"], return_offsets_mapping=True) + + # 4. Batch with pairs (list of tuples) + pairs = [("hello", "world"), ("test", "tokenizer")] + res = tokenizer.batch_encode_plus(pairs, return_attention_mask=False) + assert len(res['input_ids']) == 2 + + # 5. Test invalid batch input to trigger ValueError + # Input list of floats + with pytest.raises(ValueError): + tokenizer.batch_encode_plus([12.34])