From 70fb19f35d20c4f13f54a76000ee729689d4d70c Mon Sep 17 00:00:00 2001 From: Hsshuai Date: Mon, 24 Nov 2025 16:41:42 +0800 Subject: [PATCH] add test for tokenization --- .../test_tokenizers/test_tokenization_base.py | 1436 ++++++++++++++++- 1 file changed, 1425 insertions(+), 11 deletions(-) diff --git a/tests/st/test_ut/test_tokenizers/test_tokenization_base.py b/tests/st/test_ut/test_tokenizers/test_tokenization_base.py index 014cca58b..0c93291eb 100644 --- a/tests/st/test_ut/test_tokenizers/test_tokenization_base.py +++ b/tests/st/test_ut/test_tokenizers/test_tokenization_base.py @@ -14,17 +14,27 @@ # ============================================================================ """test tokenizer base.""" import os +import json +import shutil +import tempfile import unittest +from contextlib import ExitStack from unittest.mock import patch -import tempfile from collections import OrderedDict -import yaml + +import mindspore as ms +import numpy as np import pytest +import yaml +from tests.st.test_ut.test_tokenizers.get_vocab_model import get_sp_vocab_model from mindformers import AutoTokenizer, LlamaTokenizer from mindformers.mindformer_book import MindFormerBook from mindformers.models.tokenization_utils import PaddingStrategy, TruncationStrategy -from tests.st.test_ut.test_tokenizers.get_vocab_model import get_sp_vocab_model +from mindformers.models.tokenization_utils_base import ( + AddedToken, BatchEncoding, CharSpan, is_experimental_mode, PreTrainedTokenizerBase, SpecialTokensMixin, TensorType, + to_py_obj, TokenSpan +) value = OrderedDict([ ('ChatGLM4Tokenizer', 'GLMProcessor'), @@ -38,8 +48,9 @@ class TestTokenizerBase(unittest.TestCase): @classmethod def setUpClass(cls): - cls.temp_dir = tempfile.TemporaryDirectory() - cls.path = cls.temp_dir.name + cls._exit_stack = ExitStack() + cls.temp_dir = cls._exit_stack.enter_context(tempfile.TemporaryDirectory()) + cls.path = cls.temp_dir cls.string = "An increasing sequence: one, two, three." get_sp_vocab_model("llama2_7b", cls.path) cls.tokenizer_model_path = os.path.join(cls.path, "llama2_7b_tokenizer.model") @@ -87,12 +98,12 @@ class TestTokenizerBase(unittest.TestCase): def test_save_origin_pretrained(self): """test save origin pretrained.""" with patch.object(MindFormerBook, "_TOKENIZER_NAME_TO_PROCESSOR", value): - res = self.tokenizer.save_origin_pretrained("notexist") - assert res is None - res = self.tokenizer.save_origin_pretrained(self.path, "") - assert res is None + self.tokenizer.save_origin_pretrained("notexist") + assert os.path.exists(os.path.join("notexist", "mindspore_model.yaml")) + self.tokenizer.save_origin_pretrained(self.path, None) + assert os.path.exists(os.path.join(self.path, "mindspore_model.yaml")) with pytest.raises(ValueError): - assert res == self.tokenizer.save_origin_pretrained(self.path, None, 'txt') + self.tokenizer.save_origin_pretrained(self.path, None, 'txt') @pytest.mark.level1 @pytest.mark.platform_x86_cpu @@ -135,6 +146,1409 @@ class TestTokenizerBase(unittest.TestCase): res = self.tokenizer.truncate_sequences(ids, pair_ids, 1, TruncationStrategy.ONLY_SECOND) assert res == ([1, 2, 3, 4, 5], [1, 2], [3]) + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_encode_decode(self): + """test encode and decode.""" + text = "Hello world" + encoded = self.tokenizer.encode(text) + assert isinstance(encoded, list) + decoded = self.tokenizer.decode(encoded) + assert isinstance(decoded, str) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encode_plus(self): + """test batch encode plus.""" + texts = ["Hello world", "This is a test"] + res = self.tokenizer.batch_encode_plus(texts, padding=True) + assert 'input_ids' in res + assert 'attention_mask' in res + assert len(res['input_ids']) == 2 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_decode(self): + """test batch decode.""" + sequences = [[1, 48, 87, 85], [1, 48, 87]] + res = self.tokenizer.batch_decode(sequences) + assert isinstance(res, list) + assert len(res) == 2 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad(self): + """test pad method.""" + encoded_inputs = { + 'input_ids': [[1, 2, 3], [1, 2, 3, 4, 5]], + } + res = self.tokenizer.pad(encoded_inputs, padding=True) + assert 'input_ids' in res + assert 'attention_mask' in res + assert len(res['input_ids'][0]) == len(res['input_ids'][1]) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_properties(self): + """test special tokens properties.""" + assert self.tokenizer.bos_token is not None + assert self.tokenizer.eos_token is not None + assert self.tokenizer.unk_token is not None + assert self.tokenizer.pad_token is not None + assert isinstance(self.tokenizer.bos_token_id, int) + assert isinstance(self.tokenizer.eos_token_id, int) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_map(self): + """test special tokens map.""" + special_tokens = self.tokenizer.special_tokens_map + assert isinstance(special_tokens, dict) + assert 'bos_token' in special_tokens or 'eos_token' in special_tokens + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_all_special_tokens(self): + """test all special tokens.""" + all_tokens = self.tokenizer.all_special_tokens + assert isinstance(all_tokens, list) + assert len(all_tokens) > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_all_special_ids(self): + """test all special ids.""" + all_ids = self.tokenizer.all_special_ids + assert isinstance(all_ids, list) + assert len(all_ids) > 0 + assert all(isinstance(x, int) for x in all_ids) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_encode_plus(self): + """test encode plus.""" + text = "Hello world" + res = self.tokenizer.encode_plus(text, padding=True, truncation=True, max_length=10) + assert 'input_ids' in res + assert isinstance(res['input_ids'], list) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_tokenizer_call(self): + """test tokenizer call method.""" + text = "Hello world" + res = self.tokenizer(text, padding=True, truncation=True, max_length=10) + assert 'input_ids' in res + assert isinstance(res['input_ids'], list) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_prepare_for_model(self): + """test prepare for model.""" + ids = [1, 2, 3, 4, 5] + res = self.tokenizer.prepare_for_model(ids, padding=True, max_length=10) + assert 'input_ids' in res + assert isinstance(res['input_ids'], list) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_build_inputs_with_special_tokens(self): + """test build inputs with special tokens.""" + ids = [1, 2, 3] + res = self.tokenizer.build_inputs_with_special_tokens(ids) + assert isinstance(res, list) + assert len(res) >= len(ids) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_create_token_type_ids_from_sequences(self): + """test create token type ids from sequences.""" + ids = [1, 2, 3] + pair_ids = [4, 5] + res = self.tokenizer.create_token_type_ids_from_sequences(ids) + add_length = 1 if self.tokenizer.add_bos_token else 0 + add_length += 1 if self.tokenizer.add_eos_token else 0 + real_ids_length = add_length + len(ids) + real_pair_ids_length = add_length + len(pair_ids) + assert res == [0] * real_ids_length + res = self.tokenizer.create_token_type_ids_from_sequences(ids, pair_ids) + assert res == [0] * real_ids_length + [1] * real_pair_ids_length + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_get_vocab(self): + """test get vocab.""" + vocab = self.tokenizer.get_vocab() + assert isinstance(vocab, dict) + assert len(vocab) > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_vocab_size(self): + """test vocab size.""" + vocab_size = self.tokenizer.vocab_size + assert isinstance(vocab_size, int) + assert vocab_size > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_model_max_length(self): + """test model max length.""" + max_length = self.tokenizer.model_max_length + assert isinstance(max_length, int) + assert max_length > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_padding_side(self): + """test padding side.""" + padding_side = self.tokenizer.padding_side + assert padding_side in ['left', 'right'] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncation_side(self): + """test truncation side.""" + truncation_side = self.tokenizer.truncation_side + assert truncation_side in ['left', 'right'] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_clean_up_tokenization(self): + """test clean up tokenization.""" + text = "This is a test . It works !" + cleaned = self.tokenizer.clean_up_tokenization(text) + assert isinstance(cleaned, str) + assert cleaned == "This is a test. It works!" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_get_special_tokens_mask(self): + """test get special tokens mask.""" + token_ids = [1, 2, 3, 4, 5] + mask = self.tokenizer.get_special_tokens_mask(token_ids, already_has_special_tokens=True) + assert isinstance(mask, list) + assert len(mask) == len(token_ids) + assert all(x in [0, 1] for x in mask) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_tokens_to_string(self): + """test convert tokens to string.""" + tokens = self.tokenizer.tokenize(self.string) + if tokens: + result = self.tokenizer.convert_tokens_to_string(tokens) + assert isinstance(result, str) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_num_special_tokens_to_add(self): + """test num special tokens to add.""" + num_single = self.tokenizer.num_special_tokens_to_add(pair=False) + num_pair = self.tokenizer.num_special_tokens_to_add(pair=True) + assert isinstance(num_single, int) + assert isinstance(num_pair, int) + assert num_pair >= num_single + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_tokenize(self): + """test tokenize method.""" + text = "Hello world" + tokens = self.tokenizer.tokenize(text) + assert isinstance(tokens, list) + assert len(tokens) > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_is_fast(self): + """test is_fast property.""" + is_fast = self.tokenizer.is_fast + assert isinstance(is_fast, bool) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_max_len_single_sentence(self): + """test max len single sentence.""" + max_len = self.tokenizer.max_len_single_sentence + assert isinstance(max_len, int) + assert max_len > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_max_len_sentences_pair(self): + """test max len sentences pair.""" + max_len = self.tokenizer.max_len_sentences_pair + assert isinstance(max_len, int) + assert max_len > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_with_different_strategies(self): + """test pad with different padding strategies.""" + encoded_inputs = { + 'input_ids': [[1, 2, 3], [1, 2, 3, 4, 5]], + } + # Test longest padding + res = self.tokenizer.pad(encoded_inputs, padding='longest') + assert len(res['input_ids'][0]) == len(res['input_ids'][1]) + # Test max_length padding + res = self.tokenizer.pad(encoded_inputs, padding='max_length', max_length=10) + assert len(res['input_ids'][0]) == 10 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncation_with_stride(self): + """test truncation with stride.""" + ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + res = self.tokenizer.truncate_sequences(ids, None, 5, stride=2) + assert len(res[2]) > 0 # overflowing tokens should exist + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encode_with_pairs(self): + """test batch encode with text pairs.""" + texts = ["Hello", "World"] + text_pairs = ["there", "here"] + res = self.tokenizer.batch_encode_plus( + list(zip(texts, text_pairs)), + padding=True + ) + assert 'input_ids' in res + assert len(res['input_ids']) == 2 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_prepare_for_model_with_pairs(self): + """test prepare for model with pair ids.""" + ids = [1, 2, 3] + pair_ids = [4, 5, 6] + res = self.tokenizer.prepare_for_model( + ids, + pair_ids=pair_ids, + add_special_tokens=True, + padding='max_length', + max_length=15 + ) + assert 'input_ids' in res + assert len(res['input_ids']) == 15 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_decode_skip_special_tokens(self): + """test decode with skip special tokens.""" + encoded = self.tokenizer.encode(self.string) + decoded_with_special = self.tokenizer.decode(encoded, skip_special_tokens=False) + decoded_without_special = self.tokenizer.decode(encoded, skip_special_tokens=True) + assert isinstance(decoded_with_special, str) + assert isinstance(decoded_without_special, str) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_added_tokens_decoder(self): + """test added tokens decoder.""" + added_tokens = self.tokenizer.added_tokens_decoder + assert isinstance(added_tokens, dict) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_tokenizer_len(self): + """test tokenizer length.""" + length = len(self.tokenizer) + assert isinstance(length, int) + assert length > 0 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_tokenizer_repr(self): + """test tokenizer representation.""" + repr_str = repr(self.tokenizer) + assert isinstance(repr_str, str) + assert 'LlamaTokenizer' in repr_str + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_encode_empty_string(self): + """test encode empty string.""" + res = self.tokenizer.encode("") + assert isinstance(res, list) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_empty_inputs(self): + """test pad with empty inputs.""" + encoded_inputs = {'input_ids': []} + res = self.tokenizer.pad(encoded_inputs, padding=True) + assert 'input_ids' in res + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncation_with_do_not_truncate(self): + """test truncation with do not truncate strategy.""" + ids = [1, 2, 3, 4, 5] + res = self.tokenizer.truncate_sequences(ids, None, 0, TruncationStrategy.DO_NOT_TRUNCATE) + assert res == (ids, None, []) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_prepare_for_model_return_special_tokens_mask(self): + """test prepare for model return special tokens mask.""" + ids = [1, 2, 3] + res = self.tokenizer.prepare_for_model( + ids, + add_special_tokens=True, + return_special_tokens_mask=True + ) + assert 'special_tokens_mask' in res + assert isinstance(res['special_tokens_mask'], list) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_prepare_for_model_return_length(self): + """test prepare for model return length.""" + ids = [1, 2, 3] + res = self.tokenizer.prepare_for_model( + ids, + return_length=True + ) + assert 'length' in res + assert isinstance(res['length'], int) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_token_setters(self): + """test special token setters.""" + original_pad = self.tokenizer.pad_token + original_eos = self.tokenizer.eos_token + # Test setting and getting + self.tokenizer.pad_token = original_pad + assert self.tokenizer.pad_token == original_pad + self.tokenizer.eos_token = original_eos + assert self.tokenizer.eos_token == original_eos + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_model_input_names(self): + """test model input names.""" + input_names = self.tokenizer.model_input_names + assert isinstance(input_names, list) + assert 'input_ids' in input_names + + @classmethod + def tearDownClass(cls): + """Clean up temporary directory.""" + if hasattr(cls, '_exit_stack'): + cls._exit_stack.close() + + +class TestBatchEncoding(unittest.TestCase): + """Test BatchEncoding class from tokenization_utils_base.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_init(self): + """test batch encoding initialization.""" + data = {'input_ids': [1, 2, 3], 'attention_mask': [1, 1, 1]} + encoding = BatchEncoding(data) + assert encoding['input_ids'] == [1, 2, 3] + assert encoding['attention_mask'] == [1, 1, 1] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_getitem(self): + """test batch encoding getitem.""" + data = {'input_ids': [[1, 2], [3, 4]], 'attention_mask': [[1, 1], [1, 1]]} + encoding = BatchEncoding(data) + # Test string key + assert encoding['input_ids'] == [[1, 2], [3, 4]] + # Test slice + sliced = encoding[0:1] + assert 'input_ids' in sliced + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_keys_values_items(self): + """test batch encoding keys, values, items.""" + data = {'input_ids': [1, 2, 3], 'attention_mask': [1, 1, 1]} + encoding = BatchEncoding(data) + assert 'input_ids' in encoding.keys() + assert [1, 2, 3] in list(encoding.values()) + assert ('input_ids', [1, 2, 3]) in list(encoding.items()) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_convert_to_tensors_ms(self): + """test batch encoding convert to mindspore tensors.""" + data = {'input_ids': [1, 2, 3]} + encoding = BatchEncoding(data, tensor_type=TensorType.MINDSPORE) + assert isinstance(encoding['input_ids'], ms.Tensor) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_convert_to_tensors_np(self): + """test batch encoding convert to numpy arrays.""" + data = {'input_ids': [1, 2, 3]} + encoding = BatchEncoding(data, tensor_type=TensorType.NUMPY) + assert isinstance(encoding['input_ids'], np.ndarray) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_is_fast(self): + """test batch encoding is_fast property.""" + data = {'input_ids': [1, 2, 3]} + encoding = BatchEncoding(data) + assert isinstance(encoding.is_fast, bool) + assert not encoding.is_fast # No encodings provided + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_batch_encoding_n_sequences(self): + """test batch encoding n_sequences property.""" + data = {'input_ids': [1, 2, 3]} + encoding = BatchEncoding(data, n_sequences=1) + assert encoding.n_sequences == 1 + + +class TestUtilityFunctions(unittest.TestCase): + """Test utility functions from tokenization_utils_base.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_to_py_obj_dict(self): + """test to_py_obj with dict input.""" + obj = {'a': ms.Tensor([1, 2, 3]), 'b': [4, 5]} + result = to_py_obj(obj) + assert isinstance(result, dict) + assert isinstance(result['a'], list) + assert result['a'] == [1, 2, 3] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_to_py_obj_list(self): + """test to_py_obj with list input.""" + obj = [ms.Tensor([1, 2]), ms.Tensor([3, 4])] + result = to_py_obj(obj) + assert isinstance(result, list) + assert result == [[1, 2], [3, 4]] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_to_py_obj_tensor(self): + """test to_py_obj with mindspore tensor.""" + obj = ms.Tensor([1, 2, 3]) + result = to_py_obj(obj) + assert isinstance(result, list) + assert result == [1, 2, 3] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_to_py_obj_numpy(self): + """test to_py_obj with numpy array.""" + obj = np.array([1, 2, 3]) + result = to_py_obj(obj) + assert isinstance(result, list) + assert result == [1, 2, 3] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_to_py_obj_primitive(self): + """test to_py_obj with primitive types.""" + assert to_py_obj(5) == 5 + assert to_py_obj("hello") == "hello" + assert to_py_obj([1, 2, 3]) == [1, 2, 3] + + +class TestAddedToken(unittest.TestCase): + """Test AddedToken class from tokenization_utils_base.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_added_token_init(self): + """test added token initialization.""" + token = AddedToken("", special=True) + assert token.content == "" + assert token.special is True + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_added_token_str(self): + """test added token string representation.""" + token = AddedToken("") + assert str(token) == "" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_added_token_normalized_default(self): + """test added token normalized default value.""" + token_special = AddedToken("", special=True) + token_normal = AddedToken("hello", special=False) + assert token_special.normalized is False + assert token_normal.normalized is True + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_added_token_options(self): + """test added token with various options.""" + token = AddedToken( + "", + single_word=True, + lstrip=True, + rstrip=True, + special=True, + normalized=False + ) + assert token.single_word is True + assert token.lstrip is True + assert token.rstrip is True + assert token.special is True + assert token.normalized is False + + +class TestNamedTuples(unittest.TestCase): + """Test NamedTuples from tokenization_utils_base.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_char_span(self): + """test CharSpan named tuple.""" + span = CharSpan(start=0, end=5) + assert span.start == 0 + assert span.end == 5 + assert isinstance(span, tuple) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_token_span(self): + """test TokenSpan named tuple.""" + span = TokenSpan(start=0, end=3) + assert span.start == 0 + assert span.end == 3 + assert isinstance(span, tuple) + + +class TestEnums(unittest.TestCase): + """Test Enum classes from tokenization_utils_base.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_padding_strategy_values(self): + """test PaddingStrategy enum values.""" + assert PaddingStrategy.LONGEST.value == "longest" + assert PaddingStrategy.MAX_LENGTH.value == "max_length" + assert PaddingStrategy.DO_NOT_PAD.value == "do_not_pad" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_padding_strategy_from_string(self): + """test PaddingStrategy from string.""" + strategy = PaddingStrategy("longest") + assert strategy == PaddingStrategy.LONGEST + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncation_strategy_values(self): + """test TruncationStrategy enum values.""" + assert TruncationStrategy.ONLY_FIRST.value == "only_first" + assert TruncationStrategy.ONLY_SECOND.value == "only_second" + assert TruncationStrategy.LONGEST_FIRST.value == "longest_first" + assert TruncationStrategy.DO_NOT_TRUNCATE.value == "do_not_truncate" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncation_strategy_from_string(self): + """test TruncationStrategy from string.""" + strategy = TruncationStrategy("only_first") + assert strategy == TruncationStrategy.ONLY_FIRST + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_tensor_type_values(self): + """test TensorType enum values.""" + assert TensorType.NUMPY.value == "np" + assert TensorType.MINDSPORE.value == "ms" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_explicit_enum_error(self): + """test ExplicitEnum raises error for invalid value.""" + with pytest.raises(ValueError) as exc_info: + PaddingStrategy("invalid_value") + assert "invalid_value" in str(exc_info.value) + assert "not a valid" in str(exc_info.value) + + +class TestPaddingTruncationStrategies(unittest.TestCase): + """Test padding and truncation strategy handling in base class.""" + + @classmethod + def setUpClass(cls): + cls._exit_stack = ExitStack() + cls.temp_dir = cls._exit_stack.enter_context(tempfile.TemporaryDirectory()) + cls.path = cls.temp_dir + get_sp_vocab_model("llama2_7b_test", cls.path) + tokenizer_model_path = os.path.join(cls.path, "llama2_7b_test_tokenizer.model") + cls.tokenizer = LlamaTokenizer(tokenizer_model_path) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_get_padding_truncation_strategies_defaults(self): + """test _get_padding_truncation_strategies with default values.""" + padding_strategy, truncation_strategy, max_length, kwargs = \ + self.tokenizer._get_padding_truncation_strategies() + assert padding_strategy == PaddingStrategy.DO_NOT_PAD + assert truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE + assert max_length is None + assert kwargs == {} + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_get_padding_truncation_with_max_length_only(self): + """test strategy inference when only max_length is provided.""" + _, truncation_strategy, max_length, _ = \ + self.tokenizer._get_padding_truncation_strategies( + max_length=128, verbose=False + ) + assert truncation_strategy == TruncationStrategy.LONGEST_FIRST + assert max_length == 128 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_get_padding_max_length_strategy(self): + """test padding max_length strategy.""" + padding_strategy, _, max_length, _ = \ + self.tokenizer._get_padding_truncation_strategies( + padding='max_length', max_length=256 + ) + assert padding_strategy == PaddingStrategy.MAX_LENGTH + assert max_length == 256 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_get_truncation_strategies(self): + """test different truncation strategies.""" + # Test only_first + _, truncation_strategy, _, _ = \ + self.tokenizer._get_padding_truncation_strategies( + truncation='only_first', max_length=128 + ) + assert truncation_strategy == TruncationStrategy.ONLY_FIRST + + # Test only_second + _, truncation_strategy, _, _ = \ + self.tokenizer._get_padding_truncation_strategies( + truncation='only_second', max_length=128 + ) + assert truncation_strategy == TruncationStrategy.ONLY_SECOND + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_to_multiple_of(self): + """test pad_to_multiple_of parameter.""" + padding_strategy, _, _, _ = \ + self.tokenizer._get_padding_truncation_strategies( + padding=True, pad_to_multiple_of=8 + ) + assert padding_strategy == PaddingStrategy.LONGEST + + @classmethod + def tearDownClass(cls): + """Clean up temporary directory.""" + if hasattr(cls, '_exit_stack'): + cls._exit_stack.close() + + +class TestTruncationEdgeCases(unittest.TestCase): + """Test truncation edge cases in base class.""" + + @classmethod + def setUpClass(cls): + cls._exit_stack = ExitStack() + cls.temp_dir = cls._exit_stack.enter_context(tempfile.TemporaryDirectory()) + cls.path = cls.temp_dir + get_sp_vocab_model("llama2_7b_trunc", cls.path) + tokenizer_model_path = os.path.join(cls.path, "llama2_7b_trunc_tokenizer.model") + cls.tokenizer = LlamaTokenizer(tokenizer_model_path) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncate_left_side(self): + """test left side truncation.""" + original_side = self.tokenizer.truncation_side + self.tokenizer.truncation_side = "left" + ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + ids_truncated, _, _ = self.tokenizer.truncate_sequences( + ids, None, 5, TruncationStrategy.ONLY_FIRST, stride=0 + ) + assert len(ids_truncated) == 5 + assert ids_truncated == [6, 7, 8, 9, 10] + self.tokenizer.truncation_side = original_side + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncate_right_side(self): + """test right side truncation.""" + original_side = self.tokenizer.truncation_side + self.tokenizer.truncation_side = "right" + ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + ids_truncated, _, _ = self.tokenizer.truncate_sequences( + ids, None, 5, TruncationStrategy.ONLY_FIRST, stride=0 + ) + assert len(ids_truncated) == 5 + assert ids_truncated == [1, 2, 3, 4, 5] + self.tokenizer.truncation_side = original_side + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncate_with_zero_removal(self): + """test truncation when no tokens need to be removed.""" + ids = [1, 2, 3] + ids_truncated, pair_truncated, overflow = self.tokenizer.truncate_sequences( + ids, None, 0 + ) + assert ids_truncated == ids + assert pair_truncated is None + assert overflow == [] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_truncate_longest_first_with_pairs(self): + """test longest_first truncation with pair sequences.""" + ids1 = [1, 2, 3, 4, 5] + ids2 = [6, 7, 8] + ids1_trunc, ids2_trunc, _ = self.tokenizer.truncate_sequences( + ids1, ids2, 4, TruncationStrategy.LONGEST_FIRST + ) + # Should remove from longest sequence first + total_len = len(ids1_trunc) + len(ids2_trunc) + assert total_len == 4 + + @classmethod + def tearDownClass(cls): + """Clean up temporary directory.""" + if hasattr(cls, '_exit_stack'): + cls._exit_stack.close() + + +class TestSpecialTokensMixin(unittest.TestCase): + """Test SpecialTokensMixin class methods.""" + + def setUp(self): + """Set up test fixtures.""" + self.mixin = SpecialTokensMixin() + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_attributes(self): + """test SpecialTokensMixin has correct attributes.""" + assert hasattr(self.mixin, '_bos_token') + assert hasattr(self.mixin, '_eos_token') + assert hasattr(self.mixin, '_unk_token') + assert hasattr(self.mixin, '_sep_token') + assert hasattr(self.mixin, '_pad_token') + assert hasattr(self.mixin, '_cls_token') + assert hasattr(self.mixin, '_mask_token') + assert hasattr(self.mixin, '_additional_special_tokens') + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_init_with_kwargs(self): + """test SpecialTokensMixin initialization with kwargs.""" + mixin = SpecialTokensMixin( + bos_token="", + eos_token="", + unk_token="" + ) + assert mixin.bos_token == "" + assert mixin.eos_token == "" + assert mixin.unk_token == "" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_setter(self): + """test SpecialTokensMixin token setters.""" + self.mixin.bos_token = "" + assert self.mixin._bos_token == "" + self.mixin.eos_token = "" + assert self.mixin._eos_token == "" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_setter_with_added_token(self): + """test SpecialTokensMixin setter with AddedToken.""" + token = AddedToken("", special=True) + self.mixin.pad_token = token + assert str(self.mixin.pad_token) == "" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_setter_invalid_type(self): + """test SpecialTokensMixin setter with invalid type.""" + with pytest.raises(ValueError) as exc_info: + self.mixin.bos_token = 123 + assert "non-string" in str(exc_info.value).lower() + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_additional_special_tokens_setter(self): + """test additional_special_tokens setter.""" + tokens = ["", ""] + self.mixin.additional_special_tokens = tokens + assert self.mixin._additional_special_tokens == tokens + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_map_extended(self): + """test special_tokens_map_extended property.""" + self.mixin.bos_token = "" + self.mixin.eos_token = "" + special_map = self.mixin.special_tokens_map_extended + assert isinstance(special_map, dict) + assert special_map.get('bos_token') == "" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_all_special_tokens_extended(self): + """test all_special_tokens_extended property.""" + self.mixin.bos_token = "" + self.mixin.eos_token = "" + all_tokens = self.mixin.all_special_tokens_extended + assert isinstance(all_tokens, list) + # Check no duplicates + token_strs = [str(t) for t in all_tokens] + assert len(token_strs) == len(set(token_strs)) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_reset_special_tokens_cache(self): + """test reset_special_tokens_cache method.""" + self.mixin._all_special_tokens = [""] + self.mixin._all_special_ids = [1] + self.mixin.reset_special_tokens_cache() + assert self.mixin._all_special_tokens == [] + assert self.mixin._all_special_ids == [] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_special_tokens_init_invalid_additional_tokens(self): + """test SpecialTokensMixin init with invalid additional_special_tokens.""" + with pytest.raises(ValueError) as exc_info: + SpecialTokensMixin(additional_special_tokens="not_a_list") + assert "not a list or tuple" in str(exc_info.value) + + +class TestConvertAddedTokens(unittest.TestCase): + """Test convert_added_tokens class method.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_added_token_dict_to_object(self): + """test converting dict with __type to AddedToken.""" + token_dict = { + "__type": "AddedToken", + "content": "", + "special": True, + "normalized": False + } + result = PreTrainedTokenizerBase.convert_added_tokens(token_dict) + assert isinstance(result, AddedToken) + assert result.content == "" + assert result.special is True + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_added_token_object_to_dict(self): + """test converting AddedToken to dict.""" + token = AddedToken("", special=True) + result = PreTrainedTokenizerBase.convert_added_tokens(token, save=True) + assert isinstance(result, dict) + assert result.get("content") == "" + assert result.get("special") is True + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_added_tokens_list(self): + """test converting list of added tokens.""" + tokens = [ + {"__type": "AddedToken", "content": "", "special": True}, + {"__type": "AddedToken", "content": "", "special": True} + ] + result = PreTrainedTokenizerBase.convert_added_tokens(tokens) + assert isinstance(result, list) + assert len(result) == 2 + assert all(isinstance(t, AddedToken) for t in result) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_added_tokens_nested_dict(self): + """test converting nested dict with added tokens.""" + data = { + "special_tokens": { + "pad": {"__type": "AddedToken", "content": "", "special": True} + } + } + result = PreTrainedTokenizerBase.convert_added_tokens(data) + assert isinstance(result["special_tokens"]["pad"], AddedToken) + + +class TestPadMethod(unittest.TestCase): + """Test _pad method from base class.""" + + @classmethod + def setUpClass(cls): + cls._exit_stack = ExitStack() + cls.temp_dir = cls._exit_stack.enter_context(tempfile.TemporaryDirectory()) + cls.path = cls.temp_dir + get_sp_vocab_model("llama2_7b_pad", cls.path) + tokenizer_model_path = os.path.join(cls.path, "llama2_7b_pad_tokenizer.model") + cls.tokenizer = LlamaTokenizer(tokenizer_model_path) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_do_not_pad(self): + """test _pad with DO_NOT_PAD strategy.""" + encoded = {"input_ids": [1, 2, 3]} + result = self.tokenizer._pad( + encoded, + padding_strategy=PaddingStrategy.DO_NOT_PAD + ) + assert result["input_ids"] == [1, 2, 3] + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_right_side(self): + """test _pad with right side padding.""" + original_side = self.tokenizer.padding_side + self.tokenizer.padding_side = "right" + + encoded = {"input_ids": [1, 2, 3]} + result = self.tokenizer._pad( + encoded, + max_length=5, + padding_strategy=PaddingStrategy.MAX_LENGTH, + return_attention_mask=True + ) + assert len(result["input_ids"]) == 5 + assert result["input_ids"][-1] == self.tokenizer.pad_token_id + assert result["attention_mask"] == [1, 1, 1, 0, 0] + + self.tokenizer.padding_side = original_side + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_left_side(self): + """test _pad with left side padding.""" + original_side = self.tokenizer.padding_side + self.tokenizer.padding_side = "left" + + encoded = {"input_ids": [1, 2, 3]} + result = self.tokenizer._pad( + encoded, + max_length=5, + padding_strategy=PaddingStrategy.MAX_LENGTH, + return_attention_mask=True + ) + assert len(result["input_ids"]) == 5 + assert result["input_ids"][0] == self.tokenizer.pad_token_id + assert result["attention_mask"] == [0, 0, 1, 1, 1] + + self.tokenizer.padding_side = original_side + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_with_token_type_ids(self): + """test _pad with token_type_ids.""" + encoded = { + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0] + } + result = self.tokenizer._pad( + encoded, + max_length=5, + padding_strategy=PaddingStrategy.MAX_LENGTH + ) + assert len(result["token_type_ids"]) == 5 + assert result["token_type_ids"][-1] == self.tokenizer.pad_token_type_id + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_pad_to_multiple_of(self): + """test _pad with pad_to_multiple_of.""" + encoded = {"input_ids": [1, 2, 3]} + result = self.tokenizer._pad( + encoded, + max_length=5, + padding_strategy=PaddingStrategy.MAX_LENGTH, + pad_to_multiple_of=8 + ) + # Should pad to 8 (next multiple of 8) + assert len(result["input_ids"]) == 8 + + @classmethod + def tearDownClass(cls): + """Clean up temporary directory.""" + if hasattr(cls, '_exit_stack'): + cls._exit_stack.close() + + +class TestCleanUpTokenization(unittest.TestCase): + """Test clean_up_tokenization static method.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_clean_up_spaces_before_punctuation(self): + """test cleaning spaces before punctuation.""" + text = "Hello , world ." + result = PreTrainedTokenizerBase.clean_up_tokenization(text) + assert result == "Hello, world." + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_clean_up_contractions(self): + """test cleaning contractions.""" + text = "I do n't know what 's happening" + result = PreTrainedTokenizerBase.clean_up_tokenization(text) + assert "n't" in result + assert "'s" in result + assert " n't" not in result + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_clean_up_multiple_issues(self): + """test cleaning multiple tokenization issues.""" + text = "It 's a test ! Do n't worry ." + result = PreTrainedTokenizerBase.clean_up_tokenization(text) + assert result == "It's a test! Don't worry." + + +class TestBatchEncodingConversion(unittest.TestCase): + """Test BatchEncoding tensor conversion.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_already_tensor(self): + """test that already-tensor data is not converted again.""" + data = {'input_ids': ms.Tensor([1, 2, 3])} + encoding = BatchEncoding(data, tensor_type=TensorType.MINDSPORE) + assert isinstance(encoding['input_ids'], ms.Tensor) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_convert_none_tensor_type(self): + """test with None tensor type.""" + data = {'input_ids': [1, 2, 3]} + encoding = BatchEncoding(data, tensor_type=None) + assert isinstance(encoding['input_ids'], list) + + +class TestFromExperimentalPretrained(unittest.TestCase): + """Test from_experimental_pretrained method.""" + + @classmethod + def setUpClass(cls): + cls._exit_stack = ExitStack() + cls.temp_dir = cls._exit_stack.enter_context(tempfile.TemporaryDirectory()) + cls.path = cls.temp_dir + # Create a simple tokenizer for testing + get_sp_vocab_model("llama2_7b_exp", cls.path) + cls.tokenizer_model_path = os.path.join(cls.path, "llama2_7b_exp_tokenizer.model") + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_from_experimental_pretrained_with_config_json(self): + """test from_experimental_pretrained with tokenizer_config.json.""" + # Create tokenizer config + config = { + "tokenizer_class": "LlamaTokenizer", + "model_max_length": 2048, + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "" + } + config_path = os.path.join(self.path, "tokenizer_config.json") + with open(config_path, 'w', encoding="utf-8") as f: + json.dump(config, f) + # Copy tokenizer model + tokenizer_dest = os.path.join(self.path, "tokenizer.model") + if not os.path.exists(tokenizer_dest): + shutil.copy(self.tokenizer_model_path, tokenizer_dest) + + # Load tokenizer + tokenizer = LlamaTokenizer.from_pretrained(self.path) + assert tokenizer is not None + assert tokenizer.model_max_length == 2048 + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_from_experimental_pretrained_with_special_tokens_map(self): + """test from_experimental_pretrained with special_tokens_map.json.""" + # Create special tokens map + special_tokens = { + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "" + } + special_tokens_path = os.path.join(self.path, "special_tokens_map.json") + with open(special_tokens_path, "w", encoding="utf-8") as f: + json.dump(special_tokens, f) + + config = {"tokenizer_class": "LlamaTokenizer"} + config_path = os.path.join(self.path, "tokenizer_config.json") + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config, f) + + tokenizer = LlamaTokenizer.from_pretrained(self.path) + assert tokenizer is not None + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_from_experimental_pretrained_nonexistent_dir(self): + """test from_experimental_pretrained with nonexistent directory.""" + with pytest.raises(Exception): + LlamaTokenizer.from_pretrained("/nonexistent/path/to/tokenizer") + + @classmethod + def tearDownClass(cls): + """Clean up temporary directory.""" + if hasattr(cls, '_exit_stack'): + cls._exit_stack.close() + + +class TestSaveExperimentalPretrained(unittest.TestCase): + """Test save_experimental_pretrained method.""" + + @classmethod + def setUpClass(cls): + cls._exit_stack = ExitStack() + cls.temp_dir = cls._exit_stack.enter_context(tempfile.TemporaryDirectory()) + cls.path = cls.temp_dir + get_sp_vocab_model("llama2_7b_save", cls.path) + tokenizer_model_path = os.path.join(cls.path, "llama2_7b_save_tokenizer.model") + cls.tokenizer = LlamaTokenizer(tokenizer_model_path) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_save_experimental_pretrained_basic(self): + """test save_experimental_pretrained saves required files.""" + save_dir = os.path.join(self.path, "saved_tokenizer") + os.makedirs(save_dir, exist_ok=True) + + self.tokenizer.save_pretrained(save_dir, save_json=True) + + # Check that tokenizer_config.json exists + config_path = os.path.join(save_dir, "tokenizer_config.json") + assert os.path.exists(config_path) + + # Check that special_tokens_map.json exists + special_tokens_path = os.path.join(save_dir, "special_tokens_map.json") + assert os.path.exists(special_tokens_path) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_save_experimental_pretrained_with_prefix(self): + """test save_experimental_pretrained with filename_prefix.""" + save_dir = os.path.join(self.path, "saved_with_prefix") + os.makedirs(save_dir, exist_ok=True) + + self.tokenizer.save_pretrained( + save_dir, + save_json=True, + filename_prefix="my_tokenizer" + ) + + # Check that files with prefix exist + config_path = os.path.join(save_dir, "my_tokenizer-tokenizer_config.json") + assert os.path.exists(config_path) + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_save_and_load_roundtrip(self): + """test save and load round trip.""" + save_dir = os.path.join(self.path, "roundtrip") + os.makedirs(save_dir, exist_ok=True) + + # Save + self.tokenizer.save_pretrained(save_dir, save_json=True) + + # Load + loaded_tokenizer = LlamaTokenizer.from_pretrained(save_dir) + + # Compare + test_text = "Hello world" + original_encoding = self.tokenizer.encode(test_text) + loaded_encoding = loaded_tokenizer.encode(test_text) + assert original_encoding == loaded_encoding + + @classmethod + def tearDownClass(cls): + """Clean up temporary directory.""" + if hasattr(cls, '_exit_stack'): + cls._exit_stack.close() + + +class TestIsExperimentalMode(unittest.TestCase): + """Test is_experimental_mode utility function.""" + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_is_experimental_mode_with_json(self): + """test is_experimental_mode returns True with json config.""" + with tempfile.TemporaryDirectory() as temp_dir: + path = temp_dir + # Create config.json + config_path = os.path.join(path, "config.json") + with open(config_path, "w", encoding="utf-8") as f: + json.dump({"model_type": "llama"}, f) + + result = is_experimental_mode(path) + assert result is True + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_is_experimental_mode_with_yaml(self): + """test is_experimental_mode returns False with yaml config.""" + with tempfile.TemporaryDirectory() as temp_dir: + path = temp_dir + + # Create yaml file + yaml_path = os.path.join(path, "model.yaml") + with open(yaml_path, "w", encoding="utf-8") as f: + yaml.dump({"model_type": "llama"}, f) + + result = is_experimental_mode(path) + assert result is False + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_is_experimental_mode_with_tokenizer_config(self): + """test is_experimental_mode with tokenizer_config.json.""" + + with tempfile.TemporaryDirectory() as temp_dir: + path = temp_dir + # Create tokenizer_config.json + config_path = os.path.join(path, "tokenizer_config.json") + with open(config_path, "w", encoding="utf-8") as f: + json.dump({"tokenizer_class": "LlamaTokenizer"}, f) + + result = is_experimental_mode(path) + assert result is True + + @pytest.mark.level1 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + def test_is_experimental_mode_nonexistent_path(self): + """test is_experimental_mode with nonexistent path.""" + result = is_experimental_mode("/nonexistent/path") + assert result is True # Should return True for non-URL paths + def create_yaml(model_name, dir_path): """create yaml.""" @@ -152,5 +1566,5 @@ def create_yaml(model_name, dir_path): } } file_name = f'{dir_path}/{model_name}.yaml' - with open(file_name, 'w') as file: + with open(file_name, "w", encoding="utf-8") as file: yaml.dump(yaml_content, file, default_flow_style=False) -- Gitee