From 5da2a5236e353c4b0009be1934aec38ef5eea840 Mon Sep 17 00:00:00 2001 From: Xinrui Chen Date: Mon, 10 Mar 2025 16:40:14 +0800 Subject: [PATCH] [Trainer] Delete TokenClassificationTrainer --- .../tokcls/run_tokcls_bert_base_chinese.yaml | 237 ------------------ .../run_tokcls_bert_base_chinese_cluener.yaml | 237 ------------------ mindformers/__init__.py | 2 - mindformers/dataset/__init__.py | 1 - .../dataset/token_classification_dataset.py | 196 --------------- mindformers/trainer/__init__.py | 1 - mindformers/trainer/build_trainer.py | 6 +- .../trainer/token_classification/__init__.py | 18 -- .../token_classification.py | 196 --------------- 9 files changed, 1 insertion(+), 893 deletions(-) delete mode 100644 configs/tokcls/run_tokcls_bert_base_chinese.yaml delete mode 100644 configs/tokcls/run_tokcls_bert_base_chinese_cluener.yaml delete mode 100644 mindformers/dataset/token_classification_dataset.py delete mode 100644 mindformers/trainer/token_classification/__init__.py delete mode 100644 mindformers/trainer/token_classification/token_classification.py diff --git a/configs/tokcls/run_tokcls_bert_base_chinese.yaml b/configs/tokcls/run_tokcls_bert_base_chinese.yaml deleted file mode 100644 index d098bc2596..0000000000 --- a/configs/tokcls/run_tokcls_bert_base_chinese.yaml +++ /dev/null @@ -1,237 +0,0 @@ -seed: 42 -run_mode: 'train' -output_dir: './output' # path to save checkpoint/strategy -load_checkpoint: '' -src_strategy_path_or_dir: '' -auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model -only_save_strategy: False -resume_training: False - -# context -context: - mode: 0 #0--Graph Mode; 1--Pynative Mode - device_target: "Ascend" - max_call_depth: 10000 - save_graphs: False - device_id: 0 - -# aicc -remote_save_url: "Please input obs url on AICC platform." - -# runner -runner_config: - epochs: 3 - batch_size: 24 - sink_mode: False - sink_size: 2 -runner_wrapper: - type: TrainOneStepCell - - -# parallel -use_parallel: False -parallel: - parallel_mode: 0 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel - gradients_mean: True - enable_alltoall: False - full_batch: False - search_mode: "sharding_propagation" - enable_parallel_optimizer: False - strategy_ckpt_save_file: "./ckpt_strategy.ckpt" -parallel_config: - data_parallel: 1 - model_parallel: 1 - expert_parallel: 1 - pipeline_stage: 1 - micro_batch_num: 1 - gradient_aggregation_group: 4 -micro_batch_interleave_num: 1 - -# moe -moe_config: - expert_num: 1 - capacity_factor: 1.05 - aux_loss_factor: 0.05 - num_experts_chosen: 1 - -# recompute -recompute: False -parallel_optimizer_comm_recompute: False -mp_comm_recompute: True -recompute_slice_activation: False - -# autotune -auto_tune: False -filepath_prefix: './autotune' -autotune_per_step: 10 - -# profile -profile: False -profile_start_step: 1 -profile_stop_step: 10 -init_start_profile: False -profile_communication: False -profile_memory: True - -# Trainer -trainer: - type: TokenClassificationTrainer - model_name: tokcls_bert_base_chinese -# if True, do evaluate during the training process. if false, do nothing. -# note that the task trainer should support _evaluate_in_training function. -do_eval: False - -# train dataset -train_dataset: &train_dataset - data_loader: - type: CLUENERDataLoader - dataset_dir: "./cluener/" - stage: "train" - column_names: ["text", "label_id"] - text_transforms: - type: TokenizeWithLabel - max_length: 128 - padding: "max_length" - label_transforms: - type: LabelPadding - max_length: 128 - padding_value: 0 - tokenizer: - type: BertTokenizer - cls_token: '[CLS]' - mask_token: '[MASK]' - pad_token: '[PAD]' - sep_token: '[SEP]' - unk_token: '[UNK]' - is_tokenize_char: True - do_lower_case: False - checkpoint_name_or_path: tokcls_bert_base_chinese - input_columns: ["text", "label_id"] - output_columns: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - column_order: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - num_parallel_workers: 8 - python_multiprocessing: False - drop_remainder: True - batch_size: 24 - repeat: 1 - numa_enable: False - prefetch_size: 30 - seed: 2022 -train_dataset_task: - type: TokenClassificationDataset - dataset_config: *train_dataset - -# eval dataset -eval_dataset: &eval_dataset - data_loader: - type: CLUENERDataLoader - dataset_dir: "./cluener/" - stage: "dev" - column_names: ["text", "label_id"] - text_transforms: - type: TokenizeWithLabel - max_length: 128 - padding: "max_length" - label_transforms: - type: LabelPadding - max_length: 128 - padding_value: 0 - tokenizer: - type: BertTokenizer - cls_token: '[CLS]' - mask_token: '[MASK]' - pad_token: '[PAD]' - sep_token: '[SEP]' - unk_token: '[UNK]' - is_tokenize_char: True - do_lower_case: False - checkpoint_name_or_path: tokcls_bert_base_chinese - input_columns: ["text", "label_id"] - output_columns: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - column_order: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - num_parallel_workers: 8 - python_multiprocessing: False - drop_remainder: True - batch_size: 24 - repeat: 1 - numa_enable: False - prefetch_size: 30 - seed: 2022 -eval_dataset_task: - type: TokenClassificationDataset - dataset_config: *eval_dataset - -# model -model: - model_config: - type: BertConfig - use_one_hot_embeddings: False - num_labels: 31 - dropout_prob: 0.1 - batch_size: 24 - seq_length: 128 # length of input sentence - vocab_size: 21128 # size of vocab - embedding_size: 768 # size of text feature - num_layers: 12 # model depth - num_heads: 12 # number of attention heads - expand_ratio: 4 - hidden_act: "gelu" # activation - post_layernorm_residual: True # select postlayernorm or prelayernorm - hidden_dropout_prob: 0.1 - attention_probs_dropout_prob: 0.1 - max_position_embeddings: 512 - type_vocab_size: 2 - initializer_range: 0.02 - use_relative_positions: False - use_past: False - compute_dtype: "float32" - checkpoint_name_or_path: "tokcls_bert_base_chinese" - arch: - type: BertForTokenClassification - -# lr schedule -lr_schedule: - type: linear - learning_rate: 0.00003 # 3e-5 - warmup_ratio: 0.1 - total_steps: -1 # -1 means it will load the total steps of the dataset -layer_scale: False -layer_decay: 0.65 - -# optimizer -optimizer: - type: adamw - weight_decay: 0.01 - eps: 0.00000001 # 1e-8 -lr_scale: False -lr_scale_factor: 256 - -# callbacks -callbacks: - - type: MFLossMonitor - - type: CheckpointMonitor - prefix: "mindformers" - save_checkpoint_steps: 100 - integrated_save: True - async_save: False -eval_callbacks: - - type: ObsMonitor - -# metric -metric: - type: EntityScore - -# processor -processor: - type: BertProcessor - return_tensors: ms - tokenizer: - type: BertTokenizer - cls_token: '[CLS]' - mask_token: '[MASK]' - pad_token: '[PAD]' - sep_token: '[SEP]' - unk_token: '[UNK]' - is_tokenize_char: True - do_lower_case: False - checkpoint_name_or_path: tokcls_bert_base_chinese diff --git a/configs/tokcls/run_tokcls_bert_base_chinese_cluener.yaml b/configs/tokcls/run_tokcls_bert_base_chinese_cluener.yaml deleted file mode 100644 index 90e3723638..0000000000 --- a/configs/tokcls/run_tokcls_bert_base_chinese_cluener.yaml +++ /dev/null @@ -1,237 +0,0 @@ -seed: 42 -run_mode: 'train' -output_dir: './output' # path to save checkpoint/strategy -load_checkpoint: '' -src_strategy_path_or_dir: '' -auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model -only_save_strategy: False -resume_training: False - -# context -context: - mode: 0 #0--Graph Mode; 1--Pynative Mode - device_target: "Ascend" - max_call_depth: 10000 - save_graphs: False - device_id: 0 - -# aicc -remote_save_url: "Please input obs url on AICC platform." - -# runner -runner_config: - epochs: 3 - batch_size: 24 - sink_mode: False - sink_size: 2 -runner_wrapper: - type: TrainOneStepCell - - -# parallel -use_parallel: False -parallel: - parallel_mode: 0 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel - gradients_mean: True - enable_alltoall: False - full_batch: False - search_mode: "sharding_propagation" - enable_parallel_optimizer: False - strategy_ckpt_save_file: "./ckpt_strategy.ckpt" -parallel_config: - data_parallel: 1 - model_parallel: 1 - expert_parallel: 1 - pipeline_stage: 1 - micro_batch_num: 1 - gradient_aggregation_group: 4 -micro_batch_interleave_num: 1 - -# moe -moe_config: - expert_num: 1 - capacity_factor: 1.05 - aux_loss_factor: 0.05 - num_experts_chosen: 1 - -# recompute -recompute: False -parallel_optimizer_comm_recompute: False -mp_comm_recompute: True -recompute_slice_activation: False - -# autotune -auto_tune: False -filepath_prefix: './autotune' -autotune_per_step: 10 - -# profile -profile: False -profile_start_step: 1 -profile_stop_step: 10 -init_start_profile: False -profile_communication: False -profile_memory: True - -# Trainer -trainer: - type: TokenClassificationTrainer - model_name: tokcls_bert_base_chinese_cluener -# if True, do evaluate during the training process. if false, do nothing. -# note that the task trainer should support _evaluate_in_training function. -do_eval: False - -# train dataset -train_dataset: &train_dataset - data_loader: - type: CLUENERDataLoader - dataset_dir: "./cluener/" - stage: "train" - column_names: ["text", "label_id"] - text_transforms: - type: TokenizeWithLabel - max_length: 128 - padding: "max_length" - label_transforms: - type: LabelPadding - max_length: 128 - padding_value: 0 - tokenizer: - type: BertTokenizer - cls_token: '[CLS]' - mask_token: '[MASK]' - pad_token: '[PAD]' - sep_token: '[SEP]' - unk_token: '[UNK]' - is_tokenize_char: True - do_lower_case: False - checkpoint_name_or_path: tokcls_bert_base_chinese - input_columns: ["text", "label_id"] - output_columns: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - column_order: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - num_parallel_workers: 8 - python_multiprocessing: False - drop_remainder: True - batch_size: 24 - repeat: 1 - numa_enable: False - prefetch_size: 30 - seed: 2022 -train_dataset_task: - type: TokenClassificationDataset - dataset_config: *train_dataset - -# eval dataset -eval_dataset: &eval_dataset - data_loader: - type: CLUENERDataLoader - dataset_dir: "./cluener/" - stage: "dev" - column_names: ["text", "label_id"] - text_transforms: - type: TokenizeWithLabel - max_length: 128 - padding: "max_length" - label_transforms: - type: LabelPadding - max_length: 128 - padding_value: 0 - tokenizer: - type: BertTokenizer - cls_token: '[CLS]' - mask_token: '[MASK]' - pad_token: '[PAD]' - sep_token: '[SEP]' - unk_token: '[UNK]' - is_tokenize_char: True - do_lower_case: False - checkpoint_name_or_path: tokcls_bert_base_chinese - input_columns: ["text", "label_id"] - output_columns: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - column_order: ["input_ids", "token_type_ids", "attention_mask", "label_id"] - num_parallel_workers: 8 - python_multiprocessing: False - drop_remainder: True - batch_size: 24 - repeat: 1 - numa_enable: False - prefetch_size: 30 - seed: 2022 -eval_dataset_task: - type: TokenClassificationDataset - dataset_config: *eval_dataset - -# model -model: - model_config: - type: BertConfig - use_one_hot_embeddings: False - num_labels: 31 - dropout_prob: 0.1 - batch_size: 24 - seq_length: 128 # length of input sentence - vocab_size: 21128 # size of vocab - embedding_size: 768 # size of text feature - num_layers: 12 # model depth - num_heads: 12 # number of attention heads - expand_ratio: 4 - hidden_act: "gelu" # activation - post_layernorm_residual: True # select postlayernorm or prelayernorm - hidden_dropout_prob: 0.1 - attention_probs_dropout_prob: 0.1 - max_position_embeddings: 512 - type_vocab_size: 2 - initializer_range: 0.02 - use_relative_positions: False - use_past: False - compute_dtype: "float32" - checkpoint_name_or_path: "tokcls_bert_base_chinese_cluener" - arch: - type: BertForTokenClassification - -# lr schedule -lr_schedule: - type: linear - learning_rate: 0.00003 # 3e-5 - warmup_ratio: 0.1 - total_steps: -1 # -1 means it will load the total steps of the dataset -layer_scale: False -layer_decay: 0.65 - -# optimizer -optimizer: - type: adamw - weight_decay: 0.01 - eps: 0.00000001 # 1e-8 -lr_scale: False -lr_scale_factor: 256 - -# callbacks -callbacks: - - type: MFLossMonitor - - type: CheckpointMonitor - prefix: "mindformers" - save_checkpoint_steps: 100 - integrated_save: True - async_save: False -eval_callbacks: - - type: ObsMonitor - -# metric -metric: - type: EntityScore - -# processor -processor: - type: BertProcessor - return_tensors: ms - tokenizer: - type: BertTokenizer - cls_token: '[CLS]' - mask_token: '[MASK]' - pad_token: '[PAD]' - sep_token: '[SEP]' - unk_token: '[UNK]' - is_tokenize_char: True - do_lower_case: False - checkpoint_name_or_path: tokcls_bert_base_chinese diff --git a/mindformers/__init__.py b/mindformers/__init__.py index f07514dbce..2816e1669e 100644 --- a/mindformers/__init__.py +++ b/mindformers/__init__.py @@ -66,7 +66,6 @@ from mindformers.trainer import ( QuestionAnsweringTrainer, RunnerConfig, TextClassificationTrainer, - TokenClassificationTrainer, Trainer, TrainingArguments, TranslationTrainer, @@ -156,7 +155,6 @@ from mindformers.dataset import ( SQuADDataLoader, SimMask, TextClassificationDataset, - TokenClassificationDataset, TokenizeWithLabel, TokenizerForward, ToolAlpacaDataLoader, diff --git a/mindformers/dataset/__init__.py b/mindformers/dataset/__init__.py index 21612f5ac9..555e76e001 100644 --- a/mindformers/dataset/__init__.py +++ b/mindformers/dataset/__init__.py @@ -74,7 +74,6 @@ from .mask_language_model_dataset import MaskLanguageModelDataset from .mim_dataset import MIMDataset from .question_answering_dataset import QuestionAnsweringDataset from .text_classification_dataset import TextClassificationDataset -from .token_classification_dataset import TokenClassificationDataset from .translation_dataset import TranslationDataset from .zero_shot_image_classification_dataset import ZeroShotImageClassificationDataset from .multi_turn_dataset import MultiTurnDataset diff --git a/mindformers/dataset/token_classification_dataset.py b/mindformers/dataset/token_classification_dataset.py deleted file mode 100644 index 6c98aee025..0000000000 --- a/mindformers/dataset/token_classification_dataset.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright 2023 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Token classification Dataset.""" -from typing import Optional, Union, Callable - -from mindformers.tools.register import MindFormerRegister, MindFormerModuleType -from mindformers.tools.logger import logger -from mindformers.version_control import get_dataset_map -from mindformers.utils import deprecated - -from .dataloader import build_dataset_loader -from ..models.build_tokenizer import build_tokenizer -from .transforms import build_transforms -from .sampler import build_sampler -from .base_dataset import BaseDataset - - -@deprecated(version="1.5.0") -@MindFormerRegister.register(MindFormerModuleType.DATASET) -class TokenClassificationDataset(BaseDataset): - """ - Token classification Dataset. - - Args: - dataset_config (Optional[dict]): - Config for dataset. - data_loader (Union[dict, Callable]): - Config for data loader or a data loader object. - tokenizer (Union[dict, list]): - Tokenizer configuration or object. - text_transforms (Union[dict, list]): - Configurations or objects of one or more transformers of text. - label_transforms (Union[dict, list]): - Configurations or objects of one or more transformers of label. - sampler (Union[dict, list]): - Sampler configuration or object. - input_columns (list): - Column name before the map function. - output_columns (list): - Column name after the map function. - batch_size (int): - Size of each batch. Default: 8. - drop_remainder (bool): - Whether to discard the last batch when the number of data items contained - in the last batch is smaller than batch_size. Default: True. - num_parallel_workers (int): - Specifies the number of concurrent processes or threads for map operations - to accelerate processing. Default: 8. - python_multiprocessing (bool): - Enabling the Python Multi-Process Mode to Accelerate Map Operations. Default: False. - repeat (int): - Number of times this dataset is repeated. Default: 1. - seed (int): - Random seed number. Default: 0. - prefetch_size (int): - Buffer queue size of each data processing operation in the pipeline. Default: 1. - numa_enable (bool): - Indicates whether to use the NUMA binding function. Default: False. - auto_tune (bool): - Indicates whether to enable automatic optimization of data processing parameters. Default: False. - autotune_per_step (int): - Specifies the interval for adjusting the configuration step of automatic data acceleration. Default: 10. - filepath_prefix (str): - Path for saving optimized parameter configurations. Default: './autotune'. - profile (bool): - Whether to enable data collection. Default: False. - - Returns: - A dataset for TokenClassificationDataset. - - Examples: - >>> # 1) Create an instance using a MindFormerConfig. - >>> from mindformers.tools.register import MindFormerConfig - >>> from mindformers import MindFormerBook - >>> from mindformers.dataset import TokenClassificationDataset - >>> from mindformers.dataset import check_dataset_config - >>> config_dict_list = MindFormerBook.get_trainer_support_task_list() - >>> config_path = config_dict_list['token_classification']['tokcls_bert_base_chinese'] - >>> # Initialize a MindFormerConfig instance with a specific config file of yaml. - >>> config = MindFormerConfig(config_path) - >>> config.train_dataset.data_loader.dataset_dir = "The required task dataset path" - >>> # Note: - >>> # The detailed data setting could refer to - >>> # https://gitee.com/mindspore/mindformers/blob/dev/docs/task_cards/token_classification.md - >>> check_dataset_config(config) - >>> # use class to build dataset - >>> dataset_from_class = TokenClassificationDataset(config.train_dataset_task.dataset_config) - >>> - >>> # 2) Creating an instance using other parameters. - >>> from mindformers import AutoTokenizer - >>> from mindformers.dataset import TokenClassificationDataset, CLUENERDataLoader - >>> from mindformers.dataset import TokenizeWithLabel, LabelPadding - >>> tokenizer = AutoTokenizer.from_pretrained('tokcls_bert_base_chinese_cluener') - >>> data_loader = CLUENERDataLoader(dataset_dir="The required task dataset path", - ... stage='train', column_names=['text', 'label_id']) - >>> text_transforms = TokenizeWithLabel(max_length=128, padding='max_length', tokenizer=tokenizer) - >>> label_transforms = LabelPadding(max_length=128, padding_value=0) - >>> dataset_from_param = TokenClassificationDataset(data_loader=data_loader, text_transforms=text_transforms, - ... label_transforms=label_transforms, tokenizer=tokenizer, - ... input_columns=['text', 'label_id'], - ... output_columns=['input_ids', 'token_type_ids', - ... 'attention_mask', 'label_id']) - """ - - # pylint: disable=W0613 - def __new__(cls, - dataset_config: Optional[dict] = None, - data_loader: Union[dict, Callable] = None, - tokenizer: Union[dict, Callable] = None, - text_transforms: Union[dict, list] = None, - label_transforms: Union[dict, list] = None, - sampler: Union[dict, Callable] = None, - input_columns: list = None, - output_columns: list = None, - batch_size: int = 8, - drop_remainder: bool = True, - num_parallel_workers: int = 8, - python_multiprocessing: bool = False, - repeat: int = 1, - seed: int = 0, - prefetch_size: int = 1, - numa_enable: bool = False, - auto_tune: bool = False, - filepath_prefix: str = './autotune', - autotune_per_step: int = 10, - profile: bool = False, - **kwargs): - """new method""" - logger.info("Now Create Token classification Dataset.") - dataset_config = cls.check_dataset_config(dataset_config, locals()) - cls.init_dataset_config(dataset_config) - rank_id, device_num = cls._generate_shard_info() - - if isinstance(dataset_config.data_loader, dict): - dataset = build_dataset_loader(dataset_config.data_loader, - default_args={'num_shards': device_num, 'shard_id': rank_id}) - else: - dataset = dataset_config.data_loader - - if isinstance(dataset_config.tokenizer, dict): - tokenizer = build_tokenizer(dataset_config.tokenizer) - else: - tokenizer = dataset_config.tokenizer - - if (isinstance(dataset_config.text_transforms, list) and isinstance(dataset_config.text_transforms[0], dict)) \ - or isinstance(dataset_config.text_transforms, dict): - text_transforms = build_transforms(dataset_config.text_transforms, default_args={"tokenizer": tokenizer}) - else: - text_transforms = dataset_config.text_transforms - - if (isinstance(dataset_config.label_transforms, list) and isinstance(dataset_config.label_transforms[0], dict))\ - or isinstance(dataset_config.label_transforms, dict): - label_transforms = build_transforms(dataset_config.label_transforms) - else: - label_transforms = dataset_config.label_transforms - - if isinstance(dataset_config.sampler, dict): - sampler = build_sampler(dataset_config.sampler) - else: - sampler = dataset_config.sampler - - if sampler is not None: - dataset = dataset.use_sampler(sampler) - - if text_transforms is not None: - dataset = get_dataset_map(dataset, - input_columns=dataset_config.input_columns, - operations=text_transforms, - output_columns=dataset_config.output_columns, - num_parallel_workers=dataset_config.num_parallel_workers, - python_multiprocessing=dataset_config.python_multiprocessing) - - if label_transforms is not None: - dataset = get_dataset_map(dataset, - input_columns=dataset_config.input_columns[1], - operations=label_transforms, - num_parallel_workers=dataset_config.num_parallel_workers, - python_multiprocessing=dataset_config.python_multiprocessing) - - dataset = dataset.batch(dataset_config.batch_size, - drop_remainder=dataset_config.drop_remainder, - num_parallel_workers=dataset_config.num_parallel_workers) - dataset = dataset.repeat(dataset_config.repeat) - return dataset diff --git a/mindformers/trainer/__init__.py b/mindformers/trainer/__init__.py index dc96704c5e..99f04992d1 100644 --- a/mindformers/trainer/__init__.py +++ b/mindformers/trainer/__init__.py @@ -39,7 +39,6 @@ from .image_to_text_generation import ImageToTextGenerationTrainer from .multi_modal_to_text_generation import MultiModalToTextGenerationTrainer from .translation import TranslationTrainer from .text_classfication import TextClassificationTrainer -from .token_classification import TokenClassificationTrainer from .question_answering import QuestionAnsweringTrainer from .causal_language_modeling import CausalLanguageModelingTrainer from .trainer import Trainer diff --git a/mindformers/trainer/build_trainer.py b/mindformers/trainer/build_trainer.py index c93da0b040..ebedcd58cf 100644 --- a/mindformers/trainer/build_trainer.py +++ b/mindformers/trainer/build_trainer.py @@ -16,7 +16,7 @@ from mindformers.tools.register import MindFormerRegister, MindFormerModuleType, MindFormerConfig from . import ImageClassificationTrainer, ZeroShotImageClassificationTrainer, \ MaskedImageModelingTrainer, MaskedLanguageModelingTrainer, \ - TranslationTrainer, TokenClassificationTrainer, TextClassificationTrainer, \ + TranslationTrainer, TextClassificationTrainer, \ ContrastiveLanguageImagePretrainTrainer, QuestionAnsweringTrainer, GeneralTaskTrainer @@ -72,10 +72,6 @@ def register_mf_trainer(): MaskedLanguageModelingTrainer, module_type=MindFormerModuleType.TRAINER, alias="fill_mask") - MindFormerRegister.register_cls( - TokenClassificationTrainer, - module_type=MindFormerModuleType.TRAINER, alias="token_classification") - MindFormerRegister.register_cls( TextClassificationTrainer, module_type=MindFormerModuleType.TRAINER, alias="text_classification") diff --git a/mindformers/trainer/token_classification/__init__.py b/mindformers/trainer/token_classification/__init__.py deleted file mode 100644 index 34bd937dc4..0000000000 --- a/mindformers/trainer/token_classification/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2023 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Token Classification Trainer.""" -from .token_classification import TokenClassificationTrainer - -__all__ = [] diff --git a/mindformers/trainer/token_classification/token_classification.py b/mindformers/trainer/token_classification/token_classification.py deleted file mode 100644 index 549b8d7c91..0000000000 --- a/mindformers/trainer/token_classification/token_classification.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright 2023 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Token Classification Trainer.""" -from typing import Optional, List, Union - -from mindspore.train import Callback -from mindspore.dataset import GeneratorDataset -from mindspore.nn import TrainOneStepCell, Optimizer, Cell - -from mindformers.dataset import BaseDataset -from mindformers.models import PreTrainedModel, PreTrainedTokenizerBase -from mindformers.tools.logger import logger -from mindformers.tools.register import MindFormerRegister, \ - MindFormerModuleType, MindFormerConfig -from mindformers.utils import deprecated -from ..base_trainer import BaseTrainer -from ..config_args import ConfigArguments -from ..training_args import TrainingArguments -from ...dataset.labels import cluener_labels - - -@deprecated(version="1.5.0") -@MindFormerRegister.register(MindFormerModuleType.TRAINER) -class TokenClassificationTrainer(BaseTrainer): - """ - Trainer of token classification task. It provides training, evaluation and prediction interfaces for - question answering task, allowing users to quickly start the process according to the model name, - and also provides a large number of customizable items to meet user needs. - - Args: - model_name (str): The model name of token classification task trainer. Default: None - - Raises: - NotImplementedError: If train method, evaluate method or predict method not implemented. - """ - - def __init__(self, model_name: str = None): - super().__init__("token_classification", model_name) - - def train(self, - config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None, - network: Optional[Union[Cell, PreTrainedModel]] = None, - dataset: Optional[Union[BaseDataset, GeneratorDataset]] = None, - wrapper: Optional[TrainOneStepCell] = None, - optimizer: Optional[Optimizer] = None, - callbacks: Optional[Union[Callback, List[Callback]]] = None, - **kwargs): - """ - The training API of token classification task. It allows to quickly start training or fine-tuning based on - initialization conditions or by passing in custom configurations. The configurable items include the network, - optimizer, dataset, wrapper, and callbacks. - - Args: - config (Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]]): - The task config which is used to configure the dataset, the hyper-parameter, optimizer, etc. - It supports config dict or MindFormerConfig or TrainingArguments or ConfigArguments class. - Default: None. - network (Optional[Union[Cell, PreTrainedModel]]): The network for trainer. - It supports model name or PreTrainedModel or MindSpore Cell class. - Default: None. - dataset (Optional[Union[BaseDataset, GeneratorDataset]]): The training dataset. - It supports real dataset path or BaseDateset class or MindSpore Dataset class. - Default: None. - optimizer (Optional[Optimizer]): The training network's optimizer. It support Optimizer class of MindSpore. - Default: None. - wrapper (Optional[TrainOneStepCell]): Wraps the `network` with the `optimizer`. - It supports TrainOneStepCell class of MindSpore. - Default: None. - callbacks (Optional[Union[Callback, List[Callback]]]): The training callback function. - It supports CallBack or CallBack List of MindSpore. - Default: None. - - Returns: - None - """ - self.training_process( - config=config, - network=network, - callbacks=callbacks, - dataset=dataset, - wrapper=wrapper, - optimizer=optimizer, - **kwargs) - - def evaluate(self, - config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None, - network: Optional[Union[Cell, PreTrainedModel]] = None, - dataset: Optional[Union[BaseDataset, GeneratorDataset]] = None, - callbacks: Optional[Union[Callback, List[Callback]]] = None, - compute_metrics: Optional[Union[dict, set]] = None, - **kwargs): - """ - The evaluation API of token classification task. It allows to quickly start evaluation based on - initialization conditions or by passing in custom configurations. The configurable items include the network, - dataset, callbacks, compute_metrics and callbacks. - - Args: - config (Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]]): - The task config which is used to configure the dataset, the hyper-parameter, optimizer, etc. - It supports config dict or MindFormerConfig or TrainingArguments or ConfigArguments class. - Default: None. - network (Optional[Union[Cell, PreTrainedModel]]): The network for trainer. - It supports model name or PreTrainedModel or MindSpore Cell class. - Default: None. - dataset (Optional[Union[BaseDataset]]): The evaluate dataset. - It supports real dataset path or BaseDateset class or MindSpore Dataset class. - Default: None. - callbacks (Optional[Union[Callback, List[Callback]]]): The eval callback function. - It supports CallBack or CallBack List of MindSpore. - Default: None. - compute_metrics (Optional[Union[dict, set]]): The metric of evaluating. - It supports dict or set in MindSpore's Metric class. - Default: None. - - Returns: - None - """ - metric_name = "Entity Metric" - kwargs.setdefault("metric_name", metric_name) - super().evaluate_process( - config=config, - network=network, - dataset=dataset, - compute_metrics=compute_metrics, - callbacks=callbacks, - **kwargs - ) - - def predict(self, - config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None, - input_data: Optional[Union[str, list]] = None, - network: Optional[Union[Cell, PreTrainedModel]] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - **kwargs): - """ - The prediction API of token classification task. It allows to quickly start prediction based on - initialization conditions or by passing in custom configurations. The configurable items include the network, - input data, and tokenizer. - - Args: - config (Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]]): - The task config which is used to configure the dataset, the hyper-parameter, optimizer, etc. - It supports config dict or MindFormerConfig or TrainingArguments or ConfigArguments class. - Default: None. - input_data (Optional[Union[Tensor, str, list]]): The predict data. Default: None. - network (Optional[Union[Cell, PreTrainedModel]]): The network for trainer. - It supports model name or PreTrainedModel or MindSpore Cell class. - Default: None. - tokenizer (Optional[PreTrainedTokenizerBase]): The tokenizer for tokenizing the input text. - Default: None. - - Returns: - A list of prediction results. - """ - config = self.set_config(config) - - logger.info(".........Build Input Data For Predict..........") - if input_data is None: - input_data = config.input_data - - if not isinstance(input_data, (str, list)): - raise ValueError("Input data's type must be one of [str, list]") - - if isinstance(input_data, list): - for item in input_data: - if not isinstance(item, str): - raise ValueError("The element of input data list must be str") - - # This is a known issue, you need to specify batch size equal to 1 when creating bert model. - config.model.model_config.batch_size = 1 - - max_length = network.config.seq_length if network else config.model.model_config.seq_length - - id2label = {label_id: label for label_id, label in enumerate(cluener_labels)} - - return self.predict_process(config=config, - input_data=input_data, - task='token_classification', - network=network, - tokenizer=tokenizer, - max_length=max_length, - padding="max_length", - id2label=id2label, - **kwargs) -- Gitee