diff --git a/data/test/regression/fill_mask_sbert_zh.bin b/data/test/regression/fill_mask_sbert_zh.bin index 812f7ba2..62581a26 100644 --- a/data/test/regression/fill_mask_sbert_zh.bin +++ b/data/test/regression/fill_mask_sbert_zh.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280 -size 119940 +oid sha256:4eae921001139d7e3c06331c9ef2213f8fc1c23512acd95751559866fb770e96 +size 121855 diff --git a/data/test/regression/fill_mask_veco_en.bin b/data/test/regression/fill_mask_veco_en.bin index be3fddc8..4d2dba7d 100644 --- a/data/test/regression/fill_mask_veco_en.bin +++ b/data/test/regression/fill_mask_veco_en.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705 -size 119619 +oid sha256:f97d34d7450d17d0a93647129ab10d16b1f6e70c34a73b6f7687b79519ee4f71 +size 121563 diff --git a/data/test/regression/fill_mask_veco_zh.bin b/data/test/regression/fill_mask_veco_zh.bin index c0d27e20..a6eb5621 100644 --- a/data/test/regression/fill_mask_veco_zh.bin +++ b/data/test/regression/fill_mask_veco_zh.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a -size 119619 +oid sha256:a8355f27a3235209f206b5e75f4400353e5989e94cf4d71270b42ded8821d536 +size 121563 diff --git a/data/test/regression/sbert-base-tnews.bin b/data/test/regression/sbert-base-tnews.bin index 1546860f..d2c63ab0 100644 --- a/data/test/regression/sbert-base-tnews.bin +++ b/data/test/regression/sbert-base-tnews.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bce1341f4b55d536771dad6e2b280458579f46c3216474ceb8a926022ab53d0 -size 151572 +oid sha256:344ef971bdf310b76c6571d1f4994ab6abc5edc659654d71a4f75b14a30960c2 +size 152926 diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin index 68efb778..52e31692 100644 --- a/data/test/regression/sbert_nli.bin +++ b/data/test/regression/sbert_nli.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6af5024a26337a440c7ea2935fce84af558dd982ee97a2f027bb922cc874292b -size 61741 +oid sha256:f0aeb07b6c9b40a0cfa7492e839431764e9bece93c906833a07c05e83520a399 +size 63161 diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin index 362f762c..1c8efb81 100644 --- a/data/test/regression/sbert_sen_sim.bin +++ b/data/test/regression/sbert_sen_sim.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbce084781342ca7274c2e4d02ed5c5de43ba213a3b76328d5994404d6544c41 -size 61745 +oid sha256:7aa5c7a2565ccf0d2eea4baf8adbd0e020dbe36a7159b31156c53141cc9b2df2 +size 63165 diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin index 6e441f7f..3ad45356 100644 --- a/data/test/regression/sbert_ws_en.bin +++ b/data/test/regression/sbert_ws_en.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c -size 61239 +oid sha256:cc6de82a8485fbfa008f6c2d5411cd07ba03e4a780bcb4e67efc6fba3c6ce92f +size 63597 diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin index b1841351..a85d787f 100644 --- a/data/test/regression/sbert_ws_zh.bin +++ b/data/test/regression/sbert_ws_zh.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1 -size 61115 +oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030 +size 63349 diff --git a/data/test/regression/sbert_zero_shot.bin b/data/test/regression/sbert_zero_shot.bin index 23d40946..04171523 100644 --- a/data/test/regression/sbert_zero_shot.bin +++ b/data/test/regression/sbert_zero_shot.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85 -size 61589 +oid sha256:01f9b9bf6f8bbf9bb377d4cb6f399b2e5e065381f5b7332343e0db7b4fae72a5 +size 62519 diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py index f19d2bbb..c8b7900e 100644 --- a/modelscope/exporters/base.py +++ b/modelscope/exporters/base.py @@ -19,10 +19,13 @@ class Exporter(ABC): def from_model(cls, model: Model, **kwargs): """Build the Exporter instance. - @param model: A model instance. it will be used to output the generated file, + Args: + model: A Model instance. it will be used to generate the intermediate format file, and the configuration.json in its model_dir field will be used to create the exporter instance. - @param kwargs: Extra kwargs used to create the Exporter instance. - @return: The Exporter instance + kwargs: Extra kwargs used to create the Exporter instance. + + Returns: + The Exporter instance """ cfg = Config.from_file( os.path.join(model.model_dir, ModelFile.CONFIGURATION)) @@ -44,10 +47,13 @@ class Exporter(ABC): In some cases, several files may be generated, So please return a dict which contains the generated name with the file path. - @param opset: The version of the ONNX operator set to use. - @param outputs: The output dir. - @param kwargs: In this default implementation, - kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape). - @return: A dict contains the model name with the model file path. + Args: + opset: The version of the ONNX operator set to use. + outputs: The output dir. + kwargs: In this default implementation, + kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape). + + Returns: + A dict contains the model name with the model file path. """ pass diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py index 52dab4bc..7cee331b 100644 --- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py +++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py @@ -27,11 +27,14 @@ class SbertForSequenceClassificationExporter(TorchModelExporter): **kwargs) -> Dict[str, Any]: """Generate dummy inputs for model exportation to onnx or other formats by tracing. - @param shape: A tuple of input shape which should have at most two dimensions. - shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor. - shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor. - @param pair: Generate sentence pairs or single sentences for dummy inputs. - @return: Dummy inputs. + Args: + shape: A tuple of input shape which should have at most two dimensions. + shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor. + shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor. + pair(bool, `optional`): Whether to generate sentence pairs or single sentences. + + Returns: + Dummy inputs. """ cfg = Config.from_file( diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py index 98a23fe5..94ef277a 100644 --- a/modelscope/exporters/torch_model_exporter.py +++ b/modelscope/exporters/torch_model_exporter.py @@ -13,8 +13,8 @@ from modelscope.models import TorchModel from modelscope.pipelines.base import collate_fn from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger -from modelscope.utils.regress_test_utils import compare_arguments_nested -from modelscope.utils.tensor_utils import torch_nested_numpify +from modelscope.utils.regress_test_utils import (compare_arguments_nested, + numpify_tensor_nested) from .base import Exporter logger = get_logger(__name__) @@ -28,49 +28,61 @@ class TorchModelExporter(Exporter): and to provide implementations for generate_dummy_inputs/inputs/outputs methods. """ - def export_onnx(self, outputs: str, opset=11, **kwargs): + def export_onnx(self, output_dir: str, opset=13, **kwargs): """Export the model as onnx format files. In some cases, several files may be generated, So please return a dict which contains the generated name with the file path. - @param opset: The version of the ONNX operator set to use. - @param outputs: The output dir. - @param kwargs: In this default implementation, - you can pass the arguments needed by _torch_export_onnx, other unrecognized args - will be carried to generate_dummy_inputs as extra arguments (such as input shape). - @return: A dict containing the model key - model file path pairs. + Args: + opset: The version of the ONNX operator set to use. + output_dir: The output dir. + kwargs: + model: A model instance which will replace the exporting of self.model. + In this default implementation, + you can pass the arguments needed by _torch_export_onnx, other unrecognized args + will be carried to generate_dummy_inputs as extra arguments (such as input shape). + + Returns: + A dict containing the model key - model file path pairs. """ - model = self.model + model = self.model if 'model' not in kwargs else kwargs.pop('model') if not isinstance(model, nn.Module) and hasattr(model, 'model'): model = model.model - onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE) + onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE) self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs) return {'model': onnx_file} - def export_torch_script(self, outputs: str, **kwargs): + def export_torch_script(self, output_dir: str, **kwargs): """Export the model as torch script files. In some cases, several files may be generated, So please return a dict which contains the generated name with the file path. - @param outputs: The output dir. - @param kwargs: In this default implementation, + Args: + output_dir: The output dir. + kwargs: + model: A model instance which will replace the exporting of self.model. + In this default implementation, you can pass the arguments needed by _torch_export_torch_script, other unrecognized args will be carried to generate_dummy_inputs as extra arguments (like input shape). - @return: A dict contains the model name with the model file path. + + Returns: + A dict contains the model name with the model file path. """ - model = self.model + model = self.model if 'model' not in kwargs else kwargs.pop('model') if not isinstance(model, nn.Module) and hasattr(model, 'model'): model = model.model - ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE) + ts_file = os.path.join(output_dir, ModelFile.TS_MODEL_FILE) # generate ts by tracing self._torch_export_torch_script(model, ts_file, **kwargs) return {'model': ts_file} def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]: """Generate dummy inputs for model exportation to onnx or other formats by tracing. - @return: Dummy inputs. + + Returns: + Dummy inputs. """ return None @@ -93,7 +105,7 @@ class TorchModelExporter(Exporter): def _torch_export_onnx(self, model: nn.Module, output: str, - opset: int = 11, + opset: int = 13, device: str = 'cpu', validation: bool = True, rtol: float = None, @@ -101,18 +113,27 @@ class TorchModelExporter(Exporter): **kwargs): """Export the model to an onnx format file. - @param model: A torch.nn.Module instance to export. - @param output: The output file. - @param opset: The version of the ONNX operator set to use. - @param device: The device used to forward. - @param validation: Whether validate the export file. - @param rtol: The rtol used to regress the outputs. - @param atol: The atol used to regress the outputs. + Args: + model: A torch.nn.Module instance to export. + output: The output file. + opset: The version of the ONNX operator set to use. + device: The device used to forward. + validation: Whether validate the export file. + rtol: The rtol used to regress the outputs. + atol: The atol used to regress the outputs. + kwargs: + dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs(). + inputs: An inputs structure which will replace the calling of self.inputs. + outputs: An outputs structure which will replace the calling of self.outputs. """ - dummy_inputs = self.generate_dummy_inputs(**kwargs) - inputs = self.inputs - outputs = self.outputs + dummy_inputs = self.generate_dummy_inputs( + **kwargs) if 'dummy_inputs' not in kwargs else kwargs.pop( + 'dummy_inputs') + inputs = self.inputs if 'inputs' not in kwargs else kwargs.pop( + 'inputs') + outputs = self.outputs if 'outputs' not in kwargs else kwargs.pop( + 'outputs') if dummy_inputs is None or inputs is None or outputs is None: raise NotImplementedError( 'Model property dummy_inputs,inputs,outputs must be set.') @@ -125,7 +146,7 @@ class TorchModelExporter(Exporter): if isinstance(dummy_inputs, Mapping): dummy_inputs = dict(dummy_inputs) - onnx_outputs = list(self.outputs.keys()) + onnx_outputs = list(outputs.keys()) with replace_call(): onnx_export( @@ -160,11 +181,13 @@ class TorchModelExporter(Exporter): outputs_origin = model.forward( *_decide_input_format(model, dummy_inputs)) if isinstance(outputs_origin, Mapping): - outputs_origin = torch_nested_numpify( + outputs_origin = numpify_tensor_nested( list(outputs_origin.values())) + elif isinstance(outputs_origin, (tuple, list)): + outputs_origin = numpify_tensor_nested(outputs_origin) outputs = ort_session.run( onnx_outputs, - torch_nested_numpify(dummy_inputs), + numpify_tensor_nested(dummy_inputs), ) tols = {} @@ -184,19 +207,26 @@ class TorchModelExporter(Exporter): validation: bool = True, rtol: float = None, atol: float = None, + strict: bool = True, **kwargs): """Export the model to a torch script file. - @param model: A torch.nn.Module instance to export. - @param output: The output file. - @param device: The device used to forward. - @param validation: Whether validate the export file. - @param rtol: The rtol used to regress the outputs. - @param atol: The atol used to regress the outputs. + Args: + model: A torch.nn.Module instance to export. + output: The output file. + device: The device used to forward. + validation: Whether validate the export file. + rtol: The rtol used to regress the outputs. + atol: The atol used to regress the outputs. + strict: strict mode in torch script tracing. + kwargs: + dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs(). """ model.eval() - dummy_inputs = self.generate_dummy_inputs(**kwargs) + dummy_param = 'dummy_inputs' not in kwargs + dummy_inputs = self.generate_dummy_inputs( + **kwargs) if dummy_param else kwargs.pop('dummy_inputs') if dummy_inputs is None: raise NotImplementedError( 'Model property dummy_inputs must be set.') @@ -207,7 +237,7 @@ class TorchModelExporter(Exporter): model.eval() with replace_call(): traced_model = torch.jit.trace( - model, dummy_inputs, strict=False) + model, dummy_inputs, strict=strict) torch.jit.save(traced_model, output) if validation: @@ -216,9 +246,9 @@ class TorchModelExporter(Exporter): model.eval() ts_model.eval() outputs = ts_model.forward(*dummy_inputs) - outputs = torch_nested_numpify(outputs) + outputs = numpify_tensor_nested(outputs) outputs_origin = model.forward(*dummy_inputs) - outputs_origin = torch_nested_numpify(outputs_origin) + outputs_origin = numpify_tensor_nested(outputs_origin) tols = {} if rtol is not None: tols['rtol'] = rtol @@ -240,7 +270,6 @@ def replace_call(): problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it back after the tracing was done. """ - TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl yield TorchModel.__call__ = TorchModel.call_origin diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 913589d8..01b08699 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -69,7 +69,6 @@ class Models(object): space_modeling = 'space-modeling' space_T_en = 'space-T-en' space_T_cn = 'space-T-cn' - tcrf = 'transformer-crf' transformer_softmax = 'transformer-softmax' lcrf = 'lstm-crf' diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py index 3a9d810f..1b9db825 100644 --- a/modelscope/metrics/base.py +++ b/modelscope/metrics/base.py @@ -10,9 +10,6 @@ class Metric(ABC): complex metrics for a specific task with or without other Metric subclasses. """ - def __init__(self, trainer=None, *args, **kwargs): - self.trainer = trainer - @abstractmethod def add(self, outputs: Dict, inputs: Dict): """ Append logits and labels within an eval loop. diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py index 05b72170..f8595fc1 100644 --- a/modelscope/metrics/token_classification_metric.py +++ b/modelscope/metrics/token_classification_metric.py @@ -34,17 +34,24 @@ class TokenClassificationMetric(Metric): self.labels.append( torch_nested_numpify(torch_nested_detach(ground_truths))) - def __init__(self, return_entity_level_metrics=False, *args, **kwargs): + def __init__(self, + return_entity_level_metrics=False, + label2id=None, + *args, + **kwargs): super().__init__(*args, **kwargs) self.return_entity_level_metrics = return_entity_level_metrics self.preds = [] self.labels = [] + self.label2id = label2id def evaluate(self): - self.id2label = { - id: label - for label, id in self.trainer.label2id.items() - } + label2id = self.label2id + if label2id is None: + assert hasattr(self, 'trainer') + label2id = self.trainer.label2id + + self.id2label = {id: label for label, id in label2id.items()} self.preds = np.concatenate(self.preds, axis=0) self.labels = np.concatenate(self.labels, axis=0) predictions = np.argmax(self.preds, axis=-1) diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py index cdc71fcf..1246551e 100644 --- a/modelscope/models/base/base_model.py +++ b/modelscope/models/base/base_model.py @@ -5,11 +5,11 @@ from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Union from modelscope.hub.snapshot_download import snapshot_download -from modelscope.models.builder import build_model -from modelscope.utils.checkpoint import save_pretrained +from modelscope.models.builder import MODELS, build_model +from modelscope.utils.checkpoint import save_checkpoint, save_pretrained from modelscope.utils.config import Config -from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile -from modelscope.utils.device import device_placement, verify_device +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks +from modelscope.utils.device import verify_device from modelscope.utils.logger import get_logger logger = get_logger() @@ -66,7 +66,6 @@ class Model(ABC): revision: Optional[str] = DEFAULT_MODEL_REVISION, cfg_dict: Config = None, device: str = None, - *model_args, **kwargs): """ Instantiate a model from local directory or remote model repo. Note that when loading from remote, the model revision can be specified. @@ -90,11 +89,11 @@ class Model(ABC): cfg = Config.from_file( osp.join(local_model_dir, ModelFile.CONFIGURATION)) task_name = cfg.task + if 'task' in kwargs: + task_name = kwargs.pop('task') model_cfg = cfg.model - if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): model_cfg.type = model_cfg.model_type - model_cfg.model_dir = local_model_dir for k, v in kwargs.items(): model_cfg[k] = v @@ -109,15 +108,19 @@ class Model(ABC): # dynamically add pipeline info to model for pipeline inference if hasattr(cfg, 'pipeline'): model.pipeline = cfg.pipeline + + if not hasattr(model, 'cfg'): + model.cfg = cfg return model def save_pretrained(self, target_folder: Union[str, os.PathLike], save_checkpoint_names: Union[str, List[str]] = None, - save_function: Callable = None, + save_function: Callable = save_checkpoint, config: Optional[dict] = None, **kwargs): - """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded + """save the pretrained model, its configuration and other related files to a directory, + so that it can be re-loaded Args: target_folder (Union[str, os.PathLike]): @@ -133,5 +136,10 @@ class Model(ABC): The config for the configuration.json, might not be identical with model.config """ + if config is None and hasattr(self, 'cfg'): + config = self.cfg + assert config is not None, 'Cannot save the model because the model config is empty.' + if isinstance(config, Config): + config = config.to_dict() save_pretrained(self, target_folder, save_checkpoint_names, save_function, config, **kwargs) diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py index 7a8e28f4..a35358c1 100644 --- a/modelscope/models/builder.py +++ b/modelscope/models/builder.py @@ -1,10 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.utils.config import ConfigDict +from modelscope.utils.constant import Tasks from modelscope.utils.registry import TYPE_NAME, Registry, build_from_cfg MODELS = Registry('models') BACKBONES = Registry('backbones') +BACKBONES._modules = MODELS._modules HEADS = Registry('heads') @@ -23,30 +25,27 @@ def build_model(cfg: ConfigDict, cfg, MODELS, group_key=task_name, default_args=default_args) -def build_backbone(cfg: ConfigDict, - field: str = None, - default_args: dict = None): +def build_backbone(cfg: ConfigDict, default_args: dict = None): """ build backbone given backbone config dict Args: cfg (:obj:`ConfigDict`): config dict for backbone object. - field (str, optional): field, such as CV, NLP's backbone default_args (dict, optional): Default initialization arguments. """ return build_from_cfg( - cfg, BACKBONES, group_key=field, default_args=default_args) + cfg, BACKBONES, group_key=Tasks.backbone, default_args=default_args) def build_head(cfg: ConfigDict, - group_key: str = None, + task_name: str = None, default_args: dict = None): """ build head given config dict Args: cfg (:obj:`ConfigDict`): config dict for head object. + task_name (str, optional): task name, refer to + :obj:`Tasks` for more details default_args (dict, optional): Default initialization arguments. """ - if group_key is None: - group_key = cfg[TYPE_NAME] return build_from_cfg( - cfg, HEADS, group_key=group_key, default_args=default_args) + cfg, HEADS, group_key=task_name, default_args=default_args) diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py index 7c1cea36..cb0921c6 100644 --- a/modelscope/models/nlp/T5/__init__.py +++ b/modelscope/models/nlp/T5/__init__.py @@ -1,13 +1,17 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .t5_for_text_generation import T5ForConditionalGeneration + from .backbone import T5Model + from .text2text_generation import T5ForConditionalGeneration else: _import_structure = { - 't5_for_text_generation': ['T5ForConditionalGeneration'], + 'backbone': ['T5Model'], + 'text2text_generation': ['T5ForConditionalGeneration'], } import sys diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/backbone.py similarity index 73% rename from modelscope/models/nlp/T5/modeling_t5.py rename to modelscope/models/nlp/T5/backbone.py index da50741e..9a46d980 100644 --- a/modelscope/models/nlp/T5/modeling_t5.py +++ b/modelscope/models/nlp/T5/backbone.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,12 +22,8 @@ from typing import Optional, Tuple, Union import torch from torch import nn -from torch.nn import CrossEntropyLoss from torch.utils.checkpoint import checkpoint from transformers.activations import ACT2FN -from transformers.modeling_outputs import ( - BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, - Seq2SeqLMOutput, Seq2SeqModelOutput) from transformers.modeling_utils import (PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer) @@ -36,30 +33,20 @@ from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, from transformers.utils.model_parallel_utils import (assert_device_map, get_device_map) +from modelscope.metainfo import Models +from modelscope.models.base import Model, Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import (BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqModelOutput) +from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger -from .configuration_t5 import T5Config +from .configuration import T5Config logger = get_logger(__name__) -_CONFIG_FOR_DOC = 'T5Config' -_TOKENIZER_FOR_DOC = 'T5Tokenizer' -_CHECKPOINT_FOR_DOC = 't5-small' -#################################################### -# This dict contains ids and associated url -# for the pretrained weights provided with the models -#################################################### -T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ - 't5-small', - 't5-base', - 't5-large', - 't5-3b', - 't5-11b', - # See all T5 models at https://huggingface.co/models?filter=t5 -] - - -#################################################### +################################################### # This is a conversion method from TF 1.0 to PyTorch # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 #################################################### @@ -173,65 +160,6 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): return model -#################################################### -# PyTorch Models are constructed by sub-classing -# - torch.nn.Module for the layers and -# - PreTrainedModel for the models (it-self a sub-class of nn.Module) -#################################################### -PARALLELIZE_DOCSTRING = r""" - This is an experimental feature and is a subject to change at a moment's notice. - - Uses a device map to distribute attention modules of the model across several devices. If no device map is given, - it will evenly distribute blocks across all devices. - - Args: - device_map (`Dict[int, list]`, optional, defaults to None): - A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always - automatically mapped to the first device (for esoteric reasons). That means that the first device should - have fewer attention modules mapped to it than other devices. For reference, the t5 models have the - following number of attention modules: - - - t5-small: 6 - - t5-base: 12 - - t5-large: 24 - - t5-3b: 24 - - t5-11b: 24 - - Example: - - ```python - # Here is an example of a device map on a machine with 4 GPUs - # using t5-3b, which has a total of 24 attention modules: - model = T5ForConditionalGeneration.from_pretrained("t5-3b") - device_map = { - 0: [0, 1, 2], - 1: [3, 4, 5, 6, 7, 8, 9], - 2: [10, 11, 12, 13, 14, 15, 16], - 3: [17, 18, 19, 20, 21, 22, 23], - } - model.parallelize(device_map) - ``` -""" -DEPARALLELIZE_DOCSTRING = r""" - Moves the model to cpu from a model parallel state. - - Example: - - ```python - # On a 4 GPU machine with t5-3b: - model = T5ForConditionalGeneration.from_pretrained("t5-3b") - device_map = { - 0: [0, 1, 2], - 1: [3, 4, 5, 6, 7, 8, 9], - 2: [10, 11, 12, 13, 14, 15, 16], - 3: [17, 18, 19, 20, 21, 22, 23], - } - model.parallelize(device_map) # Splits the model across several devices - model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() - ``` -""" - - class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): @@ -261,23 +189,6 @@ class T5LayerNorm(nn.Module): return self.weight * hidden_states -try: - from apex.normalization import FusedRMSNorm - - T5LayerNorm = FusedRMSNorm # noqa - - logger.info( - 'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm' - ) -except ImportError: - # using the normal T5LayerNorm - pass -except Exception: - logger.warning( - 'discovered apex but it failed to load, falling back to T5LayerNorm') - pass - - class T5DenseReluDense(nn.Module): def __init__(self, config: T5Config): @@ -791,7 +702,7 @@ class T5Block(nn.Module): return outputs -class T5PreTrainedModel(PreTrainedModel): +class T5PreTrainedModel(TorchModel, PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. @@ -803,6 +714,10 @@ class T5PreTrainedModel(PreTrainedModel): is_parallelizable = True supports_gradient_checkpointing = True + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + @property def dummy_inputs(self): input_ids = torch.tensor(DUMMY_INPUTS) @@ -819,8 +734,7 @@ class T5PreTrainedModel(PreTrainedModel): factor = self.config.initializer_factor # Used for testing weights initialization if isinstance(module, T5LayerNorm): module.weight.data.fill_(factor * 1.0) - elif isinstance(module, - (T5Model, T5ForConditionalGeneration, T5EncoderModel)): + elif isinstance(module, T5Model): # Mesh TensorFlow embeddings initialization See # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) @@ -902,6 +816,36 @@ class T5PreTrainedModel(PreTrainedModel): return shifted_input_ids + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the + label information. num_labels: An optional arg to tell the + model how many classes to initialize. + Method will call utils.parse_label_mapping + if num_labels not supplied. If num_labels is + not found, the model will use the default + setting (2 classes). + + Returns: + The loaded model, which is initialized by + transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.get('model_dir', None) + if model_dir is None: + config = T5Config(**kwargs) + model = cls(config) + else: + model_kwargs = {} + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_kwargs) + model.model_dir = model_dir + return model + class T5Stack(T5PreTrainedModel): @@ -926,8 +870,42 @@ class T5Stack(T5PreTrainedModel): self.device_map = None self.gradient_checkpointing = False - @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + r""" + This is an experimental feature and is a subject to change at a + moment's notice. + + Uses a device map to distribute attention modules of the model + across several devices. If no device map is given, it will evenly + distribute blocks across all devices. + + Args: + device_map (`Dict[int, list]`, optional, defaults to None): + A dictionary that maps attention modules to devices. Note + that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric + reasons). That means that the first device should have fewer + attention modules mapped to it than other devices. For + reference, the t5 models have the following number of + attention modules: + + - t5-small: 6 + - t5-base: 12 + - t5-large: 24 + - t5-3b: 24 + - t5-11b: 24 + + Example: + + ```python # Here is an example of a device map on a machine with 4 + GPUs # using t5-3b, which has a total of 24 attention modules: model + = T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = { + 0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14, + 15, 16], 3: [17, 18, 19, 20, 21, 22, 23], + } model.parallelize(device_map) ``` all of the parallelize methods + in this file are the same + + """ # Check validity of device_map self.device_map = ( get_device_map(len(self.block), range(torch.cuda.device_count())) @@ -948,8 +926,22 @@ class T5Stack(T5PreTrainedModel): # Set final layer norm to last device self.final_layer_norm = self.final_layer_norm.to(self.last_device) - @add_start_docstrings(PARALLELIZE_DOCSTRING) def deparallelize(self): + r""" + Moves the model to cpu from a model parallel state. + + Example: + + ```python # On a 4 GPU machine with t5-3b: model = + T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = { + 0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14, + 15, 16], 3: [17, 18, 19, 20, 21, 22, 23], + } model.parallelize(device_map) # Splits the model across several + devices model.deparallelize() # Put the model back on cpu and + cleans memory by calling torch.cuda.empty_cache() ``` + + all of the deparallelize methods in this file are the same + """ self.model_parallel = False self.device_map = None self.first_device = 'cpu' @@ -1199,7 +1191,20 @@ class T5Stack(T5PreTrainedModel): ) -T5_START_DOCSTRING = r""" +# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +__HEAD_MASK_WARNING_MSG = """ +The input argument `head_mask` was split into two arguments `head_mask` and +`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`, +but this feature is deprecated and will be removed in future versions. If you do +not want to use any `decoder_head_mask` now, please set `decoder_head_mask = +torch.ones(num_layers, num_heads)`. +""" + + +@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.T5) +class T5Model(T5PreTrainedModel): + """The bare T5 Model transformer outputting raw hidden-states without any + specific head on top. The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by @@ -1224,10 +1229,99 @@ T5_START_DOCSTRING = r""" with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" + """ + _keys_to_ignore_on_load_missing = [ + r'encoder\.embed_tokens\.weight', + r'decoder\.embed_tokens\.weight', + ] + _keys_to_ignore_on_load_unexpected = [ + r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight', + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) -T5_INPUTS_DOCSTRING = r""" - Args: + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = T5Stack(decoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map( + len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None else device_map) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.decoder.parallelize(self.device_map) + self.model_parallel = True + + def deparallelize(self): + self.encoder.deparallelize() + self.decoder.deparallelize() + self.encoder = self.encoder.to('cpu') + self.decoder = self.decoder.to('cpu') + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of + heads to prune in this layer} See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: + r""" + Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you should be able to pad the @@ -1343,244 +1437,84 @@ T5_INPUTS_DOCSTRING = r""" return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" + Returns: -T5_ENCODER_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. T5 is a model - with relative position embeddings so you should be able to pad the - inputs on both the right and the left. + Example: - Indices can be obtained using [`T5Tokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] - for detail. + ```python >>> from transformers import T5Tokenizer, T5Model - To know more on how to prepare `input_ids` for pretraining take a - look a [T5 Training](./t5#training). - attention_mask (`torch.FloatTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask - values selected in `[0, 1]`: + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5Model.from_pretrained("t5-small") - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + >>> ).input_ids # Batch size 1 + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, - num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask - values selected in `[0, 1]`: + >>> # forward pass + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if you want - more control over how to convert `input_ids` indices into associated - vectors than the model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain - tuple. -""" + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] + if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] + if len(encoder_outputs) > 2 else None, + ) -# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask -__HEAD_MASK_WARNING_MSG = """ -The input argument `head_mask` was split into two arguments `head_mask` and -`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`, -but this feature is deprecated and will be removed in future versions. If you do -not want to use any `decoder_head_mask` now, please set `decoder_head_mask = -torch.ones(num_layers, num_heads)`. -""" + hidden_states = encoder_outputs[0] + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to( + self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device) - -@add_start_docstrings( - 'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.', - T5_START_DOCSTRING, -) -class T5Model(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r'encoder\.embed_tokens\.weight', - r'decoder\.embed_tokens\.weight', - ] - _keys_to_ignore_on_load_unexpected = [ - r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight', - ] - - def __init__(self, config: T5Config): - super().__init__(config) - self.shared = nn.Embedding(config.vocab_size, config.d_model) - - encoder_config = copy.deepcopy(config) - encoder_config.is_decoder = False - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False - self.encoder = T5Stack(encoder_config, self.shared) - - decoder_config = copy.deepcopy(config) - decoder_config.is_decoder = True - decoder_config.is_encoder_decoder = False - decoder_config.num_layers = config.num_decoder_layers - self.decoder = T5Stack(decoder_config, self.shared) - - # Initialize weights and apply final processing - self.post_init() - - # Model parallel - self.model_parallel = False - self.device_map = None - - @add_start_docstrings(PARALLELIZE_DOCSTRING) - def parallelize(self, device_map=None): - self.device_map = ( - get_device_map( - len(self.encoder.block), range(torch.cuda.device_count())) - if device_map is None else device_map) - assert_device_map(self.device_map, len(self.encoder.block)) - self.encoder.parallelize(self.device_map) - self.decoder.parallelize(self.device_map) - self.model_parallel = True - - @add_start_docstrings(DEPARALLELIZE_DOCSTRING) - def deparallelize(self): - self.encoder.deparallelize() - self.decoder.deparallelize() - self.encoder = self.encoder.to('cpu') - self.decoder = self.decoder.to('cpu') - self.model_parallel = False - self.device_map = None - torch.cuda.empty_cache() - - def get_input_embeddings(self): - return self.shared - - def set_input_embeddings(self, new_embeddings): - self.shared = new_embeddings - self.encoder.set_input_embeddings(new_embeddings) - self.decoder.set_input_embeddings(new_embeddings) - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of - heads to prune in this layer} See base class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) - @replace_return_docstrings( - output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.BoolTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - decoder_head_mask: Optional[torch.FloatTensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - decoder_inputs_embeds: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: - r""" - Returns: - - Example: - - ```python >>> from transformers import T5Tokenizer, T5Model - - >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") - >>> model = T5Model.from_pretrained("t5-small") - - >>> input_ids = tokenizer( - ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" - >>> ).input_ids # Batch size 1 - >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - - >>> # forward pass - >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) - >>> last_hidden_states = outputs.last_hidden_state - ```""" - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask - if head_mask is not None and decoder_head_mask is None: - if self.config.num_layers == self.config.num_decoder_layers: - warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) - decoder_head_mask = head_mask - - # Encode if needed (training, first prediction pass) - if encoder_outputs is None: - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] - if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] - if len(encoder_outputs) > 2 else None, - ) - - hidden_states = encoder_outputs[0] - if self.model_parallel: - torch.cuda.set_device(self.decoder.first_device) - # Set device for model parallelism - if self.model_parallel: - torch.cuda.set_device(self.decoder.first_device) - hidden_states = hidden_states.to(self.decoder.first_device) - if decoder_input_ids is not None: - decoder_input_ids = decoder_input_ids.to( - self.decoder.first_device) - if attention_mask is not None: - attention_mask = attention_mask.to(self.decoder.first_device) - if decoder_attention_mask is not None: - decoder_attention_mask = decoder_attention_mask.to( - self.decoder.first_device) - - # Decode - decoder_outputs = self.decoder( - input_ids=decoder_input_ids, - attention_mask=decoder_attention_mask, - inputs_embeds=decoder_inputs_embeds, - past_key_values=past_key_values, - encoder_hidden_states=hidden_states, - encoder_attention_mask=attention_mask, - head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) if not return_dict: return decoder_outputs + encoder_outputs @@ -1595,409 +1529,3 @@ class T5Model(T5PreTrainedModel): encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) - - -@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", - T5_START_DOCSTRING) -class T5ForConditionalGeneration(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r'encoder\.embed_tokens\.weight', - r'decoder\.embed_tokens\.weight', - r'lm_head\.weight', - ] - _keys_to_ignore_on_load_unexpected = [ - r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight', - ] - - def __init__(self, config: T5Config): - super().__init__(config) - self.model_dim = config.d_model - - self.shared = nn.Embedding(config.vocab_size, config.d_model) - - encoder_config = copy.deepcopy(config) - encoder_config.is_decoder = False - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False - self.encoder = T5Stack(encoder_config, self.shared) - - decoder_config = copy.deepcopy(config) - decoder_config.is_decoder = True - decoder_config.is_encoder_decoder = False - decoder_config.num_layers = config.num_decoder_layers - self.decoder = T5Stack(decoder_config, self.shared) - - self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - # Model parallel - self.model_parallel = False - self.device_map = None - - @add_start_docstrings(PARALLELIZE_DOCSTRING) - def parallelize(self, device_map=None): - self.device_map = ( - get_device_map( - len(self.encoder.block), range(torch.cuda.device_count())) - if device_map is None else device_map) - assert_device_map(self.device_map, len(self.encoder.block)) - self.encoder.parallelize(self.device_map) - self.decoder.parallelize(self.device_map) - self.lm_head = self.lm_head.to(self.decoder.first_device) - self.model_parallel = True - - @add_start_docstrings(DEPARALLELIZE_DOCSTRING) - def deparallelize(self): - self.encoder.deparallelize() - self.decoder.deparallelize() - self.encoder = self.encoder.to('cpu') - self.decoder = self.decoder.to('cpu') - self.lm_head = self.lm_head.to('cpu') - self.model_parallel = False - self.device_map = None - torch.cuda.empty_cache() - - def get_input_embeddings(self): - return self.shared - - def set_input_embeddings(self, new_embeddings): - self.shared = new_embeddings - self.encoder.set_input_embeddings(new_embeddings) - self.decoder.set_input_embeddings(new_embeddings) - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) - @replace_return_docstrings( - output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.BoolTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - decoder_head_mask: Optional[torch.FloatTensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. - Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All - labels set to `-100` are ignored (masked), the loss is only computed - for labels in `[0, ..., config.vocab_size]` - - Returns: - - Examples: - - ```python >>> from transformers import T5Tokenizer, - T5ForConditionalGeneration - - >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") - >>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - - >>> # training - >>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids - >>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids - >>> outputs = model(input_ids=input_ids, labels=labels) - >>> loss = outputs.loss - >>> logits = outputs.logits - - >>> # inference - >>> input_ids = tokenizer( - ... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" - >>> ).input_ids # Batch size 1 - >>> outputs = model.generate(input_ids) - >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) - >>> # studies have shown that owning a dog is good for you. - ```""" - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask - if head_mask is not None and decoder_head_mask is None: - if self.config.num_layers == self.config.num_decoder_layers: - warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) - decoder_head_mask = head_mask - - # Encode if needed (training, first prediction pass) - if encoder_outputs is None: - # Convert encoder inputs in embeddings if needed - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] - if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] - if len(encoder_outputs) > 2 else None, - ) - - hidden_states = encoder_outputs[0] - - if self.model_parallel: - torch.cuda.set_device(self.decoder.first_device) - - if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: - # get decoder inputs from shifting lm labels to the right - decoder_input_ids = self._shift_right(labels) - - # Set device for model parallelism - if self.model_parallel: - torch.cuda.set_device(self.decoder.first_device) - hidden_states = hidden_states.to(self.decoder.first_device) - if decoder_input_ids is not None: - decoder_input_ids = decoder_input_ids.to( - self.decoder.first_device) - if attention_mask is not None: - attention_mask = attention_mask.to(self.decoder.first_device) - if decoder_attention_mask is not None: - decoder_attention_mask = decoder_attention_mask.to( - self.decoder.first_device) - - # Decode - decoder_outputs = self.decoder( - input_ids=decoder_input_ids, - attention_mask=decoder_attention_mask, - inputs_embeds=decoder_inputs_embeds, - past_key_values=past_key_values, - encoder_hidden_states=hidden_states, - encoder_attention_mask=attention_mask, - head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = decoder_outputs[0] - - # Set device for model parallelism - if self.model_parallel: - torch.cuda.set_device(self.encoder.first_device) - self.lm_head = self.lm_head.to(self.encoder.first_device) - sequence_output = sequence_output.to(self.lm_head.weight.device) - - if self.config.tie_word_embeddings: - # Rescale output before projecting on vocab See - # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 - sequence_output = sequence_output * (self.model_dim**-0.5) - - lm_logits = self.lm_head(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=-100) - loss = loss_fct( - lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) - # TODO(thom): Add z_loss - # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 - - if not return_dict: - output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs - return ((loss, ) + output) if loss is not None else output - - return Seq2SeqLMOutput( - loss=loss, - logits=lm_logits, - past_key_values=decoder_outputs.past_key_values, - decoder_hidden_states=decoder_outputs.hidden_states, - decoder_attentions=decoder_outputs.attentions, - cross_attentions=decoder_outputs.cross_attentions, - encoder_last_hidden_state=encoder_outputs.last_hidden_state, - encoder_hidden_states=encoder_outputs.hidden_states, - encoder_attentions=encoder_outputs.attentions, - ) - - def prepare_inputs_for_generation(self, - input_ids, - past=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs): - - # cut decoder_input_ids if past is used - if past is not None: - input_ids = input_ids[:, -1:] - - return { - 'decoder_input_ids': input_ids, - 'past_key_values': past, - 'encoder_outputs': encoder_outputs, - 'attention_mask': attention_mask, - 'head_mask': head_mask, - 'decoder_head_mask': decoder_head_mask, - 'cross_attn_head_mask': cross_attn_head_mask, - 'use_cache': use_cache, - } - - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): - return self._shift_right(labels) - - def _reorder_cache(self, past, beam_idx): - # if decoder past is not included in output - # speedy decoding is disabled and no need to reorder - if past is None: - logger.warning( - 'You might want to consider setting `use_cache=True` to speed up decoding' - ) - return past - - reordered_decoder_past = () - for layer_past_states in past: - # get the correct batch idx from layer past batch dim - # batch dim of `past` is at 2nd position - reordered_layer_past_states = () - for layer_past_state in layer_past_states: - # need to set correct `past` for each of the four key / value states - reordered_layer_past_states = reordered_layer_past_states + ( - layer_past_state.index_select( - 0, beam_idx.to(layer_past_state.device)), ) - - assert reordered_layer_past_states[0].shape == layer_past_states[ - 0].shape - assert len(reordered_layer_past_states) == len(layer_past_states) - - reordered_decoder_past = reordered_decoder_past + ( - reordered_layer_past_states, ) - return reordered_decoder_past - - -@add_start_docstrings( - "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", - T5_START_DOCSTRING, -) -class T5EncoderModel(T5PreTrainedModel): - authorized_missing_keys = [ - r'encoder\.embed_tokens\.weight', - ] - - def __init__(self, config: T5Config): - super().__init__(config) - self.shared = nn.Embedding(config.vocab_size, config.d_model) - - encoder_config = copy.deepcopy(config) - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False - self.encoder = T5Stack(encoder_config, self.shared) - - # Initialize weights and apply final processing - self.post_init() - - # Model parallel - self.model_parallel = False - self.device_map = None - - @add_start_docstrings(PARALLELIZE_DOCSTRING) - def parallelize(self, device_map=None): - self.device_map = ( - get_device_map( - len(self.encoder.block), range(torch.cuda.device_count())) - if device_map is None else device_map) - assert_device_map(self.device_map, len(self.encoder.block)) - self.encoder.parallelize(self.device_map) - self.model_parallel = True - - @add_start_docstrings(DEPARALLELIZE_DOCSTRING) - def deparallelize(self): - self.encoder.deparallelize() - self.encoder = self.encoder.to('cpu') - self.model_parallel = False - self.device_map = None - torch.cuda.empty_cache() - - def get_input_embeddings(self): - return self.shared - - def set_input_embeddings(self, new_embeddings): - self.shared = new_embeddings - self.encoder.set_input_embeddings(new_embeddings) - - def get_encoder(self): - return self.encoder - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of - heads to prune in this layer} See base class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING) - @replace_return_docstrings( - output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]: - r""" - Returns: - - Example: - - ```python - >>> from transformers import T5Tokenizer, T5EncoderModel - - >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") - >>> model = T5EncoderModel.from_pretrained("t5-small") - >>> input_ids = tokenizer( - ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" - >>> ).input_ids # Batch size 1 - >>> outputs = model(input_ids=input_ids) - >>> last_hidden_states = outputs.last_hidden_state - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - return encoder_outputs diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration.py similarity index 99% rename from modelscope/models/nlp/T5/configuration_t5.py rename to modelscope/models/nlp/T5/configuration.py index 117a6bc1..1f9a965e 100644 --- a/modelscope/models/nlp/T5/configuration_t5.py +++ b/modelscope/models/nlp/T5/configuration.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright 2020, The T5 Authors and HuggingFace Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py deleted file mode 100644 index 27f077d8..00000000 --- a/modelscope/models/nlp/T5/t5_for_text_generation.py +++ /dev/null @@ -1,56 +0,0 @@ -from typing import Optional, Tuple - -import torch - -from modelscope.metainfo import Models -from modelscope.models.base import Tensor, TorchModel -from modelscope.models.builder import MODELS -from modelscope.outputs import OutputKeys -from modelscope.utils.constant import Tasks -from .modeling_t5 import T5Config -from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration - - -@MODELS.register_module( - group_key=Tasks.text2text_generation, - module_name=Models.T5, -) -class T5ForConditionalGeneration(TorchModel): - - def __init__(self, model_dir=None, *args, **kwargs): - """initialize the text generation model from the `model_dir` path. - - Args: - model_dir (str): the model path. - model_cls (Optional[Any], optional): model loader, if None, use the - default loader to load model weights, by default None. - """ - super().__init__(model_dir, *args, **kwargs) - self.model = T5ForGeneration.from_pretrained(model_dir) - self.generate = self.model.generate - self.config = self.model.config - - def forward(self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.BoolTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - decoder_head_mask: Optional[torch.FloatTensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs): - return self.model.forward( - self, input_ids, attention_mask, decoder_input_ids, - decoder_attention_mask, head_mask, decoder_head_mask, - cross_attn_head_mask, encoder_outputs, past_key_values, - inputs_embeds, decoder_inputs_embeds, labels, use_cache, - output_attentions, output_hidden_states, return_dict, **kwargs) diff --git a/modelscope/models/nlp/T5/text2text_generation.py b/modelscope/models/nlp/T5/text2text_generation.py new file mode 100644 index 00000000..c4dcdfdb --- /dev/null +++ b/modelscope/models/nlp/T5/text2text_generation.py @@ -0,0 +1,455 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import warnings +from typing import Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.utils.model_parallel_utils import (assert_device_map, + get_device_map) + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import BaseModelOutput, Seq2SeqLMOutput +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from .backbone import T5PreTrainedModel, T5Stack +from .configuration import T5Config + +logger = get_logger(__name__) + +# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +__HEAD_MASK_WARNING_MSG = """ +The input argument `head_mask` was split into two arguments `head_mask` and +`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`, +but this feature is deprecated and will be removed in future versions. If you do +not want to use any `decoder_head_mask` now, please set `decoder_head_mask = +torch.ones(num_layers, num_heads)`. +""" + + +@MODELS.register_module( + group_key=Tasks.text2text_generation, + module_name=Models.T5, +) +class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_missing = [ + r'encoder\.embed_tokens\.weight', + r'decoder\.embed_tokens\.weight', + r'lm_head\.weight', + ] + _keys_to_ignore_on_load_unexpected = [ + r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight', + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.model_dim = config.d_model + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = T5Stack(decoder_config, self.shared) + + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map( + len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None else device_map) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.decoder.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.decoder.first_device) + self.model_parallel = True + + def deparallelize(self): + self.encoder.deparallelize() + self.decoder.deparallelize() + self.encoder = self.encoder.to('cpu') + self.decoder = self.decoder.to('cpu') + self.lm_head = self.lm_head.to('cpu') + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_output_embeddings(self): + return self.lm_head + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def forward(self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model + with relative position embeddings so you should be able to pad the + inputs on both the right and the left. + + Indices can be obtained using [`T5Tokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for detail. + + [What are input IDs?](../glossary#input-ids) + + To know more on how to prepare `input_ids` for pretraining take a + look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask + values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, + target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`T5Tokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + T5 uses the `pad_token_id` as the starting token for + `decoder_input_ids` generation. If `past_key_values` is used, + optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining + take a look at [T5 Training](./t5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, + target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in + `decoder_input_ids`. Causal mask will also be used by default. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, + num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the + encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or + `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the + decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or + `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in + the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, + `optional`: *attentions*) `last_hidden_state` of shape `(batch_size, + sequence_length, hidden_size)` is a sequence of hidden states at the + output of the last layer of the encoder. Used in the cross-attention + of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention + blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only + the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead + of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert `input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, + target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to + directly pass an embedded representation. If `past_key_values` is + used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more + control over how to convert `decoder_input_ids` indices into + associated vectors than the model's internal embedding lookup + matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, + `decoder_inputs_embeds` takes the value of `inputs_embeds`. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned + and can be used to speed up decoding (see `past_key_values`). + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention + layers. See `attentions` under returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See + `hidden_states` under returned tensors for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain + tuple. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. + Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All + labels set to `-100` are ignored (masked), the loss is only computed + for labels in `[0, ..., config.vocab_size]` + + Returns: + + Examples: + + ```python >>> from transformers import T5Tokenizer, + T5ForConditionalGeneration + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + + >>> # training + >>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids + >>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids + >>> outputs = model(input_ids=input_ids, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits + + >>> # inference + >>> input_ids = tokenizer( + ... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" + >>> ).input_ids # Batch size 1 + >>> outputs = model.generate(input_ids) + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + >>> # studies have shown that owning a dog is good for you. + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + # Convert encoder inputs in embeddings if needed + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] + if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] + if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to( + self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.encoder.first_device) + self.lm_head = self.lm_head.to(self.encoder.first_device) + sequence_output = sequence_output.to(self.lm_head.weight.device) + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab See + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim**-0.5) + + lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct( + lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) + # TODO(thom): Add z_loss + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 + + if not return_dict: + output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs + return ((loss, ) + output) if loss is not None else output + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + past=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs): + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + 'decoder_input_ids': input_ids, + 'past_key_values': past, + 'encoder_outputs': encoder_outputs, + 'attention_mask': attention_mask, + 'head_mask': head_mask, + 'decoder_head_mask': decoder_head_mask, + 'cross_attn_head_mask': cross_attn_head_mask, + 'use_cache': use_cache, + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + def _reorder_cache(self, past, beam_idx): + # if decoder past is not included in output + # speedy decoding is disabled and no need to reorder + if past is None: + logger.warning( + 'You might want to consider setting `use_cache=True` to speed up decoding' + ) + return past + + reordered_decoder_past = () + for layer_past_states in past: + # get the correct batch idx from layer past batch dim + # batch dim of `past` is at 2nd position + reordered_layer_past_states = () + for layer_past_state in layer_past_states: + # need to set correct `past` for each of the four key / value states + reordered_layer_past_states = reordered_layer_past_states + ( + layer_past_state.index_select( + 0, beam_idx.to(layer_past_state.device)), ) + + assert reordered_layer_past_states[0].shape == layer_past_states[ + 0].shape + assert len(reordered_layer_past_states) == len(layer_past_states) + + reordered_decoder_past = reordered_decoder_past + ( + reordered_layer_past_states, ) + return reordered_decoder_past diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 57222698..dff42d1c 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -4,80 +4,99 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .backbones import SbertModel - from .bart_for_text_error_correction import BartForTextErrorCorrection - from .bert_for_document_segmentation import BertForDocumentSegmentation - from .csanmt_for_translation import CsanmtForTranslation + from .bart import BartForTextErrorCorrection + from .csanmt import CsanmtForTranslation from .heads import SequenceClassificationHead from .gpt3 import GPT3ForTextGeneration - from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM, - BertForMaskedLM, DebertaV2ForMaskedLM) - from .ponet_for_masked_language import PoNetForMaskedLM - from .nncrf_for_named_entity_recognition import ( - TransformerCRFForNamedEntityRecognition, - LSTMCRFForNamedEntityRecognition) from .palm_v2 import PalmForTextGeneration - from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering - from .star_text_to_sql import StarForTextToSql - from .sequence_classification import (VecoForSequenceClassification, - SbertForSequenceClassification, - BertForSequenceClassification) - from .space import SpaceForDialogIntent - from .space import SpaceForDialogModeling - from .space import SpaceForDialogStateTracking - from .table_question_answering import TableQuestionAnswering - from .task_models import (FeatureExtractionModel, - InformationExtractionModel, - SequenceClassificationModel, - SingleBackboneTaskModelBase, - TokenClassificationModel, - TaskModelForTextGeneration) - from .token_classification import SbertForTokenClassification - from .sentence_embedding import SentenceEmbedding - from .text_ranking import TextRanking - from .T5 import T5ForConditionalGeneration + from .space_T_en import StarForTextToSql + from .space_T_cn import TableQuestionAnswering + from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST + from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig + from .structbert import ( + SbertForFaqQuestionAnswering, + SbertForMaskedLM, + SbertForSequenceClassification, + SbertForTokenClassification, + SbertTokenizer, + SbertTokenizerFast, + ) + from .bert import ( + BertForMaskedLM, + BertForTextRanking, + BertForSentenceEmbedding, + BertForSequenceClassification, + BertForTokenClassification, + BertForDocumentSegmentation, + BertModel, + BertConfig, + ) + from .veco import VecoModel, VecoConfig, VecoForTokenClassification, \ + VecoForSequenceClassification, VecoForMaskedLM, VecoTokenizer, VecoTokenizerFast + from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model + from .task_models import ( + FeatureExtractionModel, + InformationExtractionModel, + LSTMCRFForNamedEntityRecognition, + SequenceClassificationModel, + SingleBackboneTaskModelBase, + TaskModelForTextGeneration, + TokenClassificationModel, + TransformerCRFForNamedEntityRecognition, + ) + from .T5 import T5ForConditionalGeneration + from .gpt_neo import GPTNeoModel else: _import_structure = { 'backbones': ['SbertModel'], - 'bart_for_text_error_correction': ['BartForTextErrorCorrection'], - 'bert_for_document_segmentation': ['BertForDocumentSegmentation'], - 'csanmt_for_translation': ['CsanmtForTranslation'], + 'bart': ['BartForTextErrorCorrection'], + 'csanmt': ['CsanmtForTranslation'], 'heads': ['SequenceClassificationHead'], 'gpt3': ['GPT3ForTextGeneration'], - 'masked_language': [ - 'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM', - 'DebertaV2ForMaskedLM' + 'structbert': [ + 'SbertForFaqQuestionAnswering', + 'SbertForMaskedLM', + 'SbertForSequenceClassification', + 'SbertForTokenClassification', + 'SbertTokenizer', + 'SbertTokenizerFast', ], - 'nncrf_for_named_entity_recognition': [ - 'TransformerCRFForNamedEntityRecognition', - 'LSTMCRFForNamedEntityRecognition' - ], - 'ponet_for_masked_language': ['PoNetForMaskedLM'], - 'palm_v2': ['PalmForTextGeneration'], - 'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'], - 'star_text_to_sql': ['StarForTextToSql'], - 'sequence_classification': [ - 'VecoForSequenceClassification', 'SbertForSequenceClassification', - 'BertForSequenceClassification' + 'veco': [ + 'VecoModel', 'VecoConfig', 'VecoForTokenClassification', + 'VecoForSequenceClassification', 'VecoForMaskedLM', + 'VecoTokenizer', 'VecoTokenizerFast' ], - 'space': [ - 'SpaceForDialogIntent', 'SpaceForDialogModeling', - 'SpaceForDialogStateTracking' + 'bert': [ + 'BertForMaskedLM', + 'BertForTextRanking', + 'BertForSentenceEmbedding', + 'BertForSequenceClassification', + 'BertForTokenClassification', + 'BertForDocumentSegmentation', + 'BertModel', + 'BertConfig', ], + 'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'], + 'palm_v2': ['PalmForTextGeneration'], + 'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'], + 'space_T_en': ['StarForTextToSql'], + 'space_T_cn': ['TableQuestionAnswering'], + 'space': + ['SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDST'], 'task_models': [ 'FeatureExtractionModel', 'InformationExtractionModel', + 'LSTMCRFForNamedEntityRecognition', 'SequenceClassificationModel', 'SingleBackboneTaskModelBase', - 'TokenClassificationModel', 'TaskModelForTextGeneration', + 'TokenClassificationModel', + 'TransformerCRFForNamedEntityRecognition', ], - 'token_classification': ['SbertForTokenClassification'], - 'table_question_answering': ['TableQuestionAnswering'], 'sentence_embedding': ['SentenceEmbedding'], - 'text_ranking': ['TextRanking'], 'T5': ['T5ForConditionalGeneration'], + 'gpt_neo': ['GPTNeoModel'], } import sys diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py deleted file mode 100644 index aa513944..00000000 --- a/modelscope/models/nlp/backbones/bert.py +++ /dev/null @@ -1,7 +0,0 @@ -from modelscope.metainfo import Models -from modelscope.models.builder import BACKBONES -from modelscope.models.nlp.bert import BertModel -from modelscope.utils.constant import Fields - -BACKBONES.register_module( - group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel) diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py deleted file mode 100644 index 74735520..00000000 --- a/modelscope/models/nlp/backbones/structbert.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from modelscope.metainfo import Models -from modelscope.models.base import TorchModel -from modelscope.models.builder import BACKBONES -from modelscope.models.nlp.structbert import SbertConfig -from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform -from modelscope.utils.constant import Fields -from modelscope.utils.logger import get_logger - -logger = get_logger(__name__) - - -@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert) -class SbertModel(TorchModel, SbertModelTransform): - - def __init__(self, model_dir=None, add_pooling_layer=True, **config): - """ - Args: - model_dir (str, optional): The model checkpoint directory. Defaults to None. - add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True. - """ - config = SbertConfig(**config) - super().__init__(model_dir) - self.config = config - SbertModelTransform.__init__(self, config, add_pooling_layer) - - def extract_sequence_outputs(self, outputs): - return outputs['last_hidden_state'] - - def extract_pooled_outputs(self, outputs): - return outputs['pooler_output'] - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - return SbertModelTransform.forward( - self, input_ids, attention_mask, token_type_ids, position_ids, - head_mask, inputs_embeds, encoder_hidden_states, - encoder_attention_mask, past_key_values, use_cache, - output_attentions, output_hidden_states, return_dict, **kwargs) diff --git a/modelscope/models/nlp/bart/__init__.py b/modelscope/models/nlp/bart/__init__.py new file mode 100644 index 00000000..31912efc --- /dev/null +++ b/modelscope/models/nlp/bart/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .text_error_correction import BartForTextErrorCorrection diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py similarity index 100% rename from modelscope/models/nlp/bart_for_text_error_correction.py rename to modelscope/models/nlp/bart/text_error_correction.py diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py index cca79c2f..28a10f57 100644 --- a/modelscope/models/nlp/bert/__init__.py +++ b/modelscope/models/nlp/bert/__init__.py @@ -4,43 +4,33 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .modeling_bert import ( - BertForMaskedLM, - BertForMultipleChoice, - BertForNextSentencePrediction, - BertForPreTraining, - BertForQuestionAnswering, - BertForSequenceClassification, - BertForTokenClassification, + from .backbone import ( BertLayer, - BertLMHeadModel, BertModel, BertPreTrainedModel, - load_tf_weights_in_bert, ) - - from .configuration_bert import BertConfig, BertOnnxConfig - + from .configuration import BertConfig + from .fill_mask import BertForMaskedLM + from .text_ranking import BertForTextRanking + from .sentence_embedding import BertForSentenceEmbedding + from .text_classification import BertForSequenceClassification + from .token_classification import BertForTokenClassification + from .document_segmentation import BertForDocumentSegmentation else: _import_structure = { - 'configuration_bert': ['BertConfig', 'BertOnnxConfig'], + 'backbone': [ + 'BertModel', + 'BertPreTrainedModel', + ], + 'configuration': ['BertConfig'], + 'fill_mask': ['BertForMaskedLM'], + 'text_ranking': ['BertForTextRanking'], + 'sentence_embedding': ['BertForSentenceEmbedding'], + 'text_classification': ['BertForSequenceClassification'], + 'token_classification': ['BertForTokenClassification'], + 'document_segmentation': ['BertForDocumentSegmentation'], } - _import_structure['modeling_bert'] = [ - 'BertForMaskedLM', - 'BertForMultipleChoice', - 'BertForNextSentencePrediction', - 'BertForPreTraining', - 'BertForQuestionAnswering', - 'BertForSequenceClassification', - 'BertForTokenClassification', - 'BertLayer', - 'BertLMHeadModel', - 'BertModel', - 'BertPreTrainedModel', - 'load_tf_weights_in_bert', - ] - import sys sys.modules[__name__] = LazyImportModule( diff --git a/modelscope/models/nlp/bert/backbone.py b/modelscope/models/nlp/bert/backbone.py new file mode 100755 index 00000000..df0aebd2 --- /dev/null +++ b/modelscope/models/nlp/bert/backbone.py @@ -0,0 +1,952 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +import math +import os +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from transformers.activations import ACT2FN +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) + +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import (BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions) +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.logger import get_logger +from .configuration import BertConfig + +logger = get_logger(__name__) + +_CONFIG_FOR_DOC = 'BertConfig' + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model + # variable name and be able to load any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and + # exported when serialized + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse('1.6.0'): + self.register_buffer( + 'token_type_ids', + torch.zeros(self.position_ids.size(), dtype=torch.long), + persistent=False, + ) + + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, + past_key_values_length:seq_length + + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor + # where it is all zeros, which usually occurs when its auto-generated, + # registered buffer helps users when tracing the model without passing + # token_type_ids, solves issue #5664 + if token_type_ids is None: + if hasattr(self, 'token_type_ids'): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, + dtype=torch.long, + device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == 'absolute': + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, 'embedding_size'): + raise ValueError( + f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, 'position_embedding_type', 'absolute') + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all + # cross attention key/value_states. Further calls to cross_attention + # layer can then reuse all cross-attention key/value_states (first + # "if" case) if uni-directional self-attention (decoder) save + # Tuple(torch.Tensor, torch.Tensor) of all previous decoder + # key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected + # key/value_states (third "elif" case) if encoder bi-directional + # self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + if self.is_decoder: + outputs = outputs + (past_key_value, ) + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BertSelfAttention( + config, position_embedding_type=position_embedding_type) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, + self.self.attention_head_size, self.pruned_heads) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError( + f'{self} should be used as a decoder model if cross attention is added' + ) + self.crossattention = BertAttention( + config, position_embedding_type='absolute') + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[: + 2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, 'crossattention'): + raise ValueError( + f'If `encoder_hidden_states` are passed, {self} has to be instantiated ' + f'with cross-attention layers by setting `config.add_cross_attention=True`' + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[ + -2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[ + 1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward(self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output) + outputs = (layer_output, ) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value, ) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + ) if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1], ) + if output_attentions: + all_self_attentions = all_self_attentions + ( + layer_outputs[1], ) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + ( + layer_outputs[2], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPreTrainedModel(TorchModel, PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface + for downloading and loading pretrained models. + """ + + config_class = BertConfig + base_model_prefix = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BertEncoder): + module.gradient_checkpointing = value + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + + Returns: + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.get('model_dir', None) + if model_dir is None: + config = BertConfig(**kwargs) + model = cls(config) + else: + model_kwargs = {} + label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) + id2label = kwargs.get( + 'id2label', None if label2id is None else + {id: label + for label, id in label2id.items()}) + if id2label is not None and label2id is None: + label2id = {label: id for id, label in id2label.items()} + + num_labels = kwargs.get( + 'num_labels', None if label2id is None else len(label2id)) + if num_labels is not None: + model_kwargs['num_labels'] = num_labels + if label2id is not None: + model_kwargs['label2id'] = label2id + if id2label is not None: + model_kwargs['id2label'] = id2label + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_kwargs) + model.model_dir = model_dir + return model + + +@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.bert) +class BertModel(BertPreTrainedModel): + """The Bert Model transformer outputting raw hidden-states without any + specific head on top. + + This model inherits from [`PreTrainedModel`]. Check the superclass + documentation for the generic methods the library implements for all its + model (such as downloading or saving, resizing the input embeddings, pruning + heads etc.) + + This model is also a PyTorch + [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch + documentation for all matter related to general usage and behavior. + + Parameters: + config ([`BertConfig`]): Model configuration class with all the + parameters of the model. + Initializing with a config file does not load the weights associated + with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model + weights. + + The model can behave as an encoder (with only self-attention) as well as a + decoder, in which case a layer of cross-attention is added between the + self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam + Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + `is_decoder` argument of the configuration set to `True`. To be used in a + Seq2Seq model, the model needs to initialized with both `is_decoder` + argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` + is then expected as an input to the forward pass. + + + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + @classmethod + def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config): + config = BertConfig(**config) + model = cls(config, add_pooling_layer) + return model + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + r""" + Args: + input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask + values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the + inputs. Indices are selected in `[0, 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position + embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, + num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask + values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`, + *optional*): + Optionally, instead of passing `input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert `input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention + layers. See `attentions` under returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See + `hidden_states` under returned tensors for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a + plain tuple. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder. Used in the cross-attention if the model is configured as a + decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of + the encoder input. This mask is used in the cross-attention if the + model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention + blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only + the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead + of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned + and can be used to speed up decoding (see `past_key_values`). + Others (**kwargs) + some additional parameters might passed in from upstream pipeline, + which not influence the results. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds') + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[ + 2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), + device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, 'token_type_ids'): + buffered_token_type_ids = self.embeddings.token_type_ids[:, : + seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def extract_sequence_outputs(self, outputs): + return outputs['last_hidden_state'] + + def extract_pooled_outputs(self, outputs): + return outputs['pooler_output'] diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration.py similarity index 99% rename from modelscope/models/nlp/bert/configuration_bert.py rename to modelscope/models/nlp/bert/configuration.py index 2c9293ec..1e2cef95 100644 --- a/modelscope/models/nlp/bert/configuration_bert.py +++ b/modelscope/models/nlp/bert/configuration.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # diff --git a/modelscope/models/nlp/bert_for_document_segmentation.py b/modelscope/models/nlp/bert/document_segmentation.py similarity index 99% rename from modelscope/models/nlp/bert_for_document_segmentation.py rename to modelscope/models/nlp/bert/document_segmentation.py index dfa57597..b46c77e4 100644 --- a/modelscope/models/nlp/bert_for_document_segmentation.py +++ b/modelscope/models/nlp/bert/document_segmentation.py @@ -2,6 +2,7 @@ from typing import Any, Dict +import torch from torch import nn from torch.nn import CrossEntropyLoss from transformers.modeling_outputs import TokenClassifierOutput diff --git a/modelscope/models/nlp/bert/fill_mask.py b/modelscope/models/nlp/bert/fill_mask.py new file mode 100644 index 00000000..4f81f62d --- /dev/null +++ b/modelscope/models/nlp/bert/fill_mask.py @@ -0,0 +1,299 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionFillMaskModelOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .backbone import BertModel, BertPreTrainedModel +from .configuration import BertConfig + +logger = logging.get_logger(__name__) + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert) +class BertForMaskedLM(BertPreTrainedModel): + r"""Bert Model with a `language modeling` head on top. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the fill_mask model of Structbert, the preprocessor of this model + is `modelscope.preprocessors.NLPPreprocessor`. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config: BertConfig, **kwargs): + super().__init__(config) + + if config.is_decoder: + logger.warning( + 'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for ' + 'bi-directional self-attention.') + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the masked language modeling loss. Indices + should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` + docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., + config.vocab_size]` + + Returns: + Returns `modelscope.outputs.AttentionFillMaskModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_bert_backbone_base_std') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_backbone_base_std') + >>> print(model(**preprocessor(('This is a test', 'This is also a test')))) + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return AttentionFillMaskModelOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + input_ids=input_ids, + ) + + def prepare_inputs_for_generation(self, + input_ids, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError('The PAD token should be defined for generation') + + padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1)) + attention_mask = torch.cat([attention_mask, padding_mask], dim=-1) + dummy_token = torch.full((effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {'input_ids': input_ids, 'attention_mask': attention_mask} diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py deleted file mode 100755 index 7c1dfcf5..00000000 --- a/modelscope/models/nlp/bert/modeling_bert.py +++ /dev/null @@ -1,1961 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch BERT model. """ - -import math -import warnings -from dataclasses import dataclass -from typing import Optional, Tuple - -import torch -import torch.utils.checkpoint -from packaging import version -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from transformers.activations import ACT2FN -from transformers.file_utils import (ModelOutput, add_start_docstrings, - add_start_docstrings_to_model_forward, - replace_return_docstrings) -from transformers.modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, - CausalLMOutputWithCrossAttentions, MaskedLMOutput, - MultipleChoiceModelOutput, NextSentencePredictorOutput, - QuestionAnsweringModelOutput, SequenceClassifierOutput, - TokenClassifierOutput) -from transformers.modeling_utils import (PreTrainedModel, - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer) - -from modelscope.utils.logger import get_logger -from .configuration_bert import BertConfig - -logger = get_logger(__name__) - -_CONFIG_FOR_DOC = 'BertConfig' - - -class BertEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings.""" - - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding( - config.vocab_size, - config.hidden_size, - padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model - # variable name and be able to load any TensorFlow checkpoint file - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - # position_ids (1, len position emb) is contiguous in memory and - # exported when serialized - self.position_embedding_type = getattr(config, - 'position_embedding_type', - 'absolute') - self.register_buffer( - 'position_ids', - torch.arange(config.max_position_embeddings).expand((1, -1))) - if version.parse(torch.__version__) > version.parse('1.6.0'): - self.register_buffer( - 'token_type_ids', - torch.zeros(self.position_ids.size(), dtype=torch.long), - persistent=False, - ) - - def forward(self, - input_ids=None, - token_type_ids=None, - position_ids=None, - inputs_embeds=None, - past_key_values_length=0): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = self.position_ids[:, - past_key_values_length:seq_length - + past_key_values_length] - - # Setting the token_type_ids to the registered buffer in constructor - # where it is all zeros, which usually occurs when its auto-generated, - # registered buffer helps users when tracing the model without passing - # token_type_ids, solves issue #5664 - if token_type_ids is None: - if hasattr(self, 'token_type_ids'): - buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - input_shape[0], seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros( - input_shape, - dtype=torch.long, - device=self.position_ids.device) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == 'absolute': - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class BertSelfAttention(nn.Module): - - def __init__(self, config, position_embedding_type=None): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, 'embedding_size'): - raise ValueError( - f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' - f'heads ({config.num_attention_heads})') - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size - / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, 'position_embedding_type', 'absolute') - if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, - self.attention_head_size) - - self.is_decoder = config.is_decoder - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - mixed_query_layer = self.query(hidden_states) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - if is_cross_attention and past_key_value is not None: - # reuse k,v, cross_attentions - key_layer = past_key_value[0] - value_layer = past_key_value[1] - attention_mask = encoder_attention_mask - elif is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - elif past_key_value is not None: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - key_layer = torch.cat([past_key_value[0], key_layer], dim=2) - value_layer = torch.cat([past_key_value[1], value_layer], dim=2) - else: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - - query_layer = self.transpose_for_scores(mixed_query_layer) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all - # cross attention key/value_states. Further calls to cross_attention - # layer can then reuse all cross-attention key/value_states (first - # "if" case) if uni-directional self-attention (decoder) save - # Tuple(torch.Tensor, torch.Tensor) of all previous decoder - # key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected - # key/value_states (third "elif" case) if encoder bi-directional - # self-attention `past_key_value` is always `None` - past_key_value = (key_layer, value_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, - key_layer.transpose(-1, -2)) - - if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange( - seq_length, dtype=torch.long, - device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange( - seq_length, dtype=torch.long, - device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to( - dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == 'relative_key': - relative_position_scores = torch.einsum( - 'bhld,lrd->bhlr', query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == 'relative_key_query': - relative_position_scores_query = torch.einsum( - 'bhld,lrd->bhlr', query_layer, positional_embedding) - relative_position_scores_key = torch.einsum( - 'bhrd,lrd->bhlr', key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + ( - self.all_head_size, ) - context_layer = context_layer.view(*new_context_layer_shape) - - outputs = (context_layer, - attention_probs) if output_attentions else (context_layer, ) - - if self.is_decoder: - outputs = outputs + (past_key_value, ) - return outputs - - -class BertSelfOutput(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertAttention(nn.Module): - - def __init__(self, config, position_embedding_type=None): - super().__init__() - self.self = BertSelfAttention( - config, position_embedding_type=position_embedding_type) - self.output = BertSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, - self.self.attention_head_size, self.pruned_heads) - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len( - heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - self_outputs = self.self( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them - return outputs - - -class BertIntermediate(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -class BertOutput(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertLayer(nn.Module): - - def __init__(self, config): - super().__init__() - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.seq_len_dim = 1 - self.attention = BertAttention(config) - self.is_decoder = config.is_decoder - self.add_cross_attention = config.add_cross_attention - if self.add_cross_attention: - if not self.is_decoder: - raise ValueError( - f'{self} should be used as a decoder model if cross attention is added' - ) - self.crossattention = BertAttention( - config, position_embedding_type='absolute') - self.intermediate = BertIntermediate(config) - self.output = BertOutput(config) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[: - 2] if past_key_value is not None else None - self_attention_outputs = self.attention( - hidden_states, - attention_mask, - head_mask, - output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, - ) - attention_output = self_attention_outputs[0] - - # if decoder, the last output is tuple of self-attn cache - if self.is_decoder: - outputs = self_attention_outputs[1:-1] - present_key_value = self_attention_outputs[-1] - else: - outputs = self_attention_outputs[ - 1:] # add self attentions if we output attention weights - - cross_attn_present_key_value = None - if self.is_decoder and encoder_hidden_states is not None: - if not hasattr(self, 'crossattention'): - raise ValueError( - f'If `encoder_hidden_states` are passed, {self} has to be instantiated ' - f'with cross-attention layers by setting `config.add_cross_attention=True`' - ) - - # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple - cross_attn_past_key_value = past_key_value[ - -2:] if past_key_value is not None else None - cross_attention_outputs = self.crossattention( - attention_output, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - cross_attn_past_key_value, - output_attentions, - ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[ - 1:-1] # add cross attentions if we output attention weights - - # add cross-attn cache to positions 3,4 of present_key_value tuple - cross_attn_present_key_value = cross_attention_outputs[-1] - present_key_value = present_key_value + cross_attn_present_key_value - - layer_output = apply_chunking_to_forward(self.feed_forward_chunk, - self.chunk_size_feed_forward, - self.seq_len_dim, - attention_output) - outputs = (layer_output, ) + outputs - - # if decoder, return the attn key/values as the last output - if self.is_decoder: - outputs = outputs + (present_key_value, ) - - return outputs - - def feed_forward_chunk(self, attention_output): - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output - - -class BertEncoder(nn.Module): - - def __init__(self, config): - super().__init__() - self.config = config - self.layer = nn.ModuleList( - [BertLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - ): - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = ( - ) if output_attentions and self.config.add_cross_attention else None - - next_decoder_cache = () if use_cache else None - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' - ) - use_cache = False - - def create_custom_forward(module): - - def custom_forward(*inputs): - return module(*inputs, past_key_value, - output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - - hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[-1], ) - if output_attentions: - all_self_attentions = all_self_attentions + ( - layer_outputs[1], ) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + ( - layer_outputs[2], ) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - if not return_dict: - return tuple(v for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_decoder_cache, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) - - -class BertPooler(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class BertPredictionHeadTransform(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class BertLMPredictionHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.transform = BertPredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear( - config.hidden_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` - self.decoder.bias = self.bias - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -class BertOnlyMLMHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = BertLMPredictionHead(config) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class BertOnlyNSPHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) - return seq_relationship_score - - -class BertPreTrainingHeads(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = BertLMPredictionHead(config) - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, sequence_output, pooled_output): - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -class BertPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface - for downloading and loading pretrained models. - """ - - config_class = BertConfig - base_model_prefix = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, nn.Linear): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, BertEncoder): - module.gradient_checkpointing = value - - -@dataclass -class BertForPreTrainingOutput(ModelOutput): - """ - Output type of [`BertForPreTraining`]. - - Args: - loss (*optional*, returned when `labels` is provided, - `torch.FloatTensor` of shape `(1,)`): - Total loss as the sum of the masked language modeling loss and the - next sequence prediction (classification) loss. - prediction_logits (`torch.FloatTensor` of shape `(batch_size, - sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each - vocabulary token before SoftMax). - seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, - 2)`): - Prediction scores of the next sequence prediction (classification) - head (scores of True/False continuation before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + - one for the output of each layer) of shape `(batch_size, - sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the - initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - """ - - loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -BERT_START_DOCSTRING = r""" - - This model inherits from [`PreTrainedModel`]. Check the superclass - documentation for the generic methods the library implements for all its - model (such as downloading or saving, resizing the input embeddings, pruning - heads etc.) - - This model is also a PyTorch - [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) - subclass. Use it as a regular PyTorch Module and refer to the PyTorch - documentation for all matter related to general usage and behavior. - - Parameters: - config ([`BertConfig`]): Model configuration class with all the - parameters of the model. - Initializing with a config file does not load the weights associated - with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model - weights. -""" - -BERT_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `({0})`): - Indices of input sequence tokens in the vocabulary. - - Indices can be obtained using [`BertTokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] - for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): - Mask to avoid performing attention on padding token indices. Mask - values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Segment token indices to indicate first and second portions of the - inputs. Indices are selected in `[0, 1]`: - - - 0 corresponds to a *sentence A* token, - - 1 corresponds to a *sentence B* token. - - [What are token type IDs?](../glossary#token-type-ids) - position_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Indices of positions of each input sequence tokens in the position - embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - - [What are position IDs?](../glossary#position-ids) - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, - num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask - values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, - *optional*): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if you want - more control over how to convert `input_ids` indices into associated - vectors than the model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a - plain tuple. -""" - - -@add_start_docstrings( - 'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.', - BERT_START_DOCSTRING, -) -class BertModel(BertPreTrainedModel): - """ - - The model can behave as an encoder (with only self-attention) as well as a - decoder, in which case a layer of cross-attention is added between the - self-attention layers, following the architecture described in [Attention is - all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam - Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz - Kaiser and Illia Polosukhin. - - To behave as an decoder the model needs to be initialized with the - `is_decoder` argument of the configuration set to `True`. To be used in a - Seq2Seq model, the model needs to initialized with both `is_decoder` - argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` - is then expected as an input to the forward pass. - """ - - def __init__(self, config, add_pooling_layer=True): - super().__init__(config) - self.embeddings = BertEmbeddings(config) - self.encoder = BertEncoder(config) - - self.pooler = BertPooler(config) if add_pooling_layer else None - - # Initialize weights and apply final processing - self.post_init() - - @classmethod - def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config): - config = BertConfig(**config) - model = cls(config, add_pooling_layer) - return model - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the - encoder. Used in the cross-attention if the model is configured as a - decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on the padding token indices of - the encoder input. This mask is used in the cross-attention if the - model is configured as a decoder. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length - `config.n_layers` with each tuple having 4 tensors of shape - `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention - blocks. Can be used to speed up decoding. - - If `past_key_values` are used, the user can optionally input only - the last `decoder_input_ids` (those that don't have their past key - value states given to this model) of shape `(batch_size, 1)` instead - of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned - and can be used to speed up decoding (see `past_key_values`). - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if self.config.is_decoder: - use_cache = use_cache if use_cache is not None else self.config.use_cache - else: - use_cache = False - - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - 'You cannot specify both input_ids and inputs_embeds at the same time' - ) - elif input_ids is not None: - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError( - 'You have to specify either input_ids or inputs_embeds') - - batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device - - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[ - 2] if past_key_values is not None else 0 - - if attention_mask is None: - attention_mask = torch.ones( - ((batch_size, seq_length + past_key_values_length)), - device=device) - - if token_type_ids is None: - if hasattr(self.embeddings, 'token_type_ids'): - buffered_token_type_ids = self.embeddings.token_type_ids[:, : - seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - batch_size, seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros( - input_shape, dtype=torch.long, device=device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, device) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( - ) - encoder_hidden_shape = (encoder_batch_size, - encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones( - encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) - else: - encoder_extended_attention_mask = None - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, - self.config.num_hidden_layers) - - embedding_output = self.embeddings( - input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, - ) - encoder_outputs = self.encoder( - embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler( - sequence_output) if self.pooler is not None else None - - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPoolingAndCrossAttentions( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - ) - - def extract_sequence_outputs(self, outputs): - return outputs['last_hidden_state'] - - def extract_pooled_outputs(self, outputs): - return outputs['pooler_output'] - - -@add_start_docstrings( - """ - Bert Model with two heads on top as done during the pretraining: a `masked - language modeling` head and a `next sentence prediction (classification)` - head. - """, - BERT_START_DOCSTRING, -) -class BertForPreTraining(BertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - next_sentence_label=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, - *optional*): - Labels for computing the masked language modeling loss. Indices - should be in `[-100, 0, ..., config.vocab_size]` (see - `input_ids` docstring) Tokens with indices set to `-100` are - ignored (masked), the loss is only computed for the tokens with - labels in `[0, ..., config.vocab_size]` - next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, - *optional*): - Labels for computing the next sequence prediction - (classification) loss. Input should be a sequence pair (see - `input_ids` docstring) Indices should be in `[0, 1]`: - - - 0 indicates sequence B is a continuation of sequence A, - - 1 indicates sequence B is a random sequence. - kwargs (`Dict[str, any]`, optional, defaults to *{}*): - Used to hide legacy arguments that have been deprecated. - - Returns: - - Example: - - ```python >>> from transformers import BertTokenizer, BertForPreTraining - >>> import torch - - >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') - - >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") - >>> outputs = model(**inputs) - - >>> prediction_logits = outputs.prediction_logits - >>> seq_relationship_logits = outputs.seq_relationship_logits - ``` - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output, pooled_output = outputs[:2] - prediction_scores, seq_relationship_score = self.cls( - sequence_output, pooled_output) - - total_loss = None - if labels is not None and next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - next_sentence_loss = loss_fct( - seq_relationship_score.view(-1, 2), - next_sentence_label.view(-1)) - total_loss = masked_lm_loss + next_sentence_loss - - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss, ) - + output) if total_loss is not None else output - - return BertForPreTrainingOutput( - loss=total_loss, - prediction_logits=prediction_scores, - seq_relationship_logits=seq_relationship_score, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, - BERT_START_DOCSTRING) -class BertLMHeadModel(BertPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config): - super().__init__(config) - - if not config.is_decoder: - logger.warning( - 'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`' - ) - - self.bert = BertModel(config, add_pooling_layer=False) - self.cls = BertOnlyMLMHead(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=CausalLMOutputWithCrossAttentions, - config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the - encoder. Used in the cross-attention if the model is configured - as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on the padding token indices - of the encoder input. This mask is used in the cross-attention - if the model is configured as a decoder. Mask values selected in - `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, - *optional*): - Labels for computing the left-to-right language modeling loss - (next word prediction). Indices should be in `[-100, 0, ..., - config.vocab_size]` (see `input_ids` docstring) Tokens with - indices set to `-100` are ignored (masked), the loss is only - computed for the tokens with labels n `[0, ..., - config.vocab_size]` - past_key_values (`tuple(tuple(torch.FloatTensor))` of length - `config.n_layers` with each tuple having 4 tensors of shape - `(batch_size, num_heads, sequence_length - 1, - embed_size_per_head)`): - Contains precomputed key and value hidden states of the - attention blocks. Can be used to speed up decoding. - - If `past_key_values` are used, the user can optionally input - only the last `decoder_input_ids` (those that don't have their - past key value states given to this model) of shape - `(batch_size, 1)` instead of all `decoder_input_ids` of shape - `(batch_size, sequence_length)`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are - returned and can be used to speed up decoding (see - `past_key_values`). - - Returns: - - Example: - - ```python >>> from transformers import BertTokenizer, BertLMHeadModel, - BertConfig >>> import torch - - >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') - >>> config = BertConfig.from_pretrained("bert-base-cased") - >>> config.is_decoder = True - >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) - - >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") - >>> outputs = model(**inputs) - - >>> prediction_logits = outputs.logits - ``` - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None: - use_cache = False - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - lm_loss = None - if labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - shifted_prediction_scores = prediction_scores[:, : - -1, :].contiguous() - labels = labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - lm_loss = loss_fct( - shifted_prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - - if not return_dict: - output = (prediction_scores, ) + outputs[2:] - return ((lm_loss, ) + output) if lm_loss is not None else output - - return CausalLMOutputWithCrossAttentions( - loss=lm_loss, - logits=prediction_scores, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, - ) - - def prepare_inputs_for_generation(self, - input_ids, - past=None, - attention_mask=None, - **model_kwargs): - input_shape = input_ids.shape - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_shape) - - # cut decoder_input_ids if past is used - if past is not None: - input_ids = input_ids[:, -1:] - - return { - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'past_key_values': past - } - - def _reorder_cache(self, past, beam_idx): - reordered_past = () - for layer_past in past: - reordered_past += (tuple( - past_state.index_select(0, beam_idx) - for past_state in layer_past), ) - return reordered_past - - -@add_start_docstrings( - """Bert Model with a `language modeling` head on top. """, - BERT_START_DOCSTRING) -class BertForMaskedLM(BertPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config): - super().__init__(config) - - if config.is_decoder: - logger.warning( - 'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for ' - 'bi-directional self-attention.') - - self.bert = BertModel(config, add_pooling_layer=False) - self.cls = BertOnlyMLMHead(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, - *optional*): - Labels for computing the masked language modeling loss. Indices - should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` - docstring) Tokens with indices set to `-100` are ignored (masked), - the loss is only computed for the tokens with labels in `[0, ..., - config.vocab_size]` - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - - if not return_dict: - output = (prediction_scores, ) + outputs[2:] - return ((masked_lm_loss, ) - + output) if masked_lm_loss is not None else output - - return MaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation(self, - input_ids, - attention_mask=None, - **model_kwargs): - input_shape = input_ids.shape - effective_batch_size = input_shape[0] - - # add a dummy token - if self.config.pad_token_id is None: - raise ValueError('The PAD token should be defined for generation') - - padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1)) - attention_mask = torch.cat([attention_mask, padding_mask], dim=-1) - dummy_token = torch.full((effective_batch_size, 1), - self.config.pad_token_id, - dtype=torch.long, - device=input_ids.device) - input_ids = torch.cat([input_ids, dummy_token], dim=1) - - return {'input_ids': input_ids, 'attention_mask': attention_mask} - - -@add_start_docstrings( - """Bert Model with a `next sentence prediction (classification)` head on top. """, - BERT_START_DOCSTRING, -) -class BertForNextSentencePrediction(BertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - self.cls = BertOnlyNSPHead(config) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs, - ): - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the next sequence prediction (classification) - loss. Input should be a sequence pair (see `input_ids` docstring). - Indices should be in `[0, 1]`: - - - 0 indicates sequence B is a continuation of sequence A, - - 1 indicates sequence B is a random sequence. - - Returns: - - Example: - - ```python >>> from transformers import BertTokenizer, - BertForNextSentencePrediction >>> import torch - - >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') - - >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." - >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." - >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') - - >>> outputs = model(**encoding, labels=torch.LongTensor([1])) - >>> logits = outputs.logits - >>> assert logits[0, 0] < logits[0, 1] # next sentence was random - ``` - """ - - if 'next_sentence_label' in kwargs: - warnings.warn( - 'The `next_sentence_label` argument is deprecated, use `labels` instead.', - FutureWarning, - ) - labels = kwargs.pop('next_sentence_label') - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - seq_relationship_scores = self.cls(pooled_output) - - next_sentence_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - next_sentence_loss = loss_fct( - seq_relationship_scores.view(-1, 2), labels.view(-1)) - - if not return_dict: - output = (seq_relationship_scores, ) + outputs[2:] - return ((next_sentence_loss, ) - + output) if next_sentence_loss is not None else output - - return NextSentencePredictorOutput( - loss=next_sentence_loss, - logits=seq_relationship_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Bert Model transformer with a sequence classification/regression head on top - (a linear layer on top of the pooled output) e.g. for GLUE tasks. - """, - BERT_START_DOCSTRING, -) -class BertForSequenceClassification(BertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - - self.bert = BertModel(config) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. - Indices should be in `[0, ..., config.num_labels - 1]`. If - `config.num_labels == 1` a regression loss is computed (Mean-Square - loss), If `config.num_labels > 1` a classification loss is computed - (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = 'regression' - elif self.num_labels > 1 and (labels.dtype == torch.long - or labels.dtype == torch.int): - self.config.problem_type = 'single_label_classification' - else: - self.config.problem_type = 'multi_label_classification' - - if self.config.problem_type == 'regression': - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == 'single_label_classification': - loss_fct = CrossEntropyLoss() - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == 'multi_label_classification': - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - if not return_dict: - output = (logits, ) + outputs[2:] - return ((loss, ) + output) if loss is not None else output - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Bert Model with a multiple choice classification head on top (a linear layer - on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. - """, - BERT_START_DOCSTRING, -) -class BertForMultipleChoice(BertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, 1) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format( - 'batch_size, num_choices, sequence_length')) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the multiple choice classification loss. - Indices should be in `[0, ..., num_choices-1]` where `num_choices` - is the size of the second dimension of the input tensors. (See - `input_ids` above) - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - num_choices = input_ids.shape[ - 1] if input_ids is not None else inputs_embeds.shape[1] - - input_ids = input_ids.view( - -1, input_ids.size(-1)) if input_ids is not None else None - attention_mask = attention_mask.view( - -1, - attention_mask.size(-1)) if attention_mask is not None else None - token_type_ids = token_type_ids.view( - -1, - token_type_ids.size(-1)) if token_type_ids is not None else None - position_ids = position_ids.view( - -1, position_ids.size(-1)) if position_ids is not None else None - inputs_embeds = ( - inputs_embeds.view(-1, inputs_embeds.size(-2), - inputs_embeds.size(-1)) - if inputs_embeds is not None else None) - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - reshaped_logits = logits.view(-1, num_choices) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - - if not return_dict: - output = (reshaped_logits, ) + outputs[2:] - return ((loss, ) + output) if loss is not None else output - - return MultipleChoiceModelOutput( - loss=loss, - logits=reshaped_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Bert Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. - """, - BERT_START_DOCSTRING, -) -class BertForTokenClassification(BertPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.bert = BertModel(config, add_pooling_layer=False) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, - *optional*): - Labels for computing the token classification loss. Indices should - be in `[0, ..., config.num_labels - 1]`. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), - torch.tensor(loss_fct.ignore_index).type_as(labels)) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits, ) + outputs[2:] - return ((loss, ) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Bert Model with a span classification head on top for extractive - question-answering tasks like SQuAD (a linear layers on top of the - hidden-states output to compute `span start logits` and `span end logits`). - """, - BERT_START_DOCSTRING, -) -class BertForQuestionAnswering(BertPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.bert = BertModel(config, add_pooling_layer=False) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - start_positions=None, - end_positions=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - start_positions (`torch.LongTensor` of shape `(batch_size,)`, - *optional*): - Labels for position (index) of the start of the labelled span for - computing the token classification loss. Positions are clamped to - the length of the sequence (`sequence_length`). Position outside of - the sequence are not taken into account for computing the loss. - end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the end of the labelled span for - computing the token classification loss. Positions are clamped to - the length of the sequence (`sequence_length`). Position outside of - the sequence are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss, ) - + output) if total_loss is not None else output - - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/modelscope/models/nlp/bert/sentence_embedding.py b/modelscope/models/nlp/bert/sentence_embedding.py new file mode 100644 index 00000000..f4c2620e --- /dev/null +++ b/modelscope/models/nlp/bert/sentence_embedding.py @@ -0,0 +1,113 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from modelscope.metainfo import Models +from modelscope.models import Model +from modelscope.models.builder import MODELS +from modelscope.outputs import BackboneModelOutput +from modelscope.utils.constant import Tasks +from .backbone import BertModel, BertPreTrainedModel + + +@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert) +class BertForSentenceEmbedding(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + self.config = config + setattr(self, self.base_model_prefix, + BertModel(config, add_pooling_layer=False)) + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ) -> BackboneModelOutput: + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + Returns: + Returns `modelscope.outputs.AttentionTextClassificationModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base') + >>> print(model(**preprocessor('This is a test'))) + """ + return self.base_model.forward( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + + Returns: + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + model_dir = kwargs.get('model_dir') + model = super( + Model, + cls).from_pretrained(pretrained_model_name_or_path=model_dir) + model.model_dir = model_dir + return model diff --git a/modelscope/models/nlp/bert/text_classification.py b/modelscope/models/nlp/bert/text_classification.py new file mode 100644 index 00000000..b1d18d0f --- /dev/null +++ b/modelscope/models/nlp/bert/text_classification.py @@ -0,0 +1,208 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionTextClassificationModelOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .backbone import BertModel, BertPreTrainedModel + +logger = logging.get_logger(__name__) + + +@MODELS.register_module(Tasks.text_classification, module_name=Models.bert) +@MODELS.register_module(Tasks.nli, module_name=Models.bert) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.bert) +@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert) +@MODELS.register_module( + Tasks.zero_shot_classification, module_name=Models.bert) +class BertForSequenceClassification(BertPreTrainedModel): + r"""Bert Model transformer with a sequence classification/regression head on top + (a linear layer on top of the pooled output) e.g. for GLUE tasks. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the fill_mask model of Bert, the preprocessor of this model + is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + + Trainer: + This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, + NlpEpochBasedTrainer, or trainers from other frameworks. + The preferred trainer in ModelScope is NlpEpochBasedTrainer. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + setattr(self, self.base_model_prefix, BertModel(config)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + Returns `modelscope.outputs.AttentionTextClassificationModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + >>> print(model(**preprocessor(('This is a test', 'This is also a test')))) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.base_model.forward( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = 'regression' + elif self.num_labels > 1 and (labels.dtype == torch.long + or labels.dtype == torch.int): + self.config.problem_type = 'single_label_classification' + else: + self.config.problem_type = 'multi_label_classification' + + if self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return AttentionTextClassificationModelOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py new file mode 100644 index 00000000..79a63045 --- /dev/null +++ b/modelscope/models/nlp/bert/text_ranking.py @@ -0,0 +1,89 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import torch +import torch.utils.checkpoint + +from modelscope.metainfo import Models +from modelscope.models import Model +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionTextClassificationModelOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .backbone import BertModel +from .text_classification import BertForSequenceClassification + +logger = logging.get_logger(__name__) + + +@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert) +class BertForTextRanking(BertForSequenceClassification): + + def __init__(self, config, **kwargs): + super().__init__(config) + self.train_batch_size = kwargs.get('train_batch_size', 4) + setattr(self, self.base_model_prefix, + BertModel(self.config, add_pooling_layer=True)) + self.register_buffer( + 'target_label', + torch.zeros(self.train_batch_size, dtype=torch.long)) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs) -> AttentionTextClassificationModelOutput: + outputs = self.base_model.forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + # backbone model should return pooled_output as its second output + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + if self.base_model.training: + scores = logits.view(self.train_batch_size, -1) + loss_fct = torch.nn.CrossEntropyLoss() + loss = loss_fct(scores, self.target_label) + return AttentionTextClassificationModelOutput( + loss=loss, + logits=logits, + ) + return AttentionTextClassificationModelOutput(logits=logits, ) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (1 classes). + + Returns: + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + num_labels = kwargs.get('num_labels', 1) + model_args = {} if num_labels is None else {'num_labels': num_labels} + + model_dir = kwargs.get('model_dir') + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_args) + model.model_dir = model_dir + return model diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py new file mode 100644 index 00000000..5dc6b0ce --- /dev/null +++ b/modelscope/models/nlp/bert/token_classification.py @@ -0,0 +1,225 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import TokenClassifierOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .backbone import BertModel, BertPreTrainedModel + +logger = logging.get_logger(__name__) + + +@MODELS.register_module(Tasks.token_classification, module_name=Models.bert) +@MODELS.register_module(Tasks.part_of_speech, module_name=Models.bert) +@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert) +class BertForTokenClassification(BertPreTrainedModel): + r"""Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the fill_mask model of Bert, the preprocessor of this model + is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + + Trainer: + This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, + NlpEpochBasedTrainer, or trainers from other frameworks. + The preferred trainer in ModelScope is NlpEpochBasedTrainer. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config, **kwargs): + super().__init__(config) + self.num_labels = config.num_labels + + setattr(self, self.base_model_prefix, + BertModel(config, add_pooling_layer=False)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + offset_mapping=None, + label_mask=None, + ): + r""" + Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, + sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using + :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and + :meth:`transformers.PreTrainedTokenizer.__call__` for details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask + values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the + inputs. Indices are selected in ``[0, 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position + embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or + :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask + values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert :obj:`input_ids` indices into + associated vectors than the model's internal embedding lookup + matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention + layers. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See + ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` + instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, + `optional`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If + :obj:`config.num_labels == 1` a regression loss is computed + (Mean-Square loss), If :obj:`config.num_labels > 1` a classification + loss is computed (Cross-Entropy). + offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the sentence. + Selected in the range ``[0, sequence_length - 1]``. + label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask + values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + Returns `modelscope.outputs.TokenClassifierOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base') + >>> print(model(**preprocessor(('This is a test', 'This is also a test')))) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels)) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + offset_mapping=offset_mapping, + ) diff --git a/modelscope/models/nlp/csanmt/__init__.py b/modelscope/models/nlp/csanmt/__init__.py new file mode 100644 index 00000000..85531617 --- /dev/null +++ b/modelscope/models/nlp/csanmt/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .translation import CsanmtForTranslation diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt/translation.py similarity index 100% rename from modelscope/models/nlp/csanmt_for_translation.py rename to modelscope/models/nlp/csanmt/translation.py diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py index 830210ed..08b184e5 100644 --- a/modelscope/models/nlp/deberta_v2/__init__.py +++ b/modelscope/models/nlp/deberta_v2/__init__.py @@ -22,38 +22,28 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_deberta_v2 import DebertaV2Config - from .tokenization_deberta_v2 import DebertaV2Tokenizer - from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast - - from .modeling_deberta_v2 import ( - DebertaV2ForMaskedLM, - DebertaV2ForMultipleChoice, - DebertaV2ForQuestionAnswering, - DebertaV2ForSequenceClassification, - DebertaV2ForTokenClassification, + from .configuration import DebertaV2Config + from .tokenization import DebertaV2Tokenizer + from .tokenization_fast import DebertaV2TokenizerFast + from .backbone import ( DebertaV2Model, DebertaV2PreTrainedModel, ) + from .fill_mask import DebertaV2ForMaskedLM else: _import_structure = { - 'configuration_deberta_v2': - ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'], - 'tokenization_deberta_v2': ['DebertaV2Tokenizer'] + 'configuration': ['DebertaV2Config'], + 'tokenization': ['DebertaV2Tokenizer'], + 'tokenization_fast': ['DebertaV2TokenizerFast'], + 'backbone': [ + 'DebertaV2Model', + 'DebertaV2PreTrainedModel', + ], + 'fill_mask': [ + 'DebertaV2ForMaskedLM', + ] } - _import_structure['tokenization_deberta_v2_fast'] = [ - 'DebertaV2TokenizerFast' - ] - _import_structure['modeling_deberta_v2'] = [ - 'DebertaV2ForMaskedLM', - 'DebertaV2ForMultipleChoice', - 'DebertaV2ForQuestionAnswering', - 'DebertaV2ForSequenceClassification', - 'DebertaV2ForTokenClassification', - 'DebertaV2Model', - 'DebertaV2PreTrainedModel', - ] import sys sys.modules[__name__] = LazyImportModule( diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/backbone.py similarity index 64% rename from modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py rename to modelscope/models/nlp/deberta_v2/backbone.py index 1c6b9071..cca38133 100644 --- a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py +++ b/modelscope/models/nlp/deberta_v2/backbone.py @@ -20,28 +20,22 @@ from typing import Optional, Tuple, Union import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss +from torch.nn import LayerNorm from transformers.activations import ACT2FN -from transformers.file_utils import (add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward) -from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput, - MultipleChoiceModelOutput, - QuestionAnsweringModelOutput, - SequenceClassifierOutput, - TokenClassifierOutput) +from transformers.modeling_outputs import BaseModelOutput from transformers.modeling_utils import PreTrainedModel from transformers.pytorch_utils import softmax_backward_data +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionBackboneModelOutput from modelscope.utils import logger as logging -from .configuration_deberta_v2 import DebertaV2Config +from modelscope.utils.constant import Tasks +from .configuration import DebertaV2Config logger = logging.get_logger(__name__) -_CONFIG_FOR_DOC = 'DebertaV2Config' -_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer' -_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite' - # Copied from transformers.models.deberta.modeling_deberta.ContextPooler class ContextPooler(nn.Module): @@ -1006,7 +1000,7 @@ class DebertaV2Embeddings(nn.Module): # Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2 -class DebertaV2PreTrainedModel(PreTrainedModel): +class DebertaV2PreTrainedModel(TorchModel, PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. @@ -1018,6 +1012,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel): _keys_to_ignore_on_load_unexpected = ['position_embeddings'] supports_gradient_checkpointing = True + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, nn.Linear): @@ -1037,8 +1035,24 @@ class DebertaV2PreTrainedModel(PreTrainedModel): if isinstance(module, DebertaV2Encoder): module.gradient_checkpointing = value + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + ponet_config = DebertaV2Config(**kwargs) + model = cls(ponet_config) + else: + model = super( + Model, + cls).from_pretrained(pretrained_model_name_or_path=model_dir) + return model + + +@MODELS.register_module(Tasks.backbone, module_name=Models.deberta_v2) +# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2 +class DebertaV2Model(DebertaV2PreTrainedModel): + """The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top. -DEBERTA_START_DOCSTRING = r""" The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two @@ -1048,65 +1062,13 @@ DEBERTA_START_DOCSTRING = r""" Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. - Parameters: - config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model. + config (`DebertaV2Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -DEBERTA_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `({0})`): - Indices of input sequence tokens in the vocabulary. - - Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, - 1]`: - - - 0 corresponds to a *sentence A* token, - - 1 corresponds to a *sentence B* token. - - [What are token type IDs?](../glossary#token-type-ids) - position_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - - [What are position IDs?](../glossary#position-ids) - inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert *input_ids* indices into associated vectors than the - model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - 'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.', - DEBERTA_START_DOCSTRING, -) -# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2 -class DebertaV2Model(DebertaV2PreTrainedModel): + configuration. + """ - def __init__(self, config): + def __init__(self, config, **kwargs): super().__init__(config) self.embeddings = DebertaV2Embeddings(config) @@ -1130,14 +1092,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel): raise NotImplementedError( 'The prune function is not implemented in DeBERTa model.') - @add_start_docstrings_to_model_forward( - DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=BaseModelOutput, - config_class=_CONFIG_FOR_DOC, - ) def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1148,7 +1102,53 @@ class DebertaV2Model(DebertaV2PreTrainedModel): output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[Tuple, AttentionBackboneModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`): + Indices of input sequence tokens in the vocabulary. + + attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a dataclass instead of a plain tuple. + + Returns: + Returns `modelscope.outputs.AttentionBackboneModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite') + >>> print(model(**preprocessor('这是个测试'))) + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else @@ -1216,574 +1216,9 @@ class DebertaV2Model(DebertaV2PreTrainedModel): return (sequence_output, ) + encoder_outputs[ (1 if output_hidden_states else 2):] - return BaseModelOutput( + return AttentionBackboneModelOutput( last_hidden_state=sequence_output, hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, attentions=encoder_outputs.attentions, ) - - -@add_start_docstrings( - """DeBERTa Model with a `language modeling` head on top.""", - DEBERTA_START_DOCSTRING) -# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2 -class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config): - super().__init__(config) - - self.deberta = DebertaV2Model(config) - self.cls = DebertaV2OnlyMLMHead(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=MaskedLMOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., - config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the - loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.deberta( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - - if not return_dict: - output = (prediction_scores, ) + outputs[1:] - return ((masked_lm_loss, ) - + output) if masked_lm_loss is not None else output - - return MaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta -class DebertaV2PredictionHeadTransform(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta -class DebertaV2LMPredictionHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.transform = DebertaV2PredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear( - config.hidden_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` - self.decoder.bias = self.bias - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta -class DebertaV2OnlyMLMHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = DebertaV2LMPredictionHead(config) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -@add_start_docstrings( - """ - DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the - pooled output) e.g. for GLUE tasks. - """, - DEBERTA_START_DOCSTRING, -) -# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2 -class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): - - def __init__(self, config): - super().__init__(config) - - num_labels = getattr(config, 'num_labels', 2) - self.num_labels = num_labels - - self.deberta = DebertaV2Model(config) - self.pooler = ContextPooler(config) - output_dim = self.pooler.output_dim - - self.classifier = nn.Linear(output_dim, num_labels) - drop_out = getattr(config, 'cls_dropout', None) - drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.deberta.get_input_embeddings() - - def set_input_embeddings(self, new_embeddings): - self.deberta.set_input_embeddings(new_embeddings) - - @add_start_docstrings_to_model_forward( - DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=SequenceClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.deberta( - input_ids, - token_type_ids=token_type_ids, - attention_mask=attention_mask, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - encoder_layer = outputs[0] - pooled_output = self.pooler(encoder_layer) - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - # regression task - loss_fn = nn.MSELoss() - logits = logits.view(-1).to(labels.dtype) - loss = loss_fn(logits, labels.view(-1)) - elif labels.dim() == 1 or labels.size(-1) == 1: - label_index = (labels >= 0).nonzero() - labels = labels.long() - if label_index.size(0) > 0: - labeled_logits = torch.gather( - logits, 0, - label_index.expand( - label_index.size(0), logits.size(1))) - labels = torch.gather(labels, 0, label_index.view(-1)) - loss_fct = CrossEntropyLoss() - loss = loss_fct( - labeled_logits.view(-1, self.num_labels).float(), - labels.view(-1)) - else: - loss = torch.tensor(0).to(logits) - else: - log_softmax = nn.LogSoftmax(-1) - loss = -((log_softmax(logits) * labels).sum(-1)).mean() - elif self.config.problem_type == 'regression': - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == 'single_label_classification': - loss_fct = CrossEntropyLoss() - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == 'multi_label_classification': - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - if not return_dict: - output = (logits, ) + outputs[1:] - return ((loss, ) + output) if loss is not None else output - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions) - - -@add_start_docstrings( - """ - DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for - Named-Entity-Recognition (NER) tasks. - """, - DEBERTA_START_DOCSTRING, -) -# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2 -class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.deberta = DebertaV2Model(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=TokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.deberta( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits, ) + outputs[1:] - return ((loss, ) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions) - - -@add_start_docstrings( - """ - DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - """, - DEBERTA_START_DOCSTRING, -) -# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2 -class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.deberta = DebertaV2Model(config) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=QuestionAnsweringModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - start_positions: Optional[torch.Tensor] = None, - end_positions: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: - r""" - start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.deberta( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss, ) - + output) if total_loss is not None else output - - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a - softmax) e.g. for RocStories/SWAG tasks. - """, - DEBERTA_START_DOCSTRING, -) -class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel): - - def __init__(self, config): - super().__init__(config) - - num_labels = getattr(config, 'num_labels', 2) - self.num_labels = num_labels - - self.deberta = DebertaV2Model(config) - self.pooler = ContextPooler(config) - output_dim = self.pooler.output_dim - - self.classifier = nn.Linear(output_dim, 1) - drop_out = getattr(config, 'cls_dropout', None) - drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) - - self.init_weights() - - def get_input_embeddings(self): - return self.deberta.get_input_embeddings() - - def set_input_embeddings(self, new_embeddings): - self.deberta.set_input_embeddings(new_embeddings) - - @add_start_docstrings_to_model_forward( - DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=MultipleChoiceModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., - num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See - `input_ids` above) - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - num_choices = input_ids.shape[ - 1] if input_ids is not None else inputs_embeds.shape[1] - - flat_input_ids = input_ids.view( - -1, input_ids.size(-1)) if input_ids is not None else None - flat_position_ids = position_ids.view( - -1, position_ids.size(-1)) if position_ids is not None else None - flat_token_type_ids = token_type_ids.view( - -1, - token_type_ids.size(-1)) if token_type_ids is not None else None - flat_attention_mask = attention_mask.view( - -1, - attention_mask.size(-1)) if attention_mask is not None else None - flat_inputs_embeds = ( - inputs_embeds.view(-1, inputs_embeds.size(-2), - inputs_embeds.size(-1)) - if inputs_embeds is not None else None) - - outputs = self.deberta( - flat_input_ids, - position_ids=flat_position_ids, - token_type_ids=flat_token_type_ids, - attention_mask=flat_attention_mask, - inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - encoder_layer = outputs[0] - pooled_output = self.pooler(encoder_layer) - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - reshaped_logits = logits.view(-1, num_choices) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - - if not return_dict: - output = (reshaped_logits, ) + outputs[1:] - return ((loss, ) + output) if loss is not None else output - - return MultipleChoiceModelOutput( - loss=loss, - logits=reshaped_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration.py similarity index 98% rename from modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py rename to modelscope/models/nlp/deberta_v2/configuration.py index 65e8f0b7..7921ca2f 100644 --- a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py +++ b/modelscope/models/nlp/deberta_v2/configuration.py @@ -13,8 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config""" -from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union from transformers import PretrainedConfig diff --git a/modelscope/models/nlp/deberta_v2/fill_mask.py b/modelscope/models/nlp/deberta_v2/fill_mask.py new file mode 100644 index 00000000..ed127d4c --- /dev/null +++ b/modelscope/models/nlp/deberta_v2/fill_mask.py @@ -0,0 +1,230 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2020 Microsoft and the Hugging Face Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionFillMaskModelOutput +from modelscope.utils.constant import Tasks +from .backbone import DebertaV2Model, DebertaV2PreTrainedModel + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2 +@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2) +class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): + r"""DeBERTa_v2 Model with a `language modeling` head on top. + + The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled + Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build + on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two + improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Preprocessor: + This is the fill_mask model of Deberta_v2, the preprocessor of this model + is `modelscope.preprocessors.NLPPreprocessor`. + + Parameters: + config (`DebertaV2Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. + """ + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config, **kwargs): + super().__init__(config) + + self.deberta = DebertaV2Model(config) + self.cls = DebertaV2OnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, AttentionFillMaskModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`): + Indices of input sequence tokens in the vocabulary. + + attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range `[0, config.max_position_embeddings - 1]`. + + inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert *input_ids* indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a dataclass instead of a plain tuple. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Returns: + Returns `modelscope.outputs.AttentionFillMaskModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite') + >>> # Call the model, return some tensors + >>> print(model(**preprocessor('你师父差得动你,你师父可[MASK]不动我。'))) + >>> # Call the pipeline + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor) + >>> print(pipeline_ins('你师父差得动你,你师父可[MASK]不动我。')) + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[1:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return AttentionFillMaskModelOutput( + loss=masked_lm_loss, + logits=prediction_scores, + input_ids=input_ids, + attentions=outputs.attentions, + hidden_states=outputs.hidden_states) + + +# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta +class DebertaV2PredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta +class DebertaV2LMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = DebertaV2PredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta +class DebertaV2OnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = DebertaV2LMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization.py similarity index 100% rename from modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py rename to modelscope/models/nlp/deberta_v2/tokenization.py diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_fast.py similarity index 99% rename from modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py rename to modelscope/models/nlp/deberta_v2/tokenization_fast.py index a1fcecf4..913ea5bd 100644 --- a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py +++ b/modelscope/models/nlp/deberta_v2/tokenization_fast.py @@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast from modelscope.utils import logger as logging if is_sentencepiece_available(): - from .tokenization_deberta_v2 import DebertaV2Tokenizer + from .tokenization import DebertaV2Tokenizer else: DebertaV2Tokenizer = None diff --git a/modelscope/models/nlp/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py index 9cae8cc8..051cc8f2 100644 --- a/modelscope/models/nlp/gpt3/__init__.py +++ b/modelscope/models/nlp/gpt3/__init__.py @@ -4,16 +4,16 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_gpt3 import GPT3Config - from .modeling_gpt3 import GPT3Model - from .gpt3_for_text_generation import GPT3ForTextGeneration - from .tokenizer_gpt3 import JiebaBPETokenizer + from .configuration import GPT3Config + from .backbone import GPT3Model + from .text_generation import GPT3ForTextGeneration + from .tokenizer import JiebaBPETokenizer else: _import_structure = { - 'configuration_gpt3': ['GPT3Config'], - 'modeling_gpt3': ['GPT3Model'], - 'gpt3_for_text_generation': ['GPT3ForTextGeneration'], - 'tokenizer_gpt3': ['JiebaBPETokenizer'], + 'configuration': ['GPT3Config'], + 'backbone': ['GPT3Model'], + 'text_generation': ['GPT3ForTextGeneration'], + 'tokenizer': ['JiebaBPETokenizer'], } import sys diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/backbone.py similarity index 99% rename from modelscope/models/nlp/gpt3/modeling_gpt3.py rename to modelscope/models/nlp/gpt3/backbone.py index 2c23f5db..587c7a9d 100644 --- a/modelscope/models/nlp/gpt3/modeling_gpt3.py +++ b/modelscope/models/nlp/gpt3/backbone.py @@ -24,7 +24,7 @@ from torch.nn import functional as F from transformers.modeling_utils import PreTrainedModel from modelscope.utils.constant import ModelFile -from .configuration_gpt3 import GPT3Config +from .configuration import GPT3Config class GPT3SelfAttention(nn.Module): diff --git a/modelscope/models/nlp/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration.py similarity index 100% rename from modelscope/models/nlp/gpt3/configuration_gpt3.py rename to modelscope/models/nlp/gpt3/configuration.py diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/text_generation.py similarity index 100% rename from modelscope/models/nlp/gpt3/gpt3_for_text_generation.py rename to modelscope/models/nlp/gpt3/text_generation.py diff --git a/modelscope/models/nlp/gpt3/tokenizer_gpt3.py b/modelscope/models/nlp/gpt3/tokenizer.py similarity index 100% rename from modelscope/models/nlp/gpt3/tokenizer_gpt3.py rename to modelscope/models/nlp/gpt3/tokenizer.py diff --git a/modelscope/models/nlp/backbones/__init__.py b/modelscope/models/nlp/gpt_neo/__init__.py similarity index 83% rename from modelscope/models/nlp/backbones/__init__.py rename to modelscope/models/nlp/gpt_neo/__init__.py index 749cf995..ef5fdee5 100644 --- a/modelscope/models/nlp/backbones/__init__.py +++ b/modelscope/models/nlp/gpt_neo/__init__.py @@ -4,14 +4,12 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .structbert import SbertModel + from .backbone import GPTNeoModel else: _import_structure = { - 'structbert': ['SbertModel'], + 'backbone': ['GPTNeoModel'], } - import sys - sys.modules[__name__] = LazyImportModule( __name__, globals()['__file__'], diff --git a/modelscope/models/nlp/backbones/gpt_neo.py b/modelscope/models/nlp/gpt_neo/backbone.py similarity index 74% rename from modelscope/models/nlp/backbones/gpt_neo.py rename to modelscope/models/nlp/gpt_neo/backbone.py index a2d0c374..a809bcde 100644 --- a/modelscope/models/nlp/backbones/gpt_neo.py +++ b/modelscope/models/nlp/gpt_neo/backbone.py @@ -4,10 +4,11 @@ from transformers import GPTNeoModel as GPTNeoModelTransform from modelscope.metainfo import Models from modelscope.models.builder import BACKBONES -from modelscope.utils.constant import Fields +from modelscope.utils.constant import Tasks -@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo) +@BACKBONES.register_module( + group_key=Tasks.backbone, module_name=Models.gpt_neo) class GPTNeoModel(GPTNeoModelTransform): def __init__(self, **kwargs): diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py index 3f19ca67..443f93df 100644 --- a/modelscope/models/nlp/heads/token_classification_head.py +++ b/modelscope/models/nlp/heads/token_classification_head.py @@ -37,9 +37,9 @@ class TokenClassificationHead(TorchHead): sequence_output = inputs sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) - return {OutputKeys.LOGITS: logits} + return logits def compute_loss(self, outputs: Dict[str, torch.Tensor], labels) -> Dict[str, torch.Tensor]: logits = outputs[OutputKeys.LOGITS] - return {OutputKeys.LOSS: F.cross_entropy(logits, labels)} + return F.cross_entropy(logits, labels) diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py deleted file mode 100644 index b7a890c1..00000000 --- a/modelscope/models/nlp/masked_language.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from modelscope.metainfo import Models -from modelscope.models.base import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.nlp.bert import \ - BertForMaskedLM as BertForMaskedLMTransformer -from modelscope.models.nlp.deberta_v2 import \ - DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer -from modelscope.models.nlp.structbert import SbertForMaskedLM -from modelscope.models.nlp.veco import \ - VecoForMaskedLM as VecoForMaskedLMTransformer -from modelscope.outputs import OutputKeys -from modelscope.utils.constant import Tasks - -__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM'] - - -@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert) -class StructBertForMaskedLM(TorchModel, SbertForMaskedLM): - """Structbert for MLM model. - - Inherited from structbert.SbertForMaskedLM and TorchModel, so this class can be registered into Model sets. - """ - - def __init__(self, config, model_dir): - super(TorchModel, self).__init__(model_dir) - SbertForMaskedLM.__init__(self, config) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - labels=None): - output = SbertForMaskedLM.forward( - self, - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - labels=labels) - output[OutputKeys.INPUT_IDS] = input_ids - return output - - @classmethod - def _instantiate(cls, **kwargs): - model_dir = kwargs.get('model_dir') - return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained( - pretrained_model_name_or_path=model_dir, model_dir=model_dir) - - -@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert) -class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer): - """Bert for MLM model. - - Inherited from transformers.BertForMaskedLM and TorchModel, so this class can be registered into Model sets. - """ - - def __init__(self, config, model_dir): - super(TorchModel, self).__init__(model_dir) - BertForMaskedLMTransformer.__init__(self, config) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - labels=None): - output = BertForMaskedLMTransformer.forward( - self, - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - labels=labels) - output[OutputKeys.INPUT_IDS] = input_ids - return output - - @classmethod - def _instantiate(cls, **kwargs): - model_dir = kwargs.get('model_dir') - return super(BertForMaskedLMTransformer, - BertForMaskedLM).from_pretrained( - pretrained_model_name_or_path=model_dir, - model_dir=model_dir) - - -@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) -class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer): - """Veco for MLM model. - - Inherited from veco.VecoForMaskedLM and TorchModel, so this class can be registered into Model sets. - """ - - def __init__(self, config, model_dir): - super(TorchModel, self).__init__(model_dir) - VecoForMaskedLMTransformer.__init__(self, config) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - labels=None): - output = VecoForMaskedLMTransformer.forward( - self, - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - labels=labels) - output[OutputKeys.INPUT_IDS] = input_ids - return output - - @classmethod - def _instantiate(cls, **kwargs): - model_dir = kwargs.get('model_dir') - return super(VecoForMaskedLMTransformer, - VecoForMaskedLM).from_pretrained( - pretrained_model_name_or_path=model_dir, - model_dir=model_dir) - - -@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2) -class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer): - """Deberta v2 for MLM model. - - Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets. - """ - - def __init__(self, config, model_dir): - super(TorchModel, self).__init__(model_dir) - DebertaV2ForMaskedLMTransformer.__init__(self, config) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - labels=None): - output = DebertaV2ForMaskedLMTransformer.forward( - self, - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - labels=labels) - output[OutputKeys.INPUT_IDS] = input_ids - return output - - @classmethod - def _instantiate(cls, **kwargs): - model_dir = kwargs.get('model_dir') - return super(DebertaV2ForMaskedLMTransformer, - DebertaV2ForMaskedLM).from_pretrained( - pretrained_model_name_or_path=model_dir, - model_dir=model_dir) diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py index 3a9960ec..45ab6621 100644 --- a/modelscope/models/nlp/palm_v2/__init__.py +++ b/modelscope/models/nlp/palm_v2/__init__.py @@ -17,19 +17,19 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_palm import PalmConfig - from .modeling_palm import ( + from .configuration import PalmConfig + from .backbone import ( AbsSummarizer, PalmForConditionalGeneration, Translator, ) - from .palm_for_text_generation import PalmForTextGeneration + from .text_generation import PalmForTextGeneration else: _import_structure = { - 'configuration_palm': ['PalmConfig'], - 'modeling_palm': + 'configuration': ['PalmConfig'], + 'backbone': ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'], - 'palm_for_text_generation': ['PalmForTextGeneration'], + 'text_generation': ['PalmForTextGeneration'], } import sys diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/backbone.py similarity index 99% rename from modelscope/models/nlp/palm_v2/modeling_palm.py rename to modelscope/models/nlp/palm_v2/backbone.py index f395ebd4..3e0ff805 100644 --- a/modelscope/models/nlp/palm_v2/modeling_palm.py +++ b/modelscope/models/nlp/palm_v2/backbone.py @@ -35,7 +35,7 @@ from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel from modelscope.utils import logger as logging -from .configuration_palm import PalmConfig +from .configuration import PalmConfig from .dureader_eval import compute_bleu_rouge, normalize CONFIG_NAME = 'config.json' diff --git a/modelscope/models/nlp/palm_v2/configuration_palm.py b/modelscope/models/nlp/palm_v2/configuration.py similarity index 100% rename from modelscope/models/nlp/palm_v2/configuration_palm.py rename to modelscope/models/nlp/palm_v2/configuration.py diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py similarity index 100% rename from modelscope/models/nlp/palm_v2/palm_for_text_generation.py rename to modelscope/models/nlp/palm_v2/text_generation.py diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py index dbc20751..589a636a 100644 --- a/modelscope/models/nlp/plug/__init__.py +++ b/modelscope/models/nlp/plug/__init__.py @@ -4,13 +4,13 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_plug import PlugNLGConfig - from .modeling_plug import PlugModel + from .configuration import PlugNLGConfig + from .backbone import PlugModel from .distributed_plug import DistributedPlug else: _import_structure = { - 'configuration_plug': ['PlugNLGConfig'], - 'modeling_plug': ['PlugModel'], + 'configuration': ['PlugNLGConfig'], + 'backbone': ['PlugModel'], 'distributed_plug': ['DistributedPlug'], } diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/backbone.py similarity index 99% rename from modelscope/models/nlp/plug/modeling_plug.py rename to modelscope/models/nlp/plug/backbone.py index df00006b..7f3f12de 100644 --- a/modelscope/models/nlp/plug/modeling_plug.py +++ b/modelscope/models/nlp/plug/backbone.py @@ -28,7 +28,7 @@ from torch import nn from modelscope.utils.nlp.distributed import (normal_init_method, scaled_init_method) -from .configuration_plug import PlugNLGConfig, PlugNLUConfig +from .configuration import PlugNLGConfig, PlugNLUConfig logger = logging.getLogger(__name__) diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration.py similarity index 100% rename from modelscope/models/nlp/plug/configuration_plug.py rename to modelscope/models/nlp/plug/configuration.py diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py index 06009ba1..c72e92ba 100644 --- a/modelscope/models/nlp/plug/distributed_plug.py +++ b/modelscope/models/nlp/plug/distributed_plug.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os from typing import Dict @@ -14,7 +15,7 @@ from modelscope.utils.nlp.distributed import initialize_distributed from modelscope.utils.nlp.load_checkpoint import pre_load from modelscope.utils.torch_utils import set_random_seed_mpu from . import PlugModel -from .configuration_plug import PlugNLGConfig +from .configuration import PlugNLGConfig logger = get_logger(__name__) diff --git a/modelscope/models/nlp/ponet/__init__.py b/modelscope/models/nlp/ponet/__init__.py index 6d26b194..df996167 100644 --- a/modelscope/models/nlp/ponet/__init__.py +++ b/modelscope/models/nlp/ponet/__init__.py @@ -18,16 +18,16 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_ponet import PoNetConfig - from .modeling_ponet import (PoNetForMaskedLM, PoNetModel, - PoNetPreTrainedModel) - from .tokenization_ponet import PoNetTokenizer + from .configuration import PoNetConfig + from .backbone import (PoNetModel, PoNetPreTrainedModel) + from .tokenization import PoNetTokenizer + from .fill_mask import PoNetForMaskedLM else: _import_structure = { - 'configuration_ponet': ['PoNetConfig'], - 'modeling_ponet': - ['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'], - 'tokenization_ponet': ['PoNetTokenizer'], + 'configuration': ['PoNetConfig'], + 'backbone': ['PoNetModel', 'PoNetPreTrainedModel'], + 'fill_mask': ['PoNetForMaskedLM'], + 'tokenization': ['PoNetTokenizer'], } import sys diff --git a/modelscope/models/nlp/ponet/modeling_ponet.py b/modelscope/models/nlp/ponet/backbone.py similarity index 55% rename from modelscope/models/nlp/ponet/modeling_ponet.py rename to modelscope/models/nlp/ponet/backbone.py index f37954db..fcc62fa2 100644 --- a/modelscope/models/nlp/ponet/modeling_ponet.py +++ b/modelscope/models/nlp/ponet/backbone.py @@ -16,43 +16,32 @@ """PyTorch PoNet model. """ import math -from dataclasses import dataclass from distutils.version import LooseVersion -from typing import Optional, Tuple import torch import torch.utils.checkpoint from packaging import version from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers.activations import ACT2FN -from transformers.file_utils import (ModelOutput, add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - replace_return_docstrings) -from transformers.modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, - CausalLMOutputWithCrossAttentions, MaskedLMOutput, - SequenceClassifierOutput, TokenClassifierOutput) +from transformers.modeling_outputs import \ + BaseModelOutputWithPastAndCrossAttentions from transformers.modeling_utils import (PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer) -from transformers.models.bert.modeling_bert import \ - load_tf_weights_in_bert as load_tf_weights_in_ponet +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionBackboneModelOutput +from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger -from .configuration_ponet import PoNetConfig +from .configuration import PoNetConfig logger = get_logger(__name__) is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0') -_CHECKPOINT_FOR_DOC = 'ponet-base-uncased' -_CONFIG_FOR_DOC = 'PoNetConfig' -_TOKENIZER_FOR_DOC = 'PoNetTokenizer' - CLS_ID = 101 EOS_ID = 102 @@ -609,82 +598,20 @@ class PoNetPooler(nn.Module): return pooled_output -class PoNetPredictionHeadTransform(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class PoNetLMPredictionHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.transform = PoNetPredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear( - config.hidden_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` - self.decoder.bias = self.bias - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -class PoNetOnlyMLMHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = PoNetLMPredictionHead(config) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class PoNetPreTrainingHeads(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = PoNetLMPredictionHead(config) - self.seq_relationship = nn.Linear(config.hidden_size, 3) - - def forward(self, sequence_output, pooled_output): - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -class PoNetPreTrainedModel(PreTrainedModel): +class PoNetPreTrainedModel(TorchModel, PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = PoNetConfig - load_tf_weights = load_tf_weights_in_ponet base_model_prefix = 'ponet' _keys_to_ignore_on_load_missing = [r'position_ids'] + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + def _init_weights(self, module): """Initialize the weights""" if isinstance(module, nn.Linear): @@ -703,51 +630,22 @@ class PoNetPreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) - -@dataclass -class PoNetForPreTrainingOutput(ModelOutput): - """ - Output type of :class:`~transformers.PoNetForPreTraining`. - - Args: - loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): - Masked language modeling loss. - sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): - sop loss. - prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). - hidden_states - (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed - or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed - or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, - sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - mlm_loss: Optional[torch.FloatTensor] = None - sop_loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + ponet_config = PoNetConfig(**kwargs) + model = cls(ponet_config) + else: + model = super( + Model, + cls).from_pretrained(pretrained_model_name_or_path=model_dir) + return model -PONET_START_DOCSTRING = r""" +@MODELS.register_module(Tasks.backbone, module_name=Models.ponet) +class PoNetModel(PoNetPreTrainedModel): + """The bare PoNet Model transformer outputting raw hidden-states without any specific head on top. This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, @@ -763,65 +661,6 @@ PONET_START_DOCSTRING = r""" Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. -""" - -PONET_INPUTS_DOCSTRING = r""" - Args: - input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): - Indices of input sequence tokens in the vocabulary. - - Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See - :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for - details. - - `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): - Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): - Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, - 1]``: - - - 0 corresponds to a `sentence A` token, - - 1 corresponds to a `sentence B` token. - - `What are token type IDs? <../glossary.html#token-type-ids>`_ - position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, - config.max_position_embeddings - 1]``. - - `What are position IDs? <../glossary.html#position-ids>`_ - head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): - Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): - Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert :obj:`input_ids` indices into associated - vectors than the model's internal embedding lookup matrix. - output_attentions (:obj:`bool`, `optional`): - Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned - tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`): - Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for - more detail. - return_dict (:obj:`bool`, `optional`): - Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. -""" - - -@add_start_docstrings( - 'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.', - PONET_START_DOCSTRING, -) -class PoNetModel(PoNetPreTrainedModel): - """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is @@ -834,8 +673,8 @@ class PoNetModel(PoNetPreTrainedModel): input to the forward pass. """ - def __init__(self, config, add_pooling_layer=True): - super().__init__(config) + def __init__(self, config, add_pooling_layer=True, **kwargs): + super().__init__(config, **kwargs) self.config = config self.embeddings = PoNetEmbeddings(config) @@ -859,14 +698,6 @@ class PoNetModel(PoNetPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @add_start_docstrings_to_model_forward( - PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=BaseModelOutputWithPoolingAndCrossAttentions, - config_class=_CONFIG_FOR_DOC, - ) def forward( self, input_ids=None, @@ -885,6 +716,49 @@ class PoNetModel(PoNetPreTrainedModel): return_dict=None, ): r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -906,6 +780,16 @@ class PoNetModel(PoNetPreTrainedModel): use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). + + Returns: + Returns `modelscope.outputs.AttentionBackboneModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base', task='backbone') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base') + >>> print(model(**preprocessor('这是个测试'))) """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1006,7 +890,7 @@ class PoNetModel(PoNetPreTrainedModel): if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( + return AttentionBackboneModelOutput( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, @@ -1014,578 +898,3 @@ class PoNetModel(PoNetPreTrainedModel): attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) - - -@add_start_docstrings( - """ - PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next - sentence prediction (classification)` head. - """, - PONET_START_DOCSTRING, -) -class PoNetForPreTraining(PoNetPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - - self.ponet = PoNetModel(config) - self.cls = PoNetPreTrainingHeads(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - segment_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - next_sentence_label=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`): - Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., - config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored - (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): - Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair - (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``: - - - 0 indicates sequence B is a continuation of sequence A, - - 1 indicates sequence B is a random sequence. - kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): - Used to hide legacy arguments that have been deprecated. - - Returns: - - Example:: - - >>> from transformers import PoNetTokenizer, PoNetForPreTraining - >>> import torch - - >>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased') - >>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased') - - >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") - >>> outputs = model(**inputs) - - >>> prediction_logits = outputs.prediction_logits - >>> seq_relationship_logits = outputs.seq_relationship_logits - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ponet( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - segment_ids=segment_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output, pooled_output = outputs[:2] - prediction_scores, seq_relationship_score = self.cls( - sequence_output, pooled_output) - - total_loss = None - masked_lm_loss = None - next_sentence_loss = None - if labels is not None and next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - next_sentence_loss = loss_fct( - seq_relationship_score.view(-1, 3), - next_sentence_label.view(-1)) - total_loss = masked_lm_loss + next_sentence_loss - - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss, masked_lm_loss, next_sentence_loss) - + output) if total_loss is not None else output - - return PoNetForPreTrainingOutput( - loss=total_loss, - mlm_loss=masked_lm_loss, - sop_loss=next_sentence_loss, - prediction_logits=prediction_scores, - seq_relationship_logits=seq_relationship_score, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """PoNet Model with a `language modeling` head on top for CLM fine-tuning. """, - PONET_START_DOCSTRING) -class PoNetLMHeadModel(PoNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config): - super().__init__(config) - - if not config.is_decoder: - logger.warning( - 'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`' - ) - - self.ponet = PoNetModel(config, add_pooling_layer=False) - self.cls = PoNetOnlyMLMHead(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=CausalLMOutputWithCrossAttentions, - config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - segment_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj: - `(batch_size, sequence_length, hidden_size)`, `optional`): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in - ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are - ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` - past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` - with each tuple having 4 tensors of shape : - obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` - (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` - instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. - use_cache (:obj:`bool`, `optional`): - If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up - decoding (see :obj:`past_key_values`). - - Returns: - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None: - use_cache = False - - outputs = self.ponet( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - segment_ids=segment_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - lm_loss = None - if labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - shifted_prediction_scores = prediction_scores[:, : - -1, :].contiguous() - labels = labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - lm_loss = loss_fct( - shifted_prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - - if not return_dict: - output = (prediction_scores, ) + outputs[2:] - return ((lm_loss, ) + output) if lm_loss is not None else output - - return CausalLMOutputWithCrossAttentions( - loss=lm_loss, - logits=prediction_scores, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, - ) - - def prepare_inputs_for_generation(self, - input_ids, - past=None, - attention_mask=None, - **model_kwargs): - input_shape = input_ids.shape - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_shape) - - # cut decoder_input_ids if past is used - if past is not None: - input_ids = input_ids[:, -1:] - - return { - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'past_key_values': past - } - - def _reorder_cache(self, past, beam_idx): - reordered_past = () - for layer_past in past: - reordered_past += (tuple( - past_state.index_select(0, beam_idx) - for past_state in layer_past), ) - return reordered_past - - -@add_start_docstrings( - """PoNet Model with a `language modeling` head on top. """, - PONET_START_DOCSTRING) -class PoNetForMaskedLM(PoNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config): - super().__init__(config) - - if config.is_decoder: - logger.warning( - 'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for ' - 'bi-directional self-attention.') - - self.ponet = PoNetModel(config, add_pooling_layer=False) - self.cls = PoNetOnlyMLMHead(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=MaskedLMOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - segment_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., - config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored - (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ponet( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - segment_ids=segment_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - - if not return_dict: - output = (prediction_scores, ) + outputs[2:] - return ((masked_lm_loss, ) - + output) if masked_lm_loss is not None else output - - return MaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled - output) e.g. for GLUE tasks. - """, - PONET_START_DOCSTRING, -) -class PoNetForSequenceClassification(PoNetPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - - self.ponet = PoNetModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_model_forward( - PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=SequenceClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - segment_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., - config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), - If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ponet( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - segment_ids=segment_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = 'regression' - elif self.num_labels > 1 and (labels.dtype == torch.long - or labels.dtype == torch.int): - self.config.problem_type = 'single_label_classification' - else: - self.config.problem_type = 'multi_label_classification' - - if self.config.problem_type == 'regression': - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == 'single_label_classification': - loss_fct = CrossEntropyLoss() - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == 'multi_label_classification': - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - if not return_dict: - output = (logits, ) + outputs[2:] - return ((loss, ) + output) if loss is not None else output - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for - Named-Entity-Recognition (NER) tasks. - """, - PONET_START_DOCSTRING, -) -class PoNetForTokenClassification(PoNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.ponet = PoNetModel(config, add_pooling_layer=False) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_model_forward( - PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=TokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - segment_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - - 1]``. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ponet( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - segment_ids=segment_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), - torch.tensor(loss_fct.ignore_index).type_as(labels)) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits, ) + outputs[2:] - return ((loss, ) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/modelscope/models/nlp/ponet/configuration_ponet.py b/modelscope/models/nlp/ponet/configuration.py similarity index 96% rename from modelscope/models/nlp/ponet/configuration_ponet.py rename to modelscope/models/nlp/ponet/configuration.py index 70294fc2..7dfaba48 100644 --- a/modelscope/models/nlp/ponet/configuration_ponet.py +++ b/modelscope/models/nlp/ponet/configuration.py @@ -34,8 +34,7 @@ class PoNetConfig(PretrainedConfig): Args: vocab_size (:obj:`int`, `optional`, defaults to 30522): Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the - :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or - :class:`~transformers.TFBertModel`. + :obj:`inputs_ids` passed. hidden_size (:obj:`int`, `optional`, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, `optional`, defaults to 12): @@ -55,8 +54,7 @@ class PoNetConfig(PretrainedConfig): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, `optional`, defaults to 2): - The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or - :class:`~transformers.TFBertModel`. + The vocabulary size of the :obj:`token_type_ids` passed. initializer_range (:obj:`float`, `optional`, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): diff --git a/modelscope/models/nlp/ponet/fill_mask.py b/modelscope/models/nlp/ponet/fill_mask.py new file mode 100644 index 00000000..fb09efc0 --- /dev/null +++ b/modelscope/models/nlp/ponet/fill_mask.py @@ -0,0 +1,252 @@ +# Copyright 2021-2022 The Alibaba DAMO Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionFillMaskModelOutput +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from .backbone import PoNetModel, PoNetPreTrainedModel + +logger = get_logger(__name__) + + +class PoNetPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class PoNetLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = PoNetPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class PoNetOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = PoNetLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet) +class PoNetForMaskedLM(PoNetPreTrainedModel): + r"""PoNet Model with a `language modeling` head on top. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the fill_mask model of PoNet, the preprocessor of this model + is `modelscope.preprocessors.FillMaskPoNetPreprocessor`. + + Parameters: + config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`): + Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config, **kwargs): + super().__init__(config) + + if config.is_decoder: + logger.warning( + 'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for ' + 'bi-directional self-attention.') + + self.ponet = PoNetModel(config, add_pooling_layer=False) + self.cls = PoNetOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + segment_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length', hidden_size)`, + `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + + Returns: + Returns `modelscope.outputs.AttentionFillMaskModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base') + >>> # Call the model, return some tensors + >>> print(model(**preprocessor('你师父差得动你,你师父可[MASK]不动我。'))) + >>> # Call the pipeline + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor) + >>> print(pipeline_ins('你师父差得动你,你师父可[MASK]不动我。')) + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ponet( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + segment_ids=segment_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return AttentionFillMaskModelOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + input_ids=input_ids, + ) diff --git a/modelscope/models/nlp/ponet/tokenization_ponet.py b/modelscope/models/nlp/ponet/tokenization.py similarity index 98% rename from modelscope/models/nlp/ponet/tokenization_ponet.py rename to modelscope/models/nlp/ponet/tokenization.py index 21544886..2da91545 100644 --- a/modelscope/models/nlp/ponet/tokenization_ponet.py +++ b/modelscope/models/nlp/ponet/tokenization.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from transformers.file_utils import PaddingStrategy from transformers.models.bert.tokenization_bert import BertTokenizer +from transformers.tokenization_utils import BatchEncoding, EncodedInput from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger diff --git a/modelscope/models/nlp/ponet_for_masked_language.py b/modelscope/models/nlp/ponet_for_masked_language.py deleted file mode 100644 index 11f4bc11..00000000 --- a/modelscope/models/nlp/ponet_for_masked_language.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict - -from modelscope.metainfo import Models -from modelscope.models.base import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.nlp.ponet import \ - PoNetForMaskedLM as PoNetForMaskedLMTransformer -from modelscope.outputs import OutputKeys -from modelscope.utils.constant import Tasks - -__all__ = ['PoNetForMaskedLM'] - - -@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet) -class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer): - """PoNet for MLM model.'. - - Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets. - """ - - def __init__(self, config, model_dir): - super(TorchModel, self).__init__(model_dir) - PoNetForMaskedLMTransformer.__init__(self, config) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - segment_ids=None, - position_ids=None, - head_mask=None, - labels=None): - output = PoNetForMaskedLMTransformer.forward( - self, - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - segment_ids=segment_ids, - position_ids=position_ids, - head_mask=head_mask, - labels=labels) - output[OutputKeys.INPUT_IDS] = input_ids - return output - - @classmethod - def _instantiate(cls, **kwargs): - model_dir = kwargs.get('model_dir') - return super(PoNetForMaskedLMTransformer, - PoNetForMaskedLM).from_pretrained( - pretrained_model_name_or_path=model_dir, - model_dir=model_dir) diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py deleted file mode 100644 index 340c133f..00000000 --- a/modelscope/models/nlp/sentence_embedding.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict - -import numpy as np - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.nlp.structbert import SbertPreTrainedModel -from modelscope.utils.constant import Tasks - -__all__ = ['SentenceEmbedding'] - - -@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert) -class SentenceEmbedding(TorchModel, SbertPreTrainedModel): - base_model_prefix: str = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def __init__(self, config, model_dir): - super().__init__(model_dir) - self.config = config - setattr(self, self.base_model_prefix, self.build_base_model()) - - def build_base_model(self): - from .structbert import SbertModel - return SbertModel(self.config, add_pooling_layer=False) - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - Example: - { - 'predictions': array([1]), # lable 0-negative 1-positive - 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } - """ - return self.base_model(**input) - - def postprocess(self, inputs: Dict[str, np.ndarray], - **kwargs) -> Dict[str, np.ndarray]: - embs = inputs['last_hidden_state'][:, 0].cpu().numpy() - num_sent = embs.shape[0] - if num_sent >= 2: - scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ], - (1, 0))).tolist()[0] - else: - scores = [] - result = {'text_embedding': embs, 'scores': scores} - - return result - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained - """ - model_args = {} - - return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py deleted file mode 100644 index 156c615c..00000000 --- a/modelscope/models/nlp/sequence_classification.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from abc import abstractmethod - -from torch import nn - -from modelscope.metainfo import Models -from modelscope.models.base import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.nlp.bert import BertPreTrainedModel -from modelscope.models.nlp.structbert import SbertPreTrainedModel -from modelscope.models.nlp.veco import \ - VecoForSequenceClassification as VecoForSequenceClassificationTransform -from modelscope.outputs import OutputKeys -from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping -from modelscope.utils.tensor_utils import (torch_nested_detach, - torch_nested_numpify) - -__all__ = [ - 'SbertForSequenceClassification', 'VecoForSequenceClassification', - 'BertForSequenceClassification' -] - - -class SequenceClassificationBase(TorchModel): - """A sequence classification base class for all the fitted sequence classification models. - """ - base_model_prefix: str = 'bert' - - def __init__(self, config, model_dir): - super().__init__(model_dir) - self.num_labels = config.num_labels - self.config = config - setattr(self, self.base_model_prefix, self.build_base_model()) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - @abstractmethod - def build_base_model(self): - """Build the backbone model. - - Returns: the backbone instance. - """ - pass - - @property - def base_model(self): - return getattr(self, self.base_model_prefix) - - def forward(self, **kwargs): - labels = None - if OutputKeys.LABEL in kwargs: - labels = kwargs.pop(OutputKeys.LABEL) - elif OutputKeys.LABELS in kwargs: - labels = kwargs.pop(OutputKeys.LABELS) - - outputs = self.base_model.forward(**kwargs) - - # backbone model should return pooled_output as its second output - pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - if labels is not None: - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss} - return {OutputKeys.LOGITS: logits} - - def postprocess(self, input, **kwargs): - logits = input[OutputKeys.LOGITS] - probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1))) - pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1))) - logits = torch_nested_numpify(torch_nested_detach(logits)) - res = { - OutputKeys.PREDICTIONS: pred, - OutputKeys.PROBABILITIES: probs, - OutputKeys.LOGITS: logits - } - return res - - -@MODELS.register_module( - Tasks.sentence_similarity, module_name=Models.structbert) -@MODELS.register_module( - Tasks.sentiment_classification, module_name=Models.structbert) -@MODELS.register_module(Tasks.nli, module_name=Models.structbert) -@MODELS.register_module( - Tasks.zero_shot_classification, module_name=Models.structbert) -class SbertForSequenceClassification(SequenceClassificationBase, - SbertPreTrainedModel): - """Sbert sequence classification model. - - Inherited from SequenceClassificationBase. - """ - base_model_prefix: str = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def __init__(self, config, model_dir): - if hasattr(config, 'base_model_prefix'): - SbertForSequenceClassification.base_model_prefix = config.base_model_prefix - super().__init__(config, model_dir) - - def build_base_model(self): - from .structbert import SbertModel - return SbertModel(self.config, add_pooling_layer=True) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - labels=None, - **kwargs): - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - labels=labels) - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - num_labels: An optional arg to tell the model how many classes to initialize. - Method will call utils.parse_label_mapping if num_labels not supplied. - If num_labels is not found, the model will use the default setting (2 classes). - @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained - """ - - model_dir = kwargs.get('model_dir') - num_labels = kwargs.get('num_labels') - if num_labels is None: - label2id = parse_label_mapping(model_dir) - if label2id is not None and len(label2id) > 0: - num_labels = len(label2id) - cls.id2label = {id: label for label, id in label2id.items()} - model_args = {} if num_labels is None else {'num_labels': num_labels} - return super(SbertPreTrainedModel, - SbertForSequenceClassification).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) - - -@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco) -@MODELS.register_module( - Tasks.sentiment_classification, module_name=Models.veco) -@MODELS.register_module(Tasks.nli, module_name=Models.veco) -class VecoForSequenceClassification(TorchModel, - VecoForSequenceClassificationTransform): - """Veco sequence classification model. - - Inherited from VecoForSequenceClassification and TorchModel, so this class can be registered into the model set. - This model cannot be inherited from SequenceClassificationBase, because Veco/XlmRoberta's classification structure - is different. - """ - - def __init__(self, config, model_dir): - super().__init__(model_dir) - VecoForSequenceClassificationTransform.__init__(self, config) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - **kwargs): - return VecoForSequenceClassificationTransform.forward( - self, - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - labels=labels) - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - num_labels: An optional arg to tell the model how many classes to initialize. - Method will call utils.parse_label_mapping if num_labels not supplied. - If num_labels is not found, the model will use the default setting (2 classes). - @return: The loaded model, which is initialized by veco.VecoForSequenceClassification.from_pretrained - """ - - model_dir = kwargs.get('model_dir') - num_labels = kwargs.get('num_labels') - if num_labels is None: - label2id = parse_label_mapping(model_dir) - if label2id is not None and len(label2id) > 0: - num_labels = len(label2id) - - model_args = {} if num_labels is None else {'num_labels': num_labels} - return super(VecoForSequenceClassificationTransform, - VecoForSequenceClassification).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) - - -@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert) -@MODELS.register_module( - Tasks.sentiment_classification, module_name=Models.bert) -@MODELS.register_module(Tasks.nli, module_name=Models.bert) -@MODELS.register_module(Tasks.text_classification, module_name=Models.bert) -class BertForSequenceClassification(SequenceClassificationBase, - BertPreTrainedModel): - """Bert sequence classification model. - - Inherited from SequenceClassificationBase. - """ - base_model_prefix: str = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def __init__(self, config, model_dir): - if hasattr(config, 'base_model_prefix'): - BertForSequenceClassification.base_model_prefix = config.base_model_prefix - super().__init__(config, model_dir) - - def build_base_model(self): - from .bert import BertModel - return BertModel(self.config, add_pooling_layer=True) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - labels=labels, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict) - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - num_labels: An optional arg to tell the model how many classes to initialize. - Method will call utils.parse_label_mapping if num_labels not supplied. - If num_labels is not found, the model will use the default setting (2 classes). - @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained - """ - - model_dir = kwargs.get('model_dir') - num_labels = kwargs.get('num_labels') - if num_labels is None: - label2id = parse_label_mapping(model_dir) - if label2id is not None and len(label2id) > 0: - num_labels = len(label2id) - - model_args = {} if num_labels is None else {'num_labels': num_labels} - return super(BertPreTrainedModel, - BertForSequenceClassification).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) diff --git a/modelscope/models/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py index 45f856c1..32713c34 100644 --- a/modelscope/models/nlp/space/__init__.py +++ b/modelscope/models/nlp/space/__init__.py @@ -1,20 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .model import SpaceGenerator - from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig - from .space_for_dialog_intent_prediction import SpaceForDialogIntent - from .space_for_dialog_modeling import SpaceForDialogModeling - from .space_for_dialog_state_tracking import SpaceForDialogStateTracking + from .model import SpaceModelBase, SpaceTokenizer + from .dialog_intent_prediction import SpaceForDialogIntent + from .dialog_modeling import SpaceForDialogModeling + from .dialog_state_tracking import SpaceForDST + from .configuration import SpaceConfig else: _import_structure = { - 'model': - ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'], - 'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'], - 'space_for_dialog_modeling': ['SpaceForDialogModeling'], - 'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'], + 'model': ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer'], + 'dialog_intent_prediction': ['SpaceForDialogIntent'], + 'dialog_modeling': ['SpaceForDialogModeling'], + 'dialog_state_tracking': ['SpaceForDST'], + 'configuration': ['SpaceConfig'] } import sys diff --git a/modelscope/models/nlp/space/model/configuration_space.py b/modelscope/models/nlp/space/configuration.py similarity index 100% rename from modelscope/models/nlp/space/model/configuration_space.py rename to modelscope/models/nlp/space/configuration.py diff --git a/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py b/modelscope/models/nlp/space/dialog_intent_prediction.py similarity index 66% rename from modelscope/models/nlp/space/space_for_dialog_intent_prediction.py rename to modelscope/models/nlp/space/dialog_intent_prediction.py index b93a6d83..79ff01cd 100644 --- a/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py +++ b/modelscope/models/nlp/space/dialog_intent_prediction.py @@ -8,7 +8,7 @@ from modelscope.models import TorchModel from modelscope.models.base import Tensor from modelscope.models.builder import MODELS from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase -from modelscope.preprocessors.space import IntentBPETextField +from modelscope.preprocessors.nlp import IntentBPETextField from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -24,6 +24,10 @@ class SpaceForDialogIntent(TorchModel): Args: model_dir (str): the model path. + text_field (`BPETextField`, *optional*, defaults to `IntentBPETextField`): + The text field. + config (`Config`, *optional*, defaults to config in model hub): + The config. """ super().__init__(model_dir, *args, **kwargs) @@ -72,10 +76,21 @@ class SpaceForDialogIntent(TorchModel): Example: { 'pred': array([2.62349960e-03 4.12110658e-03 4.12748595e-05 3.77560973e-05 - 1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04 - 6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01 - 2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32) + 1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04 + 6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01 + 2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32), } + Example: + >>> from modelscope.hub.snapshot_download import snapshot_download + >>> from modelscope.models.nlp import SpaceForDialogIntent + >>> from modelscope.preprocessors import DialogIntentPredictionPreprocessor + >>> cache_path = snapshot_download('damo/nlp_space_dialog-intent-prediction') + >>> preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path) + >>> model = SpaceForDialogIntent( + model_dir=cache_path, + text_field=preprocessor.text_field, + config=preprocessor.config) + >>> print(model(preprocessor("What do I need to do for the card activation?"))) """ import numpy as np pred = self.trainer.forward(input) diff --git a/modelscope/models/nlp/space/space_for_dialog_modeling.py b/modelscope/models/nlp/space/dialog_modeling.py similarity index 73% rename from modelscope/models/nlp/space/space_for_dialog_modeling.py rename to modelscope/models/nlp/space/dialog_modeling.py index efa9b851..16e9dc53 100644 --- a/modelscope/models/nlp/space/space_for_dialog_modeling.py +++ b/modelscope/models/nlp/space/dialog_modeling.py @@ -8,7 +8,7 @@ from modelscope.models import TorchModel from modelscope.models.base import Tensor from modelscope.models.builder import MODELS from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase -from modelscope.preprocessors.space import MultiWOZBPETextField +from modelscope.preprocessors.nlp import MultiWOZBPETextField from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -23,7 +23,12 @@ class SpaceForDialogModeling(TorchModel): """initialize the test generation model from the `model_dir` path. Args: - model_dir (str): the model path. + model_dir (`str`): + The model path. + text_field (`BPETextField`, *optional*, defaults to `MultiWOZBPETextField`): + The text field. + config (`Config`, *optional*, defaults to config in model hub): + The config. """ super().__init__(model_dir, *args, **kwargs) @@ -82,6 +87,19 @@ class SpaceForDialogModeling(TorchModel): 'aspn': array([47,8345,32,29,1983]), 'db': array([19, 24, 20]), } + Examples: + >>> from modelscope.hub.snapshot_download import snapshot_download + >>> from modelscope.models.nlp import SpaceForDialogModeling + >>> from modelscope.preprocessors import DialogModelingPreprocessor + >>> cache_path = snapshot_download('damo/nlp_space_dialog-modeling') + >>> preprocessor = DialogModelingPreprocessor(model_dir=cache_path) + >>> model = SpaceForDialogModeling(model_dir=cache_path, + text_field=preprocessor.text_field, + config=preprocessor.config) + >>> print(model(preprocessor({ + 'user_input': 'i would like a taxi from saint john \'s college to pizza hut fen ditton .', + 'history': {} + }))) """ first_turn = input['first_turn'] diff --git a/modelscope/models/nlp/space/model/modeling_space.py b/modelscope/models/nlp/space/dialog_state_tracking.py similarity index 57% rename from modelscope/models/nlp/space/model/modeling_space.py rename to modelscope/models/nlp/space/dialog_state_tracking.py index f093cbc5..9a713a59 100644 --- a/modelscope/models/nlp/space/model/modeling_space.py +++ b/modelscope/models/nlp/space/dialog_state_tracking.py @@ -1,6 +1,6 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,14 +16,22 @@ # limitations under the License. """PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`""" +from typing import Dict + import torch from torch import nn from torch.nn import CrossEntropyLoss from transformers.file_utils import add_start_docstrings +from transformers.modeling_utils import PreTrainedModel -from modelscope.models.nlp.structbert.modeling_sbert import ( - SbertForMaskedLM, SbertModel, SbertPreTrainedModel) -from .configuration_space import SpaceConfig +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.base import Tensor +from modelscope.models.builder import MODELS +from modelscope.models.nlp.structbert import (SbertForMaskedLM, SbertModel, + SbertPreTrainedModel) +from modelscope.utils.constant import Tasks +from .configuration import SpaceConfig SPACE_START_DOCSTRING = r""" @@ -57,6 +65,63 @@ class SpaceModel(SbertModel): config_class = SpaceConfig +class SpacePreTrainedModel(TorchModel, PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SpaceConfig + base_model_prefix = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels is not input. + label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists). + + @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + config = SpaceConfig(**kwargs) + model = cls(config) + else: + model_kwargs = {} + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_kwargs) + return model + + @add_start_docstrings( """ Space Model transformer with Dialog state tracking heads on top (a inform projection @@ -65,7 +130,9 @@ class SpaceModel(SbertModel): """, SPACE_START_DOCSTRING, ) -class SpaceForDST(SbertPreTrainedModel): +@MODELS.register_module( + Tasks.task_oriented_conversation, module_name=Models.space_dst) +class SpaceForDST(SpacePreTrainedModel): def __init__(self, config): super(SpaceForDST, self).__init__(config) @@ -113,18 +180,105 @@ class SpaceForDST(SbertPreTrainedModel): self.init_weights() - def forward(self, - input_ids, - input_mask=None, - segment_ids=None, - position_ids=None, - head_mask=None, - start_pos=None, - end_pos=None, - inform_slot_id=None, - refer_id=None, - class_label_id=None, - diag_state=None): + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + """return the result by the model + + Args: + input (Dict[str, Tensor]): the preprocessed data + + Returns: + Dict[str, Tensor]: results + Example: + { + 'inputs': dict(input_ids, input_masks,start_pos), # tracking states + 'outputs': dict(slots_logits), + 'unique_ids': str(test-example.json-0), # default value + 'input_ids_unmasked': array([101, 7632, 1010,0,0,0]) + 'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]), + 'inform': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]), + 'prefix': str('final'), #default value + 'ds': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]) + } + + Example: + >>> from modelscope.hub.snapshot_download import snapshot_download + >>> from modelscope.models.nlp import SpaceForDST + >>> from modelscope.preprocessors import DialogStateTrackingPreprocessor + >>> cache_path = snapshot_download('damo/nlp_space_dialog-state-tracking') + >>> model = SpaceForDST.from_pretrained(cache_path) + >>> preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path) + >>> print(model(preprocessor({ + 'utter': { + 'User-1': "Hi, I'm looking for a train that is going" + "to cambridge and arriving there by 20:45, is there anything like that?" + }, + 'history_states': [{}] + }))) + """ + import numpy as np + import torch + + # self.model.eval() ???? + batch = input['batch'] + + features = input['features'] + diag_state = input['diag_state'] + turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]] + reset_diag_state = np.where(np.array(turn_itrs) == '0')[0] + for slot in self.config.dst_slot_list: + for i in reset_diag_state: + diag_state[slot][i] = 0 + + with torch.no_grad(): + inputs = { + 'input_ids': batch[0], + 'input_mask': batch[1], + 'segment_ids': batch[2], + 'start_pos': batch[3], + 'end_pos': batch[4], + 'inform_slot_id': batch[5], + 'refer_id': batch[6], + 'diag_state': diag_state, + 'class_label_id': batch[8] + } + unique_ids = [features[i.item()].guid for i in batch[9]] + values = [features[i.item()].values for i in batch[9]] + input_ids_unmasked = [ + features[i.item()].input_ids_unmasked for i in batch[9] + ] + inform = [features[i.item()].inform for i in batch[9]] + outputs = self._forward(**inputs) + + # Update dialog state for next turn. + for slot in self.config.dst_slot_list: + updates = outputs[2][slot].max(1)[1] + for i, u in enumerate(updates): + if u != 0: + diag_state[slot][i] = u + + return { + 'inputs': inputs, + 'outputs': outputs, + 'unique_ids': unique_ids, + 'input_ids_unmasked': input_ids_unmasked, + 'values': values, + 'inform': inform, + 'prefix': 'final', + 'ds': input['ds'] + } + + def _forward(self, + input_ids, + input_mask=None, + segment_ids=None, + position_ids=None, + head_mask=None, + start_pos=None, + end_pos=None, + inform_slot_id=None, + refer_id=None, + class_label_id=None, + diag_state=None): outputs = self.bert( input_ids, attention_mask=input_mask, @@ -132,8 +286,8 @@ class SpaceForDST(SbertPreTrainedModel): position_ids=position_ids, head_mask=head_mask) - sequence_output = outputs[0] - pooled_output = outputs[1] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output sequence_output = self.dropout(sequence_output) pooled_output = self.dropout(pooled_output) @@ -233,36 +387,6 @@ class SpaceForDST(SbertPreTrainedModel): per_slot_start_logits, per_slot_end_logits, per_slot_refer_logits, - ) + outputs[2:] + ) + (outputs.embedding_output, ) return outputs - - -@add_start_docstrings( - 'The Space Model Model with a `language modeling` head on tops', - SPACE_START_DOCSTRING, -) -class SpaceForMaskedLM(SbertForMaskedLM): - """ - This class overrides [`SbertForMaskedLM`]. Please check the superclass for the - appropriate documentation alongside usage examples. - """ - - config_class = SpaceConfig - - -@add_start_docstrings( - """ - Space Model with only one head on top as done during the pretraining: a `masked language modeling` head. - """, - SPACE_START_DOCSTRING, -) -class SpaceForPreTraining(SbertPreTrainedModel): - - def __init__(self, model_name_or_path: str): - super(SpaceForPreTraining, self).__init__() - self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path) - - def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor): - outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels) - return outputs[0] diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py index bb1d18e4..cfff335d 100644 --- a/modelscope/models/nlp/space/model/__init__.py +++ b/modelscope/models/nlp/space/model/__init__.py @@ -1,10 +1,8 @@ -from .configuration_space import SpaceConfig +# Copyright (c) Alibaba, Inc. and its affiliates. from .gen_unified_transformer import GenUnifiedTransformer from .generator import SpaceGenerator from .intent_unified_transformer import IntentUnifiedTransformer from .model_base import SpaceModelBase -from .modeling_space import (SpaceForDST, SpaceForMaskedLM, - SpaceForPreTraining, SpaceModel) from .tokenization_space import (BasicTokenizer, SpaceTokenizer, WordpieceTokenizer) from .unified_transformer import UnifiedTransformer diff --git a/modelscope/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py index 0e7833e6..2e05b545 100644 --- a/modelscope/models/nlp/space/model/generator.py +++ b/modelscope/models/nlp/space/model/generator.py @@ -71,14 +71,11 @@ class SpaceGenerator(object): return def __call__(self, step_fn, state): - """ - Running generation. - - @param : step_fn : decoding one step - @type : function + """Running generation. - @param : state : initial state - @type : dict + Args: + step_fn (`function`) : decoding one step + state(`dict`) : initial state """ raise NotImplementedError @@ -104,11 +101,9 @@ class BeamSearch(SpaceGenerator): """ Running beam search. - @param : step_fn : decoding one step - @type : function - - @param : state : initial state - @type : dict + Args: + step_fn(`function`) : decoding one step + state(`dict`) : initial state """ if prev_input is not None: diff --git a/modelscope/models/nlp/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py index d3d0baa4..b7812182 100644 --- a/modelscope/models/nlp/space/model/model_base.py +++ b/modelscope/models/nlp/space/model/model_base.py @@ -64,8 +64,8 @@ class SpaceModelBase(nn.Module): """ Forward process, include real forward, collect metrices and optimize(optional) - @params : inputs : input data - @type : dict of numpy.ndarray/int/float/... + Args: + inputs(`dict` of numpy.ndarray/int/float/...) : input data """ if is_training: self.train() @@ -85,11 +85,10 @@ class SpaceModelBase(nn.Module): eos_id=None, max_gen_len=None, prev_input=None): - """ - Inference process. + """Inference process. - @params : inputs : input data - @type : dict of numpy.ndarray/int/float/... + Args: + inputs(`dict` of numpy.ndarray/int/float/...) : input data """ self.eval() results = self._infer( diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py index 84712b7b..e3b358d4 100644 --- a/modelscope/models/nlp/space/model/tokenization_space.py +++ b/modelscope/models/nlp/space/model/tokenization_space.py @@ -1,5 +1,5 @@ -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py index b0775541..19069971 100644 --- a/modelscope/models/nlp/space/model/unified_transformer.py +++ b/modelscope/models/nlp/space/model/unified_transformer.py @@ -119,15 +119,12 @@ class UnifiedTransformer(SpaceModelBase): input_mask, append_head=False, auto_regressive=False): - """ - Create attention mask. + """Create attention mask. from sequence to matrix:[batch_size, max_seq_len, 1] -> [batch_size, max_seq_len, max_seq_len] - @param : input_mask - @type : Variable(shape: [batch_size, max_seq_len]) - - @param : auto_regressive - @type : bool + Args: + input_mask (Variable(shape: [batch_size, max_seq_len])) + auto_regressive(bool) """ seq_len = input_mask.shape[1] @@ -150,15 +147,12 @@ class UnifiedTransformer(SpaceModelBase): return mask def _join_mask(self, mask1, mask2): - """ - Merge source attention mask and target attention mask. + """Merge source attention mask and target attention mask. There are four parts:left upper (lu) / right upper (ru) / left below (lb) / right below (rb) - @param : mask1 : source attention mask - @type : Variable(shape: [batch_size, max_src_len, max_src_len]) - - @param : mask1 : target attention mask - @type : Variable(shape: [batch_size, max_tgt_len, max_tgt_len]) + Args: + mask1(Variable(shape: [batch_size, max_src_len, max_src_len])) : source attention mask + mask2(Variable(shape: [batch_size, max_tgt_len, max_tgt_len])) : target attention mask """ batch_size = mask1.shape[0] seq_len1 = mask1.shape[1] diff --git a/modelscope/models/nlp/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py index 37f968d9..3044963a 100644 --- a/modelscope/models/nlp/space/modules/transformer_block.py +++ b/modelscope/models/nlp/space/modules/transformer_block.py @@ -30,18 +30,13 @@ class TransformerBlock(nn.Module): return def forward(self, inp, mask=None, cache=None): - """ - Forward process on one transformer layer. - - @param : x - @type : Variable(shape: [batch_size, seq_len, hidden_size]) - - @param : memory - @type : Variable(shape: [batch_size, seq_len, hidden_size]) - - @param : mask + """Forward process on one transformer layer. - @param : cache + Args: + x(Variable(shape: [batch_size, seq_len, hidden_size])) + memory(Variable(shape: [batch_size, seq_len, hidden_size])) + mask + cache """ attn_out = self.attn(inp, mask, cache) attn_out = self.dropout_layer(attn_out) diff --git a/modelscope/models/nlp/space/space_for_dialog_state_tracking.py b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py deleted file mode 100644 index 4b9cf5c3..00000000 --- a/modelscope/models/nlp/space/space_for_dialog_state_tracking.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import Dict - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.base import Tensor -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks - -__all__ = ['SpaceForDialogStateTracking'] - - -@MODELS.register_module( - Tasks.task_oriented_conversation, module_name=Models.space_dst) -class SpaceForDialogStateTracking(TorchModel): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the test generation model from the `model_dir` path. - - Args: - model_dir (str): the model path. - """ - - super().__init__(model_dir, *args, **kwargs) - - from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig - self.model_dir = model_dir - - self.config = SpaceConfig.from_pretrained(self.model_dir) - self.model = SpaceForDST.from_pretrained(self.model_dir) - - def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: - """return the result by the model - - Args: - input (Dict[str, Tensor]): the preprocessed data - - Returns: - Dict[str, Tensor]: results - Example: - { - 'inputs': dict(input_ids, input_masks,start_pos), # tracking states - 'outputs': dict(slots_logits), - 'unique_ids': str(test-example.json-0), # default value - 'input_ids_unmasked': array([101, 7632, 1010,0,0,0]) - 'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]), - 'inform': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]), - 'prefix': str('final'), #default value - 'ds': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]) - } - """ - import numpy as np - import torch - - self.model.eval() - batch = input['batch'] - - features = input['features'] - diag_state = input['diag_state'] - turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]] - reset_diag_state = np.where(np.array(turn_itrs) == '0')[0] - for slot in self.config.dst_slot_list: - for i in reset_diag_state: - diag_state[slot][i] = 0 - - with torch.no_grad(): - inputs = { - 'input_ids': batch[0], - 'input_mask': batch[1], - 'segment_ids': batch[2], - 'start_pos': batch[3], - 'end_pos': batch[4], - 'inform_slot_id': batch[5], - 'refer_id': batch[6], - 'diag_state': diag_state, - 'class_label_id': batch[8] - } - unique_ids = [features[i.item()].guid for i in batch[9]] - values = [features[i.item()].values for i in batch[9]] - input_ids_unmasked = [ - features[i.item()].input_ids_unmasked for i in batch[9] - ] - inform = [features[i.item()].inform for i in batch[9]] - outputs = self.model(**inputs) - - # Update dialog state for next turn. - for slot in self.config.dst_slot_list: - updates = outputs[2][slot].max(1)[1] - for i, u in enumerate(updates): - if u != 0: - diag_state[slot][i] = u - - return { - 'inputs': inputs, - 'outputs': outputs, - 'unique_ids': unique_ids, - 'input_ids_unmasked': input_ids_unmasked, - 'values': values, - 'inform': inform, - 'prefix': 'final', - 'ds': input['ds'] - } diff --git a/modelscope/models/nlp/space_T_cn/__init__.py b/modelscope/models/nlp/space_T_cn/__init__.py index e69de29b..b9deb700 100644 --- a/modelscope/models/nlp/space_T_cn/__init__.py +++ b/modelscope/models/nlp/space_T_cn/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .table_question_answering import TableQuestionAnswering +else: + _import_structure = { + 'table_question_answering': ['TableQuestionAnswering'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py b/modelscope/models/nlp/space_T_cn/backbone.py similarity index 99% rename from modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py rename to modelscope/models/nlp/space_T_cn/backbone.py index 72c94724..5afde06e 100644 --- a/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py +++ b/modelscope/models/nlp/space_T_cn/backbone.py @@ -1,6 +1,6 @@ +# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,8 +27,7 @@ import numpy as np import torch from torch import nn -from modelscope.models.nlp.space_T_cn.configuration_space_T_cn import \ - SpaceTCnConfig +from modelscope.models.nlp.space_T_cn.configuration import SpaceTCnConfig from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger diff --git a/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py b/modelscope/models/nlp/space_T_cn/configuration.py similarity index 100% rename from modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py rename to modelscope/models/nlp/space_T_cn/configuration.py index 553d8592..e698b310 100644 --- a/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py +++ b/modelscope/models/nlp/space_T_cn/configuration.py @@ -1,6 +1,6 @@ +# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/space_T_cn/table_question_answering.py similarity index 94% rename from modelscope/models/nlp/table_question_answering.py rename to modelscope/models/nlp/space_T_cn/table_question_answering.py index 8e05dd0f..a3f504b7 100644 --- a/modelscope/models/nlp/table_question_answering.py +++ b/modelscope/models/nlp/space_T_cn/table_question_answering.py @@ -11,11 +11,11 @@ from transformers import BertTokenizer from modelscope.metainfo import Models from modelscope.models.base import Model, Tensor from modelscope.models.builder import MODELS -from modelscope.preprocessors.space_T_cn.fields.struct import Constant +from modelscope.preprocessors.nlp.space_T_cn.fields.struct import Constant from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.device import verify_device -from .space_T_cn.configuration_space_T_cn import SpaceTCnConfig -from .space_T_cn.modeling_space_T_cn import Seq2SQL, SpaceTCnModel +from .backbone import Seq2SQL, SpaceTCnModel +from .configuration import SpaceTCnConfig __all__ = ['TableQuestionAnswering'] @@ -732,9 +732,41 @@ class TableQuestionAnswering(Model): Args: input (Dict[str, Tensor]): the preprocessed data + Returns: Dict[str, Tensor]: results Example: + { + 'result': + { + 'question_tok': ['有', '哪', '些', '风', '险', '类', '型', '?'], + 'question': '有哪些风险类型?', + 'table_id': 'fund', + 'sql': { + 'cond_conn_op': 0, + 'sel': [5], + 'agg': [0], + 'conds': [[10, 2, 'Nulll']] + }, + 'action': 10, + 'model_out': [ + [6, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [2, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0] + ] + }, + 'history_sql': None + } + + Example: + >>> from modelscope.models.nlp import TableQuestionAnswering + >>> from modelscope.preprocessors import TableQuestionAnsweringPreprocessor + >>> model = TableQuestionAnswering.from_pretrained('damo/nlp_convai_text2sql_pretrain_cn') + >>> preprocessor = TableQuestionAnsweringPreprocessor(model_dir=model.model_dir) + >>> print(model(preprocessor({'question': '有哪些风险类型?'}))) """ result = self.predict(input['datas'])[0] diff --git a/modelscope/models/nlp/space_T_en/__init__.py b/modelscope/models/nlp/space_T_en/__init__.py new file mode 100644 index 00000000..46c8b38c --- /dev/null +++ b/modelscope/models/nlp/space_T_en/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .text_to_sql import StarForTextToSql +else: + _import_structure = { + 'text_to_sql': ['StarForTextToSql'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/star_text_to_sql.py b/modelscope/models/nlp/space_T_en/text_to_sql.py similarity index 59% rename from modelscope/models/nlp/star_text_to_sql.py rename to modelscope/models/nlp/space_T_en/text_to_sql.py index 089f1c89..ca2d2596 100644 --- a/modelscope/models/nlp/star_text_to_sql.py +++ b/modelscope/models/nlp/space_T_en/text_to_sql.py @@ -4,14 +4,13 @@ import os from typing import Dict, Optional import torch -import torch.nn as nn from text2sql_lgesql.asdl.asdl import ASDLGrammar from text2sql_lgesql.asdl.transition_system import TransitionSystem from text2sql_lgesql.model.model_constructor import Text2SQL -from text2sql_lgesql.utils.constants import GRAMMAR_FILEPATH from modelscope.metainfo import Models -from modelscope.models.base import Model, Tensor +from modelscope.models import TorchModel +from modelscope.models.base import Tensor from modelscope.models.builder import MODELS from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -21,7 +20,7 @@ __all__ = ['StarForTextToSql'] @MODELS.register_module( Tasks.table_question_answering, module_name=Models.space_T_en) -class StarForTextToSql(Model): +class StarForTextToSql(TorchModel): def __init__(self, model_dir: str, *args, **kwargs): """initialize the star model from the `model_dir` path. @@ -59,6 +58,33 @@ class StarForTextToSql(Model): Returns: Dict[str, Tensor]: results Example: + + Example: + >>> from modelscope.hub.snapshot_download import snapshot_download + >>> from modelscope.models.nlp import StarForTextToSql + >>> from modelscope.preprocessors import ConversationalTextToSqlPreprocessor + >>> test_case = { + 'database_id': 'employee_hire_evaluation', + 'local_db_path': None, + 'utterance': [ + "I'd like to see Shop names.", 'Which of these are hiring?', + 'Which shop is hiring the highest number of employees?' + ' | do you want the name of the shop ? | Yes' + ] + } + >>> cache_path = snapshot_download('damo/nlp_star_conversational-text-to-sql') + >>> preprocessor = ConversationalTextToSqlPreprocessor( + model_dir=cache_path, + database_id=test_case['database_id'], + db_content=True) + >>> model = StarForTextToSql(cache_path, config=preprocessor.config) + >>> print(model(preprocessor({ + 'utterance': "I'd like to see Shop names.", + 'history': [], + 'last_sql': '', + 'database_id': 'employee_hire_evaluation', + 'local_db_path': None + }))) """ self.model.eval() hyps = self.model.parse(input['batch'], self.beam_size) # diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py index d42db83c..60d369e0 100644 --- a/modelscope/models/nlp/structbert/__init__.py +++ b/modelscope/models/nlp/structbert/__init__.py @@ -18,20 +18,26 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_sbert import SbertConfig - from .modeling_sbert import (SbertForMaskedLM, SbertModel, - SbertPreTrainedModel) - from .tokenization_sbert import (BasicTokenizer, SbertTokenizer, - WordpieceTokenizer) - from .tokenization_sbert_fast import SbertTokenizerFast + from .backbone import (SbertModel, SbertPreTrainedModel) + from .configuration import SbertConfig + from .faq_question_answering import SbertForFaqQuestionAnswering + from .fill_mask import SbertForMaskedLM + from .text_classification import SbertForSequenceClassification + from .token_classification import SbertForTokenClassification + from .tokenization import (BasicTokenizer, SbertTokenizer, + WordpieceTokenizer) + from .tokenization_fast import SbertTokenizerFast else: _import_structure = { - 'configuration_sbert': ['SbertConfig'], - 'modeling_sbert': - ['SbertForMaskedLM', 'SbertModel', 'SbertPreTrainedModel'], - 'tokenization_sbert': + 'backbone': ['SbertModel', 'SbertPreTrainedModel'], + 'configuration': ['SbertConfig'], + 'fill_mask': ['SbertForMaskedLM'], + 'faq_question_answering': ['SbertForFaqQuestionAnswering'], + 'text_classification': ['SbertForSequenceClassification'], + 'token_classification': ['SbertForTokenClassification'], + 'tokenization': ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'], - 'tokenization_sbert_fast': ['SbertTokenizerFast'], + 'tokenization_fast': ['SbertTokenizerFast'], } import sys diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py new file mode 100755 index 00000000..039db3ce --- /dev/null +++ b/modelscope/models/nlp/structbert/backbone.py @@ -0,0 +1,932 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch StructBERT model. mainly copied from :module:`~transformers.modeling_bert`""" + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from packaging import version +from transformers.activations import ACT2FN +from transformers.modeling_outputs import \ + BaseModelOutputWithPastAndCrossAttentions +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) + +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionBackboneModelOutput +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.logger import get_logger +from .configuration import SbertConfig + +logger = get_logger(__name__) + + +class SbertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse('1.6.0'): + self.register_buffer( + 'token_type_ids', + torch.zeros( + self.position_ids.size(), + dtype=torch.long, + device=self.position_ids.device), + persistent=False, + ) + + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + return_inputs_embeds=False): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, + past_key_values_length:seq_length + + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users + # when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, 'token_type_ids'): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, + dtype=torch.long, + device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == 'absolute': + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + if not return_inputs_embeds: + return embeddings + else: + return embeddings, inputs_embeds + + +class SbertSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, 'embedding_size'): + raise ValueError( + f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in SbertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + if self.is_decoder: + outputs = outputs + (past_key_value, ) + return outputs + + +class SbertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class SbertAttention(nn.Module): + + def __init__(self, config): + super().__init__() + self.self = SbertSelfAttention(config) + self.output = SbertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, + self.self.attention_head_size, self.pruned_heads) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class SbertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class SbertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class SbertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = SbertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError( + f'{self} should be used as a decoder model if cross attention is added' + ) + self.crossattention = SbertAttention(config) + self.intermediate = SbertIntermediate(config) + self.output = SbertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[: + 2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, 'crossattention'): + raise ValueError( + f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention ' + f'layers by setting `config.add_cross_attention=True`') + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[ + -2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[ + 1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward(self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output) + outputs = (layer_output, ) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value, ) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class SbertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [SbertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + ) if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1], ) + if output_attentions: + all_self_attentions = all_self_attentions + ( + layer_outputs[1], ) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + ( + layer_outputs[2], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class SbertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class SbertPreTrainedModel(TorchModel, PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SbertConfig + base_model_prefix = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, SbertEncoder): + module.gradient_checkpointing = value + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels is not input. + label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists). + + Returns: + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + config = SbertConfig(**kwargs) + model = cls(config) + else: + model_kwargs = {} + label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) + id2label = kwargs.get( + 'id2label', None if label2id is None else + {id: label + for label, id in label2id.items()}) + if id2label is not None and label2id is None: + label2id = {label: id for id, label in id2label.items()} + + num_labels = kwargs.get( + 'num_labels', None if label2id is None else len(label2id)) + if num_labels is not None: + model_kwargs['num_labels'] = num_labels + if label2id is not None: + model_kwargs['label2id'] = label2id + if id2label is not None: + model_kwargs['id2label'] = id2label + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_kwargs) + return model + + +@dataclass +class AttentionBackboneModelOutputWithEmbedding(AttentionBackboneModelOutput): + embedding_output: torch.FloatTensor = None + logits: Optional[Union[tuple, torch.FloatTensor]] = None + kwargs: dict = None + + +@MODELS.register_module(Tasks.backbone, module_name=Models.structbert) +class SbertModel(SbertPreTrainedModel): + """The StructBERT Model transformer outputting raw hidden-states without any specific head on top. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration + set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config: SbertConfig, add_pooling_layer=True, **kwargs): + super().__init__(config) + self.config = config + + self.embeddings = SbertEmbeddings(config) + self.encoder = SbertEncoder(config) + + self.pooler = SbertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple + having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + + Returns: + Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_structbert_backbone_base_std', task='backbone') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_backbone_base_std') + >>> print(model(**preprocessor('这是个测试'))) + """ + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds') + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[ + 2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), + device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, 'token_type_ids'): + buffered_token_type_ids = self.embeddings.token_type_ids[:, : + seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + embedding_output, orignal_embeds = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + return_inputs_embeds=True, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, + pooled_output) + encoder_outputs[1:] + (orignal_embeds, ) + + return AttentionBackboneModelOutputWithEmbedding( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + embedding_output=orignal_embeds) diff --git a/modelscope/models/nlp/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration.py similarity index 94% rename from modelscope/models/nlp/structbert/configuration_sbert.py rename to modelscope/models/nlp/structbert/configuration.py index a727a978..8f095f9d 100644 --- a/modelscope/models/nlp/structbert/configuration_sbert.py +++ b/modelscope/models/nlp/structbert/configuration.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" SBERT model configuration, mainly copied from :class:`~transformers.BertConfig` """ +""" StructBERT model configuration, mainly copied from :class:`~transformers.BertConfig` """ from transformers import PretrainedConfig from modelscope.utils import logger as logging @@ -26,7 +26,7 @@ class SbertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~modelscope.models.nlp.structbert.SbertModel`. - It is used to instantiate a SBERT model according to the specified arguments. + It is used to instantiate a StructBERT model according to the specified arguments. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. @@ -74,15 +74,15 @@ class SbertConfig(PretrainedConfig): relevant if ``config.is_decoder=True``. classifier_dropout (:obj:`float`, `optional`): The dropout ratio for the classification head. - adv_grad_factor (:obj:`float`, `optional`): This factor will be multipled by the KL loss grad and then + adv_grad_factor (:obj:`float`, `optional`): This factor will be multiplied by the KL loss grad and then the result will be added to the original embedding. More details please check:https://arxiv.org/abs/1908.04577 - The range of this value always be 1e-3~1e-7 + The range of this value should between 1e-3~1e-7 adv_bound (:obj:`float`, `optional`): adv_bound is used to cut the top and the bottom bound of the produced embedding. - If not proveded, 2 * sigma will be used as the adv_bound factor + If not provided, 2 * sigma will be used as the adv_bound factor sigma (:obj:`float`, `optional`): The std factor used to produce a 0 mean normal distribution. - If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor + If adv_bound not provided, 2 * sigma will be used as the adv_bound factor """ model_type = 'structbert' diff --git a/modelscope/models/nlp/sbert_for_faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py similarity index 74% rename from modelscope/models/nlp/sbert_for_faq_question_answering.py rename to modelscope/models/nlp/structbert/faq_question_answering.py index 23ccdcc5..c8dbf302 100644 --- a/modelscope/models/nlp/sbert_for_faq_question_answering.py +++ b/modelscope/models/nlp/structbert/faq_question_answering.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import math import os from collections import namedtuple @@ -15,103 +17,6 @@ from modelscope.models.nlp.task_models.task_model import BaseTaskModel from modelscope.utils.config import Config, ConfigFields from modelscope.utils.constant import ModelFile, Tasks -__all__ = ['SbertForFaqQuestionAnswering'] - - -class SbertForFaqQuestionAnsweringBase(BaseTaskModel): - """base class for faq models - """ - - def __init__(self, model_dir, *args, **kwargs): - super(SbertForFaqQuestionAnsweringBase, - self).__init__(model_dir, *args, **kwargs) - - backbone_cfg = SbertConfig.from_pretrained(model_dir) - self.bert = SbertModel(backbone_cfg) - - model_config = Config.from_file( - os.path.join(model_dir, - ModelFile.CONFIGURATION)).get(ConfigFields.model, {}) - - metric = model_config.get('metric', 'cosine') - pooling_method = model_config.get('pooling', 'avg') - - Arg = namedtuple('args', [ - 'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling' - ]) - args = Arg( - metrics=metric, - proj_hidden_size=self.bert.config.hidden_size, - hidden_size=self.bert.config.hidden_size, - dropout=0.0, - pooling=pooling_method) - - self.metrics_layer = MetricsLayer(args) - self.pooling = PoolingLayer(args) - - def _get_onehot_labels(self, labels, support_size, num_cls): - labels_ = labels.view(support_size, 1) - target_oh = torch.zeros(support_size, num_cls).to(labels) - target_oh.scatter_(dim=1, index=labels_, value=1) - return target_oh.view(support_size, num_cls).float() - - def forward_sentence_embedding(self, inputs: Dict[str, Tensor]): - input_ids = inputs['input_ids'] - input_mask = inputs['attention_mask'] - if not isinstance(input_ids, Tensor): - input_ids = torch.IntTensor(input_ids) - if not isinstance(input_mask, Tensor): - input_mask = torch.IntTensor(input_mask) - rst = self.bert(input_ids, input_mask) - last_hidden_states = rst.last_hidden_state - if len(input_mask.shape) == 2: - input_mask = input_mask.unsqueeze(-1) - pooled_representation = self.pooling(last_hidden_states, input_mask) - return pooled_representation - - -@MODELS.register_module( - Tasks.faq_question_answering, module_name=Models.structbert) -class SbertForFaqQuestionAnswering(SbertForFaqQuestionAnsweringBase): - _backbone_prefix = '' - - def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: - assert not self.training - query = input['query'] - support = input['support'] - if isinstance(query, list): - query = torch.stack(query) - if isinstance(support, list): - support = torch.stack(support) - n_query = query.shape[0] - n_support = support.shape[0] - query_mask = torch.ne(query, 0).view([n_query, -1]) - support_mask = torch.ne(support, 0).view([n_support, -1]) - - support_labels = input['support_labels'] - num_cls = torch.max(support_labels) + 1 - onehot_labels = self._get_onehot_labels(support_labels, n_support, - num_cls) - - input_ids = torch.cat([query, support]) - input_mask = torch.cat([query_mask, support_mask], dim=0) - pooled_representation = self.forward_sentence_embedding({ - 'input_ids': - input_ids, - 'attention_mask': - input_mask - }) - z_query = pooled_representation[:n_query] - z_support = pooled_representation[n_query:] - cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5 - protos = torch.matmul(onehot_labels.transpose(0, 1), - z_support) / cls_n_support.unsqueeze(-1) - scores = self.metrics_layer(z_query, protos).view([n_query, num_cls]) - if self.metrics_layer.name == 'relation': - scores = torch.sigmoid(scores) - return {'scores': scores} - - activations = { 'relu': F.relu, 'tanh': torch.tanh, @@ -247,3 +152,142 @@ class PoolingLayer(nn.Module): def forward(self, x, mask): return self.pooling(x, mask) + + +@MODELS.register_module( + Tasks.faq_question_answering, module_name=Models.structbert) +class SbertForFaqQuestionAnswering(BaseTaskModel): + _backbone_prefix = '' + + @classmethod + def _instantiate(cls, **kwargs): + model = cls(kwargs.get('model_dir')) + model.load_checkpoint(kwargs.get('model_dir')) + return model + + def __init__(self, model_dir, *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + + backbone_cfg = SbertConfig.from_pretrained(model_dir) + self.bert = SbertModel(backbone_cfg) + + model_config = Config.from_file( + os.path.join(model_dir, + ModelFile.CONFIGURATION)).get(ConfigFields.model, {}) + + metric = model_config.get('metric', 'cosine') + pooling_method = model_config.get('pooling', 'avg') + + Arg = namedtuple('args', [ + 'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling' + ]) + args = Arg( + metrics=metric, + proj_hidden_size=self.bert.config.hidden_size, + hidden_size=self.bert.config.hidden_size, + dropout=0.0, + pooling=pooling_method) + + self.metrics_layer = MetricsLayer(args) + self.pooling = PoolingLayer(args) + + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Args: + input (Dict[str, Tensor]): the preprocessed data, it contains the following keys: + query(:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The query to be predicted. + support(:obj:`torch.LongTensor` of shape :obj:`(support_size, sequence_length)`): + The support set. + support_label(:obj:`torch.LongTensor` of shape :obj:`(support_size, )`): + The labels of support set. + + Returns: + Dict[str, Tensor]: result, it contains the following key: + scores(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_cls)`): + Predicted scores of all classes for each query. + Examples: + >>> from modelscope.hub.snapshot_download import snapshot_download + >>> from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor + >>> from modelscope.models.nlp import SbertForFaqQuestionAnswering + >>> cache_path = snapshot_download('damo/nlp_structbert_faq-question-answering_chinese-base') + >>> preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(cache_path) + >>> model = SbertForFaqQuestionAnswering.from_pretrained(cache_path) + >>> param = { + >>> 'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'], + >>> 'support_set': [{ + >>> 'text': '卖品代金券怎么用', + >>> 'label': '6527856' + >>> }, { + >>> 'text': '怎么使用优惠券', + >>> 'label': '6527856' + >>> }, { + >>> 'text': '这个可以一起领吗', + >>> 'label': '1000012000' + >>> }, { + >>> 'text': '付款时送的优惠券哪里领', + >>> 'label': '1000012000' + >>> }, { + >>> 'text': '购物等级怎么长', + >>> 'label': '13421097' + >>> }, { + >>> 'text': '购物等级二心', + >>> 'label': '13421097' + >>> }] + >>> } + >>> result = model(preprocessor(param)) + """ + assert not self.training + query = input['query'] + support = input['support'] + if isinstance(query, list): + query = torch.stack(query) + if isinstance(support, list): + support = torch.stack(support) + n_query = query.shape[0] + n_support = support.shape[0] + query_mask = torch.ne(query, 0).view([n_query, -1]) + support_mask = torch.ne(support, 0).view([n_support, -1]) + + support_labels = input['support_labels'] + num_cls = torch.max(support_labels) + 1 + onehot_labels = self._get_onehot_labels(support_labels, n_support, + num_cls) + + input_ids = torch.cat([query, support]) + input_mask = torch.cat([query_mask, support_mask], dim=0) + pooled_representation = self.forward_sentence_embedding({ + 'input_ids': + input_ids, + 'attention_mask': + input_mask + }) + z_query = pooled_representation[:n_query] + z_support = pooled_representation[n_query:] + cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5 + protos = torch.matmul(onehot_labels.transpose(0, 1), + z_support) / cls_n_support.unsqueeze(-1) + scores = self.metrics_layer(z_query, protos).view([n_query, num_cls]) + if self.metrics_layer.name == 'relation': + scores = torch.sigmoid(scores) + return {'scores': scores} + + def _get_onehot_labels(self, labels, support_size, num_cls): + labels_ = labels.view(support_size, 1) + target_oh = torch.zeros(support_size, num_cls).to(labels) + target_oh.scatter_(dim=1, index=labels_, value=1) + return target_oh.view(support_size, num_cls).float() + + def forward_sentence_embedding(self, inputs: Dict[str, Tensor]): + input_ids = inputs['input_ids'] + input_mask = inputs['attention_mask'] + if not isinstance(input_ids, Tensor): + input_ids = torch.IntTensor(input_ids) + if not isinstance(input_mask, Tensor): + input_mask = torch.IntTensor(input_mask) + rst = self.bert(input_ids, input_mask) + last_hidden_states = rst.last_hidden_state + if len(input_mask.shape) == 2: + input_mask = input_mask.unsqueeze(-1) + pooled_representation = self.pooling(last_hidden_states, input_mask) + return pooled_representation diff --git a/modelscope/models/nlp/structbert/fill_mask.py b/modelscope/models/nlp/structbert/fill_mask.py new file mode 100644 index 00000000..e611aa88 --- /dev/null +++ b/modelscope/models/nlp/structbert/fill_mask.py @@ -0,0 +1,284 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionFillMaskModelOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .backbone import SbertModel, SbertPreTrainedModel +from .configuration import SbertConfig + +logger = logging.get_logger(__name__) + + +class SbertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class SbertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = SbertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class SbertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = SbertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class SbertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = SbertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert) +class SbertForMaskedLM(SbertPreTrainedModel): + r"""StructBERT Model with a `language modeling` head on top. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the fill_mask model of StructBERT, the preprocessor of this model + is `modelscope.preprocessors.NLPPreprocessor`. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config: SbertConfig, **kwargs): + super().__init__(config) + + if config.is_decoder: + logger.warning( + 'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for ' + 'bi-directional self-attention.') + + self.bert = SbertModel(config) + self.cls = SbertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + + Returns: + Returns `modelscope.outputs.AttentionFillMaskModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor, NLPPreprocessor + >>> model = Model.from_pretrained('damo/nlp_structbert_fill-mask_chinese-large') + >>> preprocessor = NLPPreprocessor('damo/nlp_structbert_fill-mask_chinese-large') + >>> # Call the model, return some tensors + >>> print(model(**preprocessor('你师父差得动你,你师父可[MASK]不动我。'))) + >>> # Call the pipeline + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor) + >>> print(pipeline_ins('你师父差得动你,你师父可[MASK]不动我。')) + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:-1] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return AttentionFillMaskModelOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + input_ids=input_ids, + ) + + def prepare_inputs_for_generation(self, + input_ids, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation' + attention_mask_zero = attention_mask.new_zeros( + (attention_mask.shape[0], 1)) + attention_mask = torch.cat([attention_mask, attention_mask_zero], + dim=-1) + dummy_token = torch.full((effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {'input_ids': input_ids, 'attention_mask': attention_mask} diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py deleted file mode 100755 index e789037a..00000000 --- a/modelscope/models/nlp/structbert/modeling_sbert.py +++ /dev/null @@ -1,1963 +0,0 @@ -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch SBERT model. mainly copied from :module:`~transformers.modeling_bert`""" - -import math -import warnings -from dataclasses import dataclass -from typing import Optional, Tuple, Union - -import numpy as np -import torch -import torch.utils.checkpoint -from packaging import version -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from transformers.activations import ACT2FN -from transformers.file_utils import (ModelOutput, add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - replace_return_docstrings) -from transformers.modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, - CausalLMOutputWithCrossAttentions, MaskedLMOutput, - MultipleChoiceModelOutput, NextSentencePredictorOutput, - QuestionAnsweringModelOutput, SequenceClassifierOutput, - TokenClassifierOutput) -from transformers.modeling_utils import (PreTrainedModel, - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer) - -from modelscope.metainfo import Models -from modelscope.models.builder import BACKBONES -from modelscope.utils.constant import Fields -from modelscope.utils.logger import get_logger -from .adv_utils import compute_adv_loss, compute_adv_loss_pair -from .configuration_sbert import SbertConfig - -logger = get_logger(__name__) - -_CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std' -_CONFIG_FOR_DOC = 'SbertConfig' -_TOKENIZER_FOR_DOC = 'SbertTokenizer' - - -class SbertEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings.""" - - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding( - config.vocab_size, - config.hidden_size, - padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, - 'position_embedding_type', - 'absolute') - self.register_buffer( - 'position_ids', - torch.arange(config.max_position_embeddings).expand((1, -1))) - if version.parse(torch.__version__) > version.parse('1.6.0'): - self.register_buffer( - 'token_type_ids', - torch.zeros( - self.position_ids.size(), - dtype=torch.long, - device=self.position_ids.device), - persistent=False, - ) - - def forward(self, - input_ids=None, - token_type_ids=None, - position_ids=None, - inputs_embeds=None, - past_key_values_length=0, - return_inputs_embeds=False): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = self.position_ids[:, - past_key_values_length:seq_length - + past_key_values_length] - - # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs - # when its auto-generated, registered buffer helps users - # when tracing the model without passing token_type_ids, solves - # issue #5664 - if token_type_ids is None: - if hasattr(self, 'token_type_ids'): - buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - input_shape[0], seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros( - input_shape, - dtype=torch.long, - device=self.position_ids.device) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == 'absolute': - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - if not return_inputs_embeds: - return embeddings - else: - return embeddings, inputs_embeds - - -class SbertSelfAttention(nn.Module): - - def __init__(self, config): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, 'embedding_size'): - raise ValueError( - f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' - f'heads ({config.num_attention_heads})') - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size - / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, - 'position_embedding_type', - 'absolute') - if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, - self.attention_head_size) - - self.is_decoder = config.is_decoder - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - mixed_query_layer = self.query(hidden_states) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - if is_cross_attention and past_key_value is not None: - # reuse k,v, cross_attentions - key_layer = past_key_value[0] - value_layer = past_key_value[1] - attention_mask = encoder_attention_mask - elif is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - elif past_key_value is not None: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - key_layer = torch.cat([past_key_value[0], key_layer], dim=2) - value_layer = torch.cat([past_key_value[1], value_layer], dim=2) - else: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - - query_layer = self.transpose_for_scores(mixed_query_layer) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_layer, value_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, - key_layer.transpose(-1, -2)) - - if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange( - seq_length, dtype=torch.long, - device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange( - seq_length, dtype=torch.long, - device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to( - dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == 'relative_key': - relative_position_scores = torch.einsum( - 'bhld,lrd->bhlr', query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == 'relative_key_query': - relative_position_scores_query = torch.einsum( - 'bhld,lrd->bhlr', query_layer, positional_embedding) - relative_position_scores_key = torch.einsum( - 'bhrd,lrd->bhlr', key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in SbertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + ( - self.all_head_size, ) - context_layer = context_layer.view(*new_context_layer_shape) - - outputs = (context_layer, - attention_probs) if output_attentions else (context_layer, ) - - if self.is_decoder: - outputs = outputs + (past_key_value, ) - return outputs - - -class SbertSelfOutput(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class SbertAttention(nn.Module): - - def __init__(self, config): - super().__init__() - self.self = SbertSelfAttention(config) - self.output = SbertSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, - self.self.attention_head_size, self.pruned_heads) - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len( - heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - self_outputs = self.self( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them - return outputs - - -class SbertIntermediate(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -class SbertOutput(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class SbertLayer(nn.Module): - - def __init__(self, config): - super().__init__() - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.seq_len_dim = 1 - self.attention = SbertAttention(config) - self.is_decoder = config.is_decoder - self.add_cross_attention = config.add_cross_attention - if self.add_cross_attention: - if not self.is_decoder: - raise ValueError( - f'{self} should be used as a decoder model if cross attention is added' - ) - self.crossattention = SbertAttention(config) - self.intermediate = SbertIntermediate(config) - self.output = SbertOutput(config) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[: - 2] if past_key_value is not None else None - self_attention_outputs = self.attention( - hidden_states, - attention_mask, - head_mask, - output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, - ) - attention_output = self_attention_outputs[0] - - # if decoder, the last output is tuple of self-attn cache - if self.is_decoder: - outputs = self_attention_outputs[1:-1] - present_key_value = self_attention_outputs[-1] - else: - outputs = self_attention_outputs[ - 1:] # add self attentions if we output attention weights - - cross_attn_present_key_value = None - if self.is_decoder and encoder_hidden_states is not None: - if not hasattr(self, 'crossattention'): - raise ValueError( - f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention ' - f'layers by setting `config.add_cross_attention=True`') - - # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple - cross_attn_past_key_value = past_key_value[ - -2:] if past_key_value is not None else None - cross_attention_outputs = self.crossattention( - attention_output, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - cross_attn_past_key_value, - output_attentions, - ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[ - 1:-1] # add cross attentions if we output attention weights - - # add cross-attn cache to positions 3,4 of present_key_value tuple - cross_attn_present_key_value = cross_attention_outputs[-1] - present_key_value = present_key_value + cross_attn_present_key_value - - layer_output = apply_chunking_to_forward(self.feed_forward_chunk, - self.chunk_size_feed_forward, - self.seq_len_dim, - attention_output) - outputs = (layer_output, ) + outputs - - # if decoder, return the attn key/values as the last output - if self.is_decoder: - outputs = outputs + (present_key_value, ) - - return outputs - - def feed_forward_chunk(self, attention_output): - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output - - -class SbertEncoder(nn.Module): - - def __init__(self, config): - super().__init__() - self.config = config - self.layer = nn.ModuleList( - [SbertLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - ): - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = ( - ) if output_attentions and self.config.add_cross_attention else None - - next_decoder_cache = () if use_cache else None - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' - ) - use_cache = False - - def create_custom_forward(module): - - def custom_forward(*inputs): - return module(*inputs, past_key_value, - output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - - hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[-1], ) - if output_attentions: - all_self_attentions = all_self_attentions + ( - layer_outputs[1], ) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + ( - layer_outputs[2], ) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - if not return_dict: - return tuple(v for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_decoder_cache, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) - - -class SbertPooler(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class SbertPredictionHeadTransform(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class SbertLMPredictionHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.transform = SbertPredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear(config.hidden_size, config.vocab_size) - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -class SbertOnlyMLMHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = SbertLMPredictionHead(config) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class SbertOnlyNSPHead(nn.Module): - - def __init__(self, config): - super().__init__() - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) - return seq_relationship_score - - -class SbertPreTrainingHeads(nn.Module): - - def __init__(self, config): - super().__init__() - self.predictions = SbertLMPredictionHead(config) - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, sequence_output, pooled_output): - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -class SbertPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = SbertConfig - base_model_prefix = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, nn.Linear): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, SbertEncoder): - module.gradient_checkpointing = value - - -@dataclass -class SbertForPreTrainingOutput(ModelOutput): - """ - Output type of :class:`~transformers.BertForPreTraining`. - - Args: - loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` - is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` - is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, - sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -SBERT_START_DOCSTRING = r""" - - This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic - methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, - pruning heads etc.) - - This model is also a PyTorch `torch.nn.Module `__ - subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to - general usage and behavior. - - Parameters: - config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with - all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model - weights. -""" - -SBERT_INPUTS_DOCSTRING = r""" - Args: - input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): - Indices of input sequence tokens in the vocabulary. - - Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See - :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for - details. - - `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): - Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): - Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, - 1]``: - - - 0 corresponds to a `sentence A` token, - - 1 corresponds to a `sentence B` token. - - `What are token type IDs? <../glossary.html#token-type-ids>`_ - position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, - config.max_position_embeddings - 1]``. - - `What are position IDs? <../glossary.html#position-ids>`_ - head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): - Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): - Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert :obj:`input_ids` indices into associated - vectors than the model's internal embedding lookup matrix. - output_attentions (:obj:`bool`, `optional`): - Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned - tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`): - Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for - more detail. - return_dict (:obj:`bool`, `optional`): - Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. -""" - - -@dataclass -class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding( - BaseModelOutputWithPoolingAndCrossAttentions): - embedding_output: torch.FloatTensor = None - logits: Optional[Union[tuple, torch.FloatTensor]] = None - kwargs: dict = None - - -@add_start_docstrings( - 'The Sbert Model transformer outputting raw hidden-states without any specific head on top.', - SBERT_START_DOCSTRING, -) -class SbertModel(SbertPreTrainedModel): - """ - - The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of - cross-attention is added between the self-attention layers, following the architecture described in `Attention is - all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, - Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. - - To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration - set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` - argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an - input to the forward pass. - """ - - def __init__(self, config: SbertConfig, add_pooling_layer=True): - super().__init__(config) - self.config = config - - self.embeddings = SbertEmbeddings(config) - self.encoder = SbertEncoder(config) - - self.pooler = SbertPooler(config) if add_pooling_layer else None - - self.init_weights() - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=BaseModelOutputWithPoolingAndCrossAttentions, - config_class=_CONFIG_FOR_DOC, - ) - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - r""" - encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, - `optional`): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple - having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` - (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` - instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. - use_cache (:obj:`bool`, `optional`): - If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up - decoding (see :obj:`past_key_values`). - """ - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if self.config.is_decoder: - use_cache = use_cache if use_cache is not None else self.config.use_cache - else: - use_cache = False - - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - 'You cannot specify both input_ids and inputs_embeds at the same time' - ) - elif input_ids is not None: - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError( - 'You have to specify either input_ids or inputs_embeds') - - batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device - - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[ - 2] if past_key_values is not None else 0 - - if attention_mask is None: - attention_mask = torch.ones( - ((batch_size, seq_length + past_key_values_length)), - device=device) - - if token_type_ids is None: - if hasattr(self.embeddings, 'token_type_ids'): - buffered_token_type_ids = self.embeddings.token_type_ids[:, : - seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - batch_size, seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros( - input_shape, dtype=torch.long, device=device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, device) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( - ) - encoder_hidden_shape = (encoder_batch_size, - encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones( - encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) - else: - encoder_extended_attention_mask = None - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, - self.config.num_hidden_layers) - - embedding_output, orignal_embeds = self.embeddings( - input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, - return_inputs_embeds=True, - ) - encoder_outputs = self.encoder( - embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler( - sequence_output) if self.pooler is not None else None - - if not return_dict: - return (sequence_output, - pooled_output) + encoder_outputs[1:] + (orignal_embeds, ) - - return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - embedding_output=orignal_embeds) - - -@add_start_docstrings( - """ - Sbert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next - sentence prediction (classification)` head. - """, - SBERT_START_DOCSTRING, -) -class SbertForPreTraining(SbertPreTrainedModel): - - def __init__(self, config: SbertConfig): - super().__init__(config) - - self.bert = SbertModel(config) - self.cls = SbertPreTrainingHeads(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=SbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - next_sentence_label=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`): - Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., - config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored - (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): - Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair - (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``: - - - 0 indicates sequence B is a continuation of sequence A, - - 1 indicates sequence B is a random sequence. - kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): - Used to hide legacy arguments that have been deprecated. - - Returns: - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output, pooled_output = outputs[:2] - prediction_scores, seq_relationship_score = self.cls( - sequence_output, pooled_output) - - total_loss = None - if labels is not None and next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - next_sentence_loss = loss_fct( - seq_relationship_score.view(-1, 2), - next_sentence_label.view(-1)) - total_loss = masked_lm_loss + next_sentence_loss - - if not return_dict: - output = (prediction_scores, - seq_relationship_score) + outputs[2:-1] - return ((total_loss, ) - + output) if total_loss is not None else output - - return SbertForPreTrainingOutput( - loss=total_loss, - prediction_logits=prediction_scores, - seq_relationship_logits=seq_relationship_score, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """Sbert Model with a `language modeling` head on top for CLM fine-tuning. """, - SBERT_START_DOCSTRING) -class SbertLMHeadModel(SbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config: SbertConfig): - super().__init__(config) - - if not config.is_decoder: - logger.warning( - 'If you want to use `SbertLMHeadModel` as a standalone, add `is_decoder=True.`' - ) - - self.bert = SbertModel(config, add_pooling_layer=False) - self.cls = SbertOnlyMLMHead(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=CausalLMOutputWithCrossAttentions, - config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, - `optional`): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in - ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are - ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` - past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` - with each tuple having 4 tensors of - shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` - (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` - instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. - use_cache (:obj:`bool`, `optional`): - If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up - decoding (see :obj:`past_key_values`). - - Returns: - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None: - use_cache = False - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - lm_loss = None - if labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - shifted_prediction_scores = prediction_scores[:, : - -1, :].contiguous() - labels = labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - lm_loss = loss_fct( - shifted_prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - if not return_dict: - output = (prediction_scores, ) + outputs[2:-1] - return ((lm_loss, ) + output) if lm_loss is not None else output - - return CausalLMOutputWithCrossAttentions( - loss=lm_loss, - logits=prediction_scores, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, - ) - - def prepare_inputs_for_generation(self, - input_ids, - past=None, - attention_mask=None, - **model_kwargs): - input_shape = input_ids.shape - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_shape) - - # cut decoder_input_ids if past is used - if past is not None: - input_ids = input_ids[:, -1:] - - return { - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'past_key_values': past - } - - def _reorder_cache(self, past, beam_idx): - reordered_past = () - for layer_past in past: - reordered_past += (tuple( - past_state.index_select(0, beam_idx) - for past_state in layer_past), ) - return reordered_past - - -@add_start_docstrings( - """Sbert Model with a `language modeling` head on top. """, - SBERT_START_DOCSTRING) -class SbertForMaskedLM(SbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - _keys_to_ignore_on_load_missing = [ - r'position_ids', r'predictions.decoder.bias' - ] - - def __init__(self, config: SbertConfig): - super().__init__(config) - - if config.is_decoder: - logger.warning( - 'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for ' - 'bi-directional self-attention.') - - self.bert = SbertModel(config) - self.cls = SbertOnlyMLMHead(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=MaskedLMOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., - config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored - (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct( - prediction_scores.view(-1, self.config.vocab_size), - labels.view(-1)) - - if not return_dict: - output = (prediction_scores, ) + outputs[2:-1] - return ((masked_lm_loss, ) - + output) if masked_lm_loss is not None else output - - return MaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation(self, - input_ids, - attention_mask=None, - **model_kwargs): - input_shape = input_ids.shape - effective_batch_size = input_shape[0] - - # add a dummy token - assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation' - attention_mask_zero = attention_mask.new_zeros( - (attention_mask.shape[0], 1)) - attention_mask = torch.cat([attention_mask, attention_mask_zero], - dim=-1) - dummy_token = torch.full((effective_batch_size, 1), - self.config.pad_token_id, - dtype=torch.long, - device=input_ids.device) - input_ids = torch.cat([input_ids, dummy_token], dim=1) - - return {'input_ids': input_ids, 'attention_mask': attention_mask} - - -@add_start_docstrings( - """Sbert Model with a `next sentence prediction (classification)` head on top. """, - SBERT_START_DOCSTRING, -) -class SbertForNextSentencePrediction(SbertPreTrainedModel): - - def __init__(self, config: SbertConfig): - super().__init__(config) - - self.bert = SbertModel(config) - self.cls = SbertOnlyNSPHead(config) - - self.init_weights() - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @replace_return_docstrings( - output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair - (see ``input_ids`` docstring). Indices should be in ``[0, 1]``: - - - 0 indicates sequence B is a continuation of sequence A, - - 1 indicates sequence B is a random sequence. - - Returns: - - """ - - if 'next_sentence_label' in kwargs: - warnings.warn( - 'The `next_sentence_label` argument is deprecated and will be removed ' - 'in a future version, use `labels` instead.', - FutureWarning, - ) - labels = kwargs.pop('next_sentence_label') - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - seq_relationship_scores = self.cls(pooled_output) - - next_sentence_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - next_sentence_loss = loss_fct( - seq_relationship_scores.view(-1, 2), labels.view(-1)) - - if not return_dict: - output = (seq_relationship_scores, ) + outputs[2:-1] - return ((next_sentence_loss, ) - + output) if next_sentence_loss is not None else output - - return NextSentencePredictorOutput( - loss=next_sentence_loss, - logits=seq_relationship_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Sbert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled - output) e.g. for GLUE tasks. - """, - SBERT_START_DOCSTRING, -) -class SbertForSequenceClassification(SbertPreTrainedModel): - - def __init__(self, config: SbertConfig): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - if self.config.adv_grad_factor is None: - logger.warning( - 'Adv parameters not set, skipping compute_adv_loss.') - self.bert = SbertModel(config) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() - - def _forward_call(self, **kwargs): - outputs = self.bert(**kwargs) - pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - outputs['logits'] = logits - outputs.kwargs = kwargs - return outputs - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=SequenceClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., - config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), - If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if not return_dict: - logger.error('Return tuple in sbert is not supported now.') - outputs = self._forward_call( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict) - return self.compute_loss(outputs, labels, **outputs.kwargs) - - def compute_loss(self, outputs, labels, **kwargs): - logits = outputs.logits - embedding_output = outputs.embedding_output - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = 'regression' - elif self.num_labels > 1 and (labels.dtype == torch.long - or labels.dtype == torch.int): - self.config.problem_type = 'single_label_classification' - else: - self.config.problem_type = 'multi_label_classification' - - if self.config.problem_type == 'regression': - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == 'single_label_classification': - loss_fct = CrossEntropyLoss() - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - if self.config.adv_grad_factor is not None and self.training: - loss = compute_adv_loss( - embedding=embedding_output, - model=self._forward_call, - ori_logits=logits, - ori_loss=loss, - adv_bound=self.config.adv_bound, - adv_grad_factor=self.config.adv_grad_factor, - sigma=self.config.sigma, - **kwargs) - elif self.config.problem_type == 'multi_label_classification': - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Sbert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a - softmax) e.g. for RocStories/SWAG tasks. - """, - SBERT_START_DOCSTRING, -) -class SbertForMultipleChoice(SbertPreTrainedModel): - - def __init__(self, config: SbertConfig): - super().__init__(config) - self.config = config - if self.config.adv_grad_factor is None: - logger.warning( - 'Adv parameters not set, skipping compute_adv_loss.') - self.bert = SbertModel(config) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, 1) - - self.init_weights() - - def _forward_call(self, num_choices, **kwargs): - outputs = self.bert(**kwargs) - pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - outputs['logits'] = logits.view(-1, num_choices) - kwargs['num_choices'] = num_choices - outputs.kwargs = kwargs - return outputs - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format( - 'batch_size, num_choices, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=MultipleChoiceModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., - num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See - :obj:`input_ids` above) - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if not return_dict: - logger.error('Return tuple in sbert is not supported now.') - - num_choices = input_ids.shape[ - 1] if input_ids is not None else inputs_embeds.shape[1] - - input_ids = input_ids.view( - -1, input_ids.size(-1)) if input_ids is not None else None - attention_mask = attention_mask.view( - -1, - attention_mask.size(-1)) if attention_mask is not None else None - token_type_ids = token_type_ids.view( - -1, - token_type_ids.size(-1)) if token_type_ids is not None else None - position_ids = position_ids.view( - -1, position_ids.size(-1)) if position_ids is not None else None - inputs_embeds = ( - inputs_embeds.view(-1, inputs_embeds.size(-2), - inputs_embeds.size(-1)) - if inputs_embeds is not None else None) - - outputs = self._forward_call( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - num_choices=num_choices) - - reshaped_logits = outputs.logits - kwargs = outputs.kwargs - embedding_output = outputs.embedding_output - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - if self.config.adv_grad_factor is not None and self.training: - loss = compute_adv_loss( - embedding=embedding_output, - model=self._forward_call, - ori_logits=reshaped_logits, - ori_loss=loss, - adv_bound=self.config.adv_bound, - adv_grad_factor=self.config.adv_grad_factor, - sigma=self.config.sigma, - **kwargs) - - return MultipleChoiceModelOutput( - loss=loss, - logits=reshaped_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Sbert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for - Named-Entity-Recognition (NER) tasks. - """, - SBERT_START_DOCSTRING, -) -class SbertForTokenClassification(SbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config: SbertConfig): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - if self.config.adv_grad_factor is None: - logger.warning( - 'Adv parameters not set, skipping compute_adv_loss.') - self.bert = SbertModel(config, add_pooling_layer=False) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - def _forward_call(self, **kwargs): - outputs = self.bert(**kwargs) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - outputs['logits'] = logits - outputs.kwargs = kwargs - return outputs - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=TokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - - 1]``. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if not return_dict: - logger.error('Return tuple in sbert is not supported now.') - - outputs = self._forward_call( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict) - - logits = outputs.logits - embedding_output = outputs.embedding_output - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), - torch.tensor(loss_fct.ignore_index).type_as(labels)) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct( - logits.view(-1, self.num_labels), labels.view(-1)) - if self.config.adv_grad_factor is not None and self.training: - loss = compute_adv_loss( - embedding=embedding_output, - model=self._forward_call, - ori_logits=logits, - ori_loss=loss, - adv_bound=self.config.adv_bound, - adv_grad_factor=self.config.adv_grad_factor, - sigma=self.config.sigma, - with_attention_mask=attention_mask is not None, - **outputs.kwargs) - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - Sbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - """, - SBERT_START_DOCSTRING, -) -class SbertForQuestionAnswering(SbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config: SbertConfig): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - if self.config.adv_grad_factor is None: - logger.warning( - 'Adv parameters not set, skipping compute_adv_loss.') - self.bert = SbertModel(config, add_pooling_layer=False) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - def _forward_call(self, **kwargs): - outputs = self.bert(**kwargs) - sequence_output = outputs[0] - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - outputs['logits'] = (start_logits, end_logits) - outputs.kwargs = kwargs - return outputs - - @add_start_docstrings_to_model_forward( - SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=QuestionAnsweringModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - start_positions=None, - end_positions=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the - sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the - sequence are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if not return_dict: - logger.error('Return tuple in sbert is not supported now.') - - outputs = self._forward_call( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict) - return self.compute_loss(outputs, start_positions, end_positions, - **outputs.kwargs) - - def compute_loss(self, - outputs, - start_positions=None, - end_positions=None, - **kwargs): - start_logits, end_logits = outputs.logits - embedding_output = outputs.embedding_output - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - if self.config.adv_grad_factor is not None and self.training: - total_loss = compute_adv_loss_pair( - embedding=embedding_output, - model=self._forward_call, - start_logits=start_logits, - end_logits=end_logits, - ori_loss=total_loss, - adv_bound=self.config.adv_bound, - adv_grad_factor=self.config.adv_grad_factor, - sigma=self.config.sigma, - **kwargs) - - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py new file mode 100644 index 00000000..044cf8d0 --- /dev/null +++ b/modelscope/models/nlp/structbert/text_classification.py @@ -0,0 +1,235 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionTextClassificationModelOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .adv_utils import compute_adv_loss +from .backbone import SbertModel, SbertPreTrainedModel +from .configuration import SbertConfig + +logger = logging.get_logger(__name__) + + +@MODELS.register_module( + Tasks.text_classification, module_name=Models.structbert) +@MODELS.register_module(Tasks.nli, module_name=Models.structbert) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.structbert) +@MODELS.register_module( + Tasks.sentence_similarity, module_name=Models.structbert) +@MODELS.register_module( + Tasks.zero_shot_classification, module_name=Models.structbert) +class SbertForSequenceClassification(SbertPreTrainedModel): + r"""StructBERT Model transformer with a sequence classification/regression head on top + (a linear layer on top of the pooled output) e.g. for GLUE tasks. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the text classification model of StructBERT, the preprocessor of this model + is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + + Trainer: + This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, + NlpEpochBasedTrainer, or trainers from other frameworks. + The preferred trainer in ModelScope is NlpEpochBasedTrainer. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + + def __init__(self, config: SbertConfig, **kwargs): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + if self.config.adv_grad_factor is None: + logger.warning( + 'Adv parameters not set, skipping compute_adv_loss.') + + SbertForSequenceClassification.base_model_prefix = getattr( + config, 'base_model_prefix', + SbertForSequenceClassification.base_model_prefix) + setattr(self, self.base_model_prefix, SbertModel(config)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def _forward_call(self, **kwargs): + outputs = self.base_model(**kwargs) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + outputs['logits'] = logits + outputs.kwargs = kwargs + return outputs + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + Returns `modelscope.outputs.AttentionTextClassificationModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + >>> # Call the model, return some tensors + >>> print(model(**preprocessor(('这是个测试', '这也是个测试')))) + >>> # Call the pipeline + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('text-classification', model=model, preprocessor=preprocessor) + >>> print(pipeline_ins(('这是个测试', '这也是个测试'))) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if not return_dict: + logger.error('Return tuple in sbert is not supported now.') + outputs = self._forward_call( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + return self.compute_loss(outputs, labels, **outputs.kwargs) + + def compute_loss(self, outputs, labels, **kwargs): + logits = outputs.logits + embedding_output = outputs.embedding_output + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = 'regression' + elif self.num_labels > 1 and (labels.dtype == torch.long + or labels.dtype == torch.int): + self.config.problem_type = 'single_label_classification' + else: + self.config.problem_type = 'multi_label_classification' + + if self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + if self.config.adv_grad_factor is not None and self.training: + loss = compute_adv_loss( + embedding=embedding_output, + model=self._forward_call, + ori_logits=logits, + ori_loss=loss, + adv_bound=self.config.adv_bound, + adv_grad_factor=self.config.adv_grad_factor, + sigma=self.config.sigma, + **kwargs) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + return AttentionTextClassificationModelOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py new file mode 100644 index 00000000..a040ff3e --- /dev/null +++ b/modelscope/models/nlp/structbert/token_classification.py @@ -0,0 +1,229 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.outputs import TokenClassifierOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .adv_utils import compute_adv_loss +from .backbone import SbertModel, SbertPreTrainedModel +from .configuration import SbertConfig + +logger = logging.get_logger(__name__) + + +@MODELS.register_module( + Tasks.token_classification, module_name=Models.structbert) +@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert) +@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert) +class SbertForTokenClassification(SbertPreTrainedModel): + r"""StructBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) + e.g. for Named-Entity-Recognition (NER) tasks. + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the token-classification model of StructBERT, the preprocessor of this model + is `modelscope.preprocessors.TokenClassificationPreprocessor`. + + Trainer: + This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, + NlpEpochBasedTrainer, or trainers from other frameworks. + The preferred trainer in modelscope is NlpEpochBasedTrainer. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + """ + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config: SbertConfig, **kwargs): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + if self.config.adv_grad_factor is None: + logger.warning( + 'Adv parameters not set, skipping compute_adv_loss.') + setattr(self, self.base_model_prefix, + SbertModel(config, add_pooling_layer=False)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def _forward_call(self, **kwargs): + outputs = self.bert(**kwargs) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + outputs['logits'] = logits + outputs.kwargs = kwargs + return outputs + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + offset_mapping=None, + label_mask=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the sentence. + Selected in the range ``[0, sequence_length - 1]``. + label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask + values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + Returns `modelscope.outputs.TokenClassifierOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base') + >>> print(model(**preprocessor(('This is a test', 'This is also a test')))) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if not return_dict: + logger.error('Return tuple in sbert is not supported now.') + + outputs = self._forward_call( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + logits = outputs.logits + embedding_output = outputs.embedding_output + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels)) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + if self.config.adv_grad_factor is not None and self.training: + loss = compute_adv_loss( + embedding=embedding_output, + model=self._forward_call, + ori_logits=logits, + ori_loss=loss, + adv_bound=self.config.adv_bound, + adv_grad_factor=self.config.adv_grad_factor, + sigma=self.config.sigma, + with_attention_mask=attention_mask is not None, + **outputs.kwargs) + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + offset_mapping=offset_mapping, + ) diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization.py similarity index 100% rename from modelscope/models/nlp/structbert/tokenization_sbert.py rename to modelscope/models/nlp/structbert/tokenization.py diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_fast.py similarity index 99% rename from modelscope/models/nlp/structbert/tokenization_sbert_fast.py rename to modelscope/models/nlp/structbert/tokenization_fast.py index a0a81121..6f7b7ba7 100644 --- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py +++ b/modelscope/models/nlp/structbert/tokenization_fast.py @@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger -from .tokenization_sbert import SbertTokenizer +from .tokenization import SbertTokenizer logger = get_logger(__name__) diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py index 38359044..e733efe2 100644 --- a/modelscope/models/nlp/task_models/__init__.py +++ b/modelscope/models/nlp/task_models/__init__.py @@ -7,6 +7,9 @@ if TYPE_CHECKING: from .information_extraction import InformationExtractionModel from .feature_extraction import FeatureExtractionModel from .fill_mask import FillMaskModel + from .nncrf_for_named_entity_recognition import ( + TransformerCRFForNamedEntityRecognition, + LSTMCRFForNamedEntityRecognition) from .sequence_classification import SequenceClassificationModel from .task_model import SingleBackboneTaskModelBase from .token_classification import TokenClassificationModel @@ -17,6 +20,10 @@ else: 'information_extraction': ['InformationExtractionModel'], 'feature_extraction': ['FeatureExtractionModel'], 'fill_mask': ['FillMaskModel'], + 'nncrf_for_named_entity_recognition': [ + 'TransformerCRFForNamedEntityRecognition', + 'LSTMCRFForNamedEntityRecognition' + ], 'sequence_classification': ['SequenceClassificationModel'], 'task_model': ['SingleBackboneTaskModelBase'], 'token_classification': ['TokenClassificationModel'], diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py index 069c37aa..9360ec08 100644 --- a/modelscope/models/nlp/task_models/feature_extraction.py +++ b/modelscope/models/nlp/task_models/feature_extraction.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict import numpy as np @@ -31,13 +32,8 @@ class FeatureExtractionModel(SingleBackboneTaskModelBase): self.build_backbone(self.backbone_cfg) def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: - # backbone do not need labels, only head need for loss compute - labels = input.pop(OutputKeys.LABELS, None) - + input.pop(OutputKeys.LABELS, None) outputs = super().forward(input) - sequence_output, pooled_output = self.extract_backbone_outputs(outputs) - if labels is not None: - input[OutputKeys.LABELS] = labels - + sequence_output = outputs.last_hidden_state return {OutputKeys.TEXT_EMBEDDING: sequence_output} diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py index f7ef1cc2..0f7d3345 100644 --- a/modelscope/models/nlp/task_models/fill_mask.py +++ b/modelscope/models/nlp/task_models/fill_mask.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict import numpy as np @@ -36,7 +37,7 @@ class FillMaskModel(SingleBackboneTaskModelBase): labels = input.pop(OutputKeys.LABELS, None) outputs = super().forward(input) - sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + sequence_output = outputs.last_hidden_state outputs = self.head.forward(sequence_output) if labels is not None: diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py index a206c2fc..ce0e21a3 100644 --- a/modelscope/models/nlp/task_models/information_extraction.py +++ b/modelscope/models/nlp/task_models/information_extraction.py @@ -33,7 +33,7 @@ class InformationExtractionModel(SingleBackboneTaskModelBase): def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: outputs = super().forward(input) - sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + sequence_output = outputs.last_hidden_state outputs = self.head.forward(sequence_output, input['text'], input['offsets']) return {OutputKeys.SPO_LIST: outputs} diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py similarity index 83% rename from modelscope/models/nlp/nncrf_for_named_entity_recognition.py rename to modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py index 8b0c59b2..017e35e5 100644 --- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py +++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py @@ -12,6 +12,7 @@ from transformers import AutoConfig, AutoModel from modelscope.metainfo import Models from modelscope.models import TorchModel from modelscope.models.builder import MODELS +from modelscope.outputs import TokenClassifierWithPredictionsOutput from modelscope.utils.constant import ModelFile, Tasks __all__ = [ @@ -39,28 +40,116 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel): def eval(self): return self.model.eval() - def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + offset_mapping=None, + label_mask=None, + ) -> Dict[str, Any]: + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the sentence. + Selected in the range ``[0, sequence_length - 1]``. + label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask + values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + Returns `modelscope.outputs.TokenClassifierOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base') + >>> print(model(**preprocessor(('This is a test', 'This is also a test')))) + """ input_tensor = { - 'input_ids': input['input_ids'], - 'attention_mask': input['attention_mask'], - 'label_mask': input['label_mask'], + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, } output = { - 'text': input['text'], - 'offset_mapping': input['offset_mapping'], + 'offset_mapping': offset_mapping, **input_tensor, **self.model(input_tensor) } return output - def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]: + def postprocess(self, input: Dict[str, Any], **kwargs): predicts = self.model.decode(input) - output = { - 'text': input['text'], - 'offset_mapping': input['offset_mapping'], - 'predicts': predicts['predicts'].squeeze(0).cpu().numpy(), - } - return output + offset_len = len(input['offset_mapping']) + predictions = torch.narrow( + predicts, 1, 0, + offset_len) # index_select only move loc, not resize + return TokenClassifierWithPredictionsOutput( + loss=None, + logits=None, + hidden_states=None, + attentions=None, + offset_mapping=input['offset_mapping'], + predictions=predictions, + ) @MODELS.register_module( @@ -133,8 +222,7 @@ class TransformerCRF(nn.Module): inputs['label_mask'].shape[1], device=seq_lens.device)[None, :] < seq_lens[:, None] predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0) - outputs = {'predicts': predicts} - return outputs + return predicts class LSTMCRF(nn.Module): @@ -183,8 +271,7 @@ class LSTMCRF(nn.Module): inputs['label_mask'].shape[1], device=seq_lens.device)[None, :] < seq_lens[:, None] predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0) - outputs = {'predicts': predicts} - return outputs + return predicts class CRF(nn.Module): diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py index 1f5e46c3..6c0c09a2 100644 --- a/modelscope/models/nlp/task_models/sequence_classification.py +++ b/modelscope/models/nlp/task_models/sequence_classification.py @@ -1,8 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os from typing import Any, Dict -import json import numpy as np from modelscope.metainfo import TaskModels @@ -16,11 +14,6 @@ from modelscope.utils.hub import parse_label_mapping __all__ = ['SequenceClassificationModel'] -@MODELS.register_module( - Tasks.sentence_similarity, module_name=TaskModels.text_classification) -@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification) -@MODELS.register_module( - Tasks.sentiment_classification, module_name=TaskModels.text_classification) @MODELS.register_module( Tasks.text_classification, module_name=TaskModels.text_classification) class SequenceClassificationModel(SingleBackboneTaskModelBase): @@ -54,25 +47,10 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase): labels = input.pop(OutputKeys.LABELS, None) outputs = super().forward(input) - sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + pooled_output = outputs.pooler_output outputs = self.head.forward(pooled_output) if labels is not None: input[OutputKeys.LABELS] = labels loss = self.compute_loss(outputs, labels) outputs.update(loss) return outputs - - def extract_logits(self, outputs): - return outputs[OutputKeys.LOGITS].cpu().detach() - - def postprocess(self, input, **kwargs): - logits = self.extract_logits(input) - probs = logits.softmax(-1).numpy() - pred = logits.argmax(-1).numpy() - logits = logits.numpy() - res = { - OutputKeys.PREDICTIONS: pred, - OutputKeys.PROBABILITIES: probs, - OutputKeys.LOGITS: logits - } - return res diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py index 0b43044f..8c83517a 100644 --- a/modelscope/models/nlp/task_models/task_model.py +++ b/modelscope/models/nlp/task_models/task_model.py @@ -404,7 +404,7 @@ class SingleBackboneTaskModelBase(BaseTaskModel): def build_backbone(self, cfg): if 'prefix' in cfg: self._backbone_prefix = cfg['prefix'] - backbone = build_backbone(cfg, field=Fields.nlp) + backbone = build_backbone(cfg) setattr(self, cfg['prefix'], backbone) def build_head(self, cfg): @@ -414,7 +414,7 @@ class SingleBackboneTaskModelBase(BaseTaskModel): ) if 'prefix' in cfg: self._head_prefix = cfg['prefix'] - head = build_head(cfg, group_key=self.group_key) + head = build_head(cfg, task_name=self.group_key) setattr(self, self._head_prefix, head) return head diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py index a39f58bf..2739bf11 100644 --- a/modelscope/models/nlp/task_models/token_classification.py +++ b/modelscope/models/nlp/task_models/token_classification.py @@ -8,7 +8,7 @@ from modelscope.metainfo import TaskModels from modelscope.models.builder import MODELS from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase -from modelscope.outputs import OutputKeys +from modelscope.outputs import OutputKeys, TokenClassifierOutput from modelscope.utils.constant import Tasks from modelscope.utils.hub import parse_label_mapping from modelscope.utils.tensor_utils import (torch_nested_detach, @@ -53,27 +53,20 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): labels = input.pop(OutputKeys.LABELS) outputs = super().forward(input) - sequence_output, pooled_output = self.extract_backbone_outputs(outputs) - outputs = self.head.forward(sequence_output) + sequence_output = outputs[0] + logits = self.head.forward(sequence_output) + loss = None if labels in input: loss = self.compute_loss(outputs, labels) - outputs.update(loss) + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + offset_mapping=input['offset_mapping'], + ) return outputs def extract_logits(self, outputs): return outputs[OutputKeys.LOGITS].cpu().detach() - - def extract_backbone_outputs(self, outputs): - sequence_output = None - pooled_output = None - if hasattr(self.backbone, 'extract_sequence_outputs'): - sequence_output = self.backbone.extract_sequence_outputs(outputs) - return sequence_output, pooled_output - - def postprocess(self, input, **kwargs): - logits = self.extract_logits(input) - pred = torch.argmax(logits[0], dim=-1) - pred = torch_nested_numpify(torch_nested_detach(pred)) - logits = torch_nested_numpify(torch_nested_detach(logits)) - res = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits} - return res diff --git a/modelscope/models/nlp/text_ranking.py b/modelscope/models/nlp/text_ranking.py deleted file mode 100644 index 5bc0635a..00000000 --- a/modelscope/models/nlp/text_ranking.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict - -import numpy as np -import torch - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.nlp import SbertForSequenceClassification -from modelscope.models.nlp.structbert import SbertPreTrainedModel -from modelscope.outputs import OutputKeys -from modelscope.utils.constant import Tasks - -__all__ = ['TextRanking'] - - -@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert) -class TextRanking(SbertForSequenceClassification, SbertPreTrainedModel): - base_model_prefix: str = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def __init__(self, config, model_dir, *args, **kwargs): - if hasattr(config, 'base_model_prefix'): - TextRanking.base_model_prefix = config.base_model_prefix - super().__init__(config, model_dir) - self.train_batch_size = kwargs.get('train_batch_size', 4) - self.register_buffer( - 'target_label', - torch.zeros(self.train_batch_size, dtype=torch.long)) - - def build_base_model(self): - from .structbert import SbertModel - return SbertModel(self.config, add_pooling_layer=True) - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - outputs = self.base_model.forward(**input) - - # backbone model should return pooled_output as its second output - pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - if self.base_model.training: - scores = logits.view(self.train_batch_size, -1) - loss_fct = torch.nn.CrossEntropyLoss() - loss = loss_fct(scores, self.target_label) - return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss} - return {OutputKeys.LOGITS: logits} - - def sigmoid(self, logits): - return np.exp(logits) / (1 + np.exp(logits)) - - def postprocess(self, inputs: Dict[str, np.ndarray], - **kwargs) -> Dict[str, np.ndarray]: - logits = inputs['logits'].squeeze(-1).detach().cpu().numpy() - logits = self.sigmoid(logits).tolist() - result = {OutputKeys.SCORES: logits} - return result - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - num_labels: An optional arg to tell the model how many classes to initialize. - Method will call utils.parse_label_mapping if num_labels not supplied. - If num_labels is not found, the model will use the default setting (1 classes). - @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained - """ - - num_labels = kwargs.get('num_labels', 1) - model_args = {} if num_labels is None else {'num_labels': num_labels} - - return super(SbertPreTrainedModel, TextRanking).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py deleted file mode 100644 index e58967a5..00000000 --- a/modelscope/models/nlp/token_classification.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from abc import abstractmethod -from typing import Dict - -import numpy as np -import torch -from torch import nn - -from modelscope.metainfo import Models -from modelscope.models.base import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.nlp.bert import BertPreTrainedModel -from modelscope.models.nlp.structbert import SbertPreTrainedModel -from modelscope.outputs import OutputKeys -from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping -from modelscope.utils.tensor_utils import (torch_nested_detach, - torch_nested_numpify) - -__all__ = ['SbertForTokenClassification'] - - -class TokenClassification(TorchModel): - """A token classification base class for all the fitted token classification models. - """ - - base_model_prefix: str = 'bert' - - def __init__(self, config, model_dir): - super().__init__(model_dir) - self.num_labels = config.num_labels - self.config = config - setattr(self, self.base_model_prefix, self.build_base_model()) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None - else config.hidden_dropout_prob) - self.dropout = nn.Dropout(classifier_dropout) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - @abstractmethod - def build_base_model(self): - """Build the backbone model. - - Returns: the backbone instance. - """ - pass - - @property - def base_model(self): - return getattr(self, self.base_model_prefix) - - def compute_loss(self, logits, labels, **kwargs): - """Compute loss. - - For example, if backbone is pretrained model, there will be a 'attention_mask' parameter to skip - useless tokens. - - Args: - logits: The logits from the classifier - labels: The labels - **kwargs: Other input params. - - Returns: The loss. - - """ - pass - - def forward(self, **kwargs): - labels = None - if OutputKeys.LABEL in kwargs: - labels = kwargs.pop(OutputKeys.LABEL) - elif OutputKeys.LABELS in kwargs: - labels = kwargs.pop(OutputKeys.LABELS) - - outputs = self.base_model(**kwargs) - # base model should return the sequence_output as its first output - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - if labels is not None: - loss = self.compute_loss(logits, labels, **kwargs) - return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss} - return {OutputKeys.LOGITS: logits} - - def postprocess(self, input: Dict[str, np.ndarray], - **kwargs) -> Dict[str, np.ndarray]: - logits = input[OutputKeys.LOGITS] - pred = torch.argmax(logits[0], dim=-1) - pred = torch_nested_numpify(torch_nested_detach(pred)) - logits = torch_nested_numpify(torch_nested_detach(logits)) - rst = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits} - return rst - - -@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert) -@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert) -@MODELS.register_module( - Tasks.token_classification, module_name=Models.structbert) -class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): - """Sbert token classification model. - - Inherited from TokenClassification. - """ - - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r'pooler'] - - def __init__(self, config, model_dir): - if hasattr(config, 'base_model_prefix'): - SbertForTokenClassification.base_model_prefix = config.base_model_prefix - super().__init__(config, model_dir) - - def build_base_model(self): - from .structbert import SbertModel - return SbertModel(self.config, add_pooling_layer=False) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - labels=None, - **kwargs): - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - labels=labels) - - def compute_loss(self, logits, labels, attention_mask=None, **kwargs): - """Compute the loss with an attention mask. - - @param logits: The logits output from the classifier. - @param labels: The labels. - @param attention_mask: The attention_mask. - @param kwargs: Unused input args. - @return: The loss - """ - loss_fct = nn.CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), - torch.tensor(loss_fct.ignore_index).type_as(labels)) - return loss_fct(active_logits, active_labels) - else: - return loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - num_labels: An optional arg to tell the model how many classes to initialize. - Method will call utils.parse_label_mapping if num_labels not supplied. - If num_labels is not found, the model will use the default setting (2 classes). - @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained - """ - model_dir = kwargs.get('model_dir') - num_labels = kwargs.get('num_labels') - if num_labels is None: - label2id = parse_label_mapping(model_dir) - if label2id is not None and len(label2id) > 0: - num_labels = len(label2id) - - model_args = {} if num_labels is None else {'num_labels': num_labels} - return super(SbertPreTrainedModel, - SbertForTokenClassification).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) - - -@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert) -@MODELS.register_module(Tasks.token_classification, module_name=Models.bert) -class BertForTokenClassification(TokenClassification, BertPreTrainedModel): - """Bert token classification model. - - Inherited from TokenClassificationBase. - """ - base_model_prefix: str = 'bert' - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def __init__(self, config, model_dir): - if hasattr(config, 'base_model_prefix'): - BertForTokenClassification.base_model_prefix = config.base_model_prefix - super().__init__(config, model_dir) - - def build_base_model(self): - from .bert import BertModel - return BertModel(self.config, add_pooling_layer=True) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - labels=labels, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - **kwargs) - - @classmethod - def _instantiate(cls, **kwargs): - """Instantiate the model. - - @param kwargs: Input args. - model_dir: The model dir used to load the checkpoint and the label information. - num_labels: An optional arg to tell the model how many classes to initialize. - Method will call utils.parse_label_mapping if num_labels not supplied. - If num_labels is not found, the model will use the default setting (2 classes). - @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained - """ - model_dir = kwargs.get('model_dir') - num_labels = kwargs.get('num_labels') - if num_labels is None: - label2id = parse_label_mapping(model_dir) - if label2id is not None and len(label2id) > 0: - num_labels = len(label2id) - - model_args = {} if num_labels is None else {'num_labels': num_labels} - return super(BertPreTrainedModel, - BertForTokenClassification).from_pretrained( - pretrained_model_name_or_path=kwargs.get('model_dir'), - model_dir=kwargs.get('model_dir'), - **model_args) diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py index 0fe786fd..0774e9b4 100644 --- a/modelscope/models/nlp/veco/__init__.py +++ b/modelscope/models/nlp/veco/__init__.py @@ -18,18 +18,22 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .configuration_veco import VecoConfig - from .modeling_veco import (VecoForMaskedLM, VecoForSequenceClassification, - VecoModel) - from .tokenization_veco import VecoTokenizer - from .tokenization_veco_fast import VecoTokenizerFast + from .configuration import VecoConfig + from .backbone import VecoModel + from .text_classification import VecoForSequenceClassification + from .token_classification import VecoForTokenClassification + from .fill_mask import VecoForMaskedLM + from .tokenization import VecoTokenizer + from .tokenization_fast import VecoTokenizerFast else: _import_structure = { - 'configuration_veco': ['VecoConfig'], - 'modeling_veco': - ['VecoForMaskedLM', 'VecoForSequenceClassification', 'VecoModel'], - 'tokenization_veco': ['VecoTokenizer'], - 'tokenization_veco_fast': ['VecoTokenizerFast'], + 'configuration': ['VecoConfig'], + 'backbone': ['VecoModel'], + 'text_classification': ['VecoForSequenceClassification'], + 'fill_mask': ['VecoForMaskedLM'], + 'token_classification': ['VecoForTokenClassification'], + 'tokenization': ['VecoTokenizer'], + 'tokenization_fast': ['VecoTokenizerFast'], } import sys diff --git a/modelscope/models/nlp/veco/backbone.py b/modelscope/models/nlp/veco/backbone.py new file mode 100644 index 00000000..98d8c30a --- /dev/null +++ b/modelscope/models/nlp/veco/backbone.py @@ -0,0 +1,96 @@ +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`""" + +from transformers import RobertaModel + +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionBackboneModelOutput +from modelscope.utils import logger as logging +from modelscope.utils.constant import Tasks +from .configuration import VecoConfig + +logger = logging.get_logger(__name__) + +VECO_PRETRAINED_MODEL_ARCHIVE_LIST = [] + + +@MODELS.register_module(Tasks.backbone, module_name=Models.veco) +class VecoModel(TorchModel, RobertaModel): + """The bare Veco Model transformer outputting raw hidden-states without any specific head on top. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config ([`VecoConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model + weights. + + This class overrides [`RobertaModel`]. Please check the superclass for the appropriate + documentation alongside usage examples. + """ + + config_class = VecoConfig + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def forward(self, *args, **kwargs): + """ + Returns: + Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large', task='backbone') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large') + >>> print(model(**preprocessor('这是个测试'))) + + """ + kwargs['return_dict'] = True + outputs = super(Model, self).forward(*args, **kwargs) + return AttentionBackboneModelOutput( + last_hidden_state=outputs.last_hidden_state, + pooler_output=outputs.pooler_output, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + ponet_config = VecoConfig(**kwargs) + model = cls(ponet_config) + else: + model = super( + Model, + cls).from_pretrained(pretrained_model_name_or_path=model_dir) + return model diff --git a/modelscope/models/nlp/veco/configuration_veco.py b/modelscope/models/nlp/veco/configuration.py similarity index 100% rename from modelscope/models/nlp/veco/configuration_veco.py rename to modelscope/models/nlp/veco/configuration.py diff --git a/modelscope/models/nlp/veco/fill_mask.py b/modelscope/models/nlp/veco/fill_mask.py new file mode 100644 index 00000000..de2cdb4a --- /dev/null +++ b/modelscope/models/nlp/veco/fill_mask.py @@ -0,0 +1,99 @@ +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers import RobertaForMaskedLM + +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionFillMaskModelOutput +from modelscope.utils.constant import Tasks +from .configuration import VecoConfig + + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) +class VecoForMaskedLM(TorchModel, RobertaForMaskedLM): + """Veco Model transformer with a masked language model head on top (a linear layer on top of the + pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the fill_mask model of StructBERT, the preprocessor of this model + is `modelscope.preprocessors.NLPPreprocessor`. + + Parameters: + config ([`VecoConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model + weights. + + This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def forward(self, *args, **kwargs): + """ + Returns: + Returns `modelscope.outputs.AttentionFillMaskModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large') + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large') + >>> # Call the model, return some tensors + >>> print(model(**preprocessor('你师父差得动你,你师父可不动我。'))) + >>> # Call the pipeline + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor) + >>> print(pipeline_ins('你师父差得动你,你师父可不动我。')) + """ + + kwargs['return_dict'] = True + outputs = super(Model, self).forward(*args, **kwargs) + return AttentionFillMaskModelOutput( + loss=outputs.loss, + logits=outputs.logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + input_ids=kwargs['input_ids'], + ) + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + ponet_config = VecoConfig(**kwargs) + model = cls(ponet_config) + else: + model = super( + Model, + cls).from_pretrained(pretrained_model_name_or_path=model_dir) + return model diff --git a/modelscope/models/nlp/veco/modeling_veco.py b/modelscope/models/nlp/veco/modeling_veco.py deleted file mode 100644 index b519c236..00000000 --- a/modelscope/models/nlp/veco/modeling_veco.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`""" - -from transformers import (RobertaForMaskedLM, RobertaForMultipleChoice, - RobertaForQuestionAnswering, - RobertaForSequenceClassification, - RobertaForTokenClassification, RobertaModel) -from transformers.file_utils import add_start_docstrings - -from modelscope.metainfo import Models -from modelscope.models.builder import BACKBONES -from modelscope.utils import logger as logging -from modelscope.utils.constant import Fields -from .configuration_veco import VecoConfig - -logger = logging.get_logger(__name__) - -VECO_PRETRAINED_MODEL_ARCHIVE_LIST = [] - -VECO_START_DOCSTRING = r""" - - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic - methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, - pruning heads etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) - subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to - general usage and behavior. - - Parameters: - config ([`VecoConfig`]): Model configuration class with all the parameters of the - model. Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model - weights. -""" - - -@add_start_docstrings( - 'The bare Veco Model transformer outputting raw hidden-states without any specific head on top.', - VECO_START_DOCSTRING, -) -class VecoModel(RobertaModel): - """ - This class overrides [`RobertaModel`]. Please check the superclass for the appropriate - documentation alongside usage examples. - """ - - config_class = VecoConfig - - -@add_start_docstrings( - """ - Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the - pooled output) e.g. for GLUE tasks. - """, - VECO_START_DOCSTRING, -) -class VecoForSequenceClassification(RobertaForSequenceClassification): - """ - This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the - appropriate documentation alongside usage examples. - """ - - config_class = VecoConfig - - -@add_start_docstrings( - """ - Veco Model transformer with a masked language model head on top (a linear layer on top of the - pooled output). - """, - VECO_START_DOCSTRING, -) -class VecoForMaskedLM(RobertaForMaskedLM): - """ - This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the - appropriate documentation alongside usage examples. - """ - - config_class = VecoConfig - - -@add_start_docstrings( - """ - Veco Model with a multiple choice classification head on top (a linear layer on top of the pooled output and - a softmax) e.g. for RocStories/SWAG tasks. - """, - VECO_START_DOCSTRING, -) -class VecoForMultipleChoice(RobertaForMultipleChoice): - """ - This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the - appropriate documentation alongside usage examples. - """ - - config_class = VecoConfig - - -@add_start_docstrings( - """ - Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. - for Named-Entity-Recognition (NER) tasks. - """, - VECO_START_DOCSTRING, -) -class VecoForTokenClassification(RobertaForTokenClassification): - """ - This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the - appropriate documentation alongside usage examples. - """ - - config_class = VecoConfig - - -@add_start_docstrings( - """ - Veco Model with a span classification head on top for extractive question-answering tasks like SQuAD (a - linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - """, - VECO_START_DOCSTRING, -) -class VecoForQuestionAnswering(RobertaForQuestionAnswering): - """ - This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the - appropriate documentation alongside usage examples. - """ - - config_class = VecoConfig diff --git a/modelscope/models/nlp/veco/text_classification.py b/modelscope/models/nlp/veco/text_classification.py new file mode 100644 index 00000000..e4e74d8f --- /dev/null +++ b/modelscope/models/nlp/veco/text_classification.py @@ -0,0 +1,150 @@ +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers import RobertaForSequenceClassification + +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionTextClassificationModelOutput +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping +from .configuration import VecoConfig + + +@MODELS.register_module(Tasks.nli, module_name=Models.veco) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.veco) +@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco) +@MODELS.register_module(Tasks.text_classification, module_name=Models.veco) +class VecoForSequenceClassification(TorchModel, + RobertaForSequenceClassification): + """Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Preprocessor: + This is the text classification model of Veco, the preprocessor of this model + is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + + Trainer: + This model should be trained by dataset which has mixed languages, + and evaluated by datasets of languages one by one. + For example, if the training dataset is xnli (which has sub datasets of multiple languages), then you + should mix the sub-datasets with the languages you want to train to one training dataset, and evaluate + the model one sub-dataset by one sub-dataset of different languages. + This procedure can be done by custom code. If you are using trainer of ModelScope, + the `VecoTrainer` is suggested to use to train this model. This trainer overrides the basic evaluation + loop, and will call the evaluation dataset one by one. Besides, this trainer will use the `VecoTaskDataset` + to mix the input datasets to one, you can check the API Doc for the details. + + To check the complete example please + view the unittest `test_veco_xnli` in `tests.trainers.test_finetune_sequence_classification.py` + + Parameters: + config ([`VecoConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model + weights. + + This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def forward(self, *args, **kwargs): + """ + Returns: + Returns `modelscope.outputs.AttentionTextClassificationModelOutput` + + Examples: + >>> from modelscope.models import Model + >>> from modelscope.preprocessors import Preprocessor + >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large', + >>> task='text-classification', num_labels=2) + >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large', + >>> label2id={'0': 0, '1': 1}) + >>> # Call the model, return some tensors + >>> print(model(**preprocessor('这是个测试'))) + >>> # Call the pipeline, the result may be incorrect + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('text-classification', pipeline_name='text-classification', + >>> model=model, preprocessor=preprocessor) + >>> print(pipeline_ins('这是个测试')) + """ + + kwargs['return_dict'] = True + outputs = super(Model, self).forward(*args, **kwargs) + return AttentionTextClassificationModelOutput( + loss=outputs.loss, + logits=outputs.logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels is not input. + label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists). + + Returns: + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + config = VecoConfig(**kwargs) + model = cls(config) + else: + model_kwargs = {} + label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) + id2label = kwargs.get( + 'id2label', None if label2id is None else + {id: label + for label, id in label2id.items()}) + if id2label is not None and label2id is None: + label2id = {label: id for id, label in id2label.items()} + + num_labels = kwargs.get( + 'num_labels', None if label2id is None else len(label2id)) + if num_labels is not None: + model_kwargs['num_labels'] = num_labels + if label2id is not None: + model_kwargs['label2id'] = label2id + if id2label is not None: + model_kwargs['id2label'] = id2label + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_kwargs) + return model diff --git a/modelscope/models/nlp/veco/token_classification.py b/modelscope/models/nlp/veco/token_classification.py new file mode 100644 index 00000000..f6252209 --- /dev/null +++ b/modelscope/models/nlp/veco/token_classification.py @@ -0,0 +1,107 @@ +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers import RobertaForTokenClassification + +from modelscope.metainfo import Models +from modelscope.models import Model, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import AttentionTokenClassificationModelOutput +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping +from .configuration import VecoConfig + + +@MODELS.register_module(Tasks.token_classification, module_name=Models.veco) +class VecoForTokenClassification(TorchModel, RobertaForTokenClassification): + """Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config ([`VecoConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model + weights. + + This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + def forward(self, *args, **kwargs): + kwargs['return_dict'] = True + outputs = super(Model, self).forward(*args, **kwargs) + return AttentionTokenClassificationModelOutput( + loss=outputs.loss, + logits=outputs.logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + Args: + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels is not input. + label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists). + + Returns: + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.pop('model_dir', None) + if model_dir is None: + config = VecoConfig(**kwargs) + model = cls(config) + else: + model_kwargs = {} + label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) + id2label = kwargs.get( + 'id2label', None if label2id is None else + {id: label + for label, id in label2id.items()}) + if id2label is not None and label2id is None: + label2id = {label: id for id, label in id2label.items()} + + num_labels = kwargs.get( + 'num_labels', None if label2id is None else len(label2id)) + if num_labels is not None: + model_kwargs['num_labels'] = num_labels + if label2id is not None: + model_kwargs['label2id'] = label2id + if id2label is not None: + model_kwargs['id2label'] = id2label + model = super(Model, cls).from_pretrained( + pretrained_model_name_or_path=model_dir, **model_kwargs) + return model diff --git a/modelscope/models/nlp/veco/tokenization_veco.py b/modelscope/models/nlp/veco/tokenization.py similarity index 100% rename from modelscope/models/nlp/veco/tokenization_veco.py rename to modelscope/models/nlp/veco/tokenization.py diff --git a/modelscope/models/nlp/veco/tokenization_veco_fast.py b/modelscope/models/nlp/veco/tokenization_fast.py similarity index 99% rename from modelscope/models/nlp/veco/tokenization_veco_fast.py rename to modelscope/models/nlp/veco/tokenization_fast.py index 3edae0e7..b41a5c3b 100644 --- a/modelscope/models/nlp/veco/tokenization_veco_fast.py +++ b/modelscope/models/nlp/veco/tokenization_fast.py @@ -27,7 +27,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast from modelscope.utils import logger as logging if is_sentencepiece_available(): - from .tokenization_veco import VecoTokenizer + from .tokenization import VecoTokenizer else: VecoTokenizer = None diff --git a/modelscope/msdatasets/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py index 014e4faa..4d82b741 100644 --- a/modelscope/msdatasets/task_datasets/torch_base_dataset.py +++ b/modelscope/msdatasets/task_datasets/torch_base_dataset.py @@ -19,6 +19,7 @@ class TorchTaskDataset(TaskDataset, Dataset): preprocessor=None, **kwargs): TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs) + self.trainer = None def __getitem__(self, index) -> Any: return self.prepare_sample(self._inner_dataset[index]) diff --git a/modelscope/outputs/__init__.py b/modelscope/outputs/__init__.py new file mode 100644 index 00000000..47e66714 --- /dev/null +++ b/modelscope/outputs/__init__.py @@ -0,0 +1,2 @@ +from .nlp.model_outputs import * # noqa +from .outputs import TASK_OUTPUTS, ModelOutputBase, OutputKeys diff --git a/modelscope/preprocessors/space_T_cn/fields/__init__.py b/modelscope/outputs/nlp/__init__.py similarity index 100% rename from modelscope/preprocessors/space_T_cn/fields/__init__.py rename to modelscope/outputs/nlp/__init__.py diff --git a/modelscope/outputs/nlp/model_outputs.py b/modelscope/outputs/nlp/model_outputs.py new file mode 100644 index 00000000..dcb37145 --- /dev/null +++ b/modelscope/outputs/nlp/model_outputs.py @@ -0,0 +1,543 @@ +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +from modelscope.outputs.outputs import ModelOutputBase + +Tensor = Union['torch.Tensor', 'tf.Tensor'] + + +@dataclass +class TextClassificationModelOutput(ModelOutputBase): + """The output class for text classification models. + + Args: + logits (`Tensor`): The logits output of the model. loss (`Tensor`, + *optional*) The loss of the model, available when training. + hidden_states (`Tensor`, *optional*) Hidden-states of the model at the + output of each layer plus the optional initial embedding outputs. + """ + + logits: Tensor = None + loss: Tensor = None + + +@dataclass +class TokenClassificationModelOutput(ModelOutputBase): + """The output class for token classification models. + logits (`Tensor`): The logits output of the model. + loss (`Tensor`, *optional*) The loss of the model, available when training. + """ + + logits: Tensor = None + loss: Tensor = None + offset_mapping: Tensor = None + + +@dataclass +class FillMaskModelOutput(ModelOutputBase): + """The output class for text classification models. + + Args: + logits (`Tensor`): The logits output of the model. + loss (`Tensor`, *optional*) The loss of the model, available when training. + input_ids (`Tensor`, *optional*) The input id tensor fed into the model. + hidden_states (`Tensor`, *optional*) Hidden-states of the model at the + output of each layer plus the optional initial embedding outputs. + """ + + logits: Tensor = None + loss: Tensor = None + input_ids: Tensor = None + hidden_states: Tensor = None + + +@dataclass +class TokenClassifierOutput(ModelOutputBase): + """ + Base class for outputs of token classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when + `labels` is provided) : + Classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, + config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the sentence. + Selected in the range ``[0, sequence_length - 1]``. + + """ + + loss: Tensor = None + logits: Tensor = None + hidden_states: Tensor = None + attentions: Tensor = None + offset_mapping: Tensor = None + + +@dataclass +class TokenClassifierWithPredictionsOutput(ModelOutputBase): + """ + Base class for outputs of token classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when + `labels` is provided) : + Classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, + config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the sentence. + Selected in the range ``[0, sequence_length - 1]``. + predictions: A PyTorch tensor of the best tag sequence for each batch of shape + (nbest, batch_size, seq_length) + + """ + + loss: Tensor = None + logits: Tensor = None + hidden_states: Tensor = None + attentions: Tensor = None + offset_mapping: Tensor = None + predictions: Tensor = None + + +@dataclass +class BaseModelOutput(ModelOutputBase): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the + model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + """ + + last_hidden_state: Tensor = None + hidden_states: Optional[Tuple[Tensor]] = None + attentions: Optional[Tuple[Tensor]] = None + + +@dataclass +class BackboneModelOutput(ModelOutputBase): + """The output class for text classification models. + + Args: + last_hidden_state (`Tensor`, *optional*): Sequence of hidden-states at + the output of the last layer of the model. + pooler_output (`Tensor`, *optional*) The tensor of the pooled hidden state. + hidden_states (`Tensor`, *optional*) Hidden-states of the model at + the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Tensor = None + pooler_output: Tensor = None + hidden_states: Tensor = None + + +@dataclass +class AttentionBackboneModelOutput(BackboneModelOutput): + """The output class for backbones of attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the + attention softmax, used to compute the weighted average in the + self-attention heads. + """ + attentions: Tensor = None + past_key_values: Tensor = None + cross_attentions: Tensor = None + + +@dataclass +class AttentionTextClassificationModelOutput(TextClassificationModelOutput): + """The output class for backbones of attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the + attention softmax, used to compute the weighted average in the + self-attention heads. + """ + attentions: Tensor = None + hidden_states: Tensor = None + + +@dataclass +class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput): + """The output class for backbones of attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax, + used to compute the weighted average in the self-attention heads. + """ + attentions: Tensor = None + hidden_states: Tensor = None + + +@dataclass +class AttentionFillMaskModelOutput(FillMaskModelOutput): + """The output class for the fill mask and attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the + attention softmax, used to compute the weighted average in the + self-attention heads. + """ + attentions: Tensor = None + + +@dataclass +class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase): + """ + Base class for model's outputs that also contains a pooling of the last + hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the + model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, + hidden_size)`): + Last layer hidden-state of the first token of the sequence + (classification token) after further processing through the layers + used for the auxiliary pretraining task. E.g. for BERT-family of + models, this returns the classification token after processing + through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction + (classification) objective during pretraining. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` and `config.add_cross_attention=True` is passed + or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the + cross-attention heads. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned + when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, + with each tuple having 2 tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, + embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the + self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that + can be used (see `past_key_values` input) to speed up sequential + decoding. + """ + + last_hidden_state: Tensor = None + pooler_output: Tensor = None + hidden_states: Tensor = None + past_key_values: Tensor = None + attentions: Tensor = None + cross_attentions: Tensor = None + + +@dataclass +class BaseModelOutputWithPastAndCrossAttentions(ModelOutputBase): + """ + Base class for model's outputs that may also contain a past key/values (to + speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the + model. + + If `past_key_values` is used only the last hidden-state of the + sequences of shape `(batch_size, 1, hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned + when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, + with each tuple having 2 tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, + embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the + self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that + can be used (see `past_key_values` input) to speed up sequential + decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` and `config.add_cross_attention=True` is passed + or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the + cross-attention heads. + """ + + last_hidden_state: Tensor = None + past_key_values: Tensor = None + hidden_states: Tensor = None + attentions: Tensor = None + cross_attentions: Tensor = None + + +@dataclass +class Seq2SeqModelOutput(ModelOutputBase): + """ + Base class for model encoder's outputs that also contains : pre-computed + hidden states that can speed up sequential decoding. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the + decoder of the model. + + If `past_key_values` is used only the last hidden-state of the + sequences of shape `(batch_size, 1, hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned + when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, + with each tuple having 2 tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, + embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the + self-attention blocks and in the cross-attention blocks) that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the + optional initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used + to compute the weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the + cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the + optional initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used + to compute the weighted average in the self-attention heads. + """ + + last_hidden_state: Tensor = None + past_key_values: Optional[Tuple[Tuple[Tensor]]] = None + decoder_hidden_states: Optional[Tuple[Tensor]] = None + decoder_attentions: Optional[Tuple[Tensor]] = None + cross_attentions: Optional[Tuple[Tensor]] = None + encoder_last_hidden_state: Optional[Tensor] = None + encoder_hidden_states: Optional[Tuple[Tensor]] = None + encoder_attentions: Optional[Tuple[Tensor]] = None + + +@dataclass +class Seq2SeqLMOutput(ModelOutputBase): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when + `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, + config.vocab_size)`): + Prediction scores of the language modeling head (scores for each + vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned + when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, + with each tuple having 2 tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, + embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the + self-attention blocks and in the cross-attention blocks) that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the + initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used + to compute the weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the + cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, + if the model has an embedding layer, + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the + initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned + when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used + to compute the weighted average in the self-attention heads. + """ + + loss: Optional[Tensor] = None + logits: Tensor = None + past_key_values: Optional[Tuple[Tuple[Tensor]]] = None + decoder_hidden_states: Optional[Tuple[Tensor]] = None + decoder_attentions: Optional[Tuple[Tensor]] = None + cross_attentions: Optional[Tuple[Tensor]] = None + encoder_last_hidden_state: Optional[Tensor] = None + encoder_hidden_states: Optional[Tuple[Tensor]] = None + encoder_attentions: Optional[Tuple[Tensor]] = None diff --git a/modelscope/outputs.py b/modelscope/outputs/outputs.py similarity index 93% rename from modelscope/outputs.py rename to modelscope/outputs/outputs.py index 34bde76a..721fb271 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs/outputs.py @@ -1,4 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from collections import OrderedDict, namedtuple +from dataclasses import dataclass, fields from modelscope.utils.constant import Tasks @@ -488,7 +490,6 @@ TASK_OUTPUTS = { # ] # } Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS], - Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS], # TODO @wenmeng.zwm support list of result check # named entity recognition result for single sample @@ -499,6 +500,7 @@ TASK_OUTPUTS = { # ] # } Tasks.named_entity_recognition: [OutputKeys.OUTPUT], + Tasks.part_of_speech: [OutputKeys.OUTPUT], # text_error_correction result for a single sample # { @@ -779,3 +781,60 @@ TASK_OUTPUTS = { # } Tasks.product_segmentation: [OutputKeys.MASKS], } + + +class ModelOutputBase(list): + + def __post_init__(self): + self.reconstruct() + self.post_init = True + + def reconstruct(self): + # Low performance, but low frequency. + self.clear() + for idx, key in enumerate(self.keys()): + self.append(getattr(self, key)) + + def __getitem__(self, item): + if isinstance(item, str): + if hasattr(self, item): + return getattr(self, item) + elif isinstance(item, (int, slice)): + return super().__getitem__(item) + raise IndexError(f'No Index {item} found in the dataclass.') + + def __setitem__(self, key, value): + if isinstance(key, str): + if key in [f.name for f in fields(self)]: + if key not in self.keys(): + super().__setattr__(key, value) + self.reconstruct() + elif id(getattr(self, key)) != id(value): + super().__setattr__(key, value) + super().__setitem__(self.keys().index(key), value) + else: + super().__setattr__(key, value) + elif isinstance(key, int): + super().__setitem__(key, value) + key_name = self.keys()[key] + super().__setattr__(key_name, value) + + def __setattr__(self, key, value): + if getattr(self, 'post_init', False): + return self.__setitem__(key, value) + else: + return super().__setattr__(key, value) + + def keys(self): + return [ + f.name for f in fields(self) if getattr(self, f.name) is not None + ] + + def items(self): + return self.to_dict().items() + + def to_dict(self): + output = OrderedDict() + for key in self.keys(): + output[key] = getattr(self, key) + return output diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 644749fc..bca80502 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -33,7 +33,7 @@ if is_tf_available(): Tensor = Union['torch.Tensor', 'tf.Tensor'] Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray'] -InputModel = Union[str, Model] +InputModel = Union[str, Model, 'torch.nn.Module'] logger = get_logger() @@ -49,13 +49,7 @@ class Pipeline(ABC): return Model.from_pretrained( model, model_prefetched=True, device=self.device_name) if is_model(model) else model - elif isinstance(model, Model): - return model else: - if model and not isinstance(model, str): - raise ValueError( - f'model type for single model is either str or Model, but got type {type(model)}' - ) return model def initiate_multiple_models(self, input_models: List[InputModel]): @@ -139,12 +133,10 @@ class Pipeline(ABC): def _get_framework(self) -> str: frameworks = [] for m in self.models: - if isinstance(m, Model): - model_dir = m.model_dir - else: - assert isinstance(m, - str), 'model should be either str or Model.' + if isinstance(m, str): model_dir = m + else: + model_dir = m.model_dir cfg_file = osp.join(model_dir, ModelFile.CONFIGURATION) cfg = Config.from_file(cfg_file) frameworks.append(cfg.framework) @@ -387,10 +379,13 @@ class DistributedPipeline(Pipeline): def _instantiate_one(cls, rank, model_dir, **kwargs): """Instantiate one model piece. - @param rank: The model rank. - @param model_dir: The model_dir in the node. - @param kwargs: Any extra args. - @return: None. The model handler should be kept in the class field. + Args: + rank: The model rank. + model_dir: The model_dir in the node. + kwargs: Any extra args. + + Returns: + None. The model handler should be kept in the class field. """ pass @@ -410,8 +405,11 @@ class DistributedPipeline(Pipeline): Use the model handler kept in the class field to forward. - @param inputs: The inputs after the preprocessing. - @return: The forward results. + Args: + inputs: The inputs after the preprocessing. + + Returns: + The forward results. """ pass @@ -429,7 +427,7 @@ def collate_fn(data, device): """ from torch.utils.data.dataloader import default_collate - from modelscope.preprocessors import InputFeatures + from modelscope.preprocessors.nlp import InputFeatures if isinstance(data, dict) or isinstance(data, Mapping): return type(data)({k: collate_fn(v, device) for k, v in data.items()}) elif isinstance(data, (tuple, list)): diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index e1583387..498c9ed8 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -285,9 +285,6 @@ def pipeline(task: str = None, if task is None and pipeline_name is None: raise ValueError('task or pipeline_name is required') - assert isinstance(model, (type(None), str, Model, list)), \ - f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}' - model = normalize_model_input(model, model_revision) if pipeline_name is None: # get default pipeline for this task @@ -304,8 +301,7 @@ def pipeline(task: str = None, else: # used for test case, when model is str and is not hub path pipeline_name = get_pipeline_by_model_name(task, model) - elif isinstance(model, Model) or \ - (isinstance(model, list) and isinstance(model[0], Model)): + elif model is not None: # get pipeline info from Model object first_model = model[0] if isinstance(model, list) else model if not hasattr(first_model, 'pipeline'): diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index 677151c0..73bd0d8c 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .automatic_post_editing_pipeline import AutomaticPostEditingPipeline from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline + from .table_question_answering_pipeline import TableQuestionAnsweringPipeline from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline from .dialog_modeling_pipeline import DialogModelingPipeline from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline @@ -14,16 +15,13 @@ if TYPE_CHECKING: from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline from .feature_extraction_pipeline import FeatureExtractionPipeline from .fill_mask_pipeline import FillMaskPipeline - from .fill_mask_ponet_pipeline import FillMaskPonetPipeline from .information_extraction_pipeline import InformationExtractionPipeline from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline from .text_ranking_pipeline import TextRankingPipeline from .sentence_embedding_pipeline import SentenceEmbeddingPipeline - from .sequence_classification_pipeline import SequenceClassificationPipeline + from .text_classification_pipeline import TextClassificationPipeline from .summarization_pipeline import SummarizationPipeline - from .table_question_answering_pipeline import TableQuestionAnsweringPipeline from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline - from .text_classification_pipeline import TextClassificationPipeline from .text_error_correction_pipeline import TextErrorCorrectionPipeline from .text_generation_pipeline import TextGenerationPipeline from .text2text_generation_pipeline import Text2TextGenerationPipeline @@ -47,13 +45,11 @@ else: 'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'], 'feature_extraction_pipeline': ['FeatureExtractionPipeline'], 'fill_mask_pipeline': ['FillMaskPipeline'], - 'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'], 'information_extraction_pipeline': ['InformationExtractionPipeline'], 'named_entity_recognition_pipeline': ['NamedEntityRecognitionPipeline'], 'text_ranking_pipeline': ['TextRankingPipeline'], 'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'], - 'sequence_classification_pipeline': ['SequenceClassificationPipeline'], 'summarization_pipeline': ['SummarizationPipeline'], 'table_question_answering_pipeline': ['TableQuestionAnsweringPipeline'], diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py index 73c6429d..48df0c40 100644 --- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py +++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py @@ -11,8 +11,6 @@ from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import ConversationalTextToSqlPreprocessor -from modelscope.preprocessors.star.fields import (SubPreprocessor, - process_tables) from modelscope.utils.constant import Tasks __all__ = ['ConversationalTextToSqlPipeline'] @@ -39,17 +37,6 @@ class ConversationalTextToSqlPipeline(Pipeline): if preprocessor is None: preprocessor = ConversationalTextToSqlPreprocessor(model.model_dir) - preprocessor.device = 'cuda' if \ - ('device' not in kwargs or kwargs['device'] == 'gpu') \ - and torch.cuda.is_available() else 'cpu' - use_device = True if preprocessor.device == 'cuda' else False - preprocessor.processor = \ - SubPreprocessor(model_dir=model.model_dir, - db_content=True, - use_gpu=use_device) - preprocessor.output_tables = \ - process_tables(preprocessor.processor, - preprocessor.tables) super().__init__(model=model, preprocessor=preprocessor, **kwargs) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py index 79d32ace..9520c06f 100644 --- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py @@ -4,7 +4,7 @@ from typing import Any, Dict, Union from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp import SpaceForDialogStateTracking +from modelscope.models.nlp import SpaceForDST from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES @@ -20,7 +20,7 @@ __all__ = ['DialogStateTrackingPipeline'] class DialogStateTrackingPipeline(Pipeline): def __init__(self, - model: Union[SpaceForDialogStateTracking, str], + model: Union[SpaceForDST, str], preprocessor: DialogStateTrackingPreprocessor = None, **kwargs): """use `model` and `preprocessor` to create a dialog state tracking pipeline for @@ -33,8 +33,7 @@ class DialogStateTrackingPipeline(Pipeline): """ model = model if isinstance( - model, - SpaceForDialogStateTracking) else Model.from_pretrained(model) + model, SpaceForDST) else Model.from_pretrained(model) self.model = model if preprocessor is None: preprocessor = DialogStateTrackingPreprocessor(model.model_dir) diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py index e5c05e86..8499f7ff 100644 --- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py +++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py @@ -27,7 +27,8 @@ class DistributedPlugPipeline(DistributedPipeline): **kwargs): """Create a plug pipeline instance. - @param model: The model_id of plug(damo/nlp_plug_text-generation_27B). + Args: + model: The model_id of plug(damo/nlp_plug_text-generation_27B). The default path to damo/nlp_plug_text-generation_27B can be obtained by function get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to this path before calling this class by model_id. @@ -52,11 +53,11 @@ class DistributedPlugPipeline(DistributedPipeline): |_ mp_rank_05_model_states.pt |_ mp_rank_06_model_states.pt |_ mp_rank_07_model_states.pt - @param preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will + preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will be used as default. - @param first_sequence: The first_sequence key name if the input format is a dict. - @param kwargs: - sequence_length: The input sequence_length. + first_sequence: The first_sequence key name if the input format is a dict. + kwargs: + sequence_length: The input sequence_length. """ if preprocessor is None: preprocessor = TextGenerationPreprocessor( diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index 1d46d8fd..fd614e91 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -2,15 +2,12 @@ from typing import Any, Dict, Union -import torch - from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp import SbertForFaqQuestionAnswering from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks __all__ = ['FaqQuestionAnsweringPipeline'] @@ -21,19 +18,19 @@ __all__ = ['FaqQuestionAnsweringPipeline'] class FaqQuestionAnsweringPipeline(Pipeline): def __init__(self, - model: Union[str, SbertForFaqQuestionAnswering], - preprocessor: FaqQuestionAnsweringPreprocessor = None, + model: Union[str, Model], + preprocessor: Preprocessor = None, **kwargs): - model = model if isinstance( - model, - SbertForFaqQuestionAnswering) else Model.from_pretrained(model) - model.eval() + model = Model.from_pretrained(model) if isinstance(model, + str) else model if preprocessor is None: - preprocessor = FaqQuestionAnsweringPreprocessor( + preprocessor = Preprocessor.from_pretrained( model.model_dir, **kwargs) - self.preprocessor = preprocessor - super(FaqQuestionAnsweringPipeline, self).__init__( - model=model, preprocessor=preprocessor, **kwargs) + if preprocessor is None: + from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor + preprocessor = FaqQuestionAnsweringPreprocessor( + model.model_dir, **kwargs) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) def _sanitize_parameters(self, **pipeline_parameters): return pipeline_parameters, pipeline_parameters, pipeline_parameters @@ -46,8 +43,7 @@ class FaqQuestionAnsweringPipeline(Pipeline): def forward(self, inputs: [list, Dict[str, Any]], **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(inputs) + return self.model(inputs) def postprocess(self, inputs: [list, Dict[str, Any]], **postprocess_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 3d515e2d..0f3446e6 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -1,145 +1,103 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os from typing import Any, Dict, Optional, Union -import torch +import numpy as np from modelscope.metainfo import Pipelines from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import NLPPreprocessor, Preprocessor -from modelscope.utils.config import Config -from modelscope.utils.constant import ModelFile, Tasks +from modelscope.preprocessors import Preprocessor +from modelscope.utils.constant import Tasks __all__ = ['FillMaskPipeline'] -_type_map = { - 'veco': 'roberta', - 'sbert': 'bert', -} @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask) +@PIPELINES.register_module( + Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet) class FillMaskPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, - first_sequence='sentence', + first_sequence: str = 'sentence', **kwargs): - """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction + """The inference pipeline for all the fill mask sub-tasks. Args: - model (str or Model): Supply either a local model dir which supported mlm task, or a - mlm model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the sentence in. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + model (`str` or `Model` or module instance): A model instance or a model local dir + or a model id in the model hub. + preprocessor (`Preprocessor`, `optional`): A Preprocessor instance. + first_sequence (`str`, `optional`): The key to read the sentence in. + sequence_length (`int`, `optional`): Max sequence length in the user's custom scenario, default 128. + + NOTE1: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' param will have no effect. - Example: + Example1: >>> from modelscope.pipelines import pipeline >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_structbert_fill-mask_english-large') >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].' >>> print(pipeline_ins(input)) + Example2: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_ponet_fill-mask_english-base') + >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].' + >>> print(pipeline_ins(input)) NOTE2: Please pay attention to the model's special tokens. If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'. If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is ''. To view other examples plese check the tests/pipelines/test_fill_mask.py. """ - fill_mask_model = model if isinstance( - model, Model) else Model.from_pretrained(model) + + fill_mask_model = Model.from_pretrained(model) if isinstance( + model, str) else model if preprocessor is None: - preprocessor = NLPPreprocessor( + preprocessor = Preprocessor.from_pretrained( fill_mask_model.model_dir, first_sequence=first_sequence, second_sequence=None, sequence_length=kwargs.pop('sequence_length', 128)) fill_mask_model.eval() + assert hasattr( + preprocessor, 'mask_id' + ), 'The input preprocessor should have the mask_id attribute.' super().__init__( model=fill_mask_model, preprocessor=preprocessor, **kwargs) - self.preprocessor = preprocessor - self.config = Config.from_file( - os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION)) - self.tokenizer = preprocessor.tokenizer - self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4} - - self.rep_map = { - 'bert': { - '[unused0]': '', - '[PAD]': '', - '[unused1]': '', - r' +': ' ', - '[SEP]': '', - '[unused2]': '', - '[CLS]': '', - '[UNK]': '' - }, - 'roberta': { - r' +': ' ', - '': '', - '': '', - '': '', - '': '', - '': ' ' - }, - 'deberta_v2': { - '[PAD]': '', - r' +': ' ', - '[SEP]': '', - '[CLS]': '', - '[UNK]': '' - }, - } - def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(**inputs, **forward_params) + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: """process the prediction results Args: - inputs (Dict[str, Any]): _description_ - + inputs (Dict[str, Any]): The model outputs. + The output should follow some rules: + 1. Values can be retrieved by keys(dict-like, or the __getitem__ method is overriden) + 2. 'logits' and 'input_ids' key exists. + Models in modelscope will return the output dataclass `modelscope.outputs.FillMaskModelOutput`. Returns: Dict[str, str]: the prediction results """ - import numpy as np logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy() input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy() pred_ids = np.argmax(logits, axis=-1) - if hasattr(self.model.config, 'backbone'): - model_type = self.model.config.backbone.type - else: - model_type = self.model.config.model_type - process_type = model_type if model_type in self.mask_id else _type_map[ - model_type] - rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids, + rst_ids = np.where(input_ids == self.preprocessor.mask_id, pred_ids, input_ids) - def rep_tokens(string, rep_map): - for k, v in rep_map.items(): - string = string.replace(k, v) - return string.strip() - pred_strings = [] for ids in rst_ids: # batch - if 'language' in self.config.model and self.config.model.language == 'zh': - pred_string = self.tokenizer.convert_ids_to_tokens(ids) - pred_string = ''.join(pred_string) - else: - pred_string = self.tokenizer.decode(ids) - pred_string = rep_tokens(pred_string, self.rep_map[process_type]) + pred_string = self.preprocessor.decode( + ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=True) pred_strings.append(pred_string) return {OutputKeys.TEXT: pred_strings} diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py deleted file mode 100644 index 9770fc38..00000000 --- a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import os -from typing import Any, Dict, Optional, Union - -import torch - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import FillMaskPoNetPreprocessor, Preprocessor -from modelscope.utils.config import Config -from modelscope.utils.constant import ModelFile, Tasks - -__all__ = ['FillMaskPonetPipeline'] -_type_map = {'ponet': 'bert'} - - -@PIPELINES.register_module( - Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet) -class FillMaskPonetPipeline(Pipeline): - - def __init__(self, - model: Union[Model, str], - preprocessor: Optional[Preprocessor] = None, - first_sequence='sentence', - **kwargs): - """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction - - Args: - model (str or Model): Supply either a local model dir which supported fill-mask task, - or a fill-mask model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the sentence in. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline( - 'fill-mask', model='damo/nlp_ponet_fill-mask_english-base') - >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].' - >>> print(pipeline_ins(input)) - - NOTE2: Please pay attention to the model's special tokens. - If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'. - If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is ''. - To view other examples plese check the tests/pipelines/test_fill_mask.py. - """ - fill_mask_model = model if isinstance( - model, Model) else Model.from_pretrained(model) - - self.config = Config.from_file( - os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION)) - - if preprocessor is None: - preprocessor = FillMaskPoNetPreprocessor( - fill_mask_model.model_dir, - first_sequence=first_sequence, - second_sequence=None, - sequence_length=kwargs.pop('sequence_length', 512)) - - fill_mask_model.eval() - super().__init__( - model=fill_mask_model, preprocessor=preprocessor, **kwargs) - - self.preprocessor = preprocessor - - self.tokenizer = preprocessor.tokenizer - self.mask_id = {'roberta': 250001, 'bert': 103} - - self.rep_map = { - 'bert': { - '[unused0]': '', - '[PAD]': '', - '[unused1]': '', - r' +': ' ', - '[SEP]': '', - '[unused2]': '', - '[CLS]': '', - '[UNK]': '' - }, - 'roberta': { - r' +': ' ', - '': '', - '': '', - '': '', - '': '', - '': ' ' - } - } - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(**inputs, **forward_params) - - def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - - Returns: - Dict[str, str]: the prediction results - """ - import numpy as np - logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy() - input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy() - pred_ids = np.argmax(logits, axis=-1) - model_type = self.model.config.model_type - process_type = model_type if model_type in self.mask_id else _type_map[ - model_type] - rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids, - input_ids) - - def rep_tokens(string, rep_map): - for k, v in rep_map.items(): - string = string.replace(k, v) - return string.strip() - - pred_strings = [] - for ids in rst_ids: # batch - if 'language' in self.config.model and self.config.model.language == 'zh': - pred_string = self.tokenizer.convert_ids_to_tokens(ids) - pred_string = ''.join(pred_string) - else: - pred_string = self.tokenizer.decode(ids) - pred_string = rep_tokens(pred_string, self.rep_map[process_type]) - pred_strings.append(pred_string) - - return {OutputKeys.TEXT: pred_strings} diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 7275feca..8d8c4542 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -12,6 +12,8 @@ from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import (Preprocessor, TokenClassificationPreprocessor) from modelscope.utils.constant import Tasks +from modelscope.utils.tensor_utils import (torch_nested_detach, + torch_nested_numpify) __all__ = ['NamedEntityRecognitionPipeline'] @@ -59,37 +61,68 @@ class NamedEntityRecognitionPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: + text = inputs.pop(OutputKeys.TEXT) with torch.no_grad(): - return super().forward(inputs, **forward_params) + return { + **self.model(**inputs, **forward_params), OutputKeys.TEXT: text + } def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): should be tensors from model + + Returns: + Dict[str, str]: the prediction results + """ text = inputs['text'] + if OutputKeys.PREDICTIONS not in inputs: + logits = inputs[OutputKeys.LOGITS] + predictions = torch.argmax(logits[0], dim=-1) + else: + predictions = inputs[OutputKeys.PREDICTIONS].squeeze( + 0).cpu().numpy() + predictions = torch_nested_numpify(torch_nested_detach(predictions)) offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] - labels = [self.id2label[x] for x in inputs['predicts']] - entities = [] - entity = {} + + labels = [self.id2label[x] for x in predictions] + chunks = [] + chunk = {} for label, offsets in zip(labels, offset_mapping): if label[0] in 'BS': - if entity: - entity['span'] = text[entity['start']:entity['end']] - entities.append(entity) - entity = { + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + chunk = { 'type': label[2:], 'start': offsets[0], 'end': offsets[1] } if label[0] in 'IES': - if entity: - entity['end'] = offsets[1] + if chunk: + chunk['end'] = offsets[1] + if label[0] in 'ES': - if entity: - entity['span'] = text[entity['start']:entity['end']] - entities.append(entity) - entity = {} - if entity: - entity['span'] = text[entity['start']:entity['end']] - entities.append(entity) - outputs = {OutputKeys.OUTPUT: entities} + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + chunk = {} + + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + + # for cws output + if len(chunks) > 0 and chunks[0]['type'] == 'cws': + spans = [ + chunk['span'] for chunk in chunks if chunk['span'].strip() + ] + seg_result = ' '.join(spans) + outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + # for ner outpus + else: + outputs = {OutputKeys.OUTPUT: chunks} return outputs diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py index 16dedb2e..cfa5c2f1 100644 --- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py @@ -2,15 +2,14 @@ from typing import Any, Dict, Optional, Union -import torch +import numpy as np from modelscope.metainfo import Pipelines from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import (Preprocessor, - SentenceEmbeddingPreprocessor) +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks __all__ = ['SentenceEmbeddingPipeline'] @@ -33,20 +32,18 @@ class SentenceEmbeddingPipeline(Pipeline): the model if supplied. sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + model = Model.from_pretrained(model) if isinstance(model, + str) else model if preprocessor is None: - preprocessor = SentenceEmbeddingPreprocessor( + preprocessor = Preprocessor.from_pretrained( model.model_dir if isinstance(model, Model) else model, first_sequence=first_sequence, sequence_length=kwargs.pop('sequence_length', 128)) - model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return {**self.model(inputs, **forward_params)} + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """process the prediction results @@ -57,6 +54,11 @@ class SentenceEmbeddingPipeline(Pipeline): Returns: Dict[str, Any]: the predicted text representation """ - embs = inputs[OutputKeys.TEXT_EMBEDDING] - scores = inputs[OutputKeys.SCORES] + embs = inputs['last_hidden_state'][:, 0].cpu().numpy() + num_sent = embs.shape[0] + if num_sent >= 2: + scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ], + (1, 0))).tolist()[0] + else: + scores = [] return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores} diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py deleted file mode 100644 index 69f6217a..00000000 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.metainfo import Pipelines -from modelscope.models.base import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import (Preprocessor, - SequenceClassificationPreprocessor) -from modelscope.utils.constant import Tasks - - -@PIPELINES.register_module( - Tasks.text_classification, module_name=Pipelines.sentiment_analysis) -@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) -@PIPELINES.register_module( - Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) -@PIPELINES.register_module( - Tasks.text_classification, module_name=Pipelines.sentiment_classification) -class SequenceClassificationPipeline(Pipeline): - - def __init__(self, - model: Union[Model, str], - preprocessor: Preprocessor = None, - **kwargs): - """This is the base class for all the sequence classification sub-tasks. - - Args: - model (str or Model): A model instance or a model local dir or a model id in the model hub. - preprocessor (Preprocessor): a preprocessor instance, must not be None. - """ - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or Model' - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - first_sequence = kwargs.pop('first_sequence', 'first_sequence') - second_sequence = kwargs.pop('second_sequence', None) - - if preprocessor is None: - preprocessor = SequenceClassificationPreprocessor( - model.model_dir if isinstance(model, Model) else model, - first_sequence=first_sequence, - second_sequence=second_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) - - assert preprocessor is not None - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label - assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ - 'as a parameter or make sure the preprocessor has the attribute.' - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(**inputs, **forward_params) - - def postprocess(self, - inputs: Dict[str, Any], - topk: int = 5) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - topk (int): The topk probs to take - Returns: - Dict[str, str]: the prediction results - """ - - probs = inputs[OutputKeys.PROBABILITIES][0] - num_classes = probs.shape[0] - topk = min(topk, num_classes) - top_indices = np.argpartition(probs, -topk)[-topk:] - cls_ids = top_indices[np.argsort(probs[top_indices])] - probs = probs[cls_ids].tolist() - - cls_names = [self.id2label[cid] for cid in cls_ids] - return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py index fc0d07b1..826e35a9 100644 --- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py @@ -13,9 +13,9 @@ from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import TableQuestionAnsweringPreprocessor -from modelscope.preprocessors.space_T_cn.fields.database import Database -from modelscope.preprocessors.space_T_cn.fields.struct import (Constant, - SQLQuery) +from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database +from modelscope.preprocessors.nlp.space_T_cn.fields.struct import (Constant, + SQLQuery) from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py index 13d9964d..9e00ad7f 100644 --- a/modelscope/pipelines/nlp/text_classification_pipeline.py +++ b/modelscope/pipelines/nlp/text_classification_pipeline.py @@ -1,43 +1,124 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict, Union +import numpy as np + from modelscope.metainfo import Pipelines +from modelscope.models.base import Model from modelscope.models.multi_modal import OfaForAllTasks -from modelscope.pipelines.base import Model, Pipeline +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import OfaPreprocessor, Preprocessor from modelscope.utils.constant import Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() +@PIPELINES.register_module( + Tasks.text_classification, module_name=Pipelines.sentiment_analysis) +@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) +@PIPELINES.register_module( + Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) @PIPELINES.register_module( Tasks.text_classification, module_name=Pipelines.text_classification) +@PIPELINES.register_module( + Tasks.text_classification, module_name=Pipelines.sentiment_classification) +@PIPELINES.register_module( + Tasks.text_classification, module_name=Pipelines.sentence_similarity) +@PIPELINES.register_module( + Tasks.sentiment_classification, + module_name=Pipelines.sentiment_classification) class TextClassificationPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: [Preprocessor] = None, + preprocessor: Preprocessor = None, **kwargs): + """The inference pipeline for all the text classification sub-tasks. + + Args: + model (`str` or `Model` or module instance): A model instance or a model local dir + or a model id in the model hub. + preprocessor (`Preprocessor`, `optional`): A Preprocessor instance. + first_sequence (`str`, `optional`): The key of the first sentence. + second_sequence (`str`, `optional`): The key of the second sentence. + sequence_length (`int`, `optional`): The sequence length. + id2label (`dict`, `optional`): The id-label mapping. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('text-classification', + model='damo/nlp_structbert_sentence-similarity_chinese-base') + >>> input = ('这是个测试', '这也是个测试') + >>> print(pipeline_ins(input)) + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence' + param will have no affection. """ - use `model` and `preprocessor` to create a kws pipeline for prediction + model = Model.from_pretrained(model) if isinstance(model, + str) else model + + if preprocessor is None: + if isinstance(model, OfaForAllTasks): + preprocessor = OfaPreprocessor(model_dir=model.model_dir) + else: + first_sequence = kwargs.pop('first_sequence', 'first_sequence') + second_sequence = kwargs.pop('second_sequence', None) + preprocessor = Preprocessor.from_pretrained( + model if isinstance(model, str) else model.model_dir, + first_sequence=first_sequence, + second_sequence=second_sequence, + sequence_length=kwargs.pop('sequence_length', 512)) + + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.id2label = kwargs.get('id2label') + if self.id2label is None and hasattr(self.preprocessor, 'id2label'): + self.id2label = self.preprocessor.id2label + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + if isinstance(self.model, OfaForAllTasks): + return super().forward(inputs, **forward_params) + return self.model(**inputs, **forward_params) + + def postprocess(self, + inputs: Dict[str, Any], + topk: int = 5) -> Dict[str, str]: + """process the prediction results + Args: - model: model id on modelscope hub. + inputs (`Dict[str, Any]` or `TextClassificationModelOutput`): The model output, please check + the `TextClassificationModelOutput` class for details. + topk (int): The topk probs to take + Returns: + Dict[str, str]: the prediction results. + scores: The probabilities of each label. + labels: The real labels. + Label at index 0 is the smallest probability. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model + if isinstance(self.model, OfaForAllTasks): + return inputs else: - raise NotImplementedError - pipe_model.model.eval() - if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - return inputs + assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ + 'as a parameter or make sure the preprocessor has the attribute.' + logits = inputs[OutputKeys.LOGITS].cpu().numpy() + if logits.shape[0] == 1: + logits = logits[0] + + def softmax(logits): + exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True)) + return exp / exp.sum(axis=-1, keepdims=True) + + probs = softmax(logits) + num_classes = probs.shape[-1] + topk = min(topk, num_classes) + top_indices = np.argpartition(probs, -topk)[-topk:] + probs = np.take_along_axis(probs, top_indices, axis=-1).tolist() + + def map_to_label(id): + return self.id2label[id] + + v_func = np.vectorize(map_to_label) + return { + OutputKeys.SCORES: probs, + OutputKeys.LABELS: v_func(top_indices).tolist() + } diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py index 4aa57238..9cee327b 100644 --- a/modelscope/pipelines/nlp/text_ranking_pipeline.py +++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Optional, Union -import torch +import numpy as np from modelscope.metainfo import Pipelines from modelscope.models import Model @@ -32,20 +32,18 @@ class TextRankingPipeline(Pipeline): the model if supplied. sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + model = Model.from_pretrained(model) if isinstance(model, + str) else model if preprocessor is None: - preprocessor = TextRankingPreprocessor( - model.model_dir if isinstance(model, Model) else model, + preprocessor = Preprocessor.from_pretrained( + model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) - model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return {**self.model(inputs, **forward_params)} + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """process the prediction results @@ -55,6 +53,10 @@ class TextRankingPipeline(Pipeline): Returns: Dict[str, Any]: the predicted text representation """ - pred_list = inputs[OutputKeys.SCORES] + def sigmoid(logits): + return np.exp(logits) / (1 + np.exp(logits)) + + logits = inputs[OutputKeys.LOGITS].squeeze(-1).detach().cpu().numpy() + pred_list = sigmoid(logits).tolist() return {OutputKeys.SCORES: pred_list} diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 055a4b8a..c36f0dfc 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -7,17 +7,22 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import (Preprocessor, - TokenClassificationPreprocessor) +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks +from modelscope.utils.tensor_utils import (torch_nested_detach, + torch_nested_numpify) __all__ = ['TokenClassificationPipeline'] @PIPELINES.register_module( Tasks.token_classification, module_name=Pipelines.part_of_speech) +@PIPELINES.register_module( + Tasks.token_classification, module_name=Pipelines.word_segmentation) +@PIPELINES.register_module( + Tasks.token_classification, module_name=Pipelines.named_entity_recognition) @PIPELINES.register_module( Tasks.part_of_speech, module_name=Pipelines.part_of_speech) class TokenClassificationPipeline(Pipeline): @@ -32,24 +37,18 @@ class TokenClassificationPipeline(Pipeline): model (str or Model): A model instance or a model local dir or a model id in the model hub. preprocessor (Preprocessor): a preprocessor instance, must not be None. """ - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or Model' - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + model = Model.from_pretrained(model) if isinstance(model, + str) else model + if preprocessor is None: - preprocessor = TokenClassificationPreprocessor( + preprocessor = Model.from_pretrained( model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) - if hasattr(model, 'id2label'): - self.id2label = getattr(model, 'id2label') - else: - model_config = getattr(model, 'config') - self.id2label = getattr(model_config, 'id2label') - - assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ - 'as a parameter or make sure the preprocessor has the attribute.' + self.id2label = kwargs.get('id2label') + if self.id2label is None and hasattr(self.preprocessor, 'id2label'): + self.id2label = self.preprocessor.id2label def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: @@ -64,38 +63,59 @@ class TokenClassificationPipeline(Pipeline): """process the prediction results Args: - inputs (Dict[str, Any]): _description_ + inputs (Dict[str, Any]): should be tensors from model Returns: Dict[str, str]: the prediction results """ + text = inputs['text'] + if not hasattr(inputs, 'predictions'): + logits = inputs[OutputKeys.LOGITS] + predictions = torch.argmax(logits[0], dim=-1) + else: + predictions = inputs[OutputKeys.PREDICTIONS].squeeze( + 0).cpu().numpy() + predictions = torch_nested_numpify(torch_nested_detach(predictions)) + offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] - pred_list = inputs['predictions'] - labels = [] - for pre in pred_list: - labels.append(self.id2label[pre]) - labels = labels[1:-1] + labels = [self.id2label[x] for x in predictions] + if len(labels) > len(offset_mapping): + labels = labels[1:-1] chunks = [] - tags = [] - chunk = '' - assert len(inputs['text']) == len(labels) - for token, label in zip(inputs['text'], labels): - if label[0] == 'B' or label[0] == 'I': - chunk += token - else: - chunk += token - chunks.append(chunk) - chunk = '' - tags.append(label.split('-')[-1]) + chunk = {} + for label, offsets in zip(labels, offset_mapping): + if label[0] in 'BS': + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + chunk = { + 'type': label[2:], + 'start': offsets[0], + 'end': offsets[1] + } + if label[0] in 'IES': + if chunk: + chunk['end'] = offsets[1] + + if label[0] in 'ES': + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + chunk = {} + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] chunks.append(chunk) - tags.append(label.split('-')[-1]) - pos_result = [] - seg_result = ' '.join(chunks) - for chunk, tag in zip(chunks, tags): - pos_result.append({OutputKeys.WORD: chunk, OutputKeys.LABEL: tag}) - outputs = { - OutputKeys.OUTPUT: seg_result, - OutputKeys.LABELS: pos_result - } + + # for cws output + if len(chunks) > 0 and chunks[0]['type'] == 'cws': + spans = [ + chunk['span'] for chunk in chunks if chunk['span'].strip() + ] + seg_result = ' '.join(spans) + outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + + # for ner outputs + else: + outputs = {OutputKeys.OUTPUT: chunks} return outputs diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index eb7f7f74..68a03631 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -34,7 +34,8 @@ class TranslationPipeline(Pipeline): def __init__(self, model: Model, **kwargs): """Build a translation pipeline with a model dir or a model id in the model hub. - @param model: A Model instance. + Args: + model: A Model instance. """ super().__init__(model=model, **kwargs) model = self.model.model_dir diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 9d4bb67f..0df8f1ad 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -12,6 +12,8 @@ from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import (Preprocessor, TokenClassificationPreprocessor) from modelscope.utils.constant import Tasks +from modelscope.utils.tensor_utils import (torch_nested_detach, + torch_nested_numpify) __all__ = ['WordSegmentationPipeline'] @@ -72,28 +74,56 @@ class WordSegmentationPipeline(Pipeline): """process the prediction results Args: - inputs (Dict[str, Any]): _description_ + inputs (Dict[str, Any]): should be tensors from model Returns: Dict[str, str]: the prediction results """ - - pred_list = inputs['predictions'] - labels = [] - for pre in pred_list: - labels.append(self.id2label[pre]) - labels = labels[1:-1] + text = inputs['text'] + logits = inputs[OutputKeys.LOGITS] + predictions = torch.argmax(logits[0], dim=-1) + logits = torch_nested_numpify(torch_nested_detach(logits)) + predictions = torch_nested_numpify(torch_nested_detach(predictions)) + offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] + + labels = [self.id2label[x] for x in predictions] + if len(labels) > len(offset_mapping): + labels = labels[1:-1] chunks = [] - chunk = '' - assert len(inputs['text']) == len(labels) - for token, label in zip(inputs['text'], labels): - if label[0] == 'B' or label[0] == 'I': - chunk += token - else: - chunk += token - chunks.append(chunk) - chunk = '' + chunk = {} + for label, offsets in zip(labels, offset_mapping): + if label[0] in 'BS': + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + chunk = { + 'type': label[2:], + 'start': offsets[0], + 'end': offsets[1] + } + if label[0] in 'IES': + if chunk: + chunk['end'] = offsets[1] + + if label[0] in 'ES': + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] + chunks.append(chunk) + chunk = {} + if chunk: + chunk['span'] = text[chunk['start']:chunk['end']] chunks.append(chunk) - seg_result = ' '.join(chunks) - return {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + + # for cws output + if len(chunks) > 0 and chunks[0]['type'] == 'cws': + spans = [ + chunk['span'] for chunk in chunks if chunk['span'].strip() + ] + seg_result = ' '.join(spans) + outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + + # for ner outpus + else: + outputs = {OutputKeys.OUTPUT: chunks} + return outputs diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index fc7051c7..88792b45 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -86,8 +86,7 @@ class ZeroShotClassificationPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(**inputs, **forward_params) + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any], @@ -99,7 +98,7 @@ class ZeroShotClassificationPipeline(Pipeline): Returns: Dict[str, Any]: the prediction results """ - logits = inputs[OutputKeys.LOGITS] + logits = inputs[OutputKeys.LOGITS].cpu().numpy() if multi_label or len(candidate_labels) == 1: logits = logits[..., [self.contradiction_id, self.entailment_id]] scores = softmax(logits, axis=-1)[..., 1] diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 423b3f46..76c6d877 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -16,31 +16,20 @@ if TYPE_CHECKING: from .kws import WavToLists from .multi_modal import (OfaPreprocessor, MPlugPreprocessor) from .nlp import ( - DocumentSegmentationPreprocessor, - FaqQuestionAnsweringPreprocessor, - FillMaskPoNetPreprocessor, - NLPPreprocessor, - NLPTokenizerPreprocessorBase, - TextRankingPreprocessor, - RelationExtractionPreprocessor, - SentenceEmbeddingPreprocessor, - SequenceClassificationPreprocessor, - TokenClassificationPreprocessor, - TextErrorCorrectionPreprocessor, - TextGenerationPreprocessor, - Text2TextGenerationPreprocessor, - Tokenize, + DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor, + FillMaskPoNetPreprocessor, NLPPreprocessor, + NLPTokenizerPreprocessorBase, TextRankingPreprocessor, + RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor, + SequenceClassificationPreprocessor, TokenClassificationPreprocessor, + TextErrorCorrectionPreprocessor, TextGenerationPreprocessor, + Text2TextGenerationPreprocessor, Tokenize, WordSegmentationBlankSetToLabelPreprocessor, - ZeroShotClassificationPreprocessor, - TextGenerationJiebaPreprocessor, - SentencePiecePreprocessor, - ) - from .space import (DialogIntentPredictionPreprocessor, - DialogModelingPreprocessor, - DialogStateTrackingPreprocessor) + ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor, + SentencePiecePreprocessor, DialogIntentPredictionPreprocessor, + DialogModelingPreprocessor, DialogStateTrackingPreprocessor, + ConversationalTextToSqlPreprocessor, + TableQuestionAnsweringPreprocessor) from .video import ReadVideoData, MovieSceneSegmentationPreprocessor - from .star import ConversationalTextToSqlPreprocessor - from .space_T_cn import TableQuestionAnsweringPreprocessor else: _import_structure = { @@ -58,30 +47,22 @@ else: 'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'], 'nlp': [ 'DocumentSegmentationPreprocessor', - 'FaqQuestionAnsweringPreprocessor', - 'FillMaskPoNetPreprocessor', - 'NLPPreprocessor', - 'NLPTokenizerPreprocessorBase', - 'TextRankingPreprocessor', - 'RelationExtractionPreprocessor', + 'FaqQuestionAnsweringPreprocessor', 'FillMaskPoNetPreprocessor', + 'NLPPreprocessor', 'NLPTokenizerPreprocessorBase', + 'TextRankingPreprocessor', 'RelationExtractionPreprocessor', 'SentenceEmbeddingPreprocessor', 'SequenceClassificationPreprocessor', 'TokenClassificationPreprocessor', - 'TextErrorCorrectionPreprocessor', - 'TextGenerationPreprocessor', - 'Tokenize', - 'Text2TextGenerationPreprocessor', + 'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor', + 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'ZeroShotClassificationPreprocessor', - 'TextGenerationJiebaPreprocessor', - 'SentencePiecePreprocessor', - ], - 'space': [ + 'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor', 'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor', - 'DialogStateTrackingPreprocessor', 'InputFeatures' + 'DialogStateTrackingPreprocessor', + 'ConversationalTextToSqlPreprocessor', + 'TableQuestionAnsweringPreprocessor' ], - 'star': ['ConversationalTextToSqlPreprocessor'], - 'space_T_cn': ['TableQuestionAnsweringPreprocessor'], } import sys diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index 6360a907..c2716a13 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -1,15 +1,22 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os from abc import ABC, abstractmethod -from typing import Any, Dict +from copy import deepcopy +from typing import Any, Dict, Optional, Sequence -from modelscope.utils.constant import ModeKeys +from modelscope.utils.config import Config +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks +from modelscope.utils.hub import read_config, snapshot_download +from modelscope.utils.logger import get_logger +from .builder import build_preprocessor + +logger = get_logger(__name__) class Preprocessor(ABC): - def __init__(self, *args, **kwargs): - self._mode = ModeKeys.INFERENCE + def __init__(self, mode=ModeKeys.INFERENCE, *args, **kwargs): + self._mode = mode self.device = int( os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None pass @@ -25,3 +32,61 @@ class Preprocessor(ABC): @mode.setter def mode(self, value): self._mode = value + + @classmethod + def from_pretrained(cls, + model_name_or_path: str, + revision: Optional[str] = DEFAULT_MODEL_REVISION, + cfg_dict: Config = None, + preprocessor_mode=ModeKeys.INFERENCE, + **kwargs): + """ Instantiate a model from local directory or remote model repo. Note + that when loading from remote, the model revision can be specified. + """ + if not os.path.exists(model_name_or_path): + model_dir = snapshot_download( + model_name_or_path, revision=revision) + else: + model_dir = model_name_or_path + if cfg_dict is None: + cfg = read_config(model_dir) + else: + cfg = cfg_dict + task = cfg.task + if 'task' in kwargs: + task = kwargs.pop('task') + field_name = Tasks.find_field_by_task(task) + if not hasattr(cfg, 'preprocessor'): + logger.error('No preprocessor field found in cfg.') + return None + + sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val' + + if 'type' not in cfg.preprocessor: + if sub_key in cfg.preprocessor: + sub_cfg = getattr(cfg.preprocessor, sub_key) + else: + logger.error( + f'No {sub_key} key and type key found in ' + f'preprocessor domain of configuration.json file.') + return None + else: + sub_cfg = cfg.preprocessor + + if len(sub_cfg): + if isinstance(sub_cfg, Sequence): + # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, + # and add mode for Compose or other plans + raise NotImplementedError('Not supported yet!') + sub_cfg = deepcopy(sub_cfg) + sub_cfg.update({'model_dir': model_dir}) + sub_cfg.update(kwargs) + preprocessor = build_preprocessor(sub_cfg, field_name) + else: + logger.error( + f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, ' + f'please check the preprocessor field in the configuration.json file.' + ) + return None + preprocessor.mode = preprocessor_mode + return preprocessor diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index b95048ba..ea7b6bf4 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -5,50 +5,68 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .text_error_correction import TextErrorCorrectionPreprocessor - from .nlp_base import ( - DocumentSegmentationPreprocessor, - FaqQuestionAnsweringPreprocessor, - FillMaskPoNetPreprocessor, - NLPPreprocessor, - NLPTokenizerPreprocessorBase, - TextRankingPreprocessor, - RelationExtractionPreprocessor, - SentenceEmbeddingPreprocessor, - SequenceClassificationPreprocessor, - TokenClassificationPreprocessor, - TextGenerationPreprocessor, - Text2TextGenerationPreprocessor, - Tokenize, - WordSegmentationBlankSetToLabelPreprocessor, - ZeroShotClassificationPreprocessor, - TextGenerationJiebaPreprocessor, - SentencePiecePreprocessor, - ) - + from .nlp_base import (NLPTokenizerPreprocessorBase, NLPBasePreprocessor) + from .text_generation_jieba_preprocessor import TextGenerationJiebaPreprocessor + from .sentence_piece_preprocessor import SentencePiecePreprocessor + from .bert_seq_cls_tokenizer import Tokenize + from .document_segmentation_preprocessor import DocumentSegmentationPreprocessor + from .faq_question_answering_preprocessor import FaqQuestionAnsweringPreprocessor + from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, NLPPreprocessor + from .text_ranking_preprocessor import TextRankingPreprocessor + from .relation_extraction_preprocessor import RelationExtractionPreprocessor + from .sentence_classification_preprocessor import SequenceClassificationPreprocessor + from .sentence_embedding_preprocessor import SentenceEmbeddingPreprocessor + from .text_generation_preprocessor import TextGenerationPreprocessor + from .text2text_generation_preprocessor import Text2TextGenerationPreprocessor + from .token_classification_preprocessor import TokenClassificationPreprocessor, \ + WordSegmentationBlankSetToLabelPreprocessor + from .zero_shot_classification_reprocessor import ZeroShotClassificationPreprocessor + from .space import (DialogIntentPredictionPreprocessor, + DialogModelingPreprocessor, + DialogStateTrackingPreprocessor, InputFeatures, + MultiWOZBPETextField, IntentBPETextField) + from .space_T_en import ConversationalTextToSqlPreprocessor + from .space_T_cn import TableQuestionAnsweringPreprocessor else: _import_structure = { 'nlp_base': [ - 'DocumentSegmentationPreprocessor', - 'FaqQuestionAnsweringPreprocessor', - 'FillMaskPoNetPreprocessor', - 'NLPPreprocessor', 'NLPTokenizerPreprocessorBase', - 'TextRankingPreprocessor', - 'RelationExtractionPreprocessor', - 'SentenceEmbeddingPreprocessor', - 'SequenceClassificationPreprocessor', + 'NLPBasePreprocessor', + ], + 'text_generation_jieba_preprocessor': + ['TextGenerationJiebaPreprocessor'], + 'sentence_piece_preprocessor': ['SentencePiecePreprocessor'], + 'bert_seq_cls_tokenizer': ['Tokenize'], + 'document_segmentation_preprocessor': + ['DocumentSegmentationPreprocessor'], + 'faq_question_answering_preprocessor': + ['FaqQuestionAnsweringPreprocessor'], + 'fill_mask_preprocessor': + ['FillMaskPoNetPreprocessor', 'NLPPreprocessor'], + 'text_ranking_preprocessor': ['TextRankingPreprocessor'], + 'relation_extraction_preprocessor': ['RelationExtractionPreprocessor'], + 'sentence_classification_preprocessor': + ['SequenceClassificationPreprocessor'], + 'sentence_embedding_preprocessor': ['SentenceEmbeddingPreprocessor'], + 'text_generation_preprocessor': ['TextGenerationPreprocessor'], + 'text2text_generation_preprocessor': + ['Text2TextGenerationPreprocessor'], + 'token_classification_preprocessor': [ 'TokenClassificationPreprocessor', - 'TextGenerationPreprocessor', - 'Tokenize', - 'Text2TextGenerationPreprocessor', - 'WordSegmentationBlankSetToLabelPreprocessor', - 'ZeroShotClassificationPreprocessor', - 'TextGenerationJiebaPreprocessor', - 'SentencePiecePreprocessor', + 'WordSegmentationBlankSetToLabelPreprocessor' ], + 'zero_shot_classification_reprocessor': + ['ZeroShotClassificationPreprocessor'], 'text_error_correction': [ 'TextErrorCorrectionPreprocessor', ], + 'space': [ + 'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor', + 'DialogStateTrackingPreprocessor', 'InputFeatures', + 'MultiWOZBPETextField', 'IntentBPETextField' + ], + 'space_T_en': ['ConversationalTextToSqlPreprocessor'], + 'space_T_cn': ['TableQuestionAnsweringPreprocessor'], } import sys diff --git a/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py b/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py new file mode 100644 index 00000000..576687ce --- /dev/null +++ b/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Union + +from transformers import AutoTokenizer + +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, InputFields + + +@PREPROCESSORS.register_module(Fields.nlp) +class Tokenize(Preprocessor): + + def __init__(self, tokenizer_name) -> None: + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + + def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]: + if isinstance(data, str): + data = {InputFields.text: data} + token_dict = self.tokenizer(data[InputFields.text]) + data.update(token_dict) + return data diff --git a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py new file mode 100644 index 00000000..5ab0a0c6 --- /dev/null +++ b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py @@ -0,0 +1,220 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields +from modelscope.utils.logger import get_logger +from .nlp_base import NLPBasePreprocessor + +logger = get_logger() + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.document_segmentation) +class DocumentSegmentationPreprocessor(NLPBasePreprocessor): + + def __init__(self, model_dir: str, config, *args, **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ + + super().__init__(model_dir, *args, **kwargs) + from transformers import BertTokenizerFast + self.tokenizer = BertTokenizerFast.from_pretrained( + model_dir, + use_fast=True, + ) + self.question_column_name = 'labels' + self.context_column_name = 'sentences' + self.example_id_column_name = 'example_id' + self.label_to_id = {'B-EOP': 0, 'O': 1} + self.target_specical_ids = set() + self.target_specical_ids.add(self.tokenizer.eos_token_id) + self.max_seq_length = config.max_position_embeddings + self.label_list = ['B-EOP', 'O'] + + def __call__(self, examples) -> Dict[str, Any]: + questions = examples[self.question_column_name] + contexts = examples[self.context_column_name] + example_ids = examples[self.example_id_column_name] + num_examples = len(questions) + + sentences = [] + for sentence_list in contexts: + sentence_list = [_ + '[EOS]' for _ in sentence_list] + sentences.append(sentence_list) + + try: + tokenized_examples = self.tokenizer( + sentences, + is_split_into_words=True, + add_special_tokens=False, + return_token_type_ids=True, + return_attention_mask=True, + ) + except Exception as e: + logger.error(e) + return {} + + segment_ids = [] + token_seq_labels = [] + for example_index in range(num_examples): + example_input_ids = tokenized_examples['input_ids'][example_index] + example_labels = questions[example_index] + example_labels = [ + self.label_to_id[_] if _ in self.label_to_id else -100 + for _ in example_labels + ] + example_token_labels = [] + segment_id = [] + cur_seg_id = 1 + for token_index in range(len(example_input_ids)): + if example_input_ids[token_index] in self.target_specical_ids: + example_token_labels.append(example_labels[cur_seg_id - 1]) + segment_id.append(cur_seg_id) + cur_seg_id += 1 + else: + example_token_labels.append(-100) + segment_id.append(cur_seg_id) + + segment_ids.append(segment_id) + token_seq_labels.append(example_token_labels) + + tokenized_examples['segment_ids'] = segment_ids + tokenized_examples['token_seq_labels'] = token_seq_labels + + new_segment_ids = [] + new_token_seq_labels = [] + new_input_ids = [] + new_token_type_ids = [] + new_attention_mask = [] + new_example_ids = [] + new_sentences = [] + + for example_index in range(num_examples): + example_input_ids = tokenized_examples['input_ids'][example_index] + example_token_type_ids = tokenized_examples['token_type_ids'][ + example_index] + example_attention_mask = tokenized_examples['attention_mask'][ + example_index] + example_segment_ids = tokenized_examples['segment_ids'][ + example_index] + example_token_seq_labels = tokenized_examples['token_seq_labels'][ + example_index] + example_sentences = contexts[example_index] + example_id = example_ids[example_index] + example_total_num_sentences = len(questions[example_index]) + example_total_num_tokens = len( + tokenized_examples['input_ids'][example_index]) + accumulate_length = [ + i for i, x in enumerate(tokenized_examples['input_ids'] + [example_index]) + if x == self.tokenizer.eos_token_id + ] + samples_boundary = [] + left_index = 0 + sent_left_index = 0 + sent_i = 0 + + # for sent_i, length in enumerate(accumulate_length): + while sent_i < len(accumulate_length): + length = accumulate_length[sent_i] + right_index = length + 1 + sent_right_index = sent_i + 1 + if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens: + samples_boundary.append([left_index, right_index]) + + sample_input_ids = [ + self.tokenizer.cls_token_id + ] + example_input_ids[left_index:right_index] + sample_input_ids = sample_input_ids[:self.max_seq_length] + + sample_token_type_ids = [ + 0 + ] + example_token_type_ids[left_index:right_index] + sample_token_type_ids = sample_token_type_ids[:self. + max_seq_length] + + sample_attention_mask = [ + 1 + ] + example_attention_mask[left_index:right_index] + sample_attention_mask = sample_attention_mask[:self. + max_seq_length] + + sample_segment_ids = [ + 0 + ] + example_segment_ids[left_index:right_index] + sample_segment_ids = sample_segment_ids[:self. + max_seq_length] + + sample_token_seq_labels = [ + -100 + ] + example_token_seq_labels[left_index:right_index] + sample_token_seq_labels = sample_token_seq_labels[:self. + max_seq_length] + + if sent_right_index - 1 == sent_left_index: + left_index = right_index + sample_input_ids[-1] = self.tokenizer.eos_token_id + sample_token_seq_labels[-1] = -100 + else: + left_index = accumulate_length[sent_i - 1] + 1 + if sample_token_seq_labels[-1] != -100: + sample_token_seq_labels[-1] = -100 + + if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens: + sample_sentences = example_sentences[ + sent_left_index:sent_right_index] + sent_left_index = sent_right_index + sent_i += 1 + else: + sample_sentences = example_sentences[ + sent_left_index:sent_right_index - 1] + sent_left_index = sent_right_index - 1 + + if (len([_ for _ in sample_token_seq_labels if _ != -100 + ])) != len(sample_sentences) - 1 and (len([ + _ + for _ in sample_token_seq_labels if _ != -100 + ])) != len(sample_sentences): + tmp = [] + for w_i, w, l in zip( + sample_input_ids, + self.tokenizer.decode(sample_input_ids).split( + ' '), sample_token_seq_labels): + tmp.append((w_i, w, l)) + while len(sample_input_ids) < self.max_seq_length: + sample_input_ids.append(self.tokenizer.pad_token_id) + sample_token_type_ids.append(0) + sample_attention_mask.append(0) + sample_segment_ids.append(example_total_num_sentences + + 1) + sample_token_seq_labels.append(-100) + + new_input_ids.append(sample_input_ids) + new_token_type_ids.append(sample_token_type_ids) + new_attention_mask.append(sample_attention_mask) + new_segment_ids.append(sample_segment_ids) + new_token_seq_labels.append(sample_token_seq_labels) + new_example_ids.append(example_id) + new_sentences.append(sample_sentences) + else: + sent_i += 1 + continue + + output_samples = {} + + output_samples['input_ids'] = new_input_ids + output_samples['token_type_ids'] = new_token_type_ids + output_samples['attention_mask'] = new_attention_mask + + output_samples['segment_ids'] = new_segment_ids + output_samples['example_id'] = new_example_ids + output_samples['labels'] = new_token_seq_labels + output_samples['sentences'] = new_sentences + + return output_samples diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py new file mode 100644 index 00000000..72c8ed99 --- /dev/null +++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py @@ -0,0 +1,90 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from typing import Any, Dict + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.config import Config, ConfigFields +from modelscope.utils.constant import Fields, ModeKeys, ModelFile +from modelscope.utils.type_assert import type_assert +from .nlp_base import NLPBasePreprocessor + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor) +class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + super(FaqQuestionAnsweringPreprocessor, self).__init__( + model_dir, mode=ModeKeys.INFERENCE, **kwargs) + from transformers import BertTokenizer + self.tokenizer = BertTokenizer.from_pretrained(model_dir) + preprocessor_config = Config.from_file( + os.path.join(model_dir, ModelFile.CONFIGURATION)).get( + ConfigFields.preprocessor, {}) + self.MAX_LEN = preprocessor_config.get('max_seq_length', 50) + self.label_dict = None + + def pad(self, samples, max_len): + result = [] + for sample in samples: + pad_len = max_len - len(sample[:max_len]) + result.append(sample[:max_len] + + [self.tokenizer.pad_token_id] * pad_len) + return result + + def set_label_dict(self, label_dict): + self.label_dict = label_dict + + def get_label(self, label_id): + assert self.label_dict is not None and label_id < len(self.label_dict) + return self.label_dict[label_id] + + def encode_plus(self, text): + return [ + self.tokenizer.cls_token_id + ] + self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id] + + @type_assert(object, Dict) + def __call__(self, data: Dict[str, Any], + **preprocessor_param) -> Dict[str, Any]: + TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN) + queryset = data['query_set'] + if not isinstance(queryset, list): + queryset = [queryset] + supportset = data['support_set'] + supportset = sorted(supportset, key=lambda d: d['label']) + + queryset_tokenized = [self.encode_plus(text) for text in queryset] + supportset_tokenized = [ + self.encode_plus(item['text']) for item in supportset + ] + + max_len = max( + [len(seq) for seq in queryset_tokenized + supportset_tokenized]) + max_len = min(TMP_MAX_LEN, max_len) + queryset_padded = self.pad(queryset_tokenized, max_len) + supportset_padded = self.pad(supportset_tokenized, max_len) + + supportset_labels_ori = [item['label'] for item in supportset] + label_dict = [] + for label in supportset_labels_ori: + if label not in label_dict: + label_dict.append(label) + self.set_label_dict(label_dict) + supportset_labels_ids = [ + label_dict.index(label) for label in supportset_labels_ori + ] + return { + 'query': queryset_padded, + 'support': supportset_padded, + 'support_labels': supportset_labels_ids + } + + def batch_encode(self, sentence_list: list, max_length=None): + if not max_length: + max_length = self.MAX_LEN + return self.tokenizer.batch_encode_plus( + sentence_list, padding=True, max_length=max_length) diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py new file mode 100644 index 00000000..b0638dbc --- /dev/null +++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py @@ -0,0 +1,142 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os.path as osp +import re +from typing import Any, Dict, Tuple, Union + +import numpy as np +import torch + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.config import Config +from modelscope.utils.constant import Fields, ModeKeys, ModelFile +from modelscope.utils.nlp import import_external_nltk_data +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.feature_extraction) +class NLPPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in MLM task. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + True) + super().__init__(model_dir, mode=mode, **kwargs) + + @property + def mask_id(self): + return self.tokenizer.mask_token_id + + def decode(self, + token_ids, + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs): + return self.tokenizer.decode(token_ids, skip_special_tokens, + clean_up_tokenization_spaces, **kwargs) + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.fill_mask_ponet) +class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in PoNet model's MLM task. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 512) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + True) + super().__init__(model_dir, mode=mode, **kwargs) + + self.cfg = Config.from_file( + osp.join(model_dir, ModelFile.CONFIGURATION)) + self.language = self.cfg.model.get('language', 'en') + if self.language == 'en': + from nltk.tokenize import sent_tokenize + import_external_nltk_data( + osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt') + elif self.language in ['zh', 'cn']: + + def sent_tokenize(para): + para = re.sub(r'([。!!?\?])([^”’])', r'\1\n\2', para) # noqa * + para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para) # noqa * + para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para) # noqa * + para = re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', + para) # noqa * + para = para.rstrip() + return [_ for _ in para.split('\n') if _] + else: + raise NotImplementedError + + self.sent_tokenize = sent_tokenize + self.max_length = kwargs['max_length'] + + def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: + """process the raw input data + + Args: + data (tuple): [sentence1, sentence2] + sentence1 (str): a sentence + Example: + 'you are so handsome.' + sentence2 (str): a sentence + Example: + 'you are so beautiful.' + Returns: + Dict[str, Any]: the preprocessed data + """ + + text_a, text_b, labels = self.parse_text_and_label(data) + output = self.tokenizer( + text_a, + text_b, + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, + **self.tokenize_kwargs) + max_seq_length = self.max_length + + if text_b is None: + segment_ids = [] + seg_lens = list( + map( + len, + self.tokenizer( + self.sent_tokenize(text_a), + add_special_tokens=False, + truncation=True)['input_ids'])) + segment_id = [0] + sum( + [[i] * sl for i, sl in enumerate(seg_lens, start=1)], []) + segment_id = segment_id[:max_seq_length - 1] + segment_ids.append(segment_id + [segment_id[-1] + 1] + * (max_seq_length - len(segment_id))) + if self.mode == ModeKeys.INFERENCE: + segment_ids = torch.tensor(segment_ids) + output['segment_ids'] = segment_ids + + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } + + self.labels_to_id(labels, output) + return output + + @property + def mask_id(self): + return self.tokenizer.mask_token_id + + def decode(self, + token_ids, + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs): + return self.tokenizer.decode(token_ids, skip_special_tokens, + clean_up_tokenization_spaces, **kwargs) diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 6075a4b3..48a04d7a 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -1,67 +1,41 @@ # Copyright (c) Alibaba, Inc. and its affiliates. + import os -import os.path as osp -import re -from typing import Any, Dict, Optional, Tuple, Union +from abc import ABC +from collections.abc import Mapping +from typing import Any, Dict, List, Tuple, Union import json import numpy as np -import sentencepiece as spm import torch from transformers import AutoTokenizer -from modelscope.metainfo import Models, Preprocessors +from modelscope.metainfo import Models from modelscope.outputs import OutputKeys from modelscope.preprocessors.base import Preprocessor -from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.config import Config, ConfigFields -from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile +from modelscope.utils.constant import ModeKeys from modelscope.utils.hub import get_model_type, parse_label_mapping from modelscope.utils.logger import get_logger -from modelscope.utils.nlp import import_external_nltk_data -from modelscope.utils.type_assert import type_assert logger = get_logger() __all__ = [ - 'DocumentSegmentationPreprocessor', - 'FaqQuestionAnsweringPreprocessor', - 'NLPPreprocessor', - 'FillMaskPoNetPreprocessor', + 'NLPBasePreprocessor', 'NLPTokenizerPreprocessorBase', - 'TextRankingPreprocessor', - 'RelationExtractionPreprocessor', - 'SentenceEmbeddingPreprocessor', - 'SequenceClassificationPreprocessor', - 'TokenClassificationPreprocessor', - 'Text2TextGenerationPreprocessor', - 'TextGenerationPreprocessor', - 'Tokenize', - 'WordSegmentationBlankSetToLabelPreprocessor', - 'ZeroShotClassificationPreprocessor', ] -@PREPROCESSORS.register_module(Fields.nlp) -class Tokenize(Preprocessor): - - def __init__(self, tokenizer_name) -> None: - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - - def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]: - if isinstance(data, str): - data = {InputFields.text: data} - token_dict = self.tokenizer(data[InputFields.text]) - data.update(token_dict) - return data - - -class NLPTokenizerPreprocessorBase(Preprocessor): - - def __init__(self, model_dir: str, mode: str, **kwargs): - """The NLP tokenizer preprocessor base class. +class NLPBasePreprocessor(Preprocessor, ABC): - Any nlp preprocessor which uses the hf tokenizer can inherit from this class. + def __init__(self, + model_dir: str, + first_sequence=None, + second_sequence=None, + label=None, + label2id=None, + mode=ModeKeys.INFERENCE, + **kwargs): + """The NLP preprocessor base class. Args: model_dir (str): The local model path @@ -71,18 +45,12 @@ class NLPTokenizerPreprocessorBase(Preprocessor): label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping if this mapping is not supplied. mode: Run this preprocessor in either 'train'/'eval'/'inference' mode - kwargs: These kwargs will be directly fed into the tokenizer. """ + self.model_dir = model_dir + self.first_sequence = first_sequence + self.second_sequence = second_sequence + self.label = label - super().__init__(**kwargs) - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.sequence_length = kwargs.pop('sequence_length', 128) - - self._mode = mode - self.label = kwargs.pop('label', OutputKeys.LABEL) self.use_fast = kwargs.pop('use_fast', None) if self.use_fast is None and os.path.isfile( os.path.join(model_dir, 'tokenizer_config.json')): @@ -92,15 +60,82 @@ class NLPTokenizerPreprocessorBase(Preprocessor): self.use_fast = json_config.get('use_fast') self.use_fast = False if self.use_fast is None else self.use_fast - self.label2id = None - if 'label2id' in kwargs: - self.label2id = kwargs.pop('label2id') + self.label2id = label2id if self.label2id is None: self.label2id = parse_label_mapping(self.model_dir) + super().__init__(mode, **kwargs) - self.tokenize_kwargs = kwargs + @property + def mask_id(self): + """Child preprocessor can override this property to return the id of mask token. + Returns: + The id of mask token, default None. + """ + return None + + def decode(self, + token_ids: Union[int, List[int], 'np.ndarray', 'torch.Tensor', + 'tf.Tensor'], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs): + """Turn the token_ids to real sentence. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to clean up the tokenization spaces. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + Returns: + The real sentence decoded by the preprocessor. + """ + raise NotImplementedError() + + +class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): + + def __init__(self, + model_dir: str, + first_sequence: str = None, + second_sequence: str = None, + label: str = 'label', + label2id: dict = None, + mode: str = ModeKeys.INFERENCE, + **kwargs): + """The NLP tokenizer preprocessor base class. + + Any nlp preprocessor which uses the hf tokenizer can inherit from this class. + + Args: + model_dir (str): The local model path + first_sequence: The key for the first sequence + second_sequence: The key for the second sequence + label: The key for the label + label2id: An optional label2id dict. + If label2id is None, the preprocessor will try to parse label-id mapping from: + - configuration.json model.label2id/model.id2label + - config.json label2id/id2label + - label_mapping.json + mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different. + kwargs: These kwargs will be directly fed into the tokenizer. + """ + + super().__init__(model_dir, first_sequence, second_sequence, label, + label2id, mode) + self.model_dir = model_dir + self.tokenize_kwargs = kwargs self.tokenizer = self.build_tokenizer(model_dir) + logger.info(f'The key of sentence1: {self.first_sequence}, ' + f'The key of sentence2: {self.second_sequence}, ' + f'The key of label: {self.label}') + if self.first_sequence is None: + logger.warning('[Important] first_sequence attribute is not set, ' + 'this will cause an error if your input is a dict.') @property def id2label(self): @@ -118,8 +153,11 @@ class NLPTokenizerPreprocessorBase(Preprocessor): NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a multi-thread problem. - @param model_dir: The local model dir. - @return: The initialized tokenizer. + Args: + model_dir: The local model dir. + + Returns: + The initialized tokenizer. """ self.is_transformer_based_model = 'lstm' not in model_dir # fast version lead to parallel inference failed @@ -180,8 +218,11 @@ class NLPTokenizerPreprocessorBase(Preprocessor): If the pair param is False, data will be parsed as the first_sentence and the label, else it will be parsed as the first_sentence and the second_sentence. - @param data: The input data. - @return: The sentences and labels tuple. + Args: + data: The input data. + + Returns: + The sentences and labels tuple. """ text_a, text_b, labels = None, None, None if isinstance(data, str): @@ -194,7 +235,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor): text_a, text_b = data else: text_a, labels = data - elif isinstance(data, dict): + elif isinstance(data, Mapping): text_a = data.get(self.first_sequence) text_b = data.get(self.second_sequence) labels = data.get(self.label) @@ -208,1007 +249,34 @@ class NLPTokenizerPreprocessorBase(Preprocessor): If the original label's type is float, or the label2id mapping does not exist, the original label will be returned. - @param labels: The input labels. - @param output: The label id. - @return: The final labels. + Args: + labels: The input labels. + output: The label id. + + Returns: + The final labels. """ def label_can_be_mapped(label): return isinstance(label, str) or isinstance(label, int) - if labels is not None: + try: if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \ and self.label2id is not None: output[OutputKeys.LABELS] = [ - self.label2id[str(label)] for label in labels + self.label2id[label] + if label in self.label2id else self.label2id[str(label)] + for label in labels ] elif label_can_be_mapped(labels) and self.label2id is not None: - output[OutputKeys.LABELS] = self.label2id[str(labels)] - else: + output[OutputKeys.LABELS] = self.label2id[ + labels] if labels in self.label2id else self.label2id[str( + labels)] + elif labels is not None: output[OutputKeys.LABELS] = labels - - -@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.feature_extraction) -class NLPPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in MLM task. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - True) - super().__init__(model_dir, mode=mode, **kwargs) - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text_ranking) -class TextRankingPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in text-ranking model. - """ - - def __init__(self, - model_dir: str, - mode=ModeKeys.INFERENCE, - *args, - **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - super().__init__(model_dir, pair=True, mode=mode, *args, **kwargs) - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'source_sentence') - self.second_sequence = kwargs.pop('second_sequence', - 'sentences_to_compare') - self.sequence_length = kwargs.pop('sequence_length', 128) - - self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) - - @type_assert(object, (str, tuple, Dict)) - def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]: - if isinstance(data, tuple): - sentence1, sentence2 = data - elif isinstance(data, dict): - sentence1 = data.get(self.first_sequence) - sentence2 = data.get(self.second_sequence) - if isinstance(sentence2, str): - sentence2 = [sentence2] - if isinstance(sentence1, str): - sentence1 = [sentence1] - sentence1 = sentence1 * len(sentence2) - - max_seq_length = self.sequence_length - feature = self.tokenizer( - sentence1, - sentence2, - padding='max_length', - truncation=True, - max_length=max_seq_length, - return_tensors='pt') - if 'labels' in data: - labels = data['labels'] - feature['labels'] = labels - if 'qid' in data: - qid = data['qid'] - feature['qid'] = qid - return feature - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.nli_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) -class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in sequence classification. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sentence_embedding) -class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in sentence embedding. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get( - 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) - - def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: - """process the raw input data - - Args: - data Dict: - keys: "source_sentence" && "sentences_to_compare" - values: list of sentences - Example: - {"source_sentence": ["how long it take to get a master's degree"], - "sentences_to_compare": ["On average, students take about 18 to 24 months - to complete a master's degree.", - "On the other hand, some students prefer to go at a slower pace - and choose to take several years to complete their studies.", - "It can take anywhere from two semesters"]} - Returns: - Dict[str, Any]: the preprocessed data - """ - source_sentence = data['source_sentence'] - compare_sentences = data['sentences_to_compare'] - sentences = [] - sentences.append(source_sentence[0]) - for sent in compare_sentences: - sentences.append(sent) - - tokenized_inputs = self.tokenizer( - sentences, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - padding=True, - truncation=True) - return tokenized_inputs - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) -class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in zero shot classification. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - self.sequence_length = kwargs.pop('sequence_length', 512) - super().__init__(model_dir, mode=mode, **kwargs) - - def __call__(self, data: Union[str, Dict], hypothesis_template: str, - candidate_labels: list) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str or dict): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - if isinstance(data, dict): - data = data.get(self.first_sequence) - - pairs = [[data, hypothesis_template.format(label)] - for label in candidate_labels] - - features = self.tokenizer( - pairs, - padding=True, - truncation=True, - max_length=self.sequence_length, - truncation_strategy='only_first', - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None) - return features - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor) -class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in text generation. - """ - - def __init__(self, - model_dir: str, - tokenizer=None, - mode=ModeKeys.INFERENCE, - **kwargs): - kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate') - kwargs['padding'] = kwargs.get('padding', False) - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - False) - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) - - def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: - text_a, _, _ = self.parse_text_and_label(data) - - inputs = self.tokenizer( - text_a, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - **self.tokenize_kwargs) - - # This is produced by tokenizers but is an invalid generate kwargs - if 'token_type_ids' in inputs: - del inputs['token_type_ids'] - return inputs - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text_gen_tokenizer) -class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in text generation. - """ - - def __init__(self, - model_dir: str, - tokenizer=None, - mode=ModeKeys.INFERENCE, - **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - False) - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) - - @staticmethod - def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: - import os - for name in os.listdir(model_dir): - full_name = os.path.join(model_dir, name) - if 'roberta' in name and os.path.isdir(full_name): - return full_name - - def build_tokenizer(self, model_dir: str): - roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir) - if roberta_tokenizer_dir: - from transformers import RobertaTokenizer - return RobertaTokenizer.from_pretrained( - roberta_tokenizer_dir, do_lower_case=False) - return super().build_tokenizer(model_dir) - - def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: - if self._mode == ModeKeys.INFERENCE: - return super().__call__(data) - src_rst = super().__call__(data['src_txt']) - src_input_ids = src_rst['input_ids'] - src_attention_mask = src_rst['attention_mask'] - if 'tgt_txt' in data: - labels = super().__call__(data['tgt_txt'])['input_ids'] - else: - labels = src_input_ids[1:] - src_input_ids = src_input_ids[:-1] - src_attention_mask = src_attention_mask[:-1] - - return { - 'input_ids': src_input_ids, - 'attention_mask': src_attention_mask, - 'labels': labels, - } - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer) -class TextGenerationJiebaPreprocessor(Preprocessor): - """The jieba tokenizer preprocessor used in text generation. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - from modelscope.models.nlp.gpt3 import JiebaBPETokenizer - super().__init__(*args, **kwargs) - self.tokenizer = JiebaBPETokenizer( - osp.join(model_dir, 'tokenizer.json')) - - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - '深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地' - Returns: - Dict[str, Any]: the preprocessed data - Example: - {'net_input': - {'src_tokens':tensor([1,2,3,4]), - 'src_lengths': tensor([4])} - } - """ - import torch - - return { - 'input_ids': - torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0) - } - - -@PREPROCESSORS.register_module( - Fields.nlp, - module_name=Preprocessors.word_segment_text_to_label_preprocessor) -class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): - """The preprocessor used to turn a single sentence to a labeled token-classification dict. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.label = kwargs.pop('label', OutputKeys.LABELS) - - def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: - data = data.split(' ') - data = list(filter(lambda x: len(x) > 0, data)) - - def produce_train_sample(words): - chars = [] - labels = [] - for word in words: - chars.extend(list(word)) - if len(word) == 1: - labels.append('S-CWS') - else: - labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2) - + ['E-CWS']) - assert len(chars) == len(labels) - return chars, labels - - chars, labels = produce_train_sample(data) - return { - self.first_sequence: chars, - self.label: labels, - } - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.ner_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) -class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in normal NER task. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get( - 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - self.label_all_tokens = kwargs.pop('label_all_tokens', False) - super().__init__(model_dir, mode=mode, **kwargs) - - if 'is_split_into_words' in kwargs: - self.is_split_into_words = kwargs.pop('is_split_into_words') - else: - self.is_split_into_words = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - - # preprocess the data for the model input - text = None - labels_list = None - if isinstance(data, str): - text = data - elif isinstance(data, dict): - text = data.get(self.first_sequence) - labels_list = data.get(self.label) - - input_ids = [] - label_mask = [] - offset_mapping = [] - if self.is_split_into_words: - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) - if len(subtoken_ids) == 0: - subtoken_ids = [self.tokenizer.unk_token_id] - input_ids.extend(subtoken_ids) - label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) - offset_mapping.extend([(offset, offset + 1)]) - else: - if self.tokenizer.is_fast: - encodings = self.tokenizer( - text, - add_special_tokens=False, - return_offsets_mapping=True, - **self.tokenize_kwargs) - input_ids = encodings['input_ids'] - word_ids = encodings.word_ids() - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = ( - offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) - else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) - else: - encodings = self.tokenizer( - text, add_special_tokens=False, **self.tokenize_kwargs) - input_ids = encodings['input_ids'] - label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( - text) - - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - offset_mapping = offset_mapping[:sum(label_mask)] - - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] - - if self._mode == ModeKeys.INFERENCE: - input_ids = torch.tensor(input_ids).unsqueeze(0) - attention_mask = torch.tensor(attention_mask).unsqueeze(0) - label_mask = torch.tensor( - label_mask, dtype=torch.bool).unsqueeze(0) - - # the token classification - output = { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } - - # align the labels with tokenized text - if labels_list is not None: - assert self.label2id is not None - # Map that sends B-Xxx label to its I-Xxx counterpart - b_to_i_label = [] - label_enumerate_values = [ - k for k, v in sorted( - self.label2id.items(), key=lambda item: item[1]) - ] - for idx, label in enumerate(label_enumerate_values): - if label.startswith('B-') and label.replace( - 'B-', 'I-') in label_enumerate_values: - b_to_i_label.append( - label_enumerate_values.index( - label.replace('B-', 'I-'))) - else: - b_to_i_label.append(idx) - - label_row = [self.label2id[lb] for lb in labels_list] - previous_word_idx = None - label_ids = [] - for word_idx in word_ids: - if word_idx is None: - label_ids.append(-100) - elif word_idx != previous_word_idx: - label_ids.append(label_row[word_idx]) - else: - if self.label_all_tokens: - label_ids.append(b_to_i_label[label_row[word_idx]]) - else: - label_ids.append(-100) - previous_word_idx = word_idx - labels = label_ids - output['labels'] = labels - return output - - def get_tokenizer_class(self): - tokenizer_class = self.tokenizer.__class__.__name__ - if tokenizer_class.endswith( - 'Fast') and tokenizer_class != 'PreTrainedTokenizerFast': - tokenizer_class = tokenizer_class[:-4] - return tokenizer_class - - def get_label_mask_and_offset_mapping(self, text): - label_mask = [] - offset_mapping = [] - tokens = self.tokenizer.tokenize(text) - offset = 0 - if self.get_tokenizer_class() == 'BertTokenizer': - for token in tokens: - is_start = (token[:2] != '##') - if is_start: - label_mask.append(True) - else: - token = token[2:] - label_mask.append(False) - start = offset + text[offset:].index(token) - end = start + len(token) - if is_start: - offset_mapping.append((start, end)) - else: - offset_mapping[-1] = (offset_mapping[-1][0], end) - offset = end - elif self.get_tokenizer_class() == 'XLMRobertaTokenizer': - last_is_blank = False - for token in tokens: - is_start = (token[0] == '▁') - if is_start: - token = token[1:] - label_mask.append(True) - if len(token) == 0: - last_is_blank = True - continue - else: - label_mask.append(False) - start = offset + text[offset:].index(token) - end = start + len(token) - if last_is_blank or is_start: - offset_mapping.append((start, end)) - else: - offset_mapping[-1] = (offset_mapping[-1][0], end) - offset = end - last_is_blank = False - else: - raise NotImplementedError - - return label_mask, offset_mapping - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.re_tokenizer) -class RelationExtractionPreprocessor(Preprocessor): - """The relation extraction preprocessor used in normal RE task. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=True) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - - # preprocess the data for the model input - text = data - output = self.tokenizer([text], return_tensors='pt') - return { - 'text': text, - 'input_ids': output['input_ids'], - 'attention_mask': output['attention_mask'], - 'offsets': output[0].offsets - } - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor) -class FaqQuestionAnsweringPreprocessor(Preprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - super(FaqQuestionAnsweringPreprocessor, self).__init__( - model_dir, mode=ModeKeys.INFERENCE, **kwargs) - import os - from transformers import BertTokenizer - - from modelscope.utils.config import Config - from modelscope.utils.constant import ModelFile - self.tokenizer = BertTokenizer.from_pretrained(model_dir) - preprocessor_config = Config.from_file( - os.path.join(model_dir, ModelFile.CONFIGURATION)).get( - ConfigFields.preprocessor, {}) - self.MAX_LEN = preprocessor_config.get('max_seq_length', 50) - self.label_dict = None - - def pad(self, samples, max_len): - result = [] - for sample in samples: - pad_len = max_len - len(sample[:max_len]) - result.append(sample[:max_len] - + [self.tokenizer.pad_token_id] * pad_len) - return result - - def set_label_dict(self, label_dict): - self.label_dict = label_dict - - def get_label(self, label_id): - assert self.label_dict is not None and label_id < len(self.label_dict) - return self.label_dict[label_id] - - def encode_plus(self, text): - return [ - self.tokenizer.cls_token_id - ] + self.tokenizer.convert_tokens_to_ids( - self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id] - - @type_assert(object, Dict) - def __call__(self, data: Dict[str, Any], - **preprocessor_param) -> Dict[str, Any]: - TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN) - queryset = data['query_set'] - if not isinstance(queryset, list): - queryset = [queryset] - supportset = data['support_set'] - supportset = sorted(supportset, key=lambda d: d['label']) - - queryset_tokenized = [self.encode_plus(text) for text in queryset] - supportset_tokenized = [ - self.encode_plus(item['text']) for item in supportset - ] - - max_len = max( - [len(seq) for seq in queryset_tokenized + supportset_tokenized]) - max_len = min(TMP_MAX_LEN, max_len) - queryset_padded = self.pad(queryset_tokenized, max_len) - supportset_padded = self.pad(supportset_tokenized, max_len) - - supportset_labels_ori = [item['label'] for item in supportset] - label_dict = [] - for label in supportset_labels_ori: - if label not in label_dict: - label_dict.append(label) - self.set_label_dict(label_dict) - supportset_labels_ids = [ - label_dict.index(label) for label in supportset_labels_ori - ] - return { - 'query': queryset_padded, - 'support': supportset_padded, - 'support_labels': supportset_labels_ids - } - - def batch_encode(self, sentence_list: list, max_length=None): - if not max_length: - max_length = self.MAX_LEN - return self.tokenizer.batch_encode_plus( - sentence_list, padding=True, max_length=max_length) - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.document_segmentation) -class DocumentSegmentationPreprocessor(Preprocessor): - - def __init__(self, model_dir: str, config, *args, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - from transformers import BertTokenizerFast - self.tokenizer = BertTokenizerFast.from_pretrained( - model_dir, - use_fast=True, - ) - self.question_column_name = 'labels' - self.context_column_name = 'sentences' - self.example_id_column_name = 'example_id' - self.label_to_id = {'B-EOP': 0, 'O': 1} - self.target_specical_ids = set() - self.target_specical_ids.add(self.tokenizer.eos_token_id) - self.max_seq_length = config.max_position_embeddings - self.label_list = ['B-EOP', 'O'] - - def __call__(self, examples) -> Dict[str, Any]: - questions = examples[self.question_column_name] - contexts = examples[self.context_column_name] - example_ids = examples[self.example_id_column_name] - num_examples = len(questions) - - sentences = [] - for sentence_list in contexts: - sentence_list = [_ + '[EOS]' for _ in sentence_list] - sentences.append(sentence_list) - - try: - tokenized_examples = self.tokenizer( - sentences, - is_split_into_words=True, - add_special_tokens=False, - return_token_type_ids=True, - return_attention_mask=True, - ) - except Exception as e: - logger.error(e) - return {} - - segment_ids = [] - token_seq_labels = [] - for example_index in range(num_examples): - example_input_ids = tokenized_examples['input_ids'][example_index] - example_labels = questions[example_index] - example_labels = [ - self.label_to_id[_] if _ in self.label_to_id else -100 - for _ in example_labels - ] - example_token_labels = [] - segment_id = [] - cur_seg_id = 1 - for token_index in range(len(example_input_ids)): - if example_input_ids[token_index] in self.target_specical_ids: - example_token_labels.append(example_labels[cur_seg_id - 1]) - segment_id.append(cur_seg_id) - cur_seg_id += 1 - else: - example_token_labels.append(-100) - segment_id.append(cur_seg_id) - - segment_ids.append(segment_id) - token_seq_labels.append(example_token_labels) - - tokenized_examples['segment_ids'] = segment_ids - tokenized_examples['token_seq_labels'] = token_seq_labels - - new_segment_ids = [] - new_token_seq_labels = [] - new_input_ids = [] - new_token_type_ids = [] - new_attention_mask = [] - new_example_ids = [] - new_sentences = [] - - for example_index in range(num_examples): - example_input_ids = tokenized_examples['input_ids'][example_index] - example_token_type_ids = tokenized_examples['token_type_ids'][ - example_index] - example_attention_mask = tokenized_examples['attention_mask'][ - example_index] - example_segment_ids = tokenized_examples['segment_ids'][ - example_index] - example_token_seq_labels = tokenized_examples['token_seq_labels'][ - example_index] - example_sentences = contexts[example_index] - example_id = example_ids[example_index] - example_total_num_sentences = len(questions[example_index]) - example_total_num_tokens = len( - tokenized_examples['input_ids'][example_index]) - accumulate_length = [ - i for i, x in enumerate(tokenized_examples['input_ids'] - [example_index]) - if x == self.tokenizer.eos_token_id - ] - samples_boundary = [] - left_index = 0 - sent_left_index = 0 - sent_i = 0 - - # for sent_i, length in enumerate(accumulate_length): - while sent_i < len(accumulate_length): - length = accumulate_length[sent_i] - right_index = length + 1 - sent_right_index = sent_i + 1 - if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens: - samples_boundary.append([left_index, right_index]) - - sample_input_ids = [ - self.tokenizer.cls_token_id - ] + example_input_ids[left_index:right_index] - sample_input_ids = sample_input_ids[:self.max_seq_length] - - sample_token_type_ids = [ - 0 - ] + example_token_type_ids[left_index:right_index] - sample_token_type_ids = sample_token_type_ids[:self. - max_seq_length] - - sample_attention_mask = [ - 1 - ] + example_attention_mask[left_index:right_index] - sample_attention_mask = sample_attention_mask[:self. - max_seq_length] - - sample_segment_ids = [ - 0 - ] + example_segment_ids[left_index:right_index] - sample_segment_ids = sample_segment_ids[:self. - max_seq_length] - - sample_token_seq_labels = [ - -100 - ] + example_token_seq_labels[left_index:right_index] - sample_token_seq_labels = sample_token_seq_labels[:self. - max_seq_length] - - if sent_right_index - 1 == sent_left_index: - left_index = right_index - sample_input_ids[-1] = self.tokenizer.eos_token_id - sample_token_seq_labels[-1] = -100 - else: - left_index = accumulate_length[sent_i - 1] + 1 - if sample_token_seq_labels[-1] != -100: - sample_token_seq_labels[-1] = -100 - - if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens: - sample_sentences = example_sentences[ - sent_left_index:sent_right_index] - sent_left_index = sent_right_index - sent_i += 1 - else: - sample_sentences = example_sentences[ - sent_left_index:sent_right_index - 1] - sent_left_index = sent_right_index - 1 - - if (len([_ for _ in sample_token_seq_labels if _ != -100 - ])) != len(sample_sentences) - 1 and (len([ - _ - for _ in sample_token_seq_labels if _ != -100 - ])) != len(sample_sentences): - tmp = [] - for w_i, w, l in zip( - sample_input_ids, - self.tokenizer.decode(sample_input_ids).split( - ' '), sample_token_seq_labels): - tmp.append((w_i, w, l)) - while len(sample_input_ids) < self.max_seq_length: - sample_input_ids.append(self.tokenizer.pad_token_id) - sample_token_type_ids.append(0) - sample_attention_mask.append(0) - sample_segment_ids.append(example_total_num_sentences - + 1) - sample_token_seq_labels.append(-100) - - new_input_ids.append(sample_input_ids) - new_token_type_ids.append(sample_token_type_ids) - new_attention_mask.append(sample_attention_mask) - new_segment_ids.append(sample_segment_ids) - new_token_seq_labels.append(sample_token_seq_labels) - new_example_ids.append(example_id) - new_sentences.append(sample_sentences) - else: - sent_i += 1 - continue - - output_samples = {} - - output_samples['input_ids'] = new_input_ids - output_samples['token_type_ids'] = new_token_type_ids - output_samples['attention_mask'] = new_attention_mask - - output_samples['segment_ids'] = new_segment_ids - output_samples['example_id'] = new_example_ids - output_samples['labels'] = new_token_seq_labels - output_samples['sentences'] = new_sentences - - return output_samples - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.fill_mask_ponet) -class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in MLM task. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 512) - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - True) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) - - self.cfg = Config.from_file( - osp.join(model_dir, ModelFile.CONFIGURATION)) - self.language = self.cfg.model.get('language', 'en') - if self.language == 'en': - from nltk.tokenize import sent_tokenize - import_external_nltk_data( - osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt') - elif self.language in ['zh', 'cn']: - - def sent_tokenize(para): - para = re.sub(r'([。!!?\?])([^”’])', r'\1\n\2', para) # noqa * - para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para) # noqa * - para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para) # noqa * - para = re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', - para) # noqa * - para = para.rstrip() - return [_ for _ in para.split('\n') if _] - else: - raise NotImplementedError - - self.sent_tokenize = sent_tokenize - self.max_length = kwargs['max_length'] - - def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: - """process the raw input data - - Args: - data (tuple): [sentence1, sentence2] - sentence1 (str): a sentence - Example: - 'you are so handsome.' - sentence2 (str): a sentence - Example: - 'you are so beautiful.' - Returns: - Dict[str, Any]: the preprocessed data - """ - - text_a, text_b, labels = self.parse_text_and_label(data) - output = self.tokenizer( - text_a, - text_b, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - **self.tokenize_kwargs) - max_seq_length = self.max_length - - if text_b is None: - segment_ids = [] - seg_lens = list( - map( - len, - self.tokenizer( - self.sent_tokenize(text_a), - add_special_tokens=False, - truncation=True)['input_ids'])) - segment_id = [0] + sum( - [[i] * sl for i, sl in enumerate(seg_lens, start=1)], []) - segment_id = segment_id[:max_seq_length - 1] - segment_ids.append(segment_id + [segment_id[-1] + 1] - * (max_seq_length - len(segment_id))) - output['segment_ids'] = segment_ids - - output = { - k: np.array(v) if isinstance(v, list) else v - for k, v in output.items() - } - - self.labels_to_id(labels, output) - return output - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sentence_piece) -class SentencePiecePreprocessor(Preprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - import os - - super().__init__(*args, **kwargs) - self.tokenizer = None - for file_name in os.listdir(model_dir): - if file_name.endswith('.model'): - m_file = osp.join(model_dir, file_name) - self.tokenizer = spm.SentencePieceProcessor(model_file=m_file) - break - assert self.tokenizer is not None, 'Can not find .model file' - - def __call__(self, data: str) -> Dict[str, Any]: - return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long) + except KeyError as e: + logger.error( + f'Label {labels} cannot be found in the label mapping {self.label2id},' + f'which comes from the user input or the configuration files. ' + f'Please consider matching your labels with this mapping.') + raise e diff --git a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py new file mode 100644 index 00000000..9a426ab7 --- /dev/null +++ b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py @@ -0,0 +1,55 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict + +from transformers import AutoTokenizer + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields +from modelscope.utils.type_assert import type_assert +from .nlp_base import NLPBasePreprocessor + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.re_tokenizer) +class RelationExtractionPreprocessor(NLPBasePreprocessor): + """The relation extraction preprocessor used in normal RE task. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ + + super().__init__(model_dir, *args, **kwargs) + + self.model_dir: str = model_dir + self.sequence_length = kwargs.pop('sequence_length', 512) + self.tokenizer = AutoTokenizer.from_pretrained( + model_dir, use_fast=True) + + @type_assert(object, str) + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + + # preprocess the data for the model input + text = data + output = self.tokenizer([text], return_tensors='pt') + return { + 'text': text, + 'input_ids': output['input_ids'], + 'attention_mask': output['attention_mask'], + 'offsets': output[0].offsets + } diff --git a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py new file mode 100644 index 00000000..f1295c50 --- /dev/null +++ b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py @@ -0,0 +1,25 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.nli_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) +class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in sequence classification. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, mode=mode, **kwargs) diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py new file mode 100644 index 00000000..519de60c --- /dev/null +++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py @@ -0,0 +1,52 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Union + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sentence_embedding) +class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in sentence embedding. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, mode=mode, **kwargs) + + def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: + """process the raw input data + + Args: + data Dict: + keys: "source_sentence" && "sentences_to_compare" + values: list of sentences + Example: + {"source_sentence": ["how long it take to get a master's degree"], + "sentences_to_compare": ["On average, students take about 18 to 24 months + to complete a master's degree.", + "On the other hand, some students prefer to go at a slower pace + and choose to take several years to complete their studies.", + "It can take anywhere from two semesters"]} + Returns: + Dict[str, Any]: the preprocessed data + """ + source_sentence = data['source_sentence'] + compare_sentences = data['sentences_to_compare'] + sentences = [] + sentences.append(source_sentence[0]) + for sent in compare_sentences: + sentences.append(sent) + + tokenized_inputs = self.tokenizer( + sentences, + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, + padding=True, + truncation=True) + return tokenized_inputs diff --git a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py new file mode 100644 index 00000000..1d1ef19d --- /dev/null +++ b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py @@ -0,0 +1,32 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os.path as osp +from typing import Any, Dict + +import sentencepiece as spm +import torch + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sentence_piece) +class SentencePiecePreprocessor(Preprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + import os + + super().__init__(*args, **kwargs) + self.tokenizer = None + for file_name in os.listdir(model_dir): + if file_name.endswith('.model'): + m_file = osp.join(model_dir, file_name) + self.tokenizer = spm.SentencePieceProcessor(model_file=m_file) + break + assert self.tokenizer is not None, 'Can not find .model file' + + def __call__(self, data: str) -> Dict[str, Any]: + return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long) diff --git a/modelscope/preprocessors/space/__init__.py b/modelscope/preprocessors/nlp/space/__init__.py similarity index 100% rename from modelscope/preprocessors/space/__init__.py rename to modelscope/preprocessors/nlp/space/__init__.py diff --git a/modelscope/preprocessors/space/args.py b/modelscope/preprocessors/nlp/space/args.py similarity index 97% rename from modelscope/preprocessors/space/args.py rename to modelscope/preprocessors/nlp/space/args.py index d9e91e74..17c6828b 100644 --- a/modelscope/preprocessors/space/args.py +++ b/modelscope/preprocessors/nlp/space/args.py @@ -1,7 +1,4 @@ -""" -Parse argument. -""" - +# Copyright (c) Alibaba, Inc. and its affiliates. import argparse import json diff --git a/modelscope/preprocessors/space/batch.py b/modelscope/preprocessors/nlp/space/batch.py similarity index 96% rename from modelscope/preprocessors/space/batch.py rename to modelscope/preprocessors/nlp/space/batch.py index fe0ad0ec..d27776f5 100644 --- a/modelscope/preprocessors/space/batch.py +++ b/modelscope/preprocessors/nlp/space/batch.py @@ -1,3 +1,6 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + + def batch(reader, batch_size, drop_last=False): """ This operator creates a batched reader which combines the data from the diff --git a/modelscope/preprocessors/space/data_loader.py b/modelscope/preprocessors/nlp/space/data_loader.py similarity index 87% rename from modelscope/preprocessors/space/data_loader.py rename to modelscope/preprocessors/nlp/space/data_loader.py index bd04a79c..290b64f3 100644 --- a/modelscope/preprocessors/space/data_loader.py +++ b/modelscope/preprocessors/nlp/space/data_loader.py @@ -1,18 +1,16 @@ -""" -DataLoader class -""" +# Copyright (c) Alibaba, Inc. and its affiliates. import math import os import numpy as np -from modelscope.preprocessors.space.args import str2bool -from modelscope.preprocessors.space.batch import batch -from modelscope.preprocessors.space.lazy_dataset import LazyDataset -from modelscope.preprocessors.space.sampler import (RandomSampler, - SequentialSampler, - SortedSampler) +from modelscope.preprocessors.nlp.space.args import str2bool +from modelscope.preprocessors.nlp.space.batch import batch +from modelscope.preprocessors.nlp.space.lazy_dataset import LazyDataset +from modelscope.preprocessors.nlp.space.sampler import (RandomSampler, + SequentialSampler, + SortedSampler) def get_data_loader(batch_size, reader, hparams, file, collate_fn, is_test): diff --git a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py similarity index 64% rename from modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py rename to modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py index e2602eaa..2923157e 100644 --- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py +++ b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py @@ -8,8 +8,7 @@ import json from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.preprocessors.space.fields.intent_field import \ - IntentBPETextField +from modelscope.preprocessors.nlp import IntentBPETextField from modelscope.utils.config import Config from modelscope.utils.constant import Fields, ModelFile from modelscope.utils.type_assert import type_assert @@ -47,10 +46,25 @@ class DialogIntentPredictionPreprocessor(Preprocessor): Args: data (str): a sentence Example: - 'you are so handsome.' + 'What do I need to do for the card activation?' Returns: Dict[str, Any]: the preprocessed data + Example: + { + 'src_token': array([[13, 2054, 2079, 1045...]]), + 'src_pos': array([[ 0, 1, 2, 3...]]), + 'src_type': array([[1, 1, 1, 1...]]), + 'src_turn': array([[1, 1, 1, 1...]]), + 'src_mask': array([[1, 1, 1, 1...]]), + 'mlm_token': array([[13, 2054, 2079, 1045...]]), + 'mlm_label': array([[0, 0, 0, 0...]]), + 'mlm_mask': array([[0, 0, 0, 0...]]), + 'tgt_token': array([[29, 30, 31, 32...]]), + 'tgt_mask': array([[1, 1, 1, 1...]]), + 'ids': array([0]), + 'intent_label': array([-1]) + } """ samples = self.text_field.preprocessor([data]) samples, _ = self.text_field.collate_fn_multi_turn(samples) diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py similarity index 75% rename from modelscope/preprocessors/space/dialog_modeling_preprocessor.py rename to modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py index c461ade1..ae3c214a 100644 --- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py +++ b/modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py @@ -6,8 +6,7 @@ from typing import Any, Dict from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.preprocessors.space.fields.gen_field import \ - MultiWOZBPETextField +from modelscope.preprocessors.nlp import MultiWOZBPETextField from modelscope.utils.config import Config from modelscope.utils.constant import Fields, ModelFile from modelscope.utils.type_assert import type_assert @@ -42,9 +41,19 @@ class DialogModelingPreprocessor(Preprocessor): """process the raw input data Args: - data (str): a sentence + data (Dict[str, Any]): A sentence and dialogue history info. Example: - 'you are so handsome.' + { + 'user_input': 'i want to leave after 17:15 .', + 'history': { + 'labels': [[13, 1045, 2052, 2066...]], + 'resp': [14, 1045, 2064, 2393...], + 'bspn': [15, 43, 7688, 10733...], + 'db': [19, 24, 20], + 'aspn': [16, 43, 48, 2681, 7180, 10], + 'output': ['i', 'can', 'help', 'with'...] + } + } Returns: Dict[str, Any]: the preprocessed data diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py similarity index 92% rename from modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py rename to modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py index 6eb17288..cff39577 100644 --- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py +++ b/modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py @@ -31,13 +31,17 @@ class DialogStateTrackingPreprocessor(Preprocessor): self.processor = multiwoz22Processor() @type_assert(object, dict) - def __call__(self, data: Dict) -> Dict[str, Any]: + def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """process the raw input data Args: - data (str): a sentence + data (Dict[str, Any]): a sentence Example: - 'you are so handsome.' + { + 'utter': {'User-1': "Hi, I'm looking for a train that is going" + "to cambridge and arriving there by 20:45, is there anything like that?"}, + 'history_states': [{}] + } Returns: Dict[str, Any]: the preprocessed data diff --git a/modelscope/preprocessors/space/dst_processors.py b/modelscope/preprocessors/nlp/space/dst_processors.py similarity index 100% rename from modelscope/preprocessors/space/dst_processors.py rename to modelscope/preprocessors/nlp/space/dst_processors.py diff --git a/modelscope/preprocessors/nlp/space/fields/__init__.py b/modelscope/preprocessors/nlp/space/fields/__init__.py new file mode 100644 index 00000000..475a99dc --- /dev/null +++ b/modelscope/preprocessors/nlp/space/fields/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .gen_field import MultiWOZBPETextField + from .intent_field import IntentBPETextField +else: + _import_structure = { + 'gen_field': ['MultiWOZBPETextField'], + 'intent_field': ['IntentBPETextField'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/nlp/space/fields/gen_field.py similarity index 99% rename from modelscope/preprocessors/space/fields/gen_field.py rename to modelscope/preprocessors/nlp/space/fields/gen_field.py index 32346bd5..1d1879fe 100644 --- a/modelscope/preprocessors/space/fields/gen_field.py +++ b/modelscope/preprocessors/nlp/space/fields/gen_field.py @@ -9,7 +9,7 @@ from itertools import chain import json import numpy as np -from modelscope.preprocessors.space.tokenizer import Tokenizer +from modelscope.preprocessors.nlp.space.tokenizer import Tokenizer from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger from modelscope.utils.nlp.space import ontology, utils diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/nlp/space/fields/intent_field.py similarity index 99% rename from modelscope/preprocessors/space/fields/intent_field.py rename to modelscope/preprocessors/nlp/space/fields/intent_field.py index 6d3b5fff..29ea915e 100644 --- a/modelscope/preprocessors/space/fields/intent_field.py +++ b/modelscope/preprocessors/nlp/space/fields/intent_field.py @@ -13,7 +13,7 @@ import json import numpy as np from tqdm import tqdm -from modelscope.preprocessors.space.tokenizer import Tokenizer +from modelscope.preprocessors.nlp.space.tokenizer import Tokenizer from modelscope.utils.constant import ModelFile from modelscope.utils.nlp.space import ontology from modelscope.utils.nlp.space.scores import hierarchical_set_score diff --git a/modelscope/preprocessors/space/lazy_dataset.py b/modelscope/preprocessors/nlp/space/lazy_dataset.py similarity index 93% rename from modelscope/preprocessors/space/lazy_dataset.py rename to modelscope/preprocessors/nlp/space/lazy_dataset.py index 8da21db7..536d9341 100644 --- a/modelscope/preprocessors/space/lazy_dataset.py +++ b/modelscope/preprocessors/nlp/space/lazy_dataset.py @@ -1,11 +1,6 @@ -""" -Dataset class -""" - +# Copyright (c) Alibaba, Inc. and its affiliates. import json -from modelscope.preprocessors.space.args import str2bool - class LazyDataset(object): """ diff --git a/modelscope/preprocessors/space/preprocess.py b/modelscope/preprocessors/nlp/space/preprocess.py similarity index 92% rename from modelscope/preprocessors/space/preprocess.py rename to modelscope/preprocessors/nlp/space/preprocess.py index bd8d64d1..8aab4711 100644 --- a/modelscope/preprocessors/space/preprocess.py +++ b/modelscope/preprocessors/nlp/space/preprocess.py @@ -1,12 +1,9 @@ -""" -Preprocess script. -""" +# Copyright (c) Alibaba, Inc. and its affiliates. import glob import os -from modelscope.preprocessors.space.args import parse_args -from modelscope.preprocessors.space.fields.intent_field import \ +from modelscope.preprocessors.nlp.space.fields.intent_field import \ IntentBPETextField FILE_NAME = 'train.json' diff --git a/modelscope/preprocessors/space/sampler.py b/modelscope/preprocessors/nlp/space/sampler.py similarity index 96% rename from modelscope/preprocessors/space/sampler.py rename to modelscope/preprocessors/nlp/space/sampler.py index 49a216d1..e549c343 100644 --- a/modelscope/preprocessors/space/sampler.py +++ b/modelscope/preprocessors/nlp/space/sampler.py @@ -1,6 +1,4 @@ -""" -Sampler class. -""" +# Copyright (c) Alibaba, Inc. and its affiliates. import numpy as np diff --git a/modelscope/preprocessors/space/tensorlistdataset.py b/modelscope/preprocessors/nlp/space/tensorlistdataset.py similarity index 100% rename from modelscope/preprocessors/space/tensorlistdataset.py rename to modelscope/preprocessors/nlp/space/tensorlistdataset.py diff --git a/modelscope/preprocessors/space/tokenizer.py b/modelscope/preprocessors/nlp/space/tokenizer.py similarity index 99% rename from modelscope/preprocessors/space/tokenizer.py rename to modelscope/preprocessors/nlp/space/tokenizer.py index 87f7e8c3..1bd0ce11 100644 --- a/modelscope/preprocessors/space/tokenizer.py +++ b/modelscope/preprocessors/nlp/space/tokenizer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from __future__ import (absolute_import, division, print_function, unicode_literals) import collections diff --git a/modelscope/preprocessors/space_T_cn/__init__.py b/modelscope/preprocessors/nlp/space_T_cn/__init__.py similarity index 100% rename from modelscope/preprocessors/space_T_cn/__init__.py rename to modelscope/preprocessors/nlp/space_T_cn/__init__.py diff --git a/modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py b/modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/preprocessors/space_T_cn/fields/database.py b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py similarity index 98% rename from modelscope/preprocessors/space_T_cn/fields/database.py rename to modelscope/preprocessors/nlp/space_T_cn/fields/database.py index 7ae38ee2..2fef8d7e 100644 --- a/modelscope/preprocessors/space_T_cn/fields/database.py +++ b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py @@ -4,7 +4,7 @@ import sqlite3 import json import tqdm -from modelscope.preprocessors.space_T_cn.fields.struct import Trie +from .struct import Trie class Database: diff --git a/modelscope/preprocessors/space_T_cn/fields/schema_link.py b/modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py similarity index 99% rename from modelscope/preprocessors/space_T_cn/fields/schema_link.py rename to modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py index 4b8f9d31..b62d03e4 100644 --- a/modelscope/preprocessors/space_T_cn/fields/schema_link.py +++ b/modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py @@ -1,7 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import re -from modelscope.preprocessors.space_T_cn.fields.struct import TypeInfo +from .struct import TypeInfo class SchemaLinker: diff --git a/modelscope/preprocessors/space_T_cn/fields/struct.py b/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py similarity index 100% rename from modelscope/preprocessors/space_T_cn/fields/struct.py rename to modelscope/preprocessors/nlp/space_T_cn/fields/struct.py diff --git a/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py similarity index 96% rename from modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py rename to modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py index 63e6fd57..3aabc6a9 100644 --- a/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py +++ b/modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py @@ -8,8 +8,9 @@ from transformers import BertTokenizer from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.preprocessors.space_T_cn.fields.database import Database -from modelscope.preprocessors.space_T_cn.fields.schema_link import SchemaLinker +from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database +from modelscope.preprocessors.nlp.space_T_cn.fields.schema_link import \ + SchemaLinker from modelscope.utils.config import Config from modelscope.utils.constant import Fields, ModelFile from modelscope.utils.type_assert import type_assert diff --git a/modelscope/preprocessors/star/__init__.py b/modelscope/preprocessors/nlp/space_T_en/__init__.py similarity index 100% rename from modelscope/preprocessors/star/__init__.py rename to modelscope/preprocessors/nlp/space_T_en/__init__.py diff --git a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py similarity index 84% rename from modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py rename to modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py index b5dd73a9..00c7bcd7 100644 --- a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py +++ b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py @@ -12,9 +12,10 @@ from text2sql_lgesql.utils.example import Example from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.preprocessors.star.fields.preprocess_dataset import \ +from modelscope.preprocessors.nlp.space_T_en.fields import SubPreprocessor +from modelscope.preprocessors.nlp.space_T_en.fields.preprocess_dataset import \ preprocess_dataset -from modelscope.preprocessors.star.fields.process_dataset import ( +from modelscope.preprocessors.nlp.space_T_en.fields.process_dataset import ( process_dataset, process_tables) from modelscope.utils.config import Config from modelscope.utils.constant import Fields, ModelFile @@ -56,6 +57,18 @@ class ConversationalTextToSqlPreprocessor(Preprocessor): model_dir=self.model_dir, db_dir=os.path.join(model_dir, 'db')) + self.device = 'cuda' if \ + ('device' not in kwargs or kwargs['device'] == 'gpu') \ + and torch.cuda.is_available() else 'cpu' + use_device = True if self.device == 'cuda' else False + self.processor = \ + SubPreprocessor(model_dir=model_dir, + db_content=True, + use_gpu=use_device) + self.output_tables = \ + process_tables(self.processor, + self.tables) + @type_assert(object, dict) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """process the raw input data diff --git a/modelscope/preprocessors/star/fields/__init__.py b/modelscope/preprocessors/nlp/space_T_en/fields/__init__.py similarity index 100% rename from modelscope/preprocessors/star/fields/__init__.py rename to modelscope/preprocessors/nlp/space_T_en/fields/__init__.py diff --git a/modelscope/preprocessors/star/fields/common_utils.py b/modelscope/preprocessors/nlp/space_T_en/fields/common_utils.py similarity index 100% rename from modelscope/preprocessors/star/fields/common_utils.py rename to modelscope/preprocessors/nlp/space_T_en/fields/common_utils.py diff --git a/modelscope/preprocessors/star/fields/parse.py b/modelscope/preprocessors/nlp/space_T_en/fields/parse.py similarity index 100% rename from modelscope/preprocessors/star/fields/parse.py rename to modelscope/preprocessors/nlp/space_T_en/fields/parse.py diff --git a/modelscope/preprocessors/star/fields/preprocess_dataset.py b/modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py similarity index 95% rename from modelscope/preprocessors/star/fields/preprocess_dataset.py rename to modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py index 6c84c0e7..a0fd13d1 100644 --- a/modelscope/preprocessors/star/fields/preprocess_dataset.py +++ b/modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py @@ -3,7 +3,7 @@ from text2sql_lgesql.preprocess.parse_raw_json import Schema, get_schemas from text2sql_lgesql.process_sql import get_sql -from modelscope.preprocessors.star.fields.parse import get_label +from .parse import get_label def preprocess_dataset(processor, dataset, output_tables, database_id, tables): diff --git a/modelscope/preprocessors/star/fields/process_dataset.py b/modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py similarity index 94% rename from modelscope/preprocessors/star/fields/process_dataset.py rename to modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py index d8ac094a..88059351 100644 --- a/modelscope/preprocessors/star/fields/process_dataset.py +++ b/modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py @@ -1,17 +1,12 @@ # Copyright (c) rhythmcao modified from https://github.com/rhythmcao/text2sql-lgesql. -import argparse import os import pickle import sys -import time -import json from text2sql_lgesql.asdl.asdl import ASDLGrammar from text2sql_lgesql.asdl.transition_system import TransitionSystem -from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor - sys.path.append(os.path.dirname(os.path.dirname(__file__))) diff --git a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py new file mode 100644 index 00000000..5693d36e --- /dev/null +++ b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py @@ -0,0 +1,40 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Union + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor) +class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in text generation. + """ + + def __init__(self, + model_dir: str, + tokenizer=None, + mode=ModeKeys.INFERENCE, + **kwargs): + kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate') + kwargs['padding'] = kwargs.get('padding', False) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + False) + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, mode=mode, **kwargs) + + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: + text_a, _, _ = self.parse_text_and_label(data) + + inputs = self.tokenizer( + text_a, + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, + **self.tokenize_kwargs) + + # This is produced by tokenizers but is an invalid generate kwargs + if 'token_type_ids' in inputs: + del inputs['token_type_ids'] + return inputs diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py index 357a946f..4e5ba3bd 100644 --- a/modelscope/preprocessors/nlp/text_error_correction.py +++ b/modelscope/preprocessors/nlp/text_error_correction.py @@ -7,11 +7,12 @@ from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields +from .nlp_base import NLPBasePreprocessor @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_error_correction) -class TextErrorCorrectionPreprocessor(Preprocessor): +class TextErrorCorrectionPreprocessor(NLPBasePreprocessor): """The preprocessor used in text correction task. """ @@ -22,7 +23,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor): Args: model_dir (str): model path """ - super().__init__(*args, **kwargs) + super().__init__(model_dir, *args, **kwargs) self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt')) def __call__(self, data: str) -> Dict[str, Any]: diff --git a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py new file mode 100644 index 00000000..1e972d64 --- /dev/null +++ b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py @@ -0,0 +1,44 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os.path as osp +from typing import Any, Dict + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer) +class TextGenerationJiebaPreprocessor(Preprocessor): + """The jieba tokenizer preprocessor used in text generation. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + from modelscope.models.nlp.gpt3 import JiebaBPETokenizer + super().__init__(*args, **kwargs) + self.tokenizer = JiebaBPETokenizer( + osp.join(model_dir, 'tokenizer.json')) + + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + '深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地' + Returns: + Dict[str, Any]: the preprocessed data + Example: + {'net_input': + {'src_tokens':tensor([1,2,3,4]), + 'src_lengths': tensor([4])} + } + """ + import torch + + return { + 'input_ids': + torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0) + } diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py new file mode 100644 index 00000000..238e2972 --- /dev/null +++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Optional, Union + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text_gen_tokenizer) +class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in text generation. + """ + + def __init__(self, + model_dir: str, + tokenizer=None, + mode=ModeKeys.INFERENCE, + **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + False) + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, mode=mode, **kwargs) + + @staticmethod + def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: + import os + for name in os.listdir(model_dir): + full_name = os.path.join(model_dir, name) + if 'roberta' in name and os.path.isdir(full_name): + return full_name + + def build_tokenizer(self, model_dir: str): + roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir) + if roberta_tokenizer_dir: + from transformers import RobertaTokenizer + return RobertaTokenizer.from_pretrained( + roberta_tokenizer_dir, do_lower_case=False) + return super().build_tokenizer(model_dir) + + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: + if self._mode == ModeKeys.INFERENCE: + return super().__call__(data) + src_rst = super().__call__(data['src_txt']) + src_input_ids = src_rst['input_ids'] + src_attention_mask = src_rst['attention_mask'] + if 'tgt_txt' in data: + labels = super().__call__(data['tgt_txt'])['input_ids'] + else: + labels = src_input_ids[1:] + src_input_ids = src_input_ids[:-1] + src_attention_mask = src_attention_mask[:-1] + + return { + 'input_ids': src_input_ids, + 'attention_mask': src_attention_mask, + 'labels': labels, + } diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py new file mode 100644 index 00000000..2ada6892 --- /dev/null +++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py @@ -0,0 +1,67 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Union + +from transformers import AutoTokenizer + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.type_assert import type_assert +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text_ranking) +class TextRankingPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in passage ranking model. + """ + + def __init__(self, + model_dir: str, + mode=ModeKeys.INFERENCE, + *args, + **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ + super().__init__(model_dir, mode=mode, *args, **kwargs) + self.model_dir: str = model_dir + self.first_sequence: str = kwargs.pop('first_sequence', + 'source_sentence') + self.second_sequence = kwargs.pop('second_sequence', + 'sentences_to_compare') + self.sequence_length = kwargs.pop('sequence_length', 128) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) + + @type_assert(object, (str, tuple, Dict)) + def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]: + if isinstance(data, tuple): + sentence1, sentence2 = data + elif isinstance(data, dict): + sentence1 = data.get(self.first_sequence) + sentence2 = data.get(self.second_sequence) + if isinstance(sentence2, str): + sentence2 = [sentence2] + if isinstance(sentence1, str): + sentence1 = [sentence1] + sentence1 = sentence1 * len(sentence2) + + max_seq_length = self.sequence_length + feature = self.tokenizer( + sentence1, + sentence2, + padding='max_length', + truncation=True, + max_length=max_seq_length, + return_tensors='pt') + if 'labels' in data: + labels = data['labels'] + feature['labels'] = labels + if 'qid' in data: + qid = data['qid'] + feature['qid'] = qid + return feature diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py new file mode 100644 index 00000000..2de0c806 --- /dev/null +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -0,0 +1,261 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Tuple, Union + +import torch + +from modelscope.metainfo import Preprocessors +from modelscope.outputs import OutputKeys +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.type_assert import type_assert +from .nlp_base import NLPBasePreprocessor, NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, + module_name=Preprocessors.word_segment_text_to_label_preprocessor) +class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): + """The preprocessor used to turn a single sentence to a labeled token-classification dict. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.first_sequence: str = kwargs.pop('first_sequence', + 'first_sequence') + self.label = kwargs.pop('label', OutputKeys.LABELS) + + def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: + data = data.split(' ') + data = list(filter(lambda x: len(x) > 0, data)) + + def produce_train_sample(words): + chars = [] + labels = [] + for word in words: + chars.extend(list(word)) + if len(word) == 1: + labels.append('S-CWS') + else: + labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2) + + ['E-CWS']) + assert len(chars) == len(labels) + return chars, labels + + chars, labels = produce_train_sample(data) + return { + self.first_sequence: chars, + self.label: labels, + } + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.ner_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) +class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in normal NER task. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get( + 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + self.sequence_length = kwargs['max_length'] + self.label_all_tokens = kwargs.pop('label_all_tokens', False) + super().__init__(model_dir, mode=mode, **kwargs) + + if 'is_split_into_words' in kwargs: + self.is_split_into_words = kwargs.pop('is_split_into_words') + else: + self.is_split_into_words = self.tokenizer.init_kwargs.get( + 'is_split_into_words', False) + if 'label2id' in kwargs: + kwargs.pop('label2id') + self.tokenize_kwargs = kwargs + + @type_assert(object, str) + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + + # preprocess the data for the model input + text = None + labels_list = None + if isinstance(data, str): + text = data + elif isinstance(data, dict): + text = data.get(self.first_sequence) + labels_list = data.get(self.label) + + input_ids = [] + label_mask = [] + offset_mapping = [] + if self.is_split_into_words: + for offset, token in enumerate(list(data)): + subtoken_ids = self.tokenizer.encode( + token, add_special_tokens=False) + if len(subtoken_ids) == 0: + subtoken_ids = [self.tokenizer.unk_token_id] + input_ids.extend(subtoken_ids) + label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) + offset_mapping.extend([(offset, offset + 1)]) + else: + if self.tokenizer.is_fast: + encodings = self.tokenizer( + text, + add_special_tokens=False, + return_offsets_mapping=True, + **self.tokenize_kwargs) + input_ids = encodings['input_ids'] + word_ids = encodings.word_ids() + for i in range(len(word_ids)): + if word_ids[i] is None: + label_mask.append(0) + elif word_ids[i] == word_ids[i - 1]: + label_mask.append(0) + offset_mapping[-1] = ( + offset_mapping[-1][0], + encodings['offset_mapping'][i][1]) + else: + label_mask.append(1) + offset_mapping.append(encodings['offset_mapping'][i]) + else: + encodings = self.tokenizer( + text, add_special_tokens=False, **self.tokenize_kwargs) + input_ids = encodings['input_ids'] + label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( + text) + + if len(input_ids) >= self.sequence_length - 2: + input_ids = input_ids[:self.sequence_length - 2] + label_mask = label_mask[:self.sequence_length - 2] + input_ids = [self.tokenizer.cls_token_id + ] + input_ids + [self.tokenizer.sep_token_id] + label_mask = [0] + label_mask + [0] + attention_mask = [1] * len(input_ids) + offset_mapping = offset_mapping[:sum(label_mask)] + + if not self.is_transformer_based_model: + input_ids = input_ids[1:-1] + attention_mask = attention_mask[1:-1] + label_mask = label_mask[1:-1] + + if self._mode == ModeKeys.INFERENCE: + input_ids = torch.tensor(input_ids).unsqueeze(0) + attention_mask = torch.tensor(attention_mask).unsqueeze(0) + label_mask = torch.tensor( + label_mask, dtype=torch.bool).unsqueeze(0) + + # the token classification + output = { + 'text': text, + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + 'offset_mapping': offset_mapping + } + + # align the labels with tokenized text + if labels_list is not None: + assert self.label2id is not None + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + label_enumerate_values = [ + k for k, v in sorted( + self.label2id.items(), key=lambda item: item[1]) + ] + for idx, label in enumerate(label_enumerate_values): + if label.startswith('B-') and label.replace( + 'B-', 'I-') in label_enumerate_values: + b_to_i_label.append( + label_enumerate_values.index( + label.replace('B-', 'I-'))) + else: + b_to_i_label.append(idx) + + label_row = [self.label2id[lb] for lb in labels_list] + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + if word_idx is None: + label_ids.append(-100) + elif word_idx != previous_word_idx: + label_ids.append(label_row[word_idx]) + else: + if self.label_all_tokens: + label_ids.append(b_to_i_label[label_row[word_idx]]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + labels = label_ids + output['labels'] = labels + return output + + def get_tokenizer_class(self): + tokenizer_class = self.tokenizer.__class__.__name__ + if tokenizer_class.endswith( + 'Fast') and tokenizer_class != 'PreTrainedTokenizerFast': + tokenizer_class = tokenizer_class[:-4] + return tokenizer_class + + def get_label_mask_and_offset_mapping(self, text): + label_mask = [] + offset_mapping = [] + tokens = self.tokenizer.tokenize(text) + offset = 0 + if self.get_tokenizer_class() == 'BertTokenizer': + for token in tokens: + is_start = (token[:2] != '##') + if is_start: + label_mask.append(True) + else: + token = token[2:] + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if is_start: + offset_mapping.append((start, end)) + else: + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + elif self.get_tokenizer_class() == 'XLMRobertaTokenizer': + last_is_blank = False + for token in tokens: + is_start = (token[0] == '▁') + if is_start: + token = token[1:] + label_mask.append(True) + if len(token) == 0: + last_is_blank = True + continue + else: + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if last_is_blank or is_start: + offset_mapping.append((start, end)) + else: + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + last_is_blank = False + else: + raise NotImplementedError + + return label_mask, offset_mapping diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py new file mode 100644 index 00000000..eb3c4b37 --- /dev/null +++ b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py @@ -0,0 +1,51 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Union + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from .nlp_base import NLPTokenizerPreprocessorBase + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) +class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in zero shot classification. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ + self.sequence_length = kwargs.pop('sequence_length', 512) + super().__init__(model_dir, mode=mode, **kwargs) + + def __call__(self, data: Union[str, Dict], hypothesis_template: str, + candidate_labels: list) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str or dict): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + if isinstance(data, dict): + data = data.get(self.first_sequence) + + pairs = [[data, hypothesis_template.format(label)] + for label in candidate_labels] + + features = self.tokenizer( + pairs, + padding=True, + truncation=True, + max_length=self.sequence_length, + truncation_strategy='only_first', + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None) + return features diff --git a/modelscope/preprocessors/space/fields/__init__.py b/modelscope/preprocessors/space/fields/__init__.py deleted file mode 100644 index 925eac71..00000000 --- a/modelscope/preprocessors/space/fields/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .gen_field import MultiWOZBPETextField -from .intent_field import IntentBPETextField diff --git a/modelscope/preprocessors/space/fields/dst_processors.py b/modelscope/preprocessors/space/fields/dst_processors.py deleted file mode 100644 index 22e06eec..00000000 --- a/modelscope/preprocessors/space/fields/dst_processors.py +++ /dev/null @@ -1,1523 +0,0 @@ -# -# Copyright 2020 Heinrich Heine University Duesseldorf -# -# Part of this code is based on the source code of BERT-DST -# (arXiv:1907.03040) -# Part of this code is based on the source code of Transformers -# (arXiv:1910.03771) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import re - -import json -import numpy as np -import six -from tqdm import tqdm - -logger = logging.getLogger(__name__) -USER_NAME = 'User' -SYSTEM_NAME = 'System' -DIALOG_ACT = 'Dialog_Act' - -utter1 = { - 'User-1': - "I'd really like to take my client out to a nice restaurant that serves indian food." -} -history_states1 = [ - {}, -] -utter2 = { - 'User-1': - "I'd really like to take my client out to a nice restaurant that serves indian food.", - 'System-1': - 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?', - 'Dialog_Act-1': { - 'Restaurant-Inform': [['choice', 'many'], ['food', 'Indian'], - ['pricerange', 'that price range']] - }, - 'User-2': - 'I am looking for an expensive indian restaurant in the area of centre.', -} - -history_states2 = [{}, { - 'attraction': { - 'book': { - 'booked': [] - }, - 'semi': { - 'area': '', - 'name': '', - 'type': '' - } - }, - 'hospital': { - 'book': { - 'booked': [] - }, - 'semi': { - 'department': '' - } - }, - 'hotel': { - 'book': { - 'booked': [{ - 'name': 'alexander bed and breakfast', - 'reference': 'JXVKZ7KV' - }], - 'day': - 'sunday', - 'people': - '6', - 'stay': - '4' - }, - 'semi': { - 'area': '', - 'internet': 'yes', - 'name': 'alexander bed and breakfast', - 'parking': 'yes', - 'pricerange': 'cheap', - 'stars': '', - 'type': 'guesthouse' - } - }, - 'police': { - 'book': { - 'booked': [] - }, - 'semi': {} - }, - 'restaurant': { - 'book': { - 'booked': [{ - 'name': 'ask', - 'reference': 'Y2Y8QYBY' - }], - 'day': 'sunday', - 'people': '6', - 'time': '18:45' - }, - 'semi': { - 'area': 'centre', - 'food': 'italian', - 'name': 'ask', - 'pricerange': 'cheap' - } - }, - 'taxi': { - 'book': { - 'booked': [] - }, - 'semi': { - 'arriveBy': '', - 'departure': '', - 'destination': '', - 'leaveAt': '' - } - }, - 'train': { - 'book': { - 'booked': [], - 'people': '' - }, - 'semi': { - 'arriveBy': '', - 'day': '', - 'departure': '', - 'destination': '', - 'leaveAt': '' - } - } -}, {}] - -utter3 = { - 'User-1': - "I'd really like to take my client out to a nice restaurant that serves indian food.", - 'System-1': - 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?', - 'Dialog_Act-1': { - 'Restaurant-Inform': [['choice', 'many'], ['food', 'Indian'], - ['pricerange', 'that price range']] - }, - 'User-2': - 'I am looking for an expensive indian restaurant in the area of centre.', - 'System-2': - 'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant ' - 'in the center of town. I can book a table for you, if you like.', - 'Dialog_Act-2': { - 'Restaurant-Recommend': [['area', 'center of town'], - ['food', 'Indian'], - ['name', 'Saffron Brasserie'], - ['pricerange', 'expensive']] - }, - 'User-3': - 'Sure thing, please book for 6 people at 19:30 on Saturday.' -} - -history_states3 = [{}, { - 'attraction': { - 'book': { - 'booked': [] - }, - 'semi': { - 'area': '', - 'name': '', - 'type': '' - } - }, - 'hospital': { - 'book': { - 'booked': [] - }, - 'semi': { - 'department': '' - } - }, - 'hotel': { - 'book': { - 'booked': [{ - 'name': 'alexander bed and breakfast', - 'reference': 'JXVKZ7KV' - }], - 'day': - 'sunday', - 'people': - '6', - 'stay': - '4' - }, - 'semi': { - 'area': '', - 'internet': 'yes', - 'name': 'alexander bed and breakfast', - 'parking': 'yes', - 'pricerange': 'cheap', - 'stars': '', - 'type': 'guesthouse' - } - }, - 'police': { - 'book': { - 'booked': [] - }, - 'semi': {} - }, - 'restaurant': { - 'book': { - 'booked': [{ - 'name': 'ask', - 'reference': 'Y2Y8QYBY' - }], - 'day': 'sunday', - 'people': '6', - 'time': '18:45' - }, - 'semi': { - 'area': 'centre', - 'food': 'italian', - 'name': 'ask', - 'pricerange': 'cheap' - } - }, - 'taxi': { - 'book': { - 'booked': [] - }, - 'semi': { - 'arriveBy': '', - 'departure': '', - 'destination': '', - 'leaveAt': '' - } - }, - 'train': { - 'book': { - 'booked': [], - 'people': '' - }, - 'semi': { - 'arriveBy': '', - 'day': '', - 'departure': '', - 'destination': '', - 'leaveAt': '' - } - } -}, {}, { - 'attraction': { - 'book': { - 'booked': [] - }, - 'semi': { - 'area': '', - 'name': '', - 'type': '' - } - }, - 'hospital': { - 'book': { - 'booked': [] - }, - 'semi': { - 'department': '' - } - }, - 'hotel': { - 'book': { - 'booked': [{ - 'name': 'alexander bed and breakfast', - 'reference': 'JXVKZ7KV' - }], - 'day': - 'sunday', - 'people': - '6', - 'stay': - '4' - }, - 'semi': { - 'area': '', - 'internet': 'yes', - 'name': 'alexander bed and breakfast', - 'parking': 'yes', - 'pricerange': 'cheap', - 'stars': '', - 'type': 'guesthouse' - } - }, - 'police': { - 'book': { - 'booked': [] - }, - 'semi': {} - }, - 'restaurant': { - 'book': { - 'booked': [{ - 'name': 'ask', - 'reference': 'Y2Y8QYBY' - }], - 'day': 'sunday', - 'people': '6', - 'time': '18:45' - }, - 'semi': { - 'area': 'centre', - 'food': 'italian', - 'name': 'ask', - 'pricerange': 'cheap' - } - }, - 'taxi': { - 'book': { - 'booked': [] - }, - 'semi': { - 'arriveBy': '', - 'departure': '', - 'destination': '', - 'leaveAt': '' - } - }, - 'train': { - 'book': { - 'booked': [], - 'people': '' - }, - 'semi': { - 'arriveBy': '', - 'day': '', - 'departure': '', - 'destination': '', - 'leaveAt': '' - } - } -}, {}] - - -class DSTProcessor(object): - ACTS_DICT = { - 'taxi-depart': 'taxi-departure', - 'taxi-dest': 'taxi-destination', - 'taxi-leaveat': 'taxi-leaveAt', - 'taxi-arriveby': 'taxi-arriveBy', - 'train-depart': 'train-departure', - 'train-dest': 'train-destination', - 'train-leaveat': 'train-leaveAt', - 'train-arriveby': 'train-arriveBy', - 'train-bookpeople': 'train-book_people', - 'restaurant-price': 'restaurant-pricerange', - 'restaurant-bookpeople': 'restaurant-book_people', - 'restaurant-bookday': 'restaurant-book_day', - 'restaurant-booktime': 'restaurant-book_time', - 'hotel-price': 'hotel-pricerange', - 'hotel-bookpeople': 'hotel-book_people', - 'hotel-bookday': 'hotel-book_day', - 'hotel-bookstay': 'hotel-book_stay', - 'booking-bookpeople': 'booking-book_people', - 'booking-bookday': 'booking-book_day', - 'booking-bookstay': 'booking-book_stay', - 'booking-booktime': 'booking-book_time', - } - - LABEL_MAPS = {} # Loaded from file - - def __init__(self): - # Required for mapping slot names in dialogue_acts.json file - # to proper designations. - pass - - def _convert_inputs_to_utterances(self, inputs: dict, - history_states: list): - """This method is to generate the utterances with user, sys, dialog_acts and metadata, - while metadata is from the history_states or the output from the inference pipline""" - - utterances = [] - user_inputs = [] - sys_gen_inputs = [] - dialog_acts_inputs = [] - for i, item in enumerate(inputs): - name, turn = item.split('-') - if name == USER_NAME: - user_inputs.insert(int(turn) - 1, inputs[item]) - elif name == SYSTEM_NAME: - sys_gen_inputs.insert(int(turn) - 1, inputs[item]) - else: - dialog_acts_inputs.insert(int(turn) - 1, inputs[item]) - - # user is leading the topic should aways larger than sys and dialog acts - assert len(user_inputs) - 1 == len(sys_gen_inputs) - assert len(user_inputs) - 1 == len(dialog_acts_inputs) - # the history states record both user and sys states - assert len(history_states) == len(user_inputs) + len(sys_gen_inputs) - - # the dialog_act at user turn is useless - for i, item in enumerate(history_states): - utterance = {} - # the dialog_act at user turn is useless - utterance['dialog_act'] = dialog_acts_inputs[ - i // 2] if i % 2 == 1 else {} - utterance['text'] = sys_gen_inputs[ - i // 2] if i % 2 == 1 else user_inputs[i // 2] - utterance['metadata'] = item - utterance['span_info'] = [] - utterances.append(utterance) - - return utterances - - def _load_acts(self, inputs: dict, dialog_id='example.json'): - dialog_acts_inputs = [] - for i, item in enumerate(inputs): - name, turn = item.split('-') - if name == DIALOG_ACT: - dialog_acts_inputs.insert(int(turn) - 1, inputs[item]) - s_dict = {} - - for j, item in enumerate(dialog_acts_inputs): - if isinstance(item, dict): - for a in item: - aa = a.lower().split('-') - if aa[1] == 'inform' or aa[1] == 'recommend' or \ - aa[1] == 'select' or aa[1] == 'book': - for i in item[a]: - s = i[0].lower() - v = i[1].lower().strip() - if s == 'none' or v == '?' or v == 'none': - continue - slot = aa[0] + '-' + s - if slot in self.ACTS_DICT: - slot = self.ACTS_DICT[slot] - key = dialog_id, str(int(j) + 1), slot - # In case of multiple mentioned values... - # ... Option 1: Keep first informed value - if key not in s_dict: - s_dict[key] = list([v]) - # ... Option 2: Keep last informed value - # s_dict[key] = list([v]) - - return s_dict - - -class multiwoz22Processor(DSTProcessor): - - def __init__(self): - super().__init__() - - def normalize_time(self, text): - text = re.sub(r'(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2', - text) # am/pm without space - text = re.sub(r'(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3', - text) # am/pm short to long form - text = re.sub( - r'(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)', - r'\1\2 \3:\4\5', text) # Missing separator - text = re.sub(r'(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3', - text) # Wrong separator - text = re.sub(r'(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)', - r'\1\2 \3:00\4', text) # normalize simple full hour time - text = re.sub(r'(^| )(\d{1}:\d{2})', r'\g<1>0\2', - text) # Add missing leading 0 - # Map 12 hour times to 24 hour times - text = \ - re.sub( - r'(\d{2})(:\d{2}) ?p\.?m\.?', - lambda x: str(int(x.groups()[0]) + 12 - if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups()[1], text) - text = re.sub(r'(^| )24:(\d{2})', r'\g<1>00:\2', - text) # Correct times that use 24 as hour - return text - - def normalize_text(self, text): - text = self.normalize_time(text) - text = re.sub("n't", ' not', text) - text = re.sub('(^| )zero(-| )star([s.,? ]|$)', r'\g<1>0 star\3', text) - text = re.sub('(^| )one(-| )star([s.,? ]|$)', r'\g<1>1 star\3', text) - text = re.sub('(^| )two(-| )star([s.,? ]|$)', r'\g<1>2 star\3', text) - text = re.sub('(^| )three(-| )star([s.,? ]|$)', r'\g<1>3 star\3', text) - text = re.sub('(^| )four(-| )star([s.,? ]|$)', r'\g<1>4 star\3', text) - text = re.sub('(^| )five(-| )star([s.,? ]|$)', r'\g<1>5 star\3', text) - text = re.sub('archaelogy', 'archaeology', text) # Systematic typo - text = re.sub('guesthouse', 'guest house', text) # Normalization - text = re.sub('(^| )b ?& ?b([.,? ]|$)', r'\1bed and breakfast\2', - text) # Normalization - text = re.sub('bed & breakfast', 'bed and breakfast', - text) # Normalization - return text - - # Loads the dialogue_acts.json and returns a list - # of slot-value pairs. - def load_acts(self, input_file): - with open(input_file) as f: - acts = json.load(f) - s_dict = {} - for d in acts: - for t in acts[d]: - if int(t) % 2 == 0: - continue - # Only process, if turn has annotation - if isinstance(acts[d][t]['dialog_act'], dict): - for a in acts[d][t]['dialog_act']: - aa = a.lower().split('-') - if aa[1] == 'inform' or aa[1] == 'recommend' \ - or aa[1] == 'select' or aa[1] == 'book': - for i in acts[d][t]['dialog_act'][a]: - s = i[0].lower() - v = i[1].lower().strip() - if s == 'none' or v == '?' or v == 'none': - continue - slot = aa[0] + '-' + s - if slot in self.ACTS_DICT: - slot = self.ACTS_DICT[slot] - key = d, str(int(t) // 2 + 1), slot - # In case of multiple mentioned values... - # ... Option 1: Keep first informed value - if key not in s_dict: - s_dict[key] = list([v]) - # ... Option 2: Keep last informed value - # s_dict[key] = list([v]) - return s_dict - - # This should only contain label normalizations. All other mappings should - # be defined in LABEL_MAPS. - def normalize_label(self, slot, value_label): - # Normalization of empty slots - if value_label == '' or value_label == 'not mentioned': - return 'none' - - # Normalization of time slots - if 'leaveAt' in slot or 'arriveBy' in slot or slot == 'restaurant-book_time': - return self.normalize_time(value_label) - - # Normalization - if 'type' in slot or 'name' in slot or 'destination' in slot or 'departure' in slot: - value_label = re.sub('guesthouse', 'guest house', value_label) - - # Map to boolean slots - if slot == 'hotel-parking' or slot == 'hotel-internet': - if value_label == 'yes' or value_label == 'free': - return 'true' - if value_label == 'no': - return 'false' - if slot == 'hotel-type': - if value_label == 'hotel': - return 'true' - if value_label == 'guest house': - return 'false' - - return value_label - - def tokenize(self, utt): - utt_lower = convert_to_unicode(utt).lower() - utt_lower = self.normalize_text(utt_lower) - utt_tok = [ - tok for tok in map(str.strip, re.split(r'(\W+)', utt_lower)) - if len(tok) > 0 - ] - return utt_tok - - def delex_utt(self, utt, values, unk_token='[UNK]'): - utt_norm = self.tokenize(utt) - for s, vals in values.items(): - for v in vals: - if v != 'none': - v_norm = self.tokenize(v) - v_len = len(v_norm) - for i in range(len(utt_norm) + 1 - v_len): - if utt_norm[i:i + v_len] == v_norm: - utt_norm[i:i + v_len] = [unk_token] * v_len - return utt_norm - - def get_token_pos(self, tok_list, value_label): - find_pos = [] - found = False - label_list = [ - item for item in map(str.strip, re.split(r'(\W+)', value_label)) - if len(item) > 0 - ] - len_label = len(label_list) - for i in range(len(tok_list) + 1 - len_label): - if tok_list[i:i + len_label] == label_list: - find_pos.append((i, i + len_label)) # start, exclusive_end - found = True - return found, find_pos - - def check_label_existence(self, value_label, usr_utt_tok): - in_usr, usr_pos = self.get_token_pos(usr_utt_tok, value_label) - # If no hit even though there should be one, check for value label variants - if not in_usr and value_label in self.LABEL_MAPS: - for value_label_variant in self.LABEL_MAPS[value_label]: - in_usr, usr_pos = self.get_token_pos(usr_utt_tok, - value_label_variant) - if in_usr: - break - return in_usr, usr_pos - - def check_slot_referral(self, value_label, slot, seen_slots): - referred_slot = 'none' - if slot == 'hotel-stars' or slot == 'hotel-internet' or slot == 'hotel-parking': - return referred_slot - for s in seen_slots: - # Avoid matches for slots that share values with different meaning. - # hotel-internet and -parking are handled separately as Boolean slots. - if s == 'hotel-stars' or s == 'hotel-internet' or s == 'hotel-parking': - continue - if re.match('(hotel|restaurant)-book_people', - s) and slot == 'hotel-book_stay': - continue - if re.match('(hotel|restaurant)-book_people', - slot) and s == 'hotel-book_stay': - continue - if slot != s and (slot not in seen_slots - or seen_slots[slot] != value_label): - if seen_slots[s] == value_label: - referred_slot = s - break - elif value_label in self.LABEL_MAPS: - for value_label_variant in self.LABEL_MAPS[value_label]: - if seen_slots[s] == value_label_variant: - referred_slot = s - break - return referred_slot - - def is_in_list(self, tok, value): - found = False - tok_list = [ - item for item in map(str.strip, re.split(r'(\W+)', tok)) - if len(item) > 0 - ] - value_list = [ - item for item in map(str.strip, re.split(r'(\W+)', value)) - if len(item) > 0 - ] - tok_len = len(tok_list) - value_len = len(value_list) - for i in range(tok_len + 1 - value_len): - if tok_list[i:i + value_len] == value_list: - found = True - break - return found - - # Fuzzy matching to label informed slot values - def check_slot_inform(self, value_label, inform_label): - result = False - informed_value = 'none' - vl = ' '.join(self.tokenize(value_label)) - for il in inform_label: - if vl == il: - result = True - elif self.is_in_list(il, vl): - result = True - elif self.is_in_list(vl, il): - result = True - elif il in self.LABEL_MAPS: - for il_variant in self.LABEL_MAPS[il]: - if vl == il_variant: - result = True - break - elif self.is_in_list(il_variant, vl): - result = True - break - elif self.is_in_list(vl, il_variant): - result = True - break - elif vl in self.LABEL_MAPS: - for value_label_variant in self.LABEL_MAPS[vl]: - if value_label_variant == il: - result = True - break - elif self.is_in_list(il, value_label_variant): - result = True - break - elif self.is_in_list(value_label_variant, il): - result = True - break - if result: - informed_value = il - break - return result, informed_value - - def get_turn_label(self, value_label, inform_label, sys_utt_tok, - usr_utt_tok, slot, seen_slots, slot_last_occurrence): - usr_utt_tok_label = [0 for _ in usr_utt_tok] - informed_value = 'none' - referred_slot = 'none' - if value_label == 'none' or value_label == 'dontcare' or value_label == 'true' or value_label == 'false': - class_type = value_label - else: - in_usr, usr_pos = self.check_label_existence( - value_label, usr_utt_tok) - is_informed, informed_value = self.check_slot_inform( - value_label, inform_label) - if in_usr: - class_type = 'copy_value' - if slot_last_occurrence: - (s, e) = usr_pos[-1] - for i in range(s, e): - usr_utt_tok_label[i] = 1 - else: - for (s, e) in usr_pos: - for i in range(s, e): - usr_utt_tok_label[i] = 1 - elif is_informed: - class_type = 'inform' - else: - referred_slot = self.check_slot_referral( - value_label, slot, seen_slots) - if referred_slot != 'none': - class_type = 'refer' - else: - class_type = 'unpointable' - return informed_value, referred_slot, usr_utt_tok_label, class_type - - def _create_example(self, - utterances, - sys_inform_dict, - set_type, - slot_list, - label_maps={}, - append_history=False, - use_history_labels=False, - swap_utterances=False, - label_value_repetitions=False, - delexicalize_sys_utts=False, - unk_token='[UNK]', - analyze=False, - dialog_id='example.json'): - - # Collects all slot changes throughout the dialog - cumulative_labels = {slot: 'none' for slot in slot_list} - - # First system utterance is empty, since multiwoz starts with user input - utt_tok_list = [[]] - mod_slots_list = [] - - # Collect all utterances and their metadata - usr_sys_switch = True - turn_itr = 0 - - for utt in utterances: - # Assert that system and user utterances alternate - is_sys_utt = utt['metadata'] != {} - if usr_sys_switch == is_sys_utt: - print( - 'WARN: Wrong order of system and user utterances. Skipping rest of the dialog %s' - % (dialog_id)) - break - usr_sys_switch = is_sys_utt - - if is_sys_utt: - turn_itr += 1 - - # Delexicalize sys utterance - if delexicalize_sys_utts and is_sys_utt: - inform_dict = {slot: 'none' for slot in slot_list} - for slot in slot_list: - if (str(dialog_id), str(turn_itr), - slot) in sys_inform_dict: - inform_dict[slot] = sys_inform_dict[(str(dialog_id), - str(turn_itr), - slot)] - utt_tok_list.append( - self.delex_utt(utt['text'], inform_dict, - unk_token)) # normalize utterances - else: - utt_tok_list.append(self.tokenize( - utt['text'])) # normalize utterances - - modified_slots = {} - - # If sys utt, extract metadata (identify and collect modified slots) - if is_sys_utt: - for d in utt['metadata']: - booked = utt['metadata'][d]['book']['booked'] - booked_slots = {} - # Check the booked section - if booked != []: - for s in booked[0]: - booked_slots[s] = self.normalize_label( - '%s-%s' % (d, s), - booked[0][s]) # normalize labels - # Check the semi and the inform slots - for category in ['book', 'semi']: - for s in utt['metadata'][d][category]: - cs = '%s-book_%s' % ( - d, s) if category == 'book' else '%s-%s' % (d, - s) - value_label = self.normalize_label( - cs, utt['metadata'][d][category] - [s]) # normalize labels - # Prefer the slot value as stored in the booked section - if s in booked_slots: - value_label = booked_slots[s] - # Remember modified slots and entire dialog state - if cs in slot_list and cumulative_labels[ - cs] != value_label: - modified_slots[cs] = value_label - cumulative_labels[cs] = value_label - - mod_slots_list.append(modified_slots.copy()) - - # Form proper (usr, sys) turns - turn_itr = 0 - diag_seen_slots_dict = {} - diag_seen_slots_value_dict = {slot: 'none' for slot in slot_list} - diag_state = {slot: 'none' for slot in slot_list} - sys_utt_tok = [] - usr_utt_tok = [] - hst_utt_tok = [] - hst_utt_tok_label_dict = {slot: [] for slot in slot_list} - new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy() - new_diag_state = diag_state.copy() - - for i in range(0, len(utt_tok_list) - 1, 2): - sys_utt_tok_label_dict = {} - usr_utt_tok_label_dict = {} - value_dict = {} - inform_dict = {} - inform_slot_dict = {} - referral_dict = {} - class_type_dict = {} - - # Collect turn data - if append_history: - if swap_utterances: - hst_utt_tok = usr_utt_tok + sys_utt_tok + hst_utt_tok - else: - hst_utt_tok = sys_utt_tok + usr_utt_tok + hst_utt_tok - sys_utt_tok = utt_tok_list[i] - usr_utt_tok = utt_tok_list[i + 1] - turn_slots = mod_slots_list[ - i + 1] if len(mod_slots_list) > 1 else {} - - guid = '%s-%s-%s' % (set_type, str(dialog_id), str(turn_itr)) - - if analyze: - print('%15s %2s %s ||| %s' % - (dialog_id, turn_itr, ' '.join(sys_utt_tok), - ' '.join(usr_utt_tok))) - print('%15s %2s [' % (dialog_id, turn_itr), end='') - - new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy() - new_diag_state = diag_state.copy() - for slot in slot_list: - value_label = 'none' - if slot in turn_slots: - value_label = turn_slots[slot] - # We keep the original labels so as to not - # overlook unpointable values, as well as to not - # modify any of the original labels for test sets, - # since this would make comparison difficult. - value_dict[slot] = value_label - elif label_value_repetitions and slot in diag_seen_slots_dict: - value_label = diag_seen_slots_value_dict[slot] - - # Get dialog act annotations - inform_label = list(['none']) - inform_slot_dict[slot] = 0 - if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict: - inform_label = list([ - self.normalize_label(slot, i) - for i in sys_inform_dict[(str(dialog_id), - str(turn_itr), slot)] - ]) - inform_slot_dict[slot] = 1 - elif (str(dialog_id), str(turn_itr), - 'booking-' + slot.split('-')[1]) in sys_inform_dict: - inform_label = list([ - self.normalize_label(slot, i) - for i in sys_inform_dict[(str(dialog_id), - str(turn_itr), 'booking-' - + slot.split('-')[1])] - ]) - inform_slot_dict[slot] = 1 - - (informed_value, referred_slot, usr_utt_tok_label, - class_type) = self.get_turn_label( - value_label, - inform_label, - sys_utt_tok, - usr_utt_tok, - slot, - diag_seen_slots_value_dict, - slot_last_occurrence=True) - - inform_dict[slot] = informed_value - - # Generally don't use span prediction on sys utterance (but inform prediction instead). - sys_utt_tok_label = [0 for _ in sys_utt_tok] - - # Determine what to do with value repetitions. - # If value is unique in seen slots, then tag it, otherwise not, - # since correct slot assignment can not be guaranteed anymore. - if label_value_repetitions and slot in diag_seen_slots_dict: - if class_type == 'copy_value' and list( - diag_seen_slots_value_dict.values()).count( - value_label) > 1: - class_type = 'none' - usr_utt_tok_label = [0 for _ in usr_utt_tok_label] - - sys_utt_tok_label_dict[slot] = sys_utt_tok_label - usr_utt_tok_label_dict[slot] = usr_utt_tok_label - - if append_history: - if use_history_labels: - if swap_utterances: - new_hst_utt_tok_label_dict[ - slot] = usr_utt_tok_label + sys_utt_tok_label + new_hst_utt_tok_label_dict[ - slot] - else: - new_hst_utt_tok_label_dict[ - slot] = sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[ - slot] - else: - new_hst_utt_tok_label_dict[slot] = [ - 0 for _ in sys_utt_tok_label + usr_utt_tok_label - + new_hst_utt_tok_label_dict[slot] - ] - - # For now, we map all occurences of unpointable slot values - # to none. However, since the labels will still suggest - # a presence of unpointable slot values, the task of the - # DST is still to find those values. It is just not - # possible to do that via span prediction on the current input. - if class_type == 'unpointable': - class_type_dict[slot] = 'none' - referral_dict[slot] = 'none' - if analyze: - if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[ - slot]: - print('(%s): %s, ' % (slot, value_label), end='') - elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] \ - and class_type != 'copy_value' and class_type != 'inform': - # If slot has seen before and its class type did not change, label this slot a not present, - # assuming that the slot has not actually been mentioned in this turn. - # Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform, - # this must mean there is evidence in the original labels, therefore consider - # them as mentioned again. - class_type_dict[slot] = 'none' - referral_dict[slot] = 'none' - else: - class_type_dict[slot] = class_type - referral_dict[slot] = referred_slot - # Remember that this slot was mentioned during this dialog already. - if class_type != 'none': - diag_seen_slots_dict[slot] = class_type - diag_seen_slots_value_dict[slot] = value_label - new_diag_state[slot] = class_type - # Unpointable is not a valid class, therefore replace with - # some valid class for now... - if class_type == 'unpointable': - new_diag_state[slot] = 'copy_value' - - if analyze: - print(']') - - if swap_utterances: - txt_a = usr_utt_tok - txt_b = sys_utt_tok - txt_a_lbl = usr_utt_tok_label_dict - txt_b_lbl = sys_utt_tok_label_dict - else: - txt_a = sys_utt_tok - txt_b = usr_utt_tok - txt_a_lbl = sys_utt_tok_label_dict - txt_b_lbl = usr_utt_tok_label_dict - - example = DSTExample( - guid=guid, - text_a=txt_a, - text_b=txt_b, - history=hst_utt_tok, - text_a_label=txt_a_lbl, - text_b_label=txt_b_lbl, - history_label=hst_utt_tok_label_dict, - values=diag_seen_slots_value_dict.copy(), - inform_label=inform_dict, - inform_slot_label=inform_slot_dict, - refer_label=referral_dict, - diag_state=diag_state, - class_label=class_type_dict) - # Update some variables. - hst_utt_tok_label_dict = new_hst_utt_tok_label_dict.copy() - diag_state = new_diag_state.copy() - - turn_itr += 1 - return example - - def create_example(self, - inputs, - history_states, - set_type, - slot_list, - label_maps={}, - append_history=False, - use_history_labels=False, - swap_utterances=False, - label_value_repetitions=False, - delexicalize_sys_utts=False, - unk_token='[UNK]', - analyze=False, - dialog_id='0'): - utterances = self._convert_inputs_to_utterances(inputs, history_states) - sys_inform_dict = self._load_acts(inputs) - self.LABEL_MAPS = label_maps - example = self._create_example(utterances, sys_inform_dict, set_type, - slot_list, label_maps, append_history, - use_history_labels, swap_utterances, - label_value_repetitions, - delexicalize_sys_utts, unk_token, - analyze) - - return example - - def create_examples(self, - input_file, - acts_file, - set_type, - slot_list, - label_maps={}, - append_history=False, - use_history_labels=False, - swap_utterances=False, - label_value_repetitions=False, - delexicalize_sys_utts=False, - unk_token='[UNK]', - analyze=False): - """Read a DST json file into a list of DSTExample.""" - - sys_inform_dict = self.load_acts(acts_file) - - with open(input_file, 'r', encoding='utf-8') as reader: - input_data = json.load(reader) - - self.LABEL_MAPS = label_maps - - examples = [] - for dialog_id in tqdm(input_data): - entry = input_data[dialog_id] - utterances = entry['log'] - - example = self._create_example( - utterances, sys_inform_dict, set_type, slot_list, label_maps, - append_history, use_history_labels, swap_utterances, - label_value_repetitions, delexicalize_sys_utts, unk_token, - analyze) - examples.append(example) - - return examples - - -class DSTExample(object): - """ - A single training/test example for the DST dataset. - """ - - def __init__(self, - guid, - text_a, - text_b, - history, - text_a_label=None, - text_b_label=None, - history_label=None, - values=None, - inform_label=None, - inform_slot_label=None, - refer_label=None, - diag_state=None, - class_label=None): - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.history = history - self.text_a_label = text_a_label - self.text_b_label = text_b_label - self.history_label = history_label - self.values = values - self.inform_label = inform_label - self.inform_slot_label = inform_slot_label - self.refer_label = refer_label - self.diag_state = diag_state - self.class_label = class_label - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = '' - s += 'guid: %s' % (self.guid) - s += ', text_a: %s' % (self.text_a) - s += ', text_b: %s' % (self.text_b) - s += ', history: %s' % (self.history) - if self.text_a_label: - s += ', text_a_label: %d' % (self.text_a_label) - if self.text_b_label: - s += ', text_b_label: %d' % (self.text_b_label) - if self.history_label: - s += ', history_label: %d' % (self.history_label) - if self.values: - s += ', values: %d' % (self.values) - if self.inform_label: - s += ', inform_label: %d' % (self.inform_label) - if self.inform_slot_label: - s += ', inform_slot_label: %d' % (self.inform_slot_label) - if self.refer_label: - s += ', refer_label: %d' % (self.refer_label) - if self.diag_state: - s += ', diag_state: %d' % (self.diag_state) - if self.class_label: - s += ', class_label: %d' % (self.class_label) - return s - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - input_ids, - input_ids_unmasked, - input_mask, - segment_ids, - start_pos=None, - end_pos=None, - values=None, - inform=None, - inform_slot=None, - refer_id=None, - diag_state=None, - class_label_id=None, - guid='NONE'): - self.guid = guid - self.input_ids = input_ids - self.input_ids_unmasked = input_ids_unmasked - self.input_mask = input_mask - self.segment_ids = segment_ids - self.start_pos = start_pos - self.end_pos = end_pos - self.values = values - self.inform = inform - self.inform_slot = inform_slot - self.refer_id = refer_id - self.diag_state = diag_state - self.class_label_id = class_label_id - - -def convert_examples_to_features(examples, - slot_list, - class_types, - model_type, - tokenizer, - max_seq_length, - slot_value_dropout=0.0): - """Loads a data file into a list of `InputBatch`s.""" - - if model_type == 'bert': - model_specs = { - 'MODEL_TYPE': 'bert', - 'CLS_TOKEN': '[CLS]', - 'UNK_TOKEN': '[UNK]', - 'SEP_TOKEN': '[SEP]', - 'TOKEN_CORRECTION': 4 - } - else: - logger.error('Unknown model type (%s). Aborting.' % (model_type)) - exit(1) - - def _tokenize_text_and_label(text, text_label_dict, slot, tokenizer, - model_specs, slot_value_dropout): - joint_text_label = [0 for _ in text_label_dict[slot] - ] # joint all slots' label - for slot_text_label in text_label_dict.values(): - for idx, label in enumerate(slot_text_label): - if label == 1: - joint_text_label[idx] = 1 - - text_label = text_label_dict[slot] - tokens = [] - tokens_unmasked = [] - token_labels = [] - for token, token_label, joint_label in zip(text, text_label, - joint_text_label): - token = convert_to_unicode(token) - sub_tokens = tokenizer.tokenize(token) # Most time intensive step - tokens_unmasked.extend(sub_tokens) - if slot_value_dropout == 0.0 or joint_label == 0: - tokens.extend(sub_tokens) - else: - rn_list = np.random.random_sample((len(sub_tokens), )) - for rn, sub_token in zip(rn_list, sub_tokens): - if rn > slot_value_dropout: - tokens.append(sub_token) - else: - tokens.append(model_specs['UNK_TOKEN']) - token_labels.extend([token_label for _ in sub_tokens]) - assert len(tokens) == len(token_labels) - assert len(tokens_unmasked) == len(token_labels) - return tokens, tokens_unmasked, token_labels - - def _truncate_seq_pair(tokens_a, tokens_b, history, max_length): - """Truncates a sequence pair in place to the maximum length. - Copied from bert/run_classifier.py - """ - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) + len(history) - if total_length <= max_length: - break - if len(history) > 0: - history.pop() - elif len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - def _truncate_length_and_warn(tokens_a, tokens_b, history, max_seq_length, - model_specs, guid): - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP], [SEP] with "- 4" (BERT) - if len(tokens_a) + len(tokens_b) + len( - history) > max_seq_length - model_specs['TOKEN_CORRECTION']: - logger.info('Truncate Example %s. Total len=%d.' % - (guid, len(tokens_a) + len(tokens_b) + len(history))) - input_text_too_long = True - else: - input_text_too_long = False - _truncate_seq_pair(tokens_a, tokens_b, history, - max_seq_length - model_specs['TOKEN_CORRECTION']) - return input_text_too_long - - def _get_token_label_ids(token_labels_a, token_labels_b, - token_labels_history, max_seq_length, - model_specs): - token_label_ids = [] - token_label_ids.append(0) # [CLS] - for token_label in token_labels_a: - token_label_ids.append(token_label) - token_label_ids.append(0) # [SEP] - for token_label in token_labels_b: - token_label_ids.append(token_label) - token_label_ids.append(0) # [SEP] - for token_label in token_labels_history: - token_label_ids.append(token_label) - token_label_ids.append(0) # [SEP] - while len(token_label_ids) < max_seq_length: - token_label_ids.append(0) # padding - assert len(token_label_ids) == max_seq_length - return token_label_ids - - def _get_start_end_pos(class_type, token_label_ids, max_seq_length): - if class_type == 'copy_value' and 1 not in token_label_ids: - # logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.") - class_type = 'none' - start_pos = 0 - end_pos = 0 - if 1 in token_label_ids: - start_pos = token_label_ids.index(1) - # Parsing is supposed to find only first location of wanted value - if 0 not in token_label_ids[start_pos:]: - end_pos = len(token_label_ids[start_pos:]) + start_pos - 1 - else: - end_pos = token_label_ids[start_pos:].index(0) + start_pos - 1 - for i in range(max_seq_length): - if i >= start_pos and i <= end_pos: - assert token_label_ids[i] == 1 - return class_type, start_pos, end_pos - - def _get_transformer_input(tokens_a, tokens_b, history, max_seq_length, - tokenizer, model_specs): - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - segment_ids = [] - tokens.append(model_specs['CLS_TOKEN']) - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - tokens.append(model_specs['SEP_TOKEN']) - segment_ids.append(0) - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) - tokens.append(model_specs['SEP_TOKEN']) - segment_ids.append(1) - for token in history: - tokens.append(token) - segment_ids.append(1) - tokens.append(model_specs['SEP_TOKEN']) - segment_ids.append(1) - input_ids = tokenizer.convert_tokens_to_ids(tokens) - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - return tokens, input_ids, input_mask, segment_ids - - total_cnt = 0 - too_long_cnt = 0 - - refer_list = ['none'] + slot_list - - features = [] - # Convert single example - for (example_index, example) in enumerate(examples): - if example_index % 1000 == 0: - logger.info('Writing example %d of %d' % - (example_index, len(examples))) - - total_cnt += 1 - - value_dict = {} - inform_dict = {} - inform_slot_dict = {} - refer_id_dict = {} - diag_state_dict = {} - class_label_id_dict = {} - start_pos_dict = {} - end_pos_dict = {} - for slot in slot_list: - tokens_a, tokens_a_unmasked, token_labels_a = _tokenize_text_and_label( - example.text_a, example.text_a_label, slot, tokenizer, - model_specs, slot_value_dropout) - tokens_b, tokens_b_unmasked, token_labels_b = _tokenize_text_and_label( - example.text_b, example.text_b_label, slot, tokenizer, - model_specs, slot_value_dropout) - tokens_history, tokens_history_unmasked, token_labels_history = _tokenize_text_and_label( - example.history, example.history_label, slot, tokenizer, - model_specs, slot_value_dropout) - - input_text_too_long = _truncate_length_and_warn( - tokens_a, tokens_b, tokens_history, max_seq_length, - model_specs, example.guid) - - if input_text_too_long: - if example_index < 10: - if len(token_labels_a) > len(tokens_a): - logger.info(' tokens_a truncated labels: %s' - % str(token_labels_a[len(tokens_a):])) - if len(token_labels_b) > len(tokens_b): - logger.info(' tokens_b truncated labels: %s' - % str(token_labels_b[len(tokens_b):])) - if len(token_labels_history) > len(tokens_history): - logger.info( - ' tokens_history truncated labels: %s' - % str(token_labels_history[len(tokens_history):])) - - token_labels_a = token_labels_a[:len(tokens_a)] - token_labels_b = token_labels_b[:len(tokens_b)] - token_labels_history = token_labels_history[:len(tokens_history - )] - tokens_a_unmasked = tokens_a_unmasked[:len(tokens_a)] - tokens_b_unmasked = tokens_b_unmasked[:len(tokens_b)] - tokens_history_unmasked = tokens_history_unmasked[:len( - tokens_history)] - - assert len(token_labels_a) == len(tokens_a) - assert len(token_labels_b) == len(tokens_b) - assert len(token_labels_history) == len(tokens_history) - assert len(token_labels_a) == len(tokens_a_unmasked) - assert len(token_labels_b) == len(tokens_b_unmasked) - assert len(token_labels_history) == len(tokens_history_unmasked) - token_label_ids = _get_token_label_ids(token_labels_a, - token_labels_b, - token_labels_history, - max_seq_length, model_specs) - - value_dict[slot] = example.values[slot] - inform_dict[slot] = example.inform_label[slot] - - class_label_mod, start_pos_dict[slot], end_pos_dict[ - slot] = _get_start_end_pos(example.class_label[slot], - token_label_ids, max_seq_length) - if class_label_mod != example.class_label[slot]: - example.class_label[slot] = class_label_mod - inform_slot_dict[slot] = example.inform_slot_label[slot] - refer_id_dict[slot] = refer_list.index(example.refer_label[slot]) - diag_state_dict[slot] = class_types.index(example.diag_state[slot]) - class_label_id_dict[slot] = class_types.index( - example.class_label[slot]) - - if input_text_too_long: - too_long_cnt += 1 - - tokens, input_ids, input_mask, segment_ids = _get_transformer_input( - tokens_a, tokens_b, tokens_history, max_seq_length, tokenizer, - model_specs) - if slot_value_dropout > 0.0: - _, input_ids_unmasked, _, _ = _get_transformer_input( - tokens_a_unmasked, tokens_b_unmasked, tokens_history_unmasked, - max_seq_length, tokenizer, model_specs) - else: - input_ids_unmasked = input_ids - - assert (len(input_ids) == len(input_ids_unmasked)) - - if example_index < 10: - logger.info('*** Example ***') - logger.info('guid: %s' % (example.guid)) - logger.info('tokens: %s' % ' '.join(tokens)) - logger.info('input_ids: %s' % ' '.join([str(x) - for x in input_ids])) - logger.info('input_mask: %s' - % ' '.join([str(x) for x in input_mask])) - logger.info('segment_ids: %s' - % ' '.join([str(x) for x in segment_ids])) - logger.info('start_pos: %s' % str(start_pos_dict)) - logger.info('end_pos: %s' % str(end_pos_dict)) - logger.info('values: %s' % str(value_dict)) - logger.info('inform: %s' % str(inform_dict)) - logger.info('inform_slot: %s' % str(inform_slot_dict)) - logger.info('refer_id: %s' % str(refer_id_dict)) - logger.info('diag_state: %s' % str(diag_state_dict)) - logger.info('class_label_id: %s' % str(class_label_id_dict)) - - features.append( - InputFeatures( - guid=example.guid, - input_ids=input_ids, - input_ids_unmasked=input_ids_unmasked, - input_mask=input_mask, - segment_ids=segment_ids, - start_pos=start_pos_dict, - end_pos=end_pos_dict, - values=value_dict, - inform=inform_dict, - inform_slot=inform_slot_dict, - refer_id=refer_id_dict, - diag_state=diag_state_dict, - class_label_id=class_label_id_dict)) - - logger.info('========== %d out of %d examples have text too long' % - (too_long_cnt, total_cnt)) - - return features - - -# From bert.tokenization (TF code) -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode('utf-8', 'ignore') - else: - raise ValueError('Unsupported string type: %s' % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode('utf-8', 'ignore') - elif isinstance(text, unicode): - return text - else: - raise ValueError('Unsupported string type: %s' % (type(text))) - else: - raise ValueError('Not running on Python2 or Python 3?') - - -if __name__ == '__main__': - processor = multiwoz22Processor() - set_type = 'test' - slot_list = [ - 'taxi-leaveAt', 'taxi-destination', 'taxi-departure', 'taxi-arriveBy', - 'restaurant-book_people', 'restaurant-book_day', - 'restaurant-book_time', 'restaurant-food', 'restaurant-pricerange', - 'restaurant-name', 'restaurant-area', 'hotel-book_people', - 'hotel-book_day', 'hotel-book_stay', 'hotel-name', 'hotel-area', - 'hotel-parking', 'hotel-pricerange', 'hotel-stars', 'hotel-internet', - 'hotel-type', 'attraction-type', 'attraction-name', 'attraction-area', - 'train-book_people', 'train-leaveAt', 'train-destination', 'train-day', - 'train-arriveBy', 'train-departure' - ] - append_history = True - use_history_labels = True - swap_utterances = True - label_value_repetitions = True - delexicalize_sys_utts = True, - unk_token = '[UNK]' - analyze = False - example = processor.create_example(utter1, history_states1, set_type, - slot_list, {}, append_history, - use_history_labels, swap_utterances, - label_value_repetitions, - delexicalize_sys_utts, unk_token, - analyze) - print(f'utterances is {example}') diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py index dbfe5ba7..d914489c 100644 --- a/modelscope/trainers/__init__.py +++ b/modelscope/trainers/__init__.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: MovieSceneSegmentationTrainer, ImageInpaintingTrainer) from .multi_modal import CLIPTrainer from .nlp import SequenceClassificationTrainer, TextRankingTrainer - from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer + from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer, NlpTrainerArguments from .trainer import EpochBasedTrainer else: @@ -27,7 +27,8 @@ else: ], 'multi_modal': ['CLIPTrainer'], 'nlp': ['SequenceClassificationTrainer', 'TextRankingTrainer'], - 'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'], + 'nlp_trainer': + ['NlpEpochBasedTrainer', 'VecoTrainer', 'NlpTrainerArguments'], 'trainer': ['EpochBasedTrainer'] } diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py index c8f0c7b0..a02478b9 100644 --- a/modelscope/trainers/default_config.py +++ b/modelscope/trainers/default_config.py @@ -22,7 +22,8 @@ def merge_cfg(cfg: Config): This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg. - @param cfg: The input cfg to be merged into. + Aegs: + cfg: The input cfg to be merged into. """ cfg.merge_from_dict(DEFAULT_CONFIG, force=False) # pop duplicate hook diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py index 32fb0250..ed018fef 100644 --- a/modelscope/trainers/hooks/lr_scheduler_hook.py +++ b/modelscope/trainers/hooks/lr_scheduler_hook.py @@ -47,7 +47,8 @@ class LrSchedulerHook(Hook): return lr def before_train_iter(self, trainer): - if not self.by_epoch and trainer.iter > 0: + if not self.by_epoch and trainer.iter >= getattr( + trainer, 'cumulative_iters', 1): if self.warmup_lr_scheduler is not None: self.warmup_lr_scheduler.step() else: diff --git a/modelscope/trainers/hooks/optimizer/base.py b/modelscope/trainers/hooks/optimizer/base.py index 8c61dfdb..0f38c67a 100644 --- a/modelscope/trainers/hooks/optimizer/base.py +++ b/modelscope/trainers/hooks/optimizer/base.py @@ -44,6 +44,7 @@ class OptimizerHook(Hook): def before_run(self, trainer): trainer.optimizer.zero_grad() + trainer.cumulative_iters = self.cumulative_iters def after_train_iter(self, trainer): for k in self.loss_keys: diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py index 7f1bcd63..22f2cfe6 100644 --- a/modelscope/trainers/nlp/__init__.py +++ b/modelscope/trainers/nlp/__init__.py @@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .sequence_classification_trainer import SequenceClassificationTrainer from .csanmt_translation_trainer import CsanmtTranslationTrainer - from .text_ranking_trainer import TextRankingTranier + from .text_ranking_trainer import TextRankingTrainer else: _import_structure = { 'sequence_classification_trainer': ['SequenceClassificationTrainer'], diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py index 2e59cd80..4baaddfe 100644 --- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py +++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py @@ -1,23 +1,22 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os -import time -from typing import Callable, Dict, Optional, Tuple, Union +from typing import Callable, Dict, Optional import numpy as np from modelscope.metainfo import Trainers from modelscope.models.nlp.space.model.generator import SpaceGenerator from modelscope.models.nlp.space.model.model_base import SpaceModelBase -from modelscope.preprocessors.space.data_loader import \ +from modelscope.preprocessors.nlp.space.data_loader import \ get_sequential_data_loader -from modelscope.preprocessors.space.fields.intent_field import \ +from modelscope.preprocessors.nlp.space.fields.intent_field import \ IntentBPETextField -from modelscope.preprocessors.space.preprocess import intent_preprocess +from modelscope.preprocessors.nlp.space.preprocess import intent_preprocess from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS from modelscope.trainers.nlp.space.trainer.intent_trainer import IntentTrainer -from modelscope.utils.config import Config +from modelscope.utils.config import Config, ModelFile from modelscope.utils.logger import get_logger PATH = None @@ -34,14 +33,6 @@ class DialogIntentTrainer(BaseTrainer): **kwargs): super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name'])) - def to_tensor(array): - """ - numpy array -> tensor - """ - import torch - array = torch.tensor(array) - return array.cuda() if self.cfg.use_gpu else array - def setup_seed(seed): import random import torch @@ -59,56 +50,70 @@ class DialogIntentTrainer(BaseTrainer): # preprocess data intent_preprocess(self.cfg.Model.init_checkpoint, self.cfg) # set reader and evaluator - bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg) + self.bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg) - self.cfg.Model.num_token_embeddings = bpe.vocab_size - self.cfg.Model.num_turn_embeddings = bpe.max_ctx_turn + 1 + self.cfg.Model.num_token_embeddings = self.bpe.vocab_size + self.cfg.Model.num_turn_embeddings = self.bpe.max_ctx_turn + 1 dataset_paths = [ os.path.join(self.cfg.Dataset.data_dir, self.cfg.Dataset.trigger_data) ] # set data and data status - collate_fn = bpe.collate_fn_multi_turn + collate_fn = self.bpe.collate_fn_multi_turn self.train_label_loader = get_sequential_data_loader( batch_size=self.cfg.Trainer.batch_size_label, - reader=bpe, + reader=self.bpe, hparams=self.cfg, data_paths=dataset_paths, collate_fn=collate_fn, data_type='train') self.valid_label_loader = get_sequential_data_loader( batch_size=self.cfg.Trainer.batch_size_label, - reader=bpe, + reader=self.bpe, hparams=self.cfg, data_paths=dataset_paths, collate_fn=collate_fn, data_type='valid') self.test_label_loader = get_sequential_data_loader( batch_size=self.cfg.Trainer.batch_size_label, - reader=bpe, + reader=self.bpe, hparams=self.cfg, data_paths=dataset_paths, collate_fn=collate_fn, data_type='test') # set generator - generator = SpaceGenerator.create(self.cfg, reader=bpe) + self.generator = SpaceGenerator.create(self.cfg, reader=self.bpe) + self._load_model(**kwargs) + + def _load_model(self, **kwargs): + + def to_tensor(array): + """ + numpy array -> tensor + """ + import torch + array = torch.tensor(array) + return array.cuda() if self.cfg.use_gpu else array + # construct model - self.model = SpaceModelBase.create( - self.cfg.Model.init_checkpoint, - self.cfg, - reader=bpe, - generator=generator) + if 'model' in kwargs: + self.model = kwargs['model'] + else: + self.model = SpaceModelBase.create( + kwargs['model_dir'], + self.cfg, + reader=self.bpe, + generator=self.generator) import torch - # multi-gpu if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1: self.model = torch.nn.DataParallel(self.model) # construct trainer self.trainer = IntentTrainer( - self.model, to_tensor, self.cfg, reader=bpe) + self.model, to_tensor, self.cfg, reader=self.bpe) num_batches = len(self.train_label_loader) self.trainer.set_optimizers(num_training_steps_per_epoch=num_batches) # load model, optimizer and lr_scheduler @@ -131,6 +136,16 @@ class DialogIntentTrainer(BaseTrainer): *args, **kwargs) -> Dict[str, float]: logger.info('Evaluate') + self.cfg.do_infer = True + + # get best checkpoint path + pos = checkpoint_path.rfind('/') + checkpoint_name = checkpoint_path[pos + 1:] + checkpoint_dir = checkpoint_path[:pos] + + assert checkpoint_name == ModelFile.TORCH_MODEL_BIN_FILE + kwargs['model_dir'] = checkpoint_dir + self._load_model(**kwargs) self.trainer.infer( data_iter=self.test_label_loader, ex_data_iter=self.train_label_loader) diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py index 726404d4..aa6bb69d 100644 --- a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py +++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py @@ -9,8 +9,7 @@ import numpy as np from modelscope.metainfo import Trainers from modelscope.models.nlp.space.model.generator import SpaceGenerator from modelscope.models.nlp.space.model.model_base import SpaceModelBase -from modelscope.preprocessors.space.fields.gen_field import \ - MultiWOZBPETextField +from modelscope.preprocessors.nlp import MultiWOZBPETextField from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS from modelscope.trainers.nlp.space.eval import MultiWOZEvaluator diff --git a/modelscope/trainers/nlp/space/trainer/gen_trainer.py b/modelscope/trainers/nlp/space/trainer/gen_trainer.py index 34cd2f9b..05efa138 100644 --- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py +++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py @@ -1,9 +1,6 @@ -""" -Trainer class. -""" -import logging +# Copyright (c) Alibaba, Inc. and its affiliates. + import os -import sys import time from collections import OrderedDict @@ -61,7 +58,7 @@ class Trainer(object): self.evaluator = evaluator self.tokenizer = reader.tokenizer - self.logger = get_logger() + self.logger = logger or get_logger() self.batch_metrics_tracker = MetricsTracker() self.token_metrics_tracker = MetricsTracker() diff --git a/modelscope/trainers/nlp/space/trainer/intent_trainer.py b/modelscope/trainers/nlp/space/trainer/intent_trainer.py index 1e6f4a2d..dc6b317b 100644 --- a/modelscope/trainers/nlp/space/trainer/intent_trainer.py +++ b/modelscope/trainers/nlp/space/trainer/intent_trainer.py @@ -1,10 +1,6 @@ -""" -Trainer class. -""" +# Copyright (c) Alibaba, Inc. and its affiliates. -import logging import os -import sys import time from collections import OrderedDict @@ -16,24 +12,8 @@ from transformers.optimization import AdamW, get_linear_schedule_with_warmup from modelscope.trainers.nlp.space.metrics.metrics_tracker import \ MetricsTracker - - -def get_logger(log_path, name='default'): - logger = logging.getLogger(name) - logger.propagate = False - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter('%(message)s') - - sh = logging.StreamHandler(sys.stdout) - sh.setFormatter(formatter) - logger.addHandler(sh) - - fh = logging.FileHandler(log_path, mode='w') - fh.setFormatter(formatter) - logger.addHandler(fh) - - return logger +from modelscope.utils.constant import ModelFile +from modelscope.utils.logger import get_logger class Trainer(object): @@ -76,11 +56,7 @@ class Trainer(object): self.lr_scheduler = lr_scheduler self.optimizer = optimizer - # if not os.path.exists(self.save_dir): - # os.makedirs(self.save_dir) - - # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer") - self.logger = logger or get_logger('trainer.log', 'trainer') + self.logger = logger or get_logger() self.batch_metrics_tracker_label = MetricsTracker() self.token_metrics_tracker_label = MetricsTracker() @@ -201,9 +177,12 @@ class Trainer(object): # Save current best model if is_best: - best_model_file = os.path.join(self.save_dir, 'best.model') + best_model_file = os.path.join(self.save_dir, + ModelFile.TORCH_MODEL_BIN_FILE) torch.save(self.model.state_dict(), best_model_file) - best_train_file = os.path.join(self.save_dir, 'best.train') + best_train_file = os.path.join( + self.save_dir, + '{}.train'.format(ModelFile.TORCH_MODEL_BIN_FILE)) torch.save(train_state, best_train_file) self.logger.info( f"Saved best model state to '{best_model_file}' with new best valid metric " @@ -215,7 +194,7 @@ class Trainer(object): def _load_model_state(): model_state_dict = torch.load( - f'{self.func_model.init_checkpoint}.model', + f'{self.func_model.init_checkpoint}', map_location=lambda storage, loc: storage) if 'module.' in list(model_state_dict.keys())[0]: @@ -303,8 +282,13 @@ class Trainer(object): self.logger.info('Loaded no model !!!') return - _load_model_state() - _load_train_state() + if self.do_train: + _load_model_state() + return + + if self.do_infer: + _load_model_state() + _load_train_state() class IntentTrainer(Trainer): @@ -719,104 +703,3 @@ class IntentTrainer(Trainer): assert 'loss' in metrics return metrics['loss'], metrics - - def load(self): - """ load """ - - def _load_model_state(): - model_state_dict = torch.load( - f'{self.func_model.init_checkpoint}', - map_location=lambda storage, loc: storage) - - if 'module.' in list(model_state_dict.keys())[0]: - new_model_state_dict = OrderedDict() - for k, v in model_state_dict.items(): - assert k[:7] == 'module.' - new_model_state_dict[k[7:]] = v - model_state_dict = new_model_state_dict - - new_model_state_dict = OrderedDict() - parameters = { - name: param - for name, param in self.func_model.named_parameters() - } - for name, param in model_state_dict.items(): - if name in parameters: - if param.shape != parameters[name].shape: - assert hasattr(param, 'numpy') - arr = param.numpy() - z = np.random.normal( - scale=self.func_model.initializer_range, - size=parameters[name].shape).astype('float32') - if name == 'embedder.token_embedding.weight': - z[-param.shape[0]:] = arr - print( - f'part of parameter({name}) random normlize initialize' - ) - else: - if z.shape[0] < param.shape[0]: - z = arr[:z.shape[0]] - print(f'part of parameter({name}) are dropped') - else: - z[:param.shape[0]] = arr - print( - f'part of parameter({name}) random normlize initialize' - ) - dtype, device = param.dtype, param.device - z = torch.tensor(z, dtype=dtype, device=device) - new_model_state_dict[name] = z - else: - new_model_state_dict[name] = param - else: - print(f'parameter({name}) are dropped') - model_state_dict = new_model_state_dict - - for name in parameters: - if name not in model_state_dict: - if parameters[name].requires_grad: - print(f'parameter({name}) random normlize initialize') - z = np.random.normal( - scale=self.func_model.initializer_range, - size=parameters[name].shape).astype('float32') - dtype, device = parameters[name].dtype, parameters[ - name].device - model_state_dict[name] = torch.tensor( - z, dtype=dtype, device=device) - else: - model_state_dict[name] = parameters[name] - - self.func_model.load_state_dict(model_state_dict) - self.logger.info( - f"Loaded model state from '{self.func_model.init_checkpoint}.model'" - ) - - def _load_train_state(): - train_file = f'{self.func_model.init_checkpoint}.train' - if os.path.exists(train_file): - train_state_dict = torch.load( - train_file, map_location=lambda storage, loc: storage) - self.epoch = train_state_dict['epoch'] - self.best_valid_metric = train_state_dict['best_valid_metric'] - if self.optimizer is not None and 'optimizer' in train_state_dict: - self.optimizer.load_state_dict( - train_state_dict['optimizer']) - if self.lr_scheduler is not None and 'lr_scheduler' in train_state_dict: - self.lr_scheduler.load_state_dict( - train_state_dict['lr_scheduler']) - self.logger.info( - f"Loaded train state from '{train_file}' with (epoch-{self.epoch} " - f'best_valid_metric={self.best_valid_metric:.3f})') - else: - self.logger.info('Loaded no train state') - - if self.func_model.init_checkpoint is None: - self.logger.info('Loaded no model !!!') - return - - if self.do_train: - _load_model_state() - return - - if self.do_infer: - _load_model_state() - _load_train_state() diff --git a/modelscope/trainers/nlp/text_ranking_trainer.py b/modelscope/trainers/nlp/text_ranking_trainer.py index 5da9c76a..610c36b5 100644 --- a/modelscope/trainers/nlp/text_ranking_trainer.py +++ b/modelscope/trainers/nlp/text_ranking_trainer.py @@ -12,9 +12,9 @@ from tqdm import tqdm from modelscope.metainfo import Trainers from modelscope.models.base import Model, TorchModel +from modelscope.models.nlp import BertForTextRanking from modelscope.msdatasets.ms_dataset import MsDataset from modelscope.preprocessors.base import Preprocessor -from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer from modelscope.utils.constant import DEFAULT_MODEL_REVISION @@ -118,7 +118,6 @@ class TextRankingTrainer(NlpEpochBasedTrainer): Example: {"accuracy": 0.5091743119266054, "f1": 0.673780487804878} """ - from modelscope.models.nlp import TextRanking # get the raw online dataset self.eval_dataloader = self._build_dataloader_with_dataset( self.eval_dataset, @@ -127,7 +126,7 @@ class TextRankingTrainer(NlpEpochBasedTrainer): # generate a standard dataloader # generate a model if checkpoint_path is not None: - model = TextRanking.from_pretrained(checkpoint_path) + model = BertForTextRanking.from_pretrained(checkpoint_path) else: model = self.model @@ -156,13 +155,16 @@ class TextRankingTrainer(NlpEpochBasedTrainer): with torch.no_grad(): label_ids = batch.pop('labels').detach().cpu().numpy() qids = batch.pop('qid').detach().cpu().numpy() - outputs = model(batch) + outputs = model(**batch) infer_end_time = time.time() total_spent_time += infer_end_time - infer_start_time total_samples += self.eval_dataloader.batch_size - assert 'scores' in outputs - logits = outputs['scores'] + def sigmoid(logits): + return np.exp(logits) / (1 + np.exp(logits)) + + logits = outputs['logits'].squeeze(-1).detach().cpu().numpy() + logits = sigmoid(logits).tolist() label_list.extend(label_ids) logits_list.extend(logits) diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index b54aa666..a19e7c7b 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -1,7 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os -from typing import Callable, Optional, Tuple, Union +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -13,15 +15,416 @@ from modelscope.metainfo import Trainers from modelscope.metrics.builder import build_metric from modelscope.models.base import Model, TorchModel from modelscope.msdatasets import MsDataset -from modelscope.preprocessors import Preprocessor, build_preprocessor -from modelscope.utils.config import Config +from modelscope.preprocessors import Preprocessor +from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys, - ModelFile, Tasks) + ModelFile) from modelscope.utils.hub import parse_label_mapping from .base import TRAINERS from .trainer import EpochBasedTrainer +@dataclass +class NlpTrainerArguments: + """The arguments for the nlp trainer. + + All the arguments listed here have None default values, which means follow the default value in the input + cfg dict. + """ + + work_dir: Optional[str] = field( + default=None, metadata={'help': 'The work dir(key: train.work_dir)'}) + + task: Optional[str] = field( + default=None, metadata={'help': 'The task type(key: task)'}) + + preprocessor_type: Optional[str] = field( + default=None, + metadata={'help': 'The preprocessor type(key: preprocessor.type)'}) + + train_first_sequence: str = field( + default=None, + metadata={ + 'help': + 'The key of first sentence for the training dataset(key:preprocessor.train.' + 'first_sequence/dataset.train.first_sequence)' + }) + + train_second_sequence: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The key of second sentence for the training dataset(key:preprocessor.train.' + 'second_sequence/dataset.train.second_sequence)' + }) + + train_label: str = field( + default=None, + metadata={ + 'help': + 'The key of label for the training dataset(key:preprocessor.train.' + 'second_sequence/dataset.train.second_sequence)' + }) + + eval_first_sequence: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The key of first sentence for the eval dataset(key:preprocessor.val.' + 'first_sequence/dataset.val.first_sequence), ' + 'if not provided, the trainer will use the train_first_sequence for evaluation' + }) + + eval_second_sequence: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The key of second sentence for the eval dataset(key:preprocessor.val.' + 'second_sequence/dataset.val.second_sequence),' + 'if not provided, the trainer will use the train_second_sequence for evaluation' + }) + + eval_label: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The key of label for the eval dataset(key:preprocessor.val.' + 'second_sequence/dataset.val.second_sequence),' + 'if not provided, the trainer will use the train_label for evaluation' + }) + + labels: Optional[List] = field( + default=None, + metadata={ + 'help': + 'The labels list of the dataset(key:dataset.train.labels),' + 'This parameter has the same effect with "label2id"' + }) + + max_epochs: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The max_epochs of the training loop(key: train.max_epochs)' + }) + + train_batch_size_per_gpu: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The train batch size per gpu(key: train.dataloader.batch_size_per_gpu)' + }) + + train_workers_per_gpu: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The number of workers per gpu(key: train.dataloader.workers_per_gpu)' + }) + + train_shuffle: Optional[bool] = field( + default=None, + metadata={ + 'help': + 'Shuffle the train dataset or not(key: train.dataloader.shuffle)' + }) + + eval_batch_size_per_gpu: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The eval batch size per gpu(key: evaluation.dataloader.batch_size_per_gpu)' + }) + + eval_workers_per_gpu: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The number of workers per gpu(key: evaluation.dataloader.workers_per_gpu)' + }) + + eval_shuffle: Optional[bool] = field( + default=None, + metadata={ + 'help': + 'Shuffle the eval dataset or not(key: evaluation.dataloader.shuffle)' + }) + + optimizer_args: Optional[Dict] = field( + default=None, + metadata={'help': 'The optimizer config dict(key: train.optimizer)'}) + + lr_scheduler_args: Optional[Dict] = field( + default=None, + metadata={ + 'help': 'The lr_scheduler config dict(key: train.lr_scheduler)' + }) + + checkpoint_saving_type: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The checkpoint saving type(key: The ckpt hook dict in train.hooks), ' + 'valid options: "BestCkptSaverHook", "CheckpointHook"' + }) + + checkpoint_by_epoch: Optional[bool] = field( + default=None, + metadata={ + 'help': + 'Saving checkpoint by epoch or not(key: The by_epoch key in ' + 'ckpt hook dict in train.hooks)' + }) + + checkpoint_interval: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The checkpoint saving interval(key: The interval key in ' + 'ckpt hook dict in train.hooks)' + }) + + metric_key: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The metric key for the BestCkptSaverHook(key: The metric_key key in ' + 'ckpt hook dict in train.hooks), if the checkpoint_saving_type is "CheckpointHook" or ' + '"None", the metric_key key has no effects' + }) + + evaluation_type: Optional[str] = field( + default=None, + metadata={ + 'help': + 'The evaluation type(key: The evaluation hook dict in train.hooks), ' + 'valid options: "EvaluationHook", "None"' + }) + + evaluation_by_epoch: Optional[bool] = field( + default=None, + metadata={ + 'help': + 'Evaluating by epoch or not(key: The by_epoch key in ' + 'evaluation hook dict in train.hooks)' + }) + + evaluation_interval: Optional[int] = field( + default=None, + metadata={ + 'help': + 'The evaluating interval(key: The interval key in ' + 'evaluation hook dict in train.hooks)' + }) + + metrics: Optional[List[str]] = field( + default=None, + metadata={'help': 'The metrics class keys(key: evaluation.metrics)'}) + + default_train_config = ConfigDict({ + 'work_dir': + '/tmp', + 'max_epochs': + 5, + 'dataloader': { + 'batch_size_per_gpu': 32, + 'workers_per_gpu': 0 + }, + 'optimizer': { + 'type': 'AdamW', + 'lr': 2e-5, + 'options': {} + }, + 'lr_scheduler': { + 'type': 'LinearLR', + 'start_factor': 1.0, + 'end_factor': 0.0, + 'total_iters': 10000, + 'options': { + 'by_epoch': False + } + }, + 'hooks': [{ + 'type': 'CheckpointHook', + 'by_epoch': False, + 'interval': 100 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 100 + }] + }) + + def __call__(self, cfg): + """ + + Args: + cfg(`Config`): The cfg to be modified. + + Returns: + The cfg after modification. + """ + + if self.task is not None: + cfg.task = self.task + + if self.preprocessor_type is not None: + if not hasattr(cfg, 'preprocessor'): + cfg.preprocessor = ConfigDict() + cfg.preprocessor.type = self.preprocessor_type + + if self.train_first_sequence is not None or self.train_second_sequence \ + is not None or self.train_label is not None or self.labels is not None: + if not hasattr(cfg, 'dataset'): + cfg.dataset = ConfigDict() + if not hasattr(cfg.dataset, 'train'): + cfg.dataset.train = ConfigDict() + if self.train_first_sequence is not None: + cfg.dataset.train.first_sequence = self.train_first_sequence + if self.train_second_sequence is not None: + cfg.dataset.train.second_sequence = self.train_second_sequence + if self.train_label is not None: + cfg.dataset.train.label = self.train_label + if self.labels is not None: + cfg.dataset.train.labels = self.labels + + if self.eval_first_sequence is not None or self.eval_second_sequence \ + is not None or self.eval_label is not None: + if not hasattr(cfg, 'dataset'): + cfg.dataset = ConfigDict() + if not hasattr(cfg.dataset, 'val'): + cfg.dataset.val = ConfigDict() + if self.eval_first_sequence is not None: + cfg.dataset.val.first_sequence = self.eval_first_sequence + if self.eval_second_sequence is not None: + cfg.dataset.val.second_sequence = self.eval_second_sequence + if self.eval_label is not None: + cfg.dataset.val.label = self.eval_label + + if self.max_epochs is not None or self.train_batch_size_per_gpu is not None \ + or self.train_shuffle is not None or self.optimizer_args is not None \ + or self.work_dir is not None or self.lr_scheduler_args is not None\ + or self.train_workers_per_gpu is not None: + if not hasattr(cfg, 'train'): + cfg.train = deepcopy(self.default_train_config) + if not hasattr(cfg.train, 'dataloader'): + cfg.train.dataloader = deepcopy( + self.default_train_config.dataloader) + if not hasattr(cfg.train, 'optimizer'): + cfg.train.optimizer = deepcopy( + self.default_train_config.optimizer) + if not hasattr(cfg.train, 'lr_scheduler'): + cfg.train.lr_scheduler = deepcopy( + self.default_train_config.lr_scheduler) + if self.work_dir is not None: + cfg.train.work_dir = self.work_dir + if self.max_epochs is not None: + cfg.train.max_epochs = self.max_epochs + if self.train_batch_size_per_gpu is not None: + cfg.train.dataloader.batch_size_per_gpu = self.train_batch_size_per_gpu + if self.train_workers_per_gpu is not None: + cfg.train.dataloader.workers_per_gpu = self.train_workers_per_gpu + if self.train_shuffle is not None: + cfg.train.dataloader.shuffle = self.train_shuffle + if self.optimizer_args is not None: + if cfg.train.optimizer.type != self.optimizer_args.get( + 'type', cfg.train.optimizer.type): + cfg.train.optimizer = ConfigDict( + deepcopy(self.optimizer_args)) + else: + cfg.train.optimizer = Config._merge_a_into_b( + self.optimizer_args, cfg.train.optimizer, force=True) + if self.lr_scheduler_args is not None: + if cfg.train.lr_scheduler.type != self.lr_scheduler_args.get( + 'type', cfg.train.lr_scheduler.type): + cfg.train.lr_scheduler = ConfigDict( + deepcopy(self.lr_scheduler_args)) + else: + cfg.train.lr_scheduler = Config._merge_a_into_b( + self.lr_scheduler_args, + cfg.train.lr_scheduler, + force=True) + + if self.checkpoint_saving_type is not None or self.checkpoint_by_epoch is not None \ + or self.checkpoint_interval is not None or self.metric_key is not None: + if not any([ + self.checkpoint_saving_type == hook['type'] + for hook in cfg.train.hooks + ]): + cfg.train.hooks = list( + filter( + lambda hook: hook['type'] not in + ['CheckpointHook', 'BestCkptSaverHook'], + cfg.train.hooks)) + cfg.train.hooks.append( + deepcopy(self.default_train_config.hooks[0])) + cfg.train.hooks[-1].type = self.checkpoint_saving_type + checkpoint_hook = list( + filter( + lambda hook: hook[ + 'type'] in ['CheckpointHook', 'BestCkptSaverHook'], + cfg.train.hooks))[0] + if self.checkpoint_by_epoch is not None: + checkpoint_hook['by_epoch'] = self.checkpoint_by_epoch + if self.checkpoint_interval is not None: + checkpoint_hook['interval'] = self.checkpoint_interval + if checkpoint_hook['type'] == 'BestCkptSaverHook': + assert self.metric_key is not None, 'The metric_key must be provided ' \ + 'if the ckpt saving hook is "BestCkptSaverHook"' + checkpoint_hook['metric_key'] = self.metric_key + + if self.evaluation_type is not None or self.evaluation_by_epoch is not None \ + or self.evaluation_interval is not None or self.eval_batch_size_per_gpu is not None or \ + self.eval_shuffle is not None or self.metrics is not None: + if self.evaluation_type is not None and not any([ + self.evaluation_type == hook['type'] + for hook in cfg.train.hooks + ]): + cfg.train.hooks = list( + filter(lambda hook: hook['type'] not in ['EvaluationHook'], + cfg.train.hooks)) + if self.evaluation_type != 'None': + cfg.train.hooks.append( + deepcopy(self.default_train_config.hooks[3])) + cfg.train.hooks[-1].type = self.evaluation_type + + evaluation_hook = list( + filter(lambda hook: hook['type'] in ['EvaluationHook'], + cfg.train.hooks)) + evaluation_hook = evaluation_hook[0] if len( + evaluation_hook) > 0 else None + + if evaluation_hook is not None and self.evaluation_by_epoch is not None: + evaluation_hook['by_epoch'] = self.evaluation_by_epoch + if evaluation_hook is not None and self.evaluation_interval is not None: + evaluation_hook['interval'] = self.evaluation_interval + + if not hasattr(cfg, 'evaluation'): + cfg.evaluation = ConfigDict({ + 'dataloader': { + 'batch_size_per_gpu': 32, + 'workers_per_gpu': 0, + 'shuffle': False + } + }) + + if self.metrics is not None: + cfg.evaluation.metrics = self.metrics + if self.eval_batch_size_per_gpu is not None: + cfg.evaluation.dataloader.batch_size_per_gpu = self.eval_batch_size_per_gpu + if self.eval_workers_per_gpu is not None: + cfg.evaluation.dataloader.workers_per_gpu = self.eval_workers_per_gpu + if self.eval_shuffle is not None: + cfg.evaluation.dataloader.shuffle = self.eval_shuffle + + return cfg + + @TRAINERS.register_module(module_name=Trainers.nlp_base_trainer) class NlpEpochBasedTrainer(EpochBasedTrainer): @@ -80,9 +483,10 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): model) else: model_dir = snapshot_download(model, revision=model_revision) - cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) + if cfg_file is None: + cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) else: - assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class' + assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!' model_dir = os.path.dirname(cfg_file) self.label2id = None @@ -91,26 +495,17 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): self.cfg_modify_fn = cfg_modify_fn self.cfg = self.rebuild_config(Config.from_file(cfg_file)) - label2id = parse_label_mapping(model_dir) - if label2id is not None: - self.label2id = label2id - self.id2label = {id: label for label, id in label2id.items()} - self.num_labels = len(label2id) - else: - try: - labels = self.cfg.dataset.train.labels - if labels is not None and len(labels) > 0: - self.label2id = { - label: idx - for idx, label in enumerate(labels) - } - self.id2label = { - idx: label - for idx, label in enumerate(labels) - } - self.num_labels = len(labels) - except AttributeError: - pass + try: + labels = self.cfg.dataset.train.labels + self.label2id = {label: idx for idx, label in enumerate(labels)} + self.id2label = {idx: label for idx, label in enumerate(labels)} + self.num_labels = len(labels) + except AttributeError: + label2id = parse_label_mapping(model_dir) + if label2id is not None: + self.label2id = label2id + self.id2label = {id: label for label, id in label2id.items()} + self.num_labels = len(label2id) def build_dataset_keys(cfg): if cfg is not None: @@ -185,36 +580,20 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): 'label2id': self.label2id } - field_name = Tasks.find_field_by_task(self.cfg.task) - train_preprocessor, eval_preprocessor = None, None - _train_cfg, _eval_cfg = {}, {} - - if 'type' not in self.cfg.preprocessor and ( - 'train' in self.cfg.preprocessor - or 'val' in self.cfg.preprocessor): - if 'train' in self.cfg.preprocessor: - _train_cfg = self.cfg.preprocessor.train - if 'val' in self.cfg.preprocessor: - _eval_cfg = self.cfg.preprocessor.val - else: - _train_cfg = self.cfg.preprocessor - _eval_cfg = self.cfg.preprocessor - - if len(_train_cfg): - _train_cfg.update({ - 'model_dir': self.model_dir, - **model_args, - **self.train_keys, 'mode': ModeKeys.TRAIN - }) - train_preprocessor = build_preprocessor(_train_cfg, field_name) - if len(_eval_cfg): - _eval_cfg.update({ - 'model_dir': self.model_dir, - **model_args, - **self.eval_keys, 'mode': ModeKeys.EVAL - }) - eval_preprocessor = build_preprocessor(_eval_cfg, field_name) - + train_preprocessor = Preprocessor.from_pretrained( + self.model_dir, + cfg_dict=self.cfg, + preprocessor_mode=ModeKeys.TRAIN, + **model_args, + **self.train_keys, + mode=ModeKeys.TRAIN) + eval_preprocessor = Preprocessor.from_pretrained( + self.model_dir, + cfg_dict=self.cfg, + preprocessor_mode=ModeKeys.EVAL, + **model_args, + **self.eval_keys, + mode=ModeKeys.EVAL) return train_preprocessor, eval_preprocessor diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 61d11aa6..0dc6ece4 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -4,7 +4,7 @@ import time from collections.abc import Mapping from distutils.version import LooseVersion from functools import partial -from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import json import torch @@ -22,18 +22,18 @@ from modelscope.msdatasets.ms_dataset import MsDataset from modelscope.msdatasets.task_datasets.builder import build_task_dataset from modelscope.msdatasets.task_datasets.torch_base_dataset import \ TorchTaskDataset +from modelscope.outputs import ModelOutputBase from modelscope.preprocessors.base import Preprocessor -from modelscope.preprocessors.builder import build_preprocessor from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, - ConfigKeys, Hubs, ModeKeys, ModelFile, - Tasks, TrainerStages) + ConfigKeys, ModeKeys, ModelFile, + TrainerStages) from modelscope.utils.data_utils import to_device -from modelscope.utils.device import create_device, verify_device +from modelscope.utils.device import create_device from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg @@ -146,7 +146,8 @@ class EpochBasedTrainer(BaseTrainer): if ConfigKeys.val in preprocessor: assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) self.eval_preprocessor = preprocessor[ConfigKeys.val] - elif hasattr(self.cfg, ConfigFields.preprocessor): + elif hasattr(self.cfg, ConfigFields.preprocessor + ) and self.cfg.preprocessor is not None: self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( ) @@ -344,23 +345,32 @@ class EpochBasedTrainer(BaseTrainer): preprocessors=preprocessor) for d in datasets ] cfg = ConfigDict( - type=self.cfg.task, mode=mode, datasets=datasets) - return build_task_dataset(cfg, self.cfg.task) + type=self.cfg.model.type, mode=mode, datasets=datasets) + task_dataset = build_task_dataset(cfg, self.cfg.task) + task_dataset.trainer = self + return task_dataset else: # avoid add no str value datasets, preprocessors in cfg task_data_build_config = ConfigDict( - mode=mode, datasets=datasets, preprocessor=preprocessor) + type=self.cfg.model.type, + mode=mode, + datasets=datasets, + preprocessor=preprocessor) task_data_build_config.update(task_data_config) - return build_task_dataset(task_data_build_config, - self.cfg.task) + task_dataset = build_task_dataset(task_data_build_config, + self.cfg.task) + task_dataset.trainer = self + return task_dataset except Exception: if isinstance(datasets, (List, Tuple)) or preprocessor is not None: - return TorchTaskDataset( + task_dataset = TorchTaskDataset( datasets, mode=mode, preprocessor=preprocessor, **(dict(type=self.cfg.model.type) if hasattr( self.cfg, 'model') else {})) + task_dataset.trainer = self + return task_dataset else: return datasets @@ -372,35 +382,12 @@ class EpochBasedTrainer(BaseTrainer): Returns: The train preprocessor and eval preprocessor instance. """ - field_name = Tasks.find_field_by_task(self.cfg.task) - train_preprocessor, eval_preprocessor = None, None - _train_cfg, _eval_cfg = {}, {} - _dafault_args = {'model_dir': self.model_dir} - - if 'type' not in self.cfg.preprocessor and ( - 'train' in self.cfg.preprocessor - or 'val' in self.cfg.preprocessor): - if 'train' in self.cfg.preprocessor: - _train_cfg = self.cfg.preprocessor.train - if 'val' in self.cfg.preprocessor: - _eval_cfg = self.cfg.preprocessor.val - else: - _train_cfg = self.cfg.preprocessor - _eval_cfg = self.cfg.preprocessor - - if len(_train_cfg): - if isinstance(_train_cfg, Sequence): - # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, - # and add mode for Compose or other plans - raise NotImplementedError('Not supported yet!') - _train_cfg.update(_dafault_args) - train_preprocessor = build_preprocessor(_train_cfg, field_name) - if len(_eval_cfg): - if isinstance(_eval_cfg, Sequence): - raise NotImplementedError('Not supported yet!') - _eval_cfg.update(_dafault_args) - eval_preprocessor = build_preprocessor(_eval_cfg, field_name) - + train_preprocessor = Preprocessor.from_pretrained( + self.model_dir, + cfg_dict=self.cfg, + preprocessor_mode=ModeKeys.TRAIN) + eval_preprocessor = Preprocessor.from_pretrained( + self.model_dir, cfg_dict=self.cfg, preprocessor_mode=ModeKeys.EVAL) return train_preprocessor, eval_preprocessor def get_metrics(self) -> List[Union[str, Dict]]: @@ -547,6 +534,8 @@ class EpochBasedTrainer(BaseTrainer): else: train_outputs = model.forward(inputs) + if isinstance(train_outputs, ModelOutputBase): + train_outputs = train_outputs.to_dict() if not isinstance(train_outputs, dict): raise TypeError('"model.forward()" must return a dict') @@ -650,8 +639,9 @@ class EpochBasedTrainer(BaseTrainer): """ # TODO: support MsDataset load for cv if hasattr(data_cfg, 'name'): + dataset_name = data_cfg.pop('name') dataset = MsDataset.load( - dataset_name=data_cfg.pop('name'), + dataset_name=dataset_name, **data_cfg, ) cfg = ConfigDict(type=self.cfg.model.type, mode=mode) diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py index a9d7f396..2a7520f2 100644 --- a/modelscope/utils/checkpoint.py +++ b/modelscope/utils/checkpoint.py @@ -207,6 +207,6 @@ def save_pretrained(model, # Dump the config to the configuration.json if ConfigFields.pipeline not in config: config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]} - cfg_str = json.dumps(config, cls=JSONIteratorEncoder) + cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder) config_file = os.path.join(target_folder, ModelFile.CONFIGURATION) storage.write(cfg_str.encode(), config_file) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 50a1c016..6a9d6fd5 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -115,7 +115,6 @@ class NLPTasks(object): dialog_intent_prediction = 'dialog-intent-prediction' dialog_state_tracking = 'dialog-state-tracking' table_question_answering = 'table-question-answering' - sentence_embedding = 'sentence-embedding' fill_mask = 'fill-mask' text_summarization = 'text-summarization' question_answering = 'question-answering' diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 2dbe7045..105b3ffa 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -82,7 +82,8 @@ def get_model_type(model_dir): this file does not exist, the method will try to get the 'model_type' field from the config.json. - @param model_dir: The local model dir to use. @return: The model type + Args: + model_dir: The local model dir to use. @return: The model type string, returns None if nothing is found. """ try: @@ -112,8 +113,11 @@ def parse_label_mapping(model_dir): 2. Try to read label-id mapping from the configuration.json 3. Try to read label-id mapping from the config.json - @param model_dir: The local model dir to use. - @return: The label2id mapping if found. + Args: + model_dir: The local model dir to use. + + Returns: + The label2id mapping if found. """ import json import os diff --git a/modelscope/utils/nlp/space/args.py b/modelscope/utils/nlp/space/args.py index d9e91e74..c92401c5 100644 --- a/modelscope/utils/nlp/space/args.py +++ b/modelscope/utils/nlp/space/args.py @@ -1,6 +1,4 @@ -""" -Parse argument. -""" +# Copyright (c) Alibaba, Inc. and its affiliates. import argparse diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py index 4578ccc4..2c971b10 100644 --- a/modelscope/utils/nlp/space/clean_dataset.py +++ b/modelscope/utils/nlp/space/clean_dataset.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os import re diff --git a/modelscope/utils/nlp/space/criterions.py b/modelscope/utils/nlp/space/criterions.py index 60f98457..82ef4ba5 100644 --- a/modelscope/utils/nlp/space/criterions.py +++ b/modelscope/utils/nlp/space/criterions.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import torch import torch.nn.functional as F from torch.nn.modules.loss import _Loss diff --git a/modelscope/utils/nlp/space/db_ops.py b/modelscope/utils/nlp/space/db_ops.py index 880b018b..d1d14ef9 100644 --- a/modelscope/utils/nlp/space/db_ops.py +++ b/modelscope/utils/nlp/space/db_ops.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os import random import sqlite3 diff --git a/modelscope/utils/nlp/space/ontology.py b/modelscope/utils/nlp/space/ontology.py index 99b084bb..c55d12e1 100644 --- a/modelscope/utils/nlp/space/ontology.py +++ b/modelscope/utils/nlp/space/ontology.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + all_domains = [ 'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'police', 'hospital' ] diff --git a/modelscope/utils/nlp/space/scores.py b/modelscope/utils/nlp/space/scores.py index fe0a8a17..eb6dd41c 100644 --- a/modelscope/utils/nlp/space/scores.py +++ b/modelscope/utils/nlp/space/scores.py @@ -1,3 +1,6 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + + def hierarchical_set_score(frame1, frame2): # deal with empty frame if not (frame1 and frame2): diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py index 81d1b1c5..56e67671 100644 --- a/modelscope/utils/nlp/space/utils.py +++ b/modelscope/utils/nlp/space/utils.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import logging from collections import OrderedDict diff --git a/modelscope/utils/nlp/space/utils_dst.py b/modelscope/utils/nlp/space/utils_dst.py index 2a7e67d7..6277172e 100644 --- a/modelscope/utils/nlp/space/utils_dst.py +++ b/modelscope/utils/nlp/space/utils_dst.py @@ -1,3 +1,29 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import List + +from modelscope.outputs import OutputKeys +from modelscope.pipelines.nlp import DialogStateTrackingPipeline + + +def tracking_and_print_dialog_states( + test_case, pipelines: List[DialogStateTrackingPipeline]): + import json + pipelines_len = len(pipelines) + history_states = [{}] + utter = {} + for step, item in enumerate(test_case): + utter.update(item) + result = pipelines[step % pipelines_len]({ + 'utter': + utter, + 'history_states': + history_states + }) + print(json.dumps(result)) + + history_states.extend([result[OutputKeys.OUTPUT], {}]) + + def batch_to_device(batch, device): batch_on_device = [] for element in batch: diff --git a/modelscope/utils/nlp/space_T_en/__init__.py b/modelscope/utils/nlp/space_T_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/space_T_en/utils.py similarity index 52% rename from modelscope/utils/nlp/nlp_utils.py rename to modelscope/utils/nlp/space_T_en/utils.py index bfeaf924..d884c241 100644 --- a/modelscope/utils/nlp/nlp_utils.py +++ b/modelscope/utils/nlp/space_T_en/utils.py @@ -1,8 +1,9 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import List from modelscope.outputs import OutputKeys -from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline, - DialogStateTrackingPipeline) +from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline def text2sql_tracking_and_print_results( @@ -22,22 +23,3 @@ def text2sql_tracking_and_print_results( print(results) last_sql = results[OutputKeys.OUTPUT][OutputKeys.TEXT] history.append(item) - - -def tracking_and_print_dialog_states( - test_case, pipelines: List[DialogStateTrackingPipeline]): - import json - pipelines_len = len(pipelines) - history_states = [{}] - utter = {} - for step, item in enumerate(test_case): - utter.update(item) - result = pipelines[step % pipelines_len]({ - 'utter': - utter, - 'history_states': - history_states - }) - print(json.dumps(result)) - - history_states.extend([result[OutputKeys.OUTPUT], {}]) diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py index d6994bd3..5284aa43 100644 --- a/modelscope/utils/registry.py +++ b/modelscope/utils/registry.py @@ -74,6 +74,7 @@ class Registry(object): raise KeyError(f'{module_name} is already registered in ' f'{self._name}[{group_key}]') self._modules[group_key][module_name] = module_cls + module_cls.group_key = group_key def register_module(self, group_key: str = default_group, diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py index 3c1e5c1c..8045d3e9 100644 --- a/modelscope/utils/regress_test_utils.py +++ b/modelscope/utils/regress_test_utils.py @@ -7,6 +7,7 @@ import pickle import random import shutil import tempfile +from collections import OrderedDict from collections.abc import Mapping from pathlib import Path from types import FunctionType @@ -14,6 +15,7 @@ from typing import Any, Dict, Union import json import numpy as np +import torch import torch.optim from torch import nn @@ -69,9 +71,10 @@ class RegressTool: **kwargs): """Monitor a pytorch module in a single forward. - @param module: A torch module - @param file_name: The file_name to store or load file - @param compare_fn: A custom fn used to compare the results manually. + Args: + module: A torch module + file_name: The file_name to store or load file + compare_fn: A custom fn used to compare the results manually. >>> def compare_fn(v1, v2, key, type): >>> return None @@ -80,6 +83,10 @@ class RegressTool: v2 is the value of current version key is the key of submodules type is in one of 'input', 'output' + + kwargs: + atol: The absolute gap between two np arrays. + rtol: The relative gap between two np arrays. """ baseline = os.getenv('REGRESSION_BASELINE') if baseline is None or self.baseline is None: @@ -144,20 +151,24 @@ class RegressTool: This is usually useful when you try to change some dangerous code which has the risk of affecting the training loop. - @param trainer: A dict or an object contains the model/optimizer/lr_scheduler - @param file_name: The file_name to store or load file - @param level: The regression level. + Args: + trainer: A dict or an object contains the model/optimizer/lr_scheduler + file_name: The file_name to store or load file + level: The regression level. 'strict' for matching every single tensor. Please make sure the parameters of head are fixed and the drop-out rate is zero. 'config' for matching the initial config, like cfg file, optimizer param_groups, lr_scheduler params and the random seed. 'metric' for compare the best metrics in the evaluation loop. - @param compare_fn: A custom fn used to compare the results manually. - @param ignore_keys: The keys to ignore of the named_parameters. - @param compare_random: If to compare random setttings, default True. - @param reset_dropout: Reset all dropout modules to 0.0. - @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called. + compare_fn: A custom fn used to compare the results manually. + ignore_keys: The keys to ignore of the named_parameters. + compare_random: If to compare random setttings, default True. + reset_dropout: Reset all dropout modules to 0.0. + lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called. + kwargs: + atol: The absolute gap between two np arrays. + rtol: The relative gap between two np arrays. >>> def compare_fn(v1, v2, key, type): >>> return None @@ -353,16 +364,22 @@ def compare_module(module1: nn.Module, module2: nn.Module): def numpify_tensor_nested(tensors, reduction=None, clip_value=10000): - import torch + try: + from modelscope.outputs import ModelOutputBase + except ImportError: + ModelOutputBase = dict "Numpify `tensors` (even if it's a nested list/tuple of tensors)." - if isinstance(tensors, (list, tuple)): - return type(tensors)( - numpify_tensor_nested(t, reduction, clip_value) for t in tensors) - if isinstance(tensors, Mapping): - return { + if isinstance(tensors, (Mapping, ModelOutputBase)): + return OrderedDict({ k: numpify_tensor_nested(t, reduction, clip_value) for k, t in tensors.items() - } + }) + if isinstance(tensors, list): + return list( + numpify_tensor_nested(t, reduction, clip_value) for t in tensors) + if isinstance(tensors, tuple): + return tuple( + numpify_tensor_nested(t, reduction, clip_value) for t in tensors) if isinstance(tensors, torch.Tensor): t: np.ndarray = tensors.cpu().numpy() if clip_value is not None: @@ -377,12 +394,19 @@ def numpify_tensor_nested(tensors, reduction=None, clip_value=10000): def detach_tensor_nested(tensors): - import torch + try: + from modelscope.outputs import ModelOutputBase + except ImportError: + ModelOutputBase = dict "Detach `tensors` (even if it's a nested list/tuple of tensors)." - if isinstance(tensors, (list, tuple)): - return type(tensors)(detach_tensor_nested(t) for t in tensors) - if isinstance(tensors, Mapping): - return {k: detach_tensor_nested(t) for k, t in tensors.items()} + if isinstance(tensors, (Mapping, ModelOutputBase)): + return OrderedDict( + {k: detach_tensor_nested(t) + for k, t in tensors.items()}) + if isinstance(tensors, list): + return list(detach_tensor_nested(t) for t in tensors) + if isinstance(tensors, tuple): + return tuple(detach_tensor_nested(t) for t in tensors) if isinstance(tensors, torch.Tensor): return tensors.detach() return tensors diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py index 406d671f..8f580d19 100644 --- a/modelscope/utils/tensor_utils.py +++ b/modelscope/utils/tensor_utils.py @@ -8,8 +8,11 @@ def torch_nested_numpify(tensors): NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict. - @param tensors: Nested torch tensors. - @return: The numpify tensors. + Args: + tensors: Nested torch tensors. + + Returns: + The numpify tensors. """ import torch @@ -30,8 +33,11 @@ def torch_nested_detach(tensors): NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict. - @param tensors: Nested torch tensors. - @return: The detached tensors. + Args: + tensors: Nested torch tensors. + + Returns: + The detached tensors. """ import torch diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py index 97926539..0e4f8349 100644 --- a/tests/export/test_export_sbert_sequence_classification.py +++ b/tests/export/test_export_sbert_sequence_classification.py @@ -3,9 +3,10 @@ import os import shutil import tempfile import unittest +from collections import OrderedDict from modelscope.exporters import Exporter, TorchModelExporter -from modelscope.models.base import Model +from modelscope.models import Model from modelscope.utils.test_utils import test_level @@ -27,10 +28,42 @@ class TestExportSbertSequenceClassification(unittest.TestCase): model = Model.from_pretrained(self.model_id) print( Exporter.from_model(model).export_onnx( - shape=(2, 256), outputs=self.tmp_dir)) + shape=(2, 256), output_dir=self.tmp_dir)) print( TorchModelExporter.from_model(model).export_torch_script( - shape=(2, 256), outputs=self.tmp_dir)) + shape=(2, 256), output_dir=self.tmp_dir)) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_export_outer_module(self): + from transformers import BertForSequenceClassification, BertTokenizerFast + model = BertForSequenceClassification.from_pretrained( + 'bert-base-uncased') + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') + dummy_inputs = tokenizer( + tokenizer.unk_token, + padding='max_length', + max_length=256, + return_tensors='pt') + dynamic_axis = {0: 'batch', 1: 'sequence'} + inputs = OrderedDict([ + ('input_ids', dynamic_axis), + ('attention_mask', dynamic_axis), + ('token_type_ids', dynamic_axis), + ]) + outputs = OrderedDict({'logits': {0: 'batch'}}) + output_files = TorchModelExporter().export_onnx( + model=model, + dummy_inputs=dummy_inputs, + inputs=inputs, + outputs=outputs, + output_dir='/tmp') + print(output_files) + output_files = TorchModelExporter().export_torch_script( + model=model, + dummy_inputs=dummy_inputs, + output_dir='/tmp', + strict=False) + print(output_files) if __name__ == '__main__': diff --git a/tests/hub/test_download_dataset.py b/tests/hub/test_download_dataset.py new file mode 100644 index 00000000..29b5d1ab --- /dev/null +++ b/tests/hub/test_download_dataset.py @@ -0,0 +1,709 @@ +import unittest + +from modelscope.msdatasets import MsDataset +from modelscope.utils.test_utils import test_level + + +class DownloadDatasetTest(unittest.TestCase): + + def setUp(self): + self.subset_count = 10 + + def download_subset(self, dataset, subset_name): + dataset = MsDataset.load(dataset, subset_name=subset_name) + if isinstance(dataset, MsDataset): + lens = len(dataset) + print(f'dataset {subset_name} len: {lens}') + self.assertTrue(lens > 0) + else: + assert isinstance(dataset, dict) + lens = {key: len(subset) for key, subset in dataset.items()} + print(f'dataset {subset_name} len: {lens}') + self.assertTrue(all([_len > 0 for _len in lens.values()])) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_glue(self): + subset = [ + 'cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', + 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax' + ] + for subset_name in subset: + self.download_subset('glue', subset_name) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_super_glue(self): + subset = [ + 'boolq', 'cb', 'copa', 'multirc', 'record', 'rte', 'wic', 'wsc', + 'wsc.fixed', 'axb', 'axg' + ] + for subset_name in subset: + self.download_subset('super_glue', subset_name) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_nllb(self): + subset = [ + 'ace_Latn-ban_Latn', 'ace_Latn-bjn_Latn', 'ace_Latn-bug_Latn', + 'ace_Latn-ceb_Latn', 'ace_Latn-eng_Latn', 'ace_Latn-fij_Latn', + 'ace_Latn-ilo_Latn', 'ace_Latn-jav_Latn', 'ace_Latn-min_Latn', + 'ace_Latn-mri_Latn', 'ace_Latn-pag_Latn', 'ace_Latn-plt_Latn', + 'ace_Latn-smo_Latn', 'ace_Latn-sun_Latn', 'ace_Latn-war_Latn', + 'afr_Latn-aka_Latn', 'afr_Latn-amh_Ethi', 'afr_Latn-bam_Latn', + 'afr_Latn-bem_Latn', 'afr_Latn-cjk_Latn', 'afr_Latn-dik_Latn', + 'afr_Latn-dyu_Latn', 'afr_Latn-eng_Latn', 'afr_Latn-ewe_Latn', + 'afr_Latn-fon_Latn', 'afr_Latn-fra_Latn', 'afr_Latn-fuv_Latn', + 'afr_Latn-gaz_Latn', 'afr_Latn-hau_Latn', 'afr_Latn-ibo_Latn', + 'afr_Latn-kam_Latn', 'afr_Latn-kik_Latn', 'afr_Latn-kin_Latn', + 'afr_Latn-kmb_Latn', 'afr_Latn-knc_Arab', 'afr_Latn-knc_Latn', + 'afr_Latn-kon_Latn', 'afr_Latn-lin_Latn', 'afr_Latn-lua_Latn', + 'afr_Latn-lug_Latn', 'afr_Latn-luo_Latn', 'afr_Latn-nso_Latn', + 'afr_Latn-nus_Latn', 'afr_Latn-nya_Latn', 'afr_Latn-run_Latn', + 'afr_Latn-sna_Latn', 'afr_Latn-som_Latn', 'afr_Latn-sot_Latn', + 'afr_Latn-ssw_Latn', 'afr_Latn-swh_Latn', 'afr_Latn-tir_Ethi', + 'afr_Latn-tsn_Latn', 'afr_Latn-tso_Latn', 'afr_Latn-tum_Latn', + 'afr_Latn-twi_Latn', 'afr_Latn-umb_Latn', 'afr_Latn-wol_Latn', + 'afr_Latn-xho_Latn', 'afr_Latn-yor_Latn', 'afr_Latn-zul_Latn', + 'aka_Latn-amh_Ethi', 'aka_Latn-bam_Latn', 'aka_Latn-bem_Latn', + 'aka_Latn-cjk_Latn', 'aka_Latn-dik_Latn', 'aka_Latn-dyu_Latn', + 'aka_Latn-eng_Latn', 'aka_Latn-ewe_Latn', 'aka_Latn-fon_Latn', + 'aka_Latn-fra_Latn', 'aka_Latn-fuv_Latn', 'aka_Latn-gaz_Latn', + 'aka_Latn-hau_Latn', 'aka_Latn-ibo_Latn', 'aka_Latn-kam_Latn', + 'aka_Latn-kik_Latn', 'aka_Latn-kin_Latn', 'aka_Latn-kmb_Latn', + 'aka_Latn-knc_Arab', 'aka_Latn-knc_Latn', 'aka_Latn-kon_Latn', + 'aka_Latn-lin_Latn', 'aka_Latn-lua_Latn', 'aka_Latn-lug_Latn', + 'aka_Latn-luo_Latn', 'aka_Latn-nso_Latn', 'aka_Latn-nus_Latn', + 'aka_Latn-nya_Latn', 'aka_Latn-run_Latn', 'aka_Latn-sna_Latn', + 'aka_Latn-som_Latn', 'aka_Latn-sot_Latn', 'aka_Latn-ssw_Latn', + 'aka_Latn-swh_Latn', 'aka_Latn-tir_Ethi', 'aka_Latn-tsn_Latn', + 'aka_Latn-tso_Latn', 'aka_Latn-tum_Latn', 'aka_Latn-twi_Latn', + 'aka_Latn-umb_Latn', 'aka_Latn-wol_Latn', 'aka_Latn-xho_Latn', + 'aka_Latn-yor_Latn', 'aka_Latn-zul_Latn', 'amh_Ethi-bam_Latn', + 'amh_Ethi-bem_Latn', 'amh_Ethi-cjk_Latn', 'amh_Ethi-dik_Latn', + 'amh_Ethi-dyu_Latn', 'amh_Ethi-eng_Latn', 'amh_Ethi-ewe_Latn', + 'amh_Ethi-fon_Latn', 'amh_Ethi-fra_Latn', 'amh_Ethi-fuv_Latn', + 'amh_Ethi-gaz_Latn', 'amh_Ethi-hau_Latn', 'amh_Ethi-ibo_Latn', + 'amh_Ethi-kam_Latn', 'amh_Ethi-kik_Latn', 'amh_Ethi-kin_Latn', + 'amh_Ethi-kmb_Latn', 'amh_Ethi-knc_Arab', 'amh_Ethi-knc_Latn', + 'amh_Ethi-kon_Latn', 'amh_Ethi-lin_Latn', 'amh_Ethi-lua_Latn', + 'amh_Ethi-lug_Latn', 'amh_Ethi-luo_Latn', 'amh_Ethi-nso_Latn', + 'amh_Ethi-nus_Latn', 'amh_Ethi-nya_Latn', 'amh_Ethi-run_Latn', + 'amh_Ethi-sna_Latn', 'amh_Ethi-som_Latn', 'amh_Ethi-sot_Latn', + 'amh_Ethi-ssw_Latn', 'amh_Ethi-swh_Latn', 'amh_Ethi-tir_Ethi', + 'amh_Ethi-tsn_Latn', 'amh_Ethi-tso_Latn', 'amh_Ethi-tum_Latn', + 'amh_Ethi-twi_Latn', 'amh_Ethi-umb_Latn', 'amh_Ethi-wol_Latn', + 'amh_Ethi-xho_Latn', 'amh_Ethi-yor_Latn', 'amh_Ethi-zul_Latn', + 'arb_Arab-ckb_Arab', 'arb_Arab-crh_Latn', 'arb_Arab-dik_Latn', + 'arb_Arab-diq_Latn', 'arb_Arab-fuv_Latn', 'arb_Arab-kmr_Latn', + 'arb_Arab-knc_Latn', 'arb_Arab-nus_Latn', 'arb_Arab-som_Latn', + 'arb_Arab-tat_Cyrl', 'arb_Arab-tzm_Tfng', 'arb_Arab-urd_Arab', + 'arb_Arab-wol_Latn', 'asm_Beng-awa_Deva', 'asm_Beng-ben_Beng', + 'asm_Beng-bho_Deva', 'asm_Beng-eng_Latn', 'asm_Beng-guj_Gujr', + 'asm_Beng-hin_Deva', 'asm_Beng-hne_Deva', 'asm_Beng-kan_Knda', + 'asm_Beng-kas_Arab', 'asm_Beng-kas_Deva', 'asm_Beng-mag_Deva', + 'asm_Beng-mai_Deva', 'asm_Beng-mal_Mlym', 'asm_Beng-mar_Deva', + 'asm_Beng-npi_Deva', 'asm_Beng-ory_Orya', 'asm_Beng-pan_Guru', + 'asm_Beng-san_Deva', 'asm_Beng-sat_Beng', 'asm_Beng-sin_Sinh', + 'asm_Beng-snd_Arab', 'asm_Beng-tam_Taml', 'asm_Beng-tel_Telu', + 'asm_Beng-urd_Arab', 'awa_Deva-ben_Beng', 'awa_Deva-bho_Deva', + 'awa_Deva-eng_Latn', 'awa_Deva-guj_Gujr', 'awa_Deva-hin_Deva', + 'awa_Deva-hne_Deva', 'awa_Deva-kan_Knda', 'awa_Deva-kas_Arab', + 'awa_Deva-kas_Deva', 'awa_Deva-mag_Deva', 'awa_Deva-mai_Deva', + 'awa_Deva-mal_Mlym', 'awa_Deva-mar_Deva', 'awa_Deva-npi_Deva', + 'awa_Deva-ory_Orya', 'awa_Deva-pan_Guru', 'awa_Deva-san_Deva', + 'awa_Deva-sat_Beng', 'awa_Deva-sin_Sinh', 'awa_Deva-snd_Arab', + 'awa_Deva-tam_Taml', 'awa_Deva-tel_Telu', 'awa_Deva-urd_Arab', + 'ayr_Latn-eng_Latn', 'ayr_Latn-spa_Latn', 'azb_Arab-eng_Latn', + 'azj_Latn-eng_Latn', 'azj_Latn-rus_Cyrl', 'bak_Cyrl-crh_Latn', + 'bak_Cyrl-eng_Latn', 'bak_Cyrl-kir_Cyrl', 'bak_Cyrl-rus_Cyrl', + 'bak_Cyrl-tat_Cyrl', 'bak_Cyrl-tuk_Latn', 'bak_Cyrl-uig_Arab', + 'bak_Cyrl-uzn_Latn', 'bam_Latn-bem_Latn', 'bam_Latn-cjk_Latn', + 'bam_Latn-dik_Latn', 'bam_Latn-dyu_Latn', 'bam_Latn-eng_Latn', + 'bam_Latn-ewe_Latn', 'bam_Latn-fon_Latn', 'bam_Latn-fra_Latn', + 'bam_Latn-fuv_Latn', 'bam_Latn-gaz_Latn', 'bam_Latn-hau_Latn', + 'bam_Latn-ibo_Latn', 'bam_Latn-kam_Latn', 'bam_Latn-kik_Latn', + 'bam_Latn-kin_Latn', 'bam_Latn-kmb_Latn', 'bam_Latn-knc_Arab', + 'bam_Latn-knc_Latn', 'bam_Latn-kon_Latn', 'bam_Latn-lin_Latn', + 'bam_Latn-lua_Latn', 'bam_Latn-lug_Latn', 'bam_Latn-luo_Latn', + 'bam_Latn-nso_Latn', 'bam_Latn-nus_Latn', 'bam_Latn-nya_Latn', + 'bam_Latn-run_Latn', 'bam_Latn-sna_Latn', 'bam_Latn-som_Latn', + 'bam_Latn-sot_Latn', 'bam_Latn-ssw_Latn', 'bam_Latn-swh_Latn', + 'bam_Latn-tir_Ethi', 'bam_Latn-tsn_Latn', 'bam_Latn-tso_Latn', + 'bam_Latn-tum_Latn', 'bam_Latn-twi_Latn', 'bam_Latn-umb_Latn', + 'bam_Latn-wol_Latn', 'bam_Latn-xho_Latn', 'bam_Latn-yor_Latn', + 'bam_Latn-zul_Latn', 'ban_Latn-bjn_Latn', 'ban_Latn-bug_Latn', + 'ban_Latn-ceb_Latn', 'ban_Latn-eng_Latn', 'ban_Latn-fij_Latn', + 'ban_Latn-ilo_Latn', 'ban_Latn-jav_Latn', 'ban_Latn-min_Latn', + 'ban_Latn-mri_Latn', 'ban_Latn-pag_Latn', 'ban_Latn-plt_Latn', + 'ban_Latn-smo_Latn', 'ban_Latn-sun_Latn', 'ban_Latn-war_Latn', + 'bel_Cyrl-eng_Latn', 'bel_Cyrl-rus_Cyrl', 'bem_Latn-cjk_Latn', + 'bem_Latn-dik_Latn', 'bem_Latn-dyu_Latn', 'bem_Latn-eng_Latn', + 'bem_Latn-ewe_Latn', 'bem_Latn-fon_Latn', 'bem_Latn-fra_Latn', + 'bem_Latn-fuv_Latn', 'bem_Latn-gaz_Latn', 'bem_Latn-hau_Latn', + 'bem_Latn-ibo_Latn', 'bem_Latn-kam_Latn', 'bem_Latn-kik_Latn', + 'bem_Latn-kin_Latn', 'bem_Latn-kmb_Latn', 'bem_Latn-knc_Arab', + 'bem_Latn-knc_Latn', 'bem_Latn-kon_Latn', 'bem_Latn-lin_Latn', + 'bem_Latn-lua_Latn', 'bem_Latn-lug_Latn', 'bem_Latn-luo_Latn', + 'bem_Latn-nso_Latn', 'bem_Latn-nus_Latn', 'bem_Latn-nya_Latn', + 'bem_Latn-run_Latn', 'bem_Latn-sna_Latn', 'bem_Latn-som_Latn', + 'bem_Latn-sot_Latn', 'bem_Latn-ssw_Latn', 'bem_Latn-swh_Latn', + 'bem_Latn-tir_Ethi', 'bem_Latn-tsn_Latn', 'bem_Latn-tso_Latn', + 'bem_Latn-tum_Latn', 'bem_Latn-twi_Latn', 'bem_Latn-umb_Latn', + 'bem_Latn-wol_Latn', 'bem_Latn-xho_Latn', 'bem_Latn-yor_Latn', + 'bem_Latn-zul_Latn', 'ben_Beng-bho_Deva', 'ben_Beng-eng_Latn', + 'ben_Beng-guj_Gujr', 'ben_Beng-hin_Deva', 'ben_Beng-hne_Deva', + 'ben_Beng-kan_Knda', 'ben_Beng-kas_Arab', 'ben_Beng-kas_Deva', + 'ben_Beng-mag_Deva', 'ben_Beng-mai_Deva', 'ben_Beng-mal_Mlym', + 'ben_Beng-mar_Deva', 'ben_Beng-npi_Deva', 'ben_Beng-ory_Orya', + 'ben_Beng-pan_Guru', 'ben_Beng-pbt_Arab', 'ben_Beng-san_Deva', + 'ben_Beng-sat_Beng', 'ben_Beng-sin_Sinh', 'ben_Beng-snd_Arab', + 'ben_Beng-tam_Taml', 'ben_Beng-tel_Telu', 'ben_Beng-urd_Arab', + 'bho_Deva-eng_Latn', 'bho_Deva-guj_Gujr', 'bho_Deva-hin_Deva', + 'bho_Deva-hne_Deva', 'bho_Deva-kan_Knda', 'bho_Deva-kas_Arab', + 'bho_Deva-kas_Deva', 'bho_Deva-mag_Deva', 'bho_Deva-mai_Deva', + 'bho_Deva-mal_Mlym', 'bho_Deva-mar_Deva', 'bho_Deva-npi_Deva', + 'bho_Deva-ory_Orya', 'bho_Deva-pan_Guru', 'bho_Deva-san_Deva', + 'bho_Deva-sat_Beng', 'bho_Deva-sin_Sinh', 'bho_Deva-snd_Arab', + 'bho_Deva-tam_Taml', 'bho_Deva-tel_Telu', 'bho_Deva-urd_Arab', + 'bjn_Latn-bug_Latn', 'bjn_Latn-ceb_Latn', 'bjn_Latn-eng_Latn', + 'bjn_Latn-fij_Latn', 'bjn_Latn-ilo_Latn', 'bjn_Latn-ind_Latn', + 'bjn_Latn-jav_Latn', 'bjn_Latn-min_Latn', 'bjn_Latn-mri_Latn', + 'bjn_Latn-pag_Latn', 'bjn_Latn-plt_Latn', 'bjn_Latn-smo_Latn', + 'bjn_Latn-sun_Latn', 'bjn_Latn-war_Latn', 'bod_Tibt-eng_Latn', + 'bos_Latn-eng_Latn', 'bug_Latn-ceb_Latn', 'bug_Latn-eng_Latn', + 'bug_Latn-fij_Latn', 'bug_Latn-ilo_Latn', 'bug_Latn-jav_Latn', + 'bug_Latn-min_Latn', 'bug_Latn-mri_Latn', 'bug_Latn-pag_Latn', + 'bug_Latn-plt_Latn', 'bug_Latn-smo_Latn', 'bug_Latn-sun_Latn', + 'bug_Latn-war_Latn', 'ceb_Latn-eng_Latn', 'ceb_Latn-fij_Latn', + 'ceb_Latn-ilo_Latn', 'ceb_Latn-jav_Latn', 'ceb_Latn-min_Latn', + 'ceb_Latn-mri_Latn', 'ceb_Latn-pag_Latn', 'ceb_Latn-plt_Latn', + 'ceb_Latn-smo_Latn', 'ceb_Latn-sun_Latn', 'ceb_Latn-war_Latn', + 'cjk_Latn-dik_Latn', 'cjk_Latn-dyu_Latn', 'cjk_Latn-eng_Latn', + 'cjk_Latn-ewe_Latn', 'cjk_Latn-fon_Latn', 'cjk_Latn-fra_Latn', + 'cjk_Latn-fuv_Latn', 'cjk_Latn-gaz_Latn', 'cjk_Latn-hau_Latn', + 'cjk_Latn-ibo_Latn', 'cjk_Latn-kam_Latn', 'cjk_Latn-kik_Latn', + 'cjk_Latn-kin_Latn', 'cjk_Latn-kmb_Latn', 'cjk_Latn-knc_Arab', + 'cjk_Latn-knc_Latn', 'cjk_Latn-kon_Latn', 'cjk_Latn-lin_Latn', + 'cjk_Latn-lua_Latn', 'cjk_Latn-lug_Latn', 'cjk_Latn-luo_Latn', + 'cjk_Latn-nso_Latn', 'cjk_Latn-nus_Latn', 'cjk_Latn-nya_Latn', + 'cjk_Latn-por_Latn', 'cjk_Latn-run_Latn', 'cjk_Latn-sna_Latn', + 'cjk_Latn-som_Latn', 'cjk_Latn-sot_Latn', 'cjk_Latn-ssw_Latn', + 'cjk_Latn-swh_Latn', 'cjk_Latn-tir_Ethi', 'cjk_Latn-tsn_Latn', + 'cjk_Latn-tso_Latn', 'cjk_Latn-tum_Latn', 'cjk_Latn-twi_Latn', + 'cjk_Latn-umb_Latn', 'cjk_Latn-wol_Latn', 'cjk_Latn-xho_Latn', + 'cjk_Latn-yor_Latn', 'cjk_Latn-zul_Latn', 'ckb_Arab-diq_Latn', + 'ckb_Arab-eng_Latn', 'ckb_Arab-kmr_Latn', 'ckb_Arab-pbt_Arab', + 'ckb_Arab-prs_Arab', 'ckb_Arab-tgk_Cyrl', 'crh_Latn-eng_Latn', + 'crh_Latn-kir_Cyrl', 'crh_Latn-rus_Cyrl', 'crh_Latn-tat_Cyrl', + 'crh_Latn-tuk_Latn', 'crh_Latn-uig_Arab', 'crh_Latn-uzn_Latn', + 'cym_Latn-eng_Latn', 'dik_Latn-dyu_Latn', 'dik_Latn-eng_Latn', + 'dik_Latn-ewe_Latn', 'dik_Latn-fon_Latn', 'dik_Latn-fra_Latn', + 'dik_Latn-fuv_Latn', 'dik_Latn-gaz_Latn', 'dik_Latn-hau_Latn', + 'dik_Latn-ibo_Latn', 'dik_Latn-kam_Latn', 'dik_Latn-kik_Latn', + 'dik_Latn-kin_Latn', 'dik_Latn-kmb_Latn', 'dik_Latn-knc_Arab', + 'dik_Latn-knc_Latn', 'dik_Latn-kon_Latn', 'dik_Latn-lin_Latn', + 'dik_Latn-lua_Latn', 'dik_Latn-lug_Latn', 'dik_Latn-luo_Latn', + 'dik_Latn-nso_Latn', 'dik_Latn-nus_Latn', 'dik_Latn-nya_Latn', + 'dik_Latn-run_Latn', 'dik_Latn-sna_Latn', 'dik_Latn-som_Latn', + 'dik_Latn-sot_Latn', 'dik_Latn-ssw_Latn', 'dik_Latn-swh_Latn', + 'dik_Latn-tir_Ethi', 'dik_Latn-tsn_Latn', 'dik_Latn-tso_Latn', + 'dik_Latn-tum_Latn', 'dik_Latn-twi_Latn', 'dik_Latn-umb_Latn', + 'dik_Latn-wol_Latn', 'dik_Latn-xho_Latn', 'dik_Latn-yor_Latn', + 'dik_Latn-zul_Latn', 'diq_Latn-eng_Latn', 'diq_Latn-kmr_Latn', + 'diq_Latn-pbt_Arab', 'diq_Latn-prs_Arab', 'diq_Latn-tgk_Cyrl', + 'dyu_Latn-eng_Latn', 'dyu_Latn-ewe_Latn', 'dyu_Latn-fon_Latn', + 'dyu_Latn-fra_Latn', 'dyu_Latn-fuv_Latn', 'dyu_Latn-gaz_Latn', + 'dyu_Latn-hau_Latn', 'dyu_Latn-ibo_Latn', 'dyu_Latn-kam_Latn', + 'dyu_Latn-kik_Latn', 'dyu_Latn-kin_Latn', 'dyu_Latn-kmb_Latn', + 'dyu_Latn-knc_Arab', 'dyu_Latn-knc_Latn', 'dyu_Latn-kon_Latn', + 'dyu_Latn-lin_Latn', 'dyu_Latn-lua_Latn', 'dyu_Latn-lug_Latn', + 'dyu_Latn-luo_Latn', 'dyu_Latn-nso_Latn', 'dyu_Latn-nus_Latn', + 'dyu_Latn-nya_Latn', 'dyu_Latn-run_Latn', 'dyu_Latn-sna_Latn', + 'dyu_Latn-som_Latn', 'dyu_Latn-sot_Latn', 'dyu_Latn-ssw_Latn', + 'dyu_Latn-swh_Latn', 'dyu_Latn-tir_Ethi', 'dyu_Latn-tsn_Latn', + 'dyu_Latn-tso_Latn', 'dyu_Latn-tum_Latn', 'dyu_Latn-twi_Latn', + 'dyu_Latn-umb_Latn', 'dyu_Latn-wol_Latn', 'dyu_Latn-xho_Latn', + 'dyu_Latn-yor_Latn', 'dyu_Latn-zul_Latn', 'dzo_Tibt-eng_Latn', + 'eng_Latn-als_Latn', 'eng_Latn-epo_Latn', 'eng_Latn-ewe_Latn', + 'eng_Latn-fao_Latn', 'eng_Latn-fij_Latn', 'eng_Latn-fon_Latn', + 'eng_Latn-fur_Latn', 'eng_Latn-fuv_Latn', 'eng_Latn-gaz_Latn', + 'eng_Latn-gla_Latn', 'eng_Latn-gle_Latn', 'eng_Latn-grn_Latn', + 'eng_Latn-guj_Gujr', 'eng_Latn-hat_Latn', 'eng_Latn-hau_Latn', + 'eng_Latn-hin_Deva', 'eng_Latn-hne_Deva', 'eng_Latn-hye_Armn', + 'eng_Latn-ibo_Latn', 'eng_Latn-ilo_Latn', 'eng_Latn-jav_Latn', + 'eng_Latn-kab_Latn', 'eng_Latn-kac_Latn', 'eng_Latn-kam_Latn', + 'eng_Latn-kan_Knda', 'eng_Latn-kas_Arab', 'eng_Latn-kas_Deva', + 'eng_Latn-kat_Geor', 'eng_Latn-kaz_Cyrl', 'eng_Latn-kbp_Latn', + 'eng_Latn-kea_Latn', 'eng_Latn-khk_Cyrl', 'eng_Latn-khm_Khmr', + 'eng_Latn-kik_Latn', 'eng_Latn-kin_Latn', 'eng_Latn-kir_Cyrl', + 'eng_Latn-kmb_Latn', 'eng_Latn-kmr_Latn', 'eng_Latn-knc_Arab', + 'eng_Latn-knc_Latn', 'eng_Latn-kon_Latn', 'eng_Latn-lao_Laoo', + 'eng_Latn-lij_Latn', 'eng_Latn-lim_Latn', 'eng_Latn-lin_Latn', + 'eng_Latn-lmo_Latn', 'eng_Latn-ltg_Latn', 'eng_Latn-ltz_Latn', + 'eng_Latn-lua_Latn', 'eng_Latn-lug_Latn', 'eng_Latn-luo_Latn', + 'eng_Latn-lus_Latn', 'eng_Latn-mag_Deva', 'eng_Latn-mai_Deva', + 'eng_Latn-mal_Mlym', 'eng_Latn-mar_Deva', 'eng_Latn-min_Latn', + 'eng_Latn-mlt_Latn', 'eng_Latn-mni_Beng', 'eng_Latn-mos_Latn', + 'eng_Latn-mri_Latn', 'eng_Latn-mya_Mymr', 'eng_Latn-npi_Deva', + 'eng_Latn-nso_Latn', 'eng_Latn-nus_Latn', 'eng_Latn-nya_Latn', + 'eng_Latn-ory_Orya', 'eng_Latn-pag_Latn', 'eng_Latn-pan_Guru', + 'eng_Latn-pap_Latn', 'eng_Latn-pbt_Arab', 'eng_Latn-plt_Latn', + 'eng_Latn-prs_Arab', 'eng_Latn-quy_Latn', 'eng_Latn-run_Latn', + 'eng_Latn-sag_Latn', 'eng_Latn-san_Deva', 'eng_Latn-sat_Beng', + 'eng_Latn-scn_Latn', 'eng_Latn-shn_Mymr', 'eng_Latn-sin_Sinh', + 'eng_Latn-smo_Latn', 'eng_Latn-sna_Latn', 'eng_Latn-snd_Arab', + 'eng_Latn-som_Latn', 'eng_Latn-sot_Latn', 'eng_Latn-srd_Latn', + 'eng_Latn-ssw_Latn', 'eng_Latn-sun_Latn', 'eng_Latn-swh_Latn', + 'eng_Latn-szl_Latn', 'eng_Latn-tam_Taml', 'eng_Latn-taq_Latn', + 'eng_Latn-tat_Cyrl', 'eng_Latn-tel_Telu', 'eng_Latn-tgk_Cyrl', + 'eng_Latn-tgl_Latn', 'eng_Latn-tir_Ethi', 'eng_Latn-tpi_Latn', + 'eng_Latn-tsn_Latn', 'eng_Latn-tso_Latn', 'eng_Latn-tuk_Latn', + 'eng_Latn-tum_Latn', 'eng_Latn-twi_Latn', 'eng_Latn-tzm_Tfng', + 'eng_Latn-uig_Arab', 'eng_Latn-umb_Latn', 'eng_Latn-urd_Arab', + 'eng_Latn-uzn_Latn', 'eng_Latn-vec_Latn', 'eng_Latn-war_Latn', + 'eng_Latn-wol_Latn', 'eng_Latn-xho_Latn', 'eng_Latn-ydd_Hebr', + 'eng_Latn-yor_Latn', 'eng_Latn-zho_Hant', 'eng_Latn-zsm_Latn', + 'eng_Latn-zul_Latn', 'epo_Latn-fra_Latn', 'ewe_Latn-fon_Latn', + 'ewe_Latn-fra_Latn', 'ewe_Latn-fuv_Latn', 'ewe_Latn-gaz_Latn', + 'ewe_Latn-hau_Latn', 'ewe_Latn-ibo_Latn', 'ewe_Latn-kam_Latn', + 'ewe_Latn-kik_Latn', 'ewe_Latn-kin_Latn', 'ewe_Latn-kmb_Latn', + 'ewe_Latn-knc_Arab', 'ewe_Latn-knc_Latn', 'ewe_Latn-kon_Latn', + 'ewe_Latn-lin_Latn', 'ewe_Latn-lua_Latn', 'ewe_Latn-lug_Latn', + 'ewe_Latn-luo_Latn', 'ewe_Latn-nso_Latn', 'ewe_Latn-nus_Latn', + 'ewe_Latn-nya_Latn', 'ewe_Latn-run_Latn', 'ewe_Latn-sna_Latn', + 'ewe_Latn-som_Latn', 'ewe_Latn-sot_Latn', 'ewe_Latn-ssw_Latn', + 'ewe_Latn-swh_Latn', 'ewe_Latn-tir_Ethi', 'ewe_Latn-tsn_Latn', + 'ewe_Latn-tso_Latn', 'ewe_Latn-tum_Latn', 'ewe_Latn-twi_Latn', + 'ewe_Latn-umb_Latn', 'ewe_Latn-wol_Latn', 'ewe_Latn-xho_Latn', + 'ewe_Latn-yor_Latn', 'ewe_Latn-zul_Latn', 'fij_Latn-hin_Deva', + 'fij_Latn-ilo_Latn', 'fij_Latn-jav_Latn', 'fij_Latn-min_Latn', + 'fij_Latn-mri_Latn', 'fij_Latn-pag_Latn', 'fij_Latn-plt_Latn', + 'fij_Latn-smo_Latn', 'fij_Latn-sun_Latn', 'fij_Latn-war_Latn', + 'fon_Latn-fra_Latn', 'fon_Latn-fuv_Latn', 'fon_Latn-gaz_Latn', + 'fon_Latn-hau_Latn', 'fon_Latn-ibo_Latn', 'fon_Latn-kam_Latn', + 'fon_Latn-kik_Latn', 'fon_Latn-kin_Latn', 'fon_Latn-kmb_Latn', + 'fon_Latn-knc_Arab', 'fon_Latn-knc_Latn', 'fon_Latn-kon_Latn', + 'fon_Latn-lin_Latn', 'fon_Latn-lua_Latn', 'fon_Latn-lug_Latn', + 'fon_Latn-luo_Latn', 'fon_Latn-nso_Latn', 'fon_Latn-nus_Latn', + 'fon_Latn-nya_Latn', 'fon_Latn-run_Latn', 'fon_Latn-sna_Latn', + 'fon_Latn-som_Latn', 'fon_Latn-sot_Latn', 'fon_Latn-ssw_Latn', + 'fon_Latn-swh_Latn', 'fon_Latn-tir_Ethi', 'fon_Latn-tsn_Latn', + 'fon_Latn-tso_Latn', 'fon_Latn-tum_Latn', 'fon_Latn-twi_Latn', + 'fon_Latn-umb_Latn', 'fon_Latn-wol_Latn', 'fon_Latn-xho_Latn', + 'fon_Latn-yor_Latn', 'fon_Latn-zul_Latn', 'fra_Latn-fuv_Latn', + 'fra_Latn-gaz_Latn', 'fra_Latn-glg_Latn', 'fra_Latn-hat_Latn', + 'fra_Latn-hau_Latn', 'fra_Latn-ibo_Latn', 'fra_Latn-kab_Latn', + 'fra_Latn-kam_Latn', 'fra_Latn-kik_Latn', 'fra_Latn-kin_Latn', + 'fra_Latn-kmb_Latn', 'fra_Latn-knc_Arab', 'fra_Latn-knc_Latn', + 'fra_Latn-kon_Latn', 'fra_Latn-lin_Latn', 'fra_Latn-ltz_Latn', + 'fra_Latn-lua_Latn', 'fra_Latn-lug_Latn', 'fra_Latn-luo_Latn', + 'fra_Latn-nso_Latn', 'fra_Latn-nus_Latn', 'fra_Latn-nya_Latn', + 'fra_Latn-oci_Latn', 'fra_Latn-plt_Latn', 'fra_Latn-run_Latn', + 'fra_Latn-sag_Latn', 'fra_Latn-scn_Latn', 'fra_Latn-sna_Latn', + 'fra_Latn-som_Latn', 'fra_Latn-sot_Latn', 'fra_Latn-ssw_Latn', + 'fra_Latn-swh_Latn', 'fra_Latn-tir_Ethi', 'fra_Latn-tsn_Latn', + 'fra_Latn-tso_Latn', 'fra_Latn-tum_Latn', 'fra_Latn-twi_Latn', + 'fra_Latn-tzm_Tfng', 'fra_Latn-umb_Latn', 'fra_Latn-wol_Latn', + 'fra_Latn-xho_Latn', 'fra_Latn-yor_Latn', 'fra_Latn-zul_Latn', + 'fuv_Latn-gaz_Latn', 'fuv_Latn-hau_Latn', 'fuv_Latn-ibo_Latn', + 'fuv_Latn-kam_Latn', 'fuv_Latn-kik_Latn', 'fuv_Latn-kin_Latn', + 'fuv_Latn-kmb_Latn', 'fuv_Latn-knc_Arab', 'fuv_Latn-knc_Latn', + 'fuv_Latn-kon_Latn', 'fuv_Latn-lin_Latn', 'fuv_Latn-lua_Latn', + 'fuv_Latn-lug_Latn', 'fuv_Latn-luo_Latn', 'fuv_Latn-nso_Latn', + 'fuv_Latn-nus_Latn', 'fuv_Latn-nya_Latn', 'fuv_Latn-run_Latn', + 'fuv_Latn-sna_Latn', 'fuv_Latn-som_Latn', 'fuv_Latn-sot_Latn', + 'fuv_Latn-ssw_Latn', 'fuv_Latn-swh_Latn', 'fuv_Latn-tir_Ethi', + 'fuv_Latn-tsn_Latn', 'fuv_Latn-tso_Latn', 'fuv_Latn-tum_Latn', + 'fuv_Latn-twi_Latn', 'fuv_Latn-umb_Latn', 'fuv_Latn-wol_Latn', + 'fuv_Latn-xho_Latn', 'fuv_Latn-yor_Latn', 'fuv_Latn-zul_Latn', + 'gaz_Latn-run_Latn', 'gaz_Latn-sna_Latn', 'gaz_Latn-som_Latn', + 'gaz_Latn-sot_Latn', 'gaz_Latn-ssw_Latn', 'gaz_Latn-swh_Latn', + 'gaz_Latn-tir_Ethi', 'gaz_Latn-tsn_Latn', 'gaz_Latn-tso_Latn', + 'gaz_Latn-tum_Latn', 'gaz_Latn-twi_Latn', 'gaz_Latn-umb_Latn', + 'gaz_Latn-wol_Latn', 'gaz_Latn-xho_Latn', 'gaz_Latn-yor_Latn', + 'gaz_Latn-zul_Latn', 'glg_Latn-por_Latn', 'grn_Latn-por_Latn', + 'guj_Gujr-hin_Deva', 'guj_Gujr-hne_Deva', 'guj_Gujr-kan_Knda', + 'guj_Gujr-kas_Arab', 'guj_Gujr-kas_Deva', 'guj_Gujr-mag_Deva', + 'guj_Gujr-mai_Deva', 'guj_Gujr-mal_Mlym', 'guj_Gujr-mar_Deva', + 'guj_Gujr-npi_Deva', 'guj_Gujr-ory_Orya', 'guj_Gujr-pan_Guru', + 'guj_Gujr-san_Deva', 'guj_Gujr-sat_Beng', 'guj_Gujr-sin_Sinh', + 'guj_Gujr-snd_Arab', 'guj_Gujr-tam_Taml', 'guj_Gujr-tel_Telu', + 'guj_Gujr-urd_Arab', 'hau_Latn-gaz_Latn', 'hau_Latn-ibo_Latn', + 'hau_Latn-kam_Latn', 'hau_Latn-kik_Latn', 'hau_Latn-kin_Latn', + 'hau_Latn-kmb_Latn', 'hau_Latn-knc_Arab', 'hau_Latn-knc_Latn', + 'hau_Latn-kon_Latn', 'hau_Latn-lin_Latn', 'hau_Latn-lua_Latn', + 'hau_Latn-lug_Latn', 'hau_Latn-luo_Latn', 'hau_Latn-nso_Latn', + 'hau_Latn-nus_Latn', 'hau_Latn-nya_Latn', 'hau_Latn-run_Latn', + 'hau_Latn-sna_Latn', 'hau_Latn-som_Latn', 'hau_Latn-sot_Latn', + 'hau_Latn-ssw_Latn', 'hau_Latn-swh_Latn', 'hau_Latn-tir_Ethi', + 'hau_Latn-tsn_Latn', 'hau_Latn-tso_Latn', 'hau_Latn-tum_Latn', + 'hau_Latn-twi_Latn', 'hau_Latn-umb_Latn', 'hau_Latn-wol_Latn', + 'hau_Latn-xho_Latn', 'hau_Latn-yor_Latn', 'hau_Latn-zul_Latn', + 'hin_Deva-hne_Deva', 'hin_Deva-kan_Knda', 'hin_Deva-kas_Arab', + 'hin_Deva-kas_Deva', 'hin_Deva-mag_Deva', 'hin_Deva-mai_Deva', + 'hin_Deva-mal_Mlym', 'hin_Deva-mar_Deva', 'hin_Deva-npi_Deva', + 'hin_Deva-ory_Orya', 'hin_Deva-pan_Guru', 'hin_Deva-pbt_Arab', + 'hin_Deva-san_Deva', 'hin_Deva-sat_Beng', 'hin_Deva-sin_Sinh', + 'hin_Deva-snd_Arab', 'hin_Deva-tam_Taml', 'hin_Deva-tel_Telu', + 'hin_Deva-urd_Arab', 'hne_Deva-kan_Knda', 'hne_Deva-kas_Arab', + 'hne_Deva-kas_Deva', 'hne_Deva-mag_Deva', 'hne_Deva-mai_Deva', + 'hne_Deva-mal_Mlym', 'hne_Deva-mar_Deva', 'hne_Deva-npi_Deva', + 'hne_Deva-ory_Orya', 'hne_Deva-pan_Guru', 'hne_Deva-san_Deva', + 'hne_Deva-sat_Beng', 'hne_Deva-sin_Sinh', 'hne_Deva-snd_Arab', + 'hne_Deva-tam_Taml', 'hne_Deva-tel_Telu', 'hne_Deva-urd_Arab', + 'hye_Armn-rus_Cyrl', 'ibo_Latn-gaz_Latn', 'ibo_Latn-kam_Latn', + 'ibo_Latn-kik_Latn', 'ibo_Latn-kin_Latn', 'ibo_Latn-kmb_Latn', + 'ibo_Latn-knc_Arab', 'ibo_Latn-knc_Latn', 'ibo_Latn-kon_Latn', + 'ibo_Latn-lin_Latn', 'ibo_Latn-lua_Latn', 'ibo_Latn-lug_Latn', + 'ibo_Latn-luo_Latn', 'ibo_Latn-nso_Latn', 'ibo_Latn-nus_Latn', + 'ibo_Latn-nya_Latn', 'ibo_Latn-run_Latn', 'ibo_Latn-sna_Latn', + 'ibo_Latn-som_Latn', 'ibo_Latn-sot_Latn', 'ibo_Latn-ssw_Latn', + 'ibo_Latn-swh_Latn', 'ibo_Latn-tir_Ethi', 'ibo_Latn-tsn_Latn', + 'ibo_Latn-tso_Latn', 'ibo_Latn-tum_Latn', 'ibo_Latn-twi_Latn', + 'ibo_Latn-umb_Latn', 'ibo_Latn-wol_Latn', 'ibo_Latn-xho_Latn', + 'ibo_Latn-yor_Latn', 'ibo_Latn-zul_Latn', 'ilo_Latn-jav_Latn', + 'ilo_Latn-min_Latn', 'ilo_Latn-mri_Latn', 'ilo_Latn-pag_Latn', + 'ilo_Latn-plt_Latn', 'ilo_Latn-smo_Latn', 'ilo_Latn-sun_Latn', + 'ilo_Latn-war_Latn', 'ind_Latn-ace_Latn', 'ind_Latn-ban_Latn', + 'ind_Latn-jav_Latn', 'ind_Latn-khm_Khmr', 'ind_Latn-lao_Laoo', + 'ind_Latn-min_Latn', 'ind_Latn-mya_Mymr', 'ind_Latn-shn_Mymr', + 'ind_Latn-sun_Latn', 'jav_Latn-min_Latn', 'jav_Latn-mri_Latn', + 'jav_Latn-pag_Latn', 'jav_Latn-plt_Latn', 'jav_Latn-smo_Latn', + 'jav_Latn-sun_Latn', 'jav_Latn-war_Latn', 'kam_Latn-gaz_Latn', + 'kam_Latn-kik_Latn', 'kam_Latn-kin_Latn', 'kam_Latn-kmb_Latn', + 'kam_Latn-knc_Arab', 'kam_Latn-knc_Latn', 'kam_Latn-kon_Latn', + 'kam_Latn-lin_Latn', 'kam_Latn-lua_Latn', 'kam_Latn-lug_Latn', + 'kam_Latn-luo_Latn', 'kam_Latn-nso_Latn', 'kam_Latn-nus_Latn', + 'kam_Latn-nya_Latn', 'kam_Latn-run_Latn', 'kam_Latn-sna_Latn', + 'kam_Latn-som_Latn', 'kam_Latn-sot_Latn', 'kam_Latn-ssw_Latn', + 'kam_Latn-swh_Latn', 'kam_Latn-tir_Ethi', 'kam_Latn-tsn_Latn', + 'kam_Latn-tso_Latn', 'kam_Latn-tum_Latn', 'kam_Latn-twi_Latn', + 'kam_Latn-umb_Latn', 'kam_Latn-wol_Latn', 'kam_Latn-xho_Latn', + 'kam_Latn-yor_Latn', 'kam_Latn-zul_Latn', 'kan_Knda-kas_Arab', + 'kan_Knda-kas_Deva', 'kan_Knda-mag_Deva', 'kan_Knda-mai_Deva', + 'kan_Knda-mal_Mlym', 'kan_Knda-mar_Deva', 'kan_Knda-npi_Deva', + 'kan_Knda-ory_Orya', 'kan_Knda-pan_Guru', 'kan_Knda-san_Deva', + 'kan_Knda-sat_Beng', 'kan_Knda-sin_Sinh', 'kan_Knda-snd_Arab', + 'kan_Knda-tam_Taml', 'kan_Knda-tel_Telu', 'kan_Knda-urd_Arab', + 'kas_Arab-kas_Deva', 'kas_Arab-mag_Deva', 'kas_Arab-mai_Deva', + 'kas_Arab-mal_Mlym', 'kas_Arab-mar_Deva', 'kas_Arab-npi_Deva', + 'kas_Arab-ory_Orya', 'kas_Arab-pan_Guru', 'kas_Arab-san_Deva', + 'kas_Arab-sat_Beng', 'kas_Arab-sin_Sinh', 'kas_Arab-snd_Arab', + 'kas_Arab-tam_Taml', 'kas_Arab-tel_Telu', 'kas_Arab-urd_Arab', + 'kas_Deva-mag_Deva', 'kas_Deva-mai_Deva', 'kas_Deva-mal_Mlym', + 'kas_Deva-mar_Deva', 'kas_Deva-npi_Deva', 'kas_Deva-ory_Orya', + 'kas_Deva-pan_Guru', 'kas_Deva-san_Deva', 'kas_Deva-sat_Beng', + 'kas_Deva-sin_Sinh', 'kas_Deva-snd_Arab', 'kas_Deva-tam_Taml', + 'kas_Deva-tel_Telu', 'kas_Deva-urd_Arab', 'kat_Geor-rus_Cyrl', + 'kea_Latn-por_Latn', 'kik_Latn-gaz_Latn', 'kik_Latn-kin_Latn', + 'kik_Latn-kmb_Latn', 'kik_Latn-kon_Latn', 'kik_Latn-lin_Latn', + 'kik_Latn-lua_Latn', 'kik_Latn-lug_Latn', 'kik_Latn-luo_Latn', + 'kik_Latn-nso_Latn', 'kik_Latn-nus_Latn', 'kik_Latn-nya_Latn', + 'kik_Latn-run_Latn', 'kik_Latn-sna_Latn', 'kik_Latn-som_Latn', + 'kik_Latn-sot_Latn', 'kik_Latn-ssw_Latn', 'kik_Latn-swh_Latn', + 'kik_Latn-tir_Ethi', 'kik_Latn-tsn_Latn', 'kik_Latn-tso_Latn', + 'kik_Latn-tum_Latn', 'kik_Latn-twi_Latn', 'kik_Latn-umb_Latn', + 'kik_Latn-wol_Latn', 'kik_Latn-xho_Latn', 'kik_Latn-yor_Latn', + 'kik_Latn-zul_Latn', 'kin_Latn-gaz_Latn', 'kin_Latn-kmb_Latn', + 'kin_Latn-kon_Latn', 'kin_Latn-lin_Latn', 'kin_Latn-lua_Latn', + 'kin_Latn-lug_Latn', 'kin_Latn-luo_Latn', 'kin_Latn-nso_Latn', + 'kin_Latn-nus_Latn', 'kin_Latn-nya_Latn', 'kin_Latn-run_Latn', + 'kin_Latn-sna_Latn', 'kin_Latn-som_Latn', 'kin_Latn-sot_Latn', + 'kin_Latn-ssw_Latn', 'kin_Latn-swh_Latn', 'kin_Latn-tir_Ethi', + 'kin_Latn-tsn_Latn', 'kin_Latn-tso_Latn', 'kin_Latn-tum_Latn', + 'kin_Latn-twi_Latn', 'kin_Latn-umb_Latn', 'kin_Latn-wol_Latn', + 'kin_Latn-xho_Latn', 'kin_Latn-yor_Latn', 'kin_Latn-zul_Latn', + 'kir_Cyrl-rus_Cyrl', 'kir_Cyrl-tat_Cyrl', 'kir_Cyrl-tuk_Latn', + 'kir_Cyrl-uig_Arab', 'kir_Cyrl-uzn_Latn', 'kmb_Latn-gaz_Latn', + 'kmb_Latn-kon_Latn', 'kmb_Latn-lin_Latn', 'kmb_Latn-lua_Latn', + 'kmb_Latn-lug_Latn', 'kmb_Latn-luo_Latn', 'kmb_Latn-nso_Latn', + 'kmb_Latn-nus_Latn', 'kmb_Latn-nya_Latn', 'kmb_Latn-por_Latn', + 'kmb_Latn-run_Latn', 'kmb_Latn-sna_Latn', 'kmb_Latn-som_Latn', + 'kmb_Latn-sot_Latn', 'kmb_Latn-ssw_Latn', 'kmb_Latn-swh_Latn', + 'kmb_Latn-tir_Ethi', 'kmb_Latn-tsn_Latn', 'kmb_Latn-tso_Latn', + 'kmb_Latn-tum_Latn', 'kmb_Latn-twi_Latn', 'kmb_Latn-umb_Latn', + 'kmb_Latn-wol_Latn', 'kmb_Latn-xho_Latn', 'kmb_Latn-yor_Latn', + 'kmb_Latn-zul_Latn', 'kmr_Latn-pbt_Arab', 'kmr_Latn-prs_Arab', + 'kmr_Latn-tgk_Cyrl', 'knc_Arab-gaz_Latn', 'knc_Arab-kik_Latn', + 'knc_Arab-kin_Latn', 'knc_Arab-kmb_Latn', 'knc_Arab-knc_Latn', + 'knc_Arab-kon_Latn', 'knc_Arab-lin_Latn', 'knc_Arab-lua_Latn', + 'knc_Arab-lug_Latn', 'knc_Arab-luo_Latn', 'knc_Arab-nso_Latn', + 'knc_Arab-nus_Latn', 'knc_Arab-nya_Latn', 'knc_Arab-run_Latn', + 'knc_Arab-sna_Latn', 'knc_Arab-som_Latn', 'knc_Arab-sot_Latn', + 'knc_Arab-ssw_Latn', 'knc_Arab-swh_Latn', 'knc_Arab-tir_Ethi', + 'knc_Arab-tsn_Latn', 'knc_Arab-tso_Latn', 'knc_Arab-tum_Latn', + 'knc_Arab-twi_Latn', 'knc_Arab-umb_Latn', 'knc_Arab-wol_Latn', + 'knc_Arab-xho_Latn', 'knc_Arab-yor_Latn', 'knc_Arab-zul_Latn', + 'knc_Latn-gaz_Latn', 'knc_Latn-kik_Latn', 'knc_Latn-kin_Latn', + 'knc_Latn-kmb_Latn', 'knc_Latn-kon_Latn', 'knc_Latn-lin_Latn', + 'knc_Latn-lua_Latn', 'knc_Latn-lug_Latn', 'knc_Latn-luo_Latn', + 'knc_Latn-nso_Latn', 'knc_Latn-nus_Latn', 'knc_Latn-nya_Latn', + 'knc_Latn-run_Latn', 'knc_Latn-sna_Latn', 'knc_Latn-som_Latn', + 'knc_Latn-sot_Latn', 'knc_Latn-ssw_Latn', 'knc_Latn-swh_Latn', + 'knc_Latn-tir_Ethi', 'knc_Latn-tsn_Latn', 'knc_Latn-tso_Latn', + 'knc_Latn-tum_Latn', 'knc_Latn-twi_Latn', 'knc_Latn-umb_Latn', + 'knc_Latn-wol_Latn', 'knc_Latn-xho_Latn', 'knc_Latn-yor_Latn', + 'knc_Latn-zul_Latn', 'kon_Latn-gaz_Latn', 'kon_Latn-lin_Latn', + 'kon_Latn-lua_Latn', 'kon_Latn-lug_Latn', 'kon_Latn-luo_Latn', + 'kon_Latn-nso_Latn', 'kon_Latn-nus_Latn', 'kon_Latn-nya_Latn', + 'kon_Latn-run_Latn', 'kon_Latn-sna_Latn', 'kon_Latn-som_Latn', + 'kon_Latn-sot_Latn', 'kon_Latn-ssw_Latn', 'kon_Latn-swh_Latn', + 'kon_Latn-tir_Ethi', 'kon_Latn-tsn_Latn', 'kon_Latn-tso_Latn', + 'kon_Latn-tum_Latn', 'kon_Latn-twi_Latn', 'kon_Latn-umb_Latn', + 'kon_Latn-wol_Latn', 'kon_Latn-xho_Latn', 'kon_Latn-yor_Latn', + 'kon_Latn-zul_Latn', 'lao_Laoo-rus_Cyrl', 'lin_Latn-gaz_Latn', + 'lin_Latn-lua_Latn', 'lin_Latn-lug_Latn', 'lin_Latn-luo_Latn', + 'lin_Latn-nso_Latn', 'lin_Latn-nus_Latn', 'lin_Latn-nya_Latn', + 'lin_Latn-run_Latn', 'lin_Latn-sna_Latn', 'lin_Latn-som_Latn', + 'lin_Latn-sot_Latn', 'lin_Latn-ssw_Latn', 'lin_Latn-swh_Latn', + 'lin_Latn-tir_Ethi', 'lin_Latn-tsn_Latn', 'lin_Latn-tso_Latn', + 'lin_Latn-tum_Latn', 'lin_Latn-twi_Latn', 'lin_Latn-umb_Latn', + 'lin_Latn-wol_Latn', 'lin_Latn-xho_Latn', 'lin_Latn-yor_Latn', + 'lin_Latn-zul_Latn', 'ltg_Latn-rus_Cyrl', 'lua_Latn-gaz_Latn', + 'lua_Latn-lug_Latn', 'lua_Latn-luo_Latn', 'lua_Latn-nso_Latn', + 'lua_Latn-nus_Latn', 'lua_Latn-nya_Latn', 'lua_Latn-run_Latn', + 'lua_Latn-sna_Latn', 'lua_Latn-som_Latn', 'lua_Latn-sot_Latn', + 'lua_Latn-ssw_Latn', 'lua_Latn-swh_Latn', 'lua_Latn-tir_Ethi', + 'lua_Latn-tsn_Latn', 'lua_Latn-tso_Latn', 'lua_Latn-tum_Latn', + 'lua_Latn-twi_Latn', 'lua_Latn-umb_Latn', 'lua_Latn-wol_Latn', + 'lua_Latn-xho_Latn', 'lua_Latn-yor_Latn', 'lua_Latn-zul_Latn', + 'lug_Latn-gaz_Latn', 'lug_Latn-luo_Latn', 'lug_Latn-nso_Latn', + 'lug_Latn-nus_Latn', 'lug_Latn-nya_Latn', 'lug_Latn-run_Latn', + 'lug_Latn-sna_Latn', 'lug_Latn-som_Latn', 'lug_Latn-sot_Latn', + 'lug_Latn-ssw_Latn', 'lug_Latn-swh_Latn', 'lug_Latn-tir_Ethi', + 'lug_Latn-tsn_Latn', 'lug_Latn-tso_Latn', 'lug_Latn-tum_Latn', + 'lug_Latn-twi_Latn', 'lug_Latn-umb_Latn', 'lug_Latn-wol_Latn', + 'lug_Latn-xho_Latn', 'lug_Latn-yor_Latn', 'lug_Latn-zul_Latn', + 'luo_Latn-gaz_Latn', 'luo_Latn-nso_Latn', 'luo_Latn-nus_Latn', + 'luo_Latn-nya_Latn', 'luo_Latn-run_Latn', 'luo_Latn-sna_Latn', + 'luo_Latn-som_Latn', 'luo_Latn-sot_Latn', 'luo_Latn-ssw_Latn', + 'luo_Latn-swh_Latn', 'luo_Latn-tir_Ethi', 'luo_Latn-tsn_Latn', + 'luo_Latn-tso_Latn', 'luo_Latn-tum_Latn', 'luo_Latn-twi_Latn', + 'luo_Latn-umb_Latn', 'luo_Latn-wol_Latn', 'luo_Latn-xho_Latn', + 'luo_Latn-yor_Latn', 'luo_Latn-zul_Latn', 'mag_Deva-mai_Deva', + 'mag_Deva-mal_Mlym', 'mag_Deva-mar_Deva', 'mag_Deva-npi_Deva', + 'mag_Deva-ory_Orya', 'mag_Deva-pan_Guru', 'mag_Deva-san_Deva', + 'mag_Deva-sat_Beng', 'mag_Deva-sin_Sinh', 'mag_Deva-snd_Arab', + 'mag_Deva-tam_Taml', 'mag_Deva-tel_Telu', 'mag_Deva-urd_Arab', + 'mai_Deva-mal_Mlym', 'mai_Deva-mar_Deva', 'mai_Deva-npi_Deva', + 'mai_Deva-ory_Orya', 'mai_Deva-pan_Guru', 'mai_Deva-san_Deva', + 'mai_Deva-sat_Beng', 'mai_Deva-sin_Sinh', 'mai_Deva-snd_Arab', + 'mai_Deva-tam_Taml', 'mai_Deva-tel_Telu', 'mai_Deva-urd_Arab', + 'mal_Mlym-mar_Deva', 'mal_Mlym-npi_Deva', 'mal_Mlym-ory_Orya', + 'mal_Mlym-pan_Guru', 'mal_Mlym-san_Deva', 'mal_Mlym-sat_Beng', + 'mal_Mlym-sin_Sinh', 'mal_Mlym-snd_Arab', 'mal_Mlym-tam_Taml', + 'mal_Mlym-tel_Telu', 'mal_Mlym-urd_Arab', 'mar_Deva-npi_Deva', + 'mar_Deva-ory_Orya', 'mar_Deva-pan_Guru', 'mar_Deva-san_Deva', + 'mar_Deva-sat_Beng', 'mar_Deva-sin_Sinh', 'mar_Deva-snd_Arab', + 'mar_Deva-tam_Taml', 'mar_Deva-tel_Telu', 'mar_Deva-urd_Arab', + 'min_Latn-mri_Latn', 'min_Latn-pag_Latn', 'min_Latn-plt_Latn', + 'min_Latn-smo_Latn', 'min_Latn-sun_Latn', 'min_Latn-war_Latn', + 'mri_Latn-pag_Latn', 'mri_Latn-smo_Latn', 'mri_Latn-sun_Latn', + 'mri_Latn-war_Latn', 'npi_Deva-ory_Orya', 'npi_Deva-pan_Guru', + 'npi_Deva-san_Deva', 'npi_Deva-sat_Beng', 'npi_Deva-sin_Sinh', + 'npi_Deva-snd_Arab', 'npi_Deva-tam_Taml', 'npi_Deva-tel_Telu', + 'npi_Deva-urd_Arab', 'nso_Latn-gaz_Latn', 'nso_Latn-nus_Latn', + 'nso_Latn-nya_Latn', 'nso_Latn-run_Latn', 'nso_Latn-sna_Latn', + 'nso_Latn-som_Latn', 'nso_Latn-sot_Latn', 'nso_Latn-ssw_Latn', + 'nso_Latn-swh_Latn', 'nso_Latn-tir_Ethi', 'nso_Latn-tsn_Latn', + 'nso_Latn-tso_Latn', 'nso_Latn-tum_Latn', 'nso_Latn-twi_Latn', + 'nso_Latn-umb_Latn', 'nso_Latn-wol_Latn', 'nso_Latn-xho_Latn', + 'nso_Latn-yor_Latn', 'nso_Latn-zul_Latn', 'nus_Latn-gaz_Latn', + 'nus_Latn-nya_Latn', 'nus_Latn-run_Latn', 'nus_Latn-sna_Latn', + 'nus_Latn-som_Latn', 'nus_Latn-sot_Latn', 'nus_Latn-ssw_Latn', + 'nus_Latn-swh_Latn', 'nus_Latn-tir_Ethi', 'nus_Latn-tsn_Latn', + 'nus_Latn-tso_Latn', 'nus_Latn-tum_Latn', 'nus_Latn-twi_Latn', + 'nus_Latn-umb_Latn', 'nus_Latn-wol_Latn', 'nus_Latn-xho_Latn', + 'nus_Latn-yor_Latn', 'nus_Latn-zul_Latn', 'nya_Latn-gaz_Latn', + 'nya_Latn-run_Latn', 'nya_Latn-sna_Latn', 'nya_Latn-som_Latn', + 'nya_Latn-sot_Latn', 'nya_Latn-ssw_Latn', 'nya_Latn-swh_Latn', + 'nya_Latn-tir_Ethi', 'nya_Latn-tsn_Latn', 'nya_Latn-tso_Latn', + 'nya_Latn-tum_Latn', 'nya_Latn-twi_Latn', 'nya_Latn-umb_Latn', + 'nya_Latn-wol_Latn', 'nya_Latn-xho_Latn', 'nya_Latn-yor_Latn', + 'nya_Latn-zul_Latn', 'oci_Latn-por_Latn', 'ory_Orya-pan_Guru', + 'ory_Orya-san_Deva', 'ory_Orya-sat_Beng', 'ory_Orya-sin_Sinh', + 'ory_Orya-snd_Arab', 'ory_Orya-tam_Taml', 'ory_Orya-tel_Telu', + 'ory_Orya-urd_Arab', 'pag_Latn-smo_Latn', 'pag_Latn-sun_Latn', + 'pan_Guru-san_Deva', 'pan_Guru-sat_Beng', 'pan_Guru-sin_Sinh', + 'pan_Guru-snd_Arab', 'pan_Guru-tam_Taml', 'pan_Guru-tel_Telu', + 'pan_Guru-urd_Arab', 'pbt_Arab-tam_Taml', 'pbt_Arab-tgk_Cyrl', + 'plt_Latn-mri_Latn', 'plt_Latn-pag_Latn', 'plt_Latn-smo_Latn', + 'plt_Latn-sun_Latn', 'plt_Latn-war_Latn', 'por_Latn-ayr_Latn', + 'por_Latn-quy_Latn', 'prs_Arab-pbt_Arab', 'prs_Arab-tgk_Cyrl', + 'quy_Latn-spa_Latn', 'run_Latn-sna_Latn', 'run_Latn-som_Latn', + 'run_Latn-sot_Latn', 'run_Latn-ssw_Latn', 'run_Latn-swh_Latn', + 'run_Latn-tir_Ethi', 'run_Latn-tsn_Latn', 'run_Latn-tso_Latn', + 'run_Latn-tum_Latn', 'run_Latn-twi_Latn', 'run_Latn-umb_Latn', + 'run_Latn-wol_Latn', 'run_Latn-xho_Latn', 'run_Latn-yor_Latn', + 'run_Latn-zul_Latn', 'rus_Cyrl-tat_Cyrl', 'rus_Cyrl-tgk_Cyrl', + 'san_Deva-sat_Beng', 'san_Deva-sin_Sinh', 'san_Deva-snd_Arab', + 'san_Deva-tam_Taml', 'san_Deva-tel_Telu', 'san_Deva-urd_Arab', + 'sat_Beng-sin_Sinh', 'sat_Beng-snd_Arab', 'sat_Beng-tam_Taml', + 'sat_Beng-tel_Telu', 'sat_Beng-urd_Arab', 'sin_Sinh-snd_Arab', + 'sin_Sinh-tam_Taml', 'sin_Sinh-tel_Telu', 'sin_Sinh-urd_Arab', + 'smo_Latn-sun_Latn', 'smo_Latn-war_Latn', 'sna_Latn-som_Latn', + 'sna_Latn-sot_Latn', 'sna_Latn-ssw_Latn', 'sna_Latn-swh_Latn', + 'sna_Latn-tir_Ethi', 'sna_Latn-tsn_Latn', 'sna_Latn-tso_Latn', + 'sna_Latn-tum_Latn', 'sna_Latn-twi_Latn', 'sna_Latn-umb_Latn', + 'sna_Latn-wol_Latn', 'sna_Latn-xho_Latn', 'sna_Latn-yor_Latn', + 'sna_Latn-zul_Latn', 'snd_Arab-tam_Taml', 'snd_Arab-tel_Telu', + 'snd_Arab-urd_Arab', 'som_Latn-sot_Latn', 'som_Latn-ssw_Latn', + 'som_Latn-swh_Latn', 'som_Latn-tir_Ethi', 'som_Latn-tsn_Latn', + 'som_Latn-tso_Latn', 'som_Latn-tum_Latn', 'som_Latn-twi_Latn', + 'som_Latn-umb_Latn', 'som_Latn-wol_Latn', 'som_Latn-xho_Latn', + 'som_Latn-yor_Latn', 'som_Latn-zul_Latn', 'sot_Latn-ssw_Latn', + 'sot_Latn-swh_Latn', 'sot_Latn-tir_Ethi', 'sot_Latn-tsn_Latn', + 'sot_Latn-tso_Latn', 'sot_Latn-tum_Latn', 'sot_Latn-twi_Latn', + 'sot_Latn-umb_Latn', 'sot_Latn-wol_Latn', 'sot_Latn-xho_Latn', + 'sot_Latn-yor_Latn', 'sot_Latn-zul_Latn', 'ssw_Latn-swh_Latn', + 'ssw_Latn-tir_Ethi', 'ssw_Latn-tsn_Latn', 'ssw_Latn-tso_Latn', + 'ssw_Latn-tum_Latn', 'ssw_Latn-twi_Latn', 'ssw_Latn-umb_Latn', + 'ssw_Latn-wol_Latn', 'ssw_Latn-xho_Latn', 'ssw_Latn-yor_Latn', + 'ssw_Latn-zul_Latn', 'sun_Latn-war_Latn', 'swh_Latn-tir_Ethi', + 'swh_Latn-tsn_Latn', 'swh_Latn-tso_Latn', 'swh_Latn-tum_Latn', + 'swh_Latn-twi_Latn', 'swh_Latn-umb_Latn', 'swh_Latn-wol_Latn', + 'swh_Latn-xho_Latn', 'swh_Latn-yor_Latn', 'swh_Latn-zul_Latn', + 'tam_Taml-tel_Telu', 'tam_Taml-urd_Arab', 'tat_Cyrl-tuk_Latn', + 'tat_Cyrl-uig_Arab', 'tat_Cyrl-uzn_Latn', 'tel_Telu-urd_Arab', + 'tir_Ethi-tsn_Latn', 'tir_Ethi-tso_Latn', 'tir_Ethi-tum_Latn', + 'tir_Ethi-twi_Latn', 'tir_Ethi-umb_Latn', 'tir_Ethi-wol_Latn', + 'tir_Ethi-xho_Latn', 'tir_Ethi-yor_Latn', 'tir_Ethi-zul_Latn', + 'tsn_Latn-tso_Latn', 'tsn_Latn-tum_Latn', 'tsn_Latn-twi_Latn', + 'tsn_Latn-umb_Latn', 'tsn_Latn-wol_Latn', 'tsn_Latn-xho_Latn', + 'tsn_Latn-yor_Latn', 'tsn_Latn-zul_Latn', 'tso_Latn-tum_Latn', + 'tso_Latn-twi_Latn', 'tso_Latn-umb_Latn', 'tso_Latn-wol_Latn', + 'tso_Latn-xho_Latn', 'tso_Latn-yor_Latn', 'tso_Latn-zul_Latn', + 'tuk_Latn-uig_Arab', 'tuk_Latn-uzn_Latn', 'tum_Latn-twi_Latn', + 'tum_Latn-umb_Latn', 'tum_Latn-wol_Latn', 'tum_Latn-xho_Latn', + 'tum_Latn-yor_Latn', 'tum_Latn-zul_Latn', 'twi_Latn-umb_Latn', + 'twi_Latn-wol_Latn', 'twi_Latn-xho_Latn', 'twi_Latn-yor_Latn', + 'twi_Latn-zul_Latn', 'uig_Arab-uzn_Latn', 'umb_Latn-wol_Latn', + 'umb_Latn-xho_Latn', 'umb_Latn-yor_Latn', 'umb_Latn-zul_Latn', + 'wol_Latn-xho_Latn', 'wol_Latn-yor_Latn', 'wol_Latn-zul_Latn', + 'xho_Latn-yor_Latn', 'xho_Latn-zul_Latn', 'yor_Latn-zul_Latn' + ] + subset = subset[:self.subset_count] + for subset_name in subset: + self.download_subset('nllb', subset_name) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_universal_dependencies(self): + subset = [ + 'af_afribooms', 'akk_pisandub', 'akk_riao', 'aqz_tudet', 'sq_tsa', + 'am_att', 'grc_perseus', 'grc_proiel', 'apu_ufpa', 'ar_nyuad', + 'ar_padt', 'ar_pud', 'hy_armtdp', 'aii_as', 'bm_crb', 'eu_bdt', + 'be_hse', 'bho_bhtb', 'br_keb', 'bg_btb', 'bxr_bdt', 'yue_hk', + 'ca_ancora', 'zh_cfl', 'zh_gsd', 'zh_gsdsimp', 'zh_hk', 'zh_pud', + 'ckt_hse', 'lzh_kyoto', 'cop_scriptorium', 'hr_set', 'cs_cac', + 'cs_cltt', 'cs_fictree', 'cs_pdt', 'cs_pud', 'da_ddt', 'nl_alpino', + 'nl_lassysmall', 'en_esl', 'en_ewt', 'en_gum', 'en_gumreddit', + 'en_lines', 'en_partut', 'en_pronouns', 'en_pud', 'myv_jr', + 'et_edt', 'et_ewt', 'fo_farpahc', 'fo_oft', 'fi_ftb', 'fi_ood', + 'fi_pud', 'fi_tdt', 'fr_fqb', 'fr_ftb', 'fr_gsd', 'fr_partut', + 'fr_pud', 'fr_sequoia', 'fr_spoken', 'gl_ctg', 'gl_treegal', + 'de_gsd', 'de_hdt', 'de_lit', 'de_pud', 'got_proiel', 'el_gdt', + 'he_htb', 'qhe_hiencs', 'hi_hdtb', 'hi_pud', 'hu_szeged', + 'is_icepahc', 'is_pud', 'id_csui', 'id_gsd', 'id_pud', 'ga_idt', + 'it_isdt', 'it_partut', 'it_postwita', 'it_pud', 'it_twittiro', + 'it_vit', 'ja_bccwj', 'ja_gsd', 'ja_modern', 'ja_pud', 'krl_kkpp', + 'kk_ktb', 'kfm_aha', 'koi_uh', 'kpv_ikdp', 'kpv_lattice', 'ko_gsd', + 'ko_kaist', 'ko_pud', 'kmr_mg', 'la_ittb', 'la_llct', 'la_perseus', + 'la_proiel', 'lv_lvtb', 'lt_alksnis', 'lt_hse', 'olo_kkpp', + 'mt_mudt', 'gv_cadhan', 'mr_ufal', 'gun_dooley', 'gun_thomas', + 'mdf_jr', 'myu_tudet', 'pcm_nsc', 'nyq_aha', 'sme_giella', + 'no_bokmaal', 'no_nynorsk', 'no_nynorsklia', 'cu_proiel', + 'fro_srcmf', 'orv_rnc', 'orv_torot', 'otk_tonqq', 'fa_perdt', + 'fa_seraji', 'pl_lfg', 'pl_pdb', 'pl_pud', 'pt_bosque', 'pt_gsd', + 'pt_pud', 'ro_nonstandard', 'ro_rrt', 'ro_simonero', 'ru_gsd', + 'ru_pud', 'ru_syntagrus', 'ru_taiga', 'sa_ufal', 'sa_vedic', + 'gd_arcosg', 'sr_set', 'sms_giellagas', 'sk_snk', 'sl_ssj', + 'sl_sst', 'soj_aha', 'ajp_madar', 'es_ancora', 'es_gsd', 'es_pud', + 'swl_sslc', 'sv_lines', 'sv_pud', 'sv_talbanken', 'gsw_uzh', + 'tl_trg', 'tl_ugnayan', 'ta_mwtt', 'ta_ttb', 'te_mtg', 'th_pud', + 'tpn_tudet', 'qtd_sagt', 'tr_boun', 'tr_gb', 'tr_imst', 'tr_pud', + 'uk_iu', 'hsb_ufal', 'ur_udtb', 'ug_udt', 'vi_vtb', 'wbp_ufal', + 'cy_ccg', 'wo_wtb', 'yo_ytb' + ] + subset = subset[:self.subset_count] + for subset_name in subset: + self.download_subset('universal_dependencies', subset_name) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_imdb(self): + dataset = MsDataset.load('imdb') + if isinstance(dataset, MsDataset): + lens = len(dataset) + print(f'dataset imdb len: {lens}') + self.assertTrue(lens > 0) + else: + assert isinstance(dataset, dict) + lens = {key: len(subset) for key, subset in dataset.items()} + print(f'dataset imdb len: {lens}') + self.assertTrue(all([_len > 0 for _len in lens.values()])) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_clue(self): + subset = [ + 'afqmc', 'tnews', 'iflytek', 'cmnli', 'cluewsc2020', 'csl', + 'cmrc2018', 'drcd', 'chid', 'c3', 'ocnli', 'diagnostics' + ] + for subset_name in subset: + self.download_subset('clue', subset_name) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_wikitext(self): + subset = [ + 'wikitext-103-v1', 'wikitext-2-v1', 'wikitext-103-raw-v1', + 'wikitext-2-raw-v1' + ] + for subset_name in subset: + self.download_subset('wikitext', subset_name) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_download_xnli(self): + subset = [ + 'XNLI', 'tydiqa', 'SQuAD', 'PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', + 'PAN-X.bn', 'PAN-X.de', 'PAN-X.el', 'PAN-X.en', 'PAN-X.es', + 'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi', 'PAN-X.fr', + 'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it', + 'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko', + 'PAN-X.ml', 'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl', + 'PAN-X.pt', 'PAN-X.ru', 'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te', + 'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr', 'PAN-X.ur', 'PAN-X.vi', + 'PAN-X.yo', 'PAN-X.zh', 'MLQA.ar.ar', 'MLQA.ar.de', 'MLQA.ar.vi', + 'MLQA.ar.zh', 'MLQA.ar.en', 'MLQA.ar.es', 'MLQA.ar.hi', + 'MLQA.de.ar', 'MLQA.de.de', 'MLQA.de.vi', 'MLQA.de.zh', + 'MLQA.de.en', 'MLQA.de.es', 'MLQA.de.hi', 'MLQA.vi.ar', + 'MLQA.vi.de', 'MLQA.vi.vi', 'MLQA.vi.zh', 'MLQA.vi.en', + 'MLQA.vi.es', 'MLQA.vi.hi', 'MLQA.zh.ar', 'MLQA.zh.de', + 'MLQA.zh.vi', 'MLQA.zh.zh', 'MLQA.zh.en', 'MLQA.zh.es', + 'MLQA.zh.hi', 'MLQA.en.ar', 'MLQA.en.de', 'MLQA.en.vi', + 'MLQA.en.zh', 'MLQA.en.en', 'MLQA.en.es', 'MLQA.en.hi', + 'MLQA.es.ar', 'MLQA.es.de', 'MLQA.es.vi', 'MLQA.es.zh', + 'MLQA.es.en', 'MLQA.es.es', 'MLQA.es.hi', 'MLQA.hi.ar', + 'MLQA.hi.de', 'MLQA.hi.vi', 'MLQA.hi.zh', 'MLQA.hi.en', + 'MLQA.hi.es', 'MLQA.hi.hi', 'XQuAD.ar', 'XQuAD.de', 'XQuAD.vi', + 'XQuAD.zh', 'XQuAD.en', 'XQuAD.es', 'XQuAD.hi', 'XQuAD.el', + 'XQuAD.ru', 'XQuAD.th', 'XQuAD.tr', 'bucc18.de', 'bucc18.fr', + 'bucc18.zh', 'bucc18.ru', 'PAWS-X.de', 'PAWS-X.en', 'PAWS-X.es', + 'PAWS-X.fr', 'PAWS-X.ja', 'PAWS-X.ko', 'PAWS-X.zh', 'tatoeba.afr', + 'tatoeba.ara', 'tatoeba.ben', 'tatoeba.bul', 'tatoeba.deu', + 'tatoeba.cmn', 'tatoeba.ell', 'tatoeba.est', 'tatoeba.eus', + 'tatoeba.fin', 'tatoeba.fra', 'tatoeba.heb', 'tatoeba.hin', + 'tatoeba.hun', 'tatoeba.ind', 'tatoeba.ita', 'tatoeba.jav', + 'tatoeba.jpn', 'tatoeba.kat', 'tatoeba.kaz', 'tatoeba.kor', + 'tatoeba.mal', 'tatoeba.mar', 'tatoeba.nld', 'tatoeba.pes', + 'tatoeba.por', 'tatoeba.rus', 'tatoeba.spa', 'tatoeba.swh', + 'tatoeba.tam', 'tatoeba.tel', 'tatoeba.tgl', 'tatoeba.tha', + 'tatoeba.tur', 'tatoeba.urd', 'tatoeba.vie', 'udpos.Afrikaans', + 'udpos.Arabic', 'udpos.Basque', 'udpos.Bulgarian', 'udpos.Dutch', + 'udpos.English', 'udpos.Estonian', 'udpos.Finnish', 'udpos.French', + 'udpos.German', 'udpos.Greek', 'udpos.Hebrew', 'udpos.Hindi', + 'udpos.Hungarian', 'udpos.Indonesian', 'udpos.Italian', + 'udpos.Japanese', 'udpos.Kazakh', 'udpos.Korean', 'udpos.Chinese', + 'udpos.Marathi', 'udpos.Persian', 'udpos.Portuguese', + 'udpos.Russian', 'udpos.Spanish', 'udpos.Tagalog', 'udpos.Tamil', + 'udpos.Telugu', 'udpos.Thai', 'udpos.Turkish', 'udpos.Urdu', + 'udpos.Vietnamese', 'udpos.Yoruba' + ] + subset = subset[:self.subset_count] + for subset_name in subset: + self.download_subset('xtreme', subset_name) diff --git a/tests/models/test_deberta_v2_backbone.py b/tests/models/test_deberta_v2_backbone.py new file mode 100644 index 00000000..706b18f8 --- /dev/null +++ b/tests/models/test_deberta_v2_backbone.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +from modelscope.models import Model +from modelscope.models.nlp.deberta_v2 import (DebertaV2ForMaskedLM, + DebertaV2Model) +from modelscope.utils.constant import Tasks + + +class DebertaV2BackboneTest(unittest.TestCase): + + def test_load_model(self): + model = Model.from_pretrained( + 'damo/nlp_debertav2_fill-mask_chinese-lite') + self.assertTrue(model.__class__ == DebertaV2ForMaskedLM) + model = Model.from_pretrained( + 'damo/nlp_debertav2_fill-mask_chinese-lite', task=Tasks.backbone) + self.assertTrue(model.__class__ == DebertaV2Model) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/outputs/__init__.py b/tests/outputs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py new file mode 100644 index 00000000..31271869 --- /dev/null +++ b/tests/outputs/test_model_outputs.py @@ -0,0 +1,30 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import torch + +from modelscope.outputs import TextClassificationModelOutput +from modelscope.utils.test_utils import test_level + + +class TestModelOutput(unittest.TestCase): + + def setUp(self): + pass + + def tearDown(self): + super().tearDown() + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_model_outputs(self): + outputs = TextClassificationModelOutput(logits=torch.Tensor([1])) + self.assertEqual(outputs['logits'], torch.Tensor([1])) + self.assertEqual(outputs[0], torch.Tensor([1])) + self.assertEqual(outputs.logits, torch.Tensor([1])) + logits, loss = outputs + self.assertEqual(logits, torch.Tensor([1])) + self.assertTrue(loss is None) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/nlp/test_faq.py b/tests/pipelines/nlp/test_faq.py new file mode 100644 index 00000000..8bac55d4 --- /dev/null +++ b/tests/pipelines/nlp/test_faq.py @@ -0,0 +1,59 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.models.nlp import SbertForFaqRanking, SbertForFaqRetrieval +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import FaqPipeline +from modelscope.preprocessors import FaqPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class FaqTest(unittest.TestCase): + model_id = '/Users/tanfan/Desktop/Workdir/Gitlab/maas/MaaS-lib/.faq_test_model' + param = { + 'query_set': ['明天星期几', '今天星期六', '今天星期六'], + 'support_set': [{ + 'text': '今天星期六', + 'label': 'label0' + }, { + 'text': '明天星期几', + 'label': 'label1' + }] + } + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_run_with_direct_file_download(self): + # cache_path = self.model_id # snapshot_download(self.model_id) + # preprocessor = FaqPreprocessor(cache_path) + # model = SbertForFaq(cache_path) + # pipeline_ins = FaqPipeline(model, preprocessor=preprocessor) + # + # result = pipeline_ins(self.param) + # print(result) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + preprocessor = FaqPreprocessor(model.model_dir) + pipeline_ins = pipeline( + task=Tasks.faq, model=model, preprocessor=preprocessor) + result = pipeline_ins(self.param) + print(result) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + # def test_run_with_model_name(self): + # pipeline_ins = pipeline(task=Tasks.faq, model=self.model_id) + # result = pipeline_ins(self.param) + # print(result) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + # def test_run_with_default_model(self): + # pipeline_ins = pipeline(task=Tasks.faq) + # print(pipeline_ins(self.param)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py index 21a4e0ce..17fffcaf 100644 --- a/tests/pipelines/test_conversational_text_to_sql.py +++ b/tests/pipelines/test_conversational_text_to_sql.py @@ -9,7 +9,8 @@ from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline from modelscope.preprocessors import ConversationalTextToSqlPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results +from modelscope.utils.nlp.space_T_en.utils import \ + text2sql_tracking_and_print_results from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py index 5894297f..2ee46388 100644 --- a/tests/pipelines/test_dialog_intent_prediction.py +++ b/tests/pipelines/test_dialog_intent_prediction.py @@ -25,7 +25,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): - cache_path = snapshot_download(self.model_id, revision='update') + cache_path = snapshot_download(self.model_id) preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path) model = SpaceForDialogIntent( model_dir=cache_path, @@ -46,7 +46,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id, revision='update') + model = Model.from_pretrained(self.model_id) preprocessor = DialogIntentPredictionPreprocessor( model_dir=model.model_dir) @@ -64,10 +64,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): - pipelines = [ - pipeline( - task=self.task, model=self.model_id, model_revision='update') - ] + pipelines = [pipeline(task=self.task, model=self.model_id)] for my_pipeline, item in list(zip(pipelines, self.test_case)): print(my_pipeline(item)) diff --git a/tests/pipelines/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py index 19d6ed2f..6b6259ce 100644 --- a/tests/pipelines/test_dialog_modeling.py +++ b/tests/pipelines/test_dialog_modeling.py @@ -115,8 +115,7 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): - cache_path = snapshot_download( - self.model_id, revision='task_oriented_conversation') + cache_path = snapshot_download(self.model_id) preprocessor = DialogModelingPreprocessor(model_dir=cache_path) model = SpaceForDialogModeling( @@ -130,8 +129,7 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained( - self.model_id, revision='task_oriented_conversation') + model = Model.from_pretrained(self.model_id) preprocessor = DialogModelingPreprocessor(model_dir=model.model_dir) pipelines = [ @@ -142,20 +140,12 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): - pipelines = [ - pipeline( - task=self.task, - model=self.model_id, - model_revision='task_oriented_conversation') - ] + pipelines = [pipeline(task=self.task, model=self.model_id)] self.generate_and_print_dialog_response(pipelines) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): - pipelines = [ - pipeline( - task=self.task, model_revision='task_oriented_conversation') - ] + pipelines = [pipeline(task=self.task)] self.generate_and_print_dialog_response(pipelines) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py index 81bdd9be..6cdd5ee7 100644 --- a/tests/pipelines/test_dialog_state_tracking.py +++ b/tests/pipelines/test_dialog_state_tracking.py @@ -3,13 +3,14 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import SpaceForDialogStateTracking +from modelscope.models.nlp import SpaceForDST from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import DialogStateTrackingPipeline from modelscope.preprocessors import DialogStateTrackingPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states +from modelscope.utils.nlp.space.utils_dst import \ + tracking_and_print_dialog_states from modelscope.utils.test_utils import test_level @@ -85,9 +86,9 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): - cache_path = snapshot_download(self.model_id, revision='update') + cache_path = snapshot_download(self.model_id) - model = SpaceForDialogStateTracking(cache_path) + model = SpaceForDST.from_pretrained(cache_path) preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path) pipelines = [ DialogStateTrackingPipeline( @@ -101,7 +102,7 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id, revision='update') + model = Model.from_pretrained(self.model_id) preprocessor = DialogStateTrackingPreprocessor( model_dir=model.model_dir) @@ -115,10 +116,7 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): - pipelines = [ - pipeline( - task=self.task, model=self.model_id, model_revision='update') - ] + pipelines = [pipeline(task=self.task, model=self.model_id)] tracking_and_print_dialog_states(self.test_case, pipelines) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py index 7eea0ddf..2f66f516 100644 --- a/tests/pipelines/test_faq_question_answering.py +++ b/tests/pipelines/test_faq_question_answering.py @@ -47,9 +47,9 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - preprocessor = FaqQuestionAnsweringPreprocessor(cache_path) - model = SbertForFaqQuestionAnswering(cache_path) - model.load_checkpoint(cache_path) + preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained( + cache_path) + model = SbertForFaqQuestionAnswering.from_pretrained(cache_path) pipeline_ins = FaqQuestionAnsweringPipeline( model, preprocessor=preprocessor) result = pipeline_ins(self.param) diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index 0e5e242b..568865c6 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -5,8 +5,7 @@ from regex import R from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM, - VecoForMaskedLM) +from modelscope.models.nlp import SbertForMaskedLM, VecoForMaskedLM from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline from modelscope.preprocessors import NLPPreprocessor @@ -55,7 +54,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): model_dir = snapshot_download(self.model_id_sbert[language]) preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) - model = StructBertForMaskedLM.from_pretrained(model_dir) + model = SbertForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) @@ -130,18 +129,6 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' f'{pipeline_ins(test_input)}\n') - # bert - language = 'zh' - model = Model.from_pretrained(self.model_id_bert, revision='beta') - preprocessor = NLPPreprocessor( - model.model_dir, first_sequence='sentence', second_sequence=None) - pipeline_ins = pipeline( - Tasks.fill_mask, model=model, preprocessor=preprocessor) - pipeline_ins.model, f'fill_mask_bert_{language}' - print( - f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' - f'{pipeline_ins(self.test_inputs[language])}\n') - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): # veco diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index db4b9912..5f2dcb25 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.pipelines.nlp import TextClassificationPipeline from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck @@ -27,9 +27,8 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) tokenizer = SequenceClassificationPreprocessor(cache_path) - model = SbertForSequenceClassification.from_pretrained(cache_path) - pipeline1 = SequenceClassificationPipeline( - model, preprocessor=tokenizer) + model = Model.from_pretrained(cache_path) + pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n' f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}') diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py index 61cdfe73..038a90f0 100644 --- a/tests/pipelines/test_part_of_speech.py +++ b/tests/pipelines/test_part_of_speech.py @@ -23,7 +23,7 @@ class PartOfSpeechTest(unittest.TestCase): model = TokenClassificationModel.from_pretrained(cache_path) pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( - Tasks.token_classification, model=model, preprocessor=tokenizer) + Tasks.part_of_speech, model=model, preprocessor=tokenizer) print(f'sentence: {self.sentence}\n' f'pipeline1:{pipeline1(input=self.sentence)}') print() diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py index 739dd7ab..e96724a8 100644 --- a/tests/pipelines/test_sentence_embedding.py +++ b/tests/pipelines/test_sentence_embedding.py @@ -4,7 +4,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import SentenceEmbedding +from modelscope.models.nlp import BertForSentenceEmbedding from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import SentenceEmbeddingPipeline from modelscope.preprocessors import SentenceEmbeddingPreprocessor @@ -40,7 +40,7 @@ class SentenceEmbeddingTest(unittest.TestCase): def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) tokenizer = SentenceEmbeddingPreprocessor(cache_path) - model = SentenceEmbedding.from_pretrained(cache_path) + model = BertForSentenceEmbedding.from_pretrained(cache_path) pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.sentence_embedding, model=model, preprocessor=tokenizer) diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index 288d38c7..76db0a8f 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.pipelines.nlp import TextClassificationPipeline from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck @@ -28,8 +28,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): cache_path = snapshot_download(self.model_id) tokenizer = SequenceClassificationPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) - pipeline1 = SequenceClassificationPipeline( - model, preprocessor=tokenizer) + pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.sentence_similarity, model=model, preprocessor=tokenizer) print('test1') diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py index d0b1b40f..b3d9b9d6 100644 --- a/tests/pipelines/test_sentiment_classification.py +++ b/tests/pipelines/test_sentiment_classification.py @@ -6,7 +6,7 @@ from modelscope.models import Model from modelscope.models.nlp.task_models.sequence_classification import \ SequenceClassificationModel from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.pipelines.nlp import TextClassificationPipeline from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck @@ -28,8 +28,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, tokenizer = SequenceClassificationPreprocessor(cache_path) model = SequenceClassificationModel.from_pretrained( self.model_id, num_labels=2, revision='beta') - pipeline1 = SequenceClassificationPipeline( - model, preprocessor=tokenizer) + pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.text_classification, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\n' diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py index 44f1531b..eece7f57 100644 --- a/tests/pipelines/test_table_question_answering.py +++ b/tests/pipelines/test_table_question_answering.py @@ -13,7 +13,7 @@ from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline from modelscope.preprocessors import TableQuestionAnsweringPreprocessor -from modelscope.preprocessors.space_T_cn.fields.database import Database +from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py new file mode 100644 index 00000000..5b38e116 --- /dev/null +++ b/tests/pipelines/test_text_classification.py @@ -0,0 +1,100 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.models import Model +from modelscope.msdatasets import MsDataset +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import TextClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck): + sentence1 = 'i like this wonderful place' + + def setUp(self) -> None: + self.model_id = 'damo/bert-base-sst2' + self.task = Tasks.text_classification + + def predict(self, pipeline_ins: TextClassificationPipeline): + from easynlp.appzoo import load_dataset + + set = load_dataset('glue', 'sst2') + data = set['test']['sentence'][:3] + + results = pipeline_ins(data[0]) + print(results) + results = pipeline_ins(data[1]) + print(results) + + print(data) + + def printDataset(self, dataset: MsDataset): + for i, r in enumerate(dataset): + if i > 10: + break + print(r) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skip('nlp model does not support tensor input, skipped') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + preprocessor = SequenceClassificationPreprocessor( + model.model_dir, first_sequence='sentence', second_sequence=None) + pipeline_ins = pipeline( + task=Tasks.text_classification, + model=model, + preprocessor=preprocessor) + print(f'sentence1: {self.sentence1}\n' + f'pipeline1:{pipeline_ins(input=self.sentence1)}') + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skip('nlp model does not support tensor input, skipped') + def test_run_with_model_name(self): + text_classification = pipeline( + task=Tasks.text_classification, model=self.model_id) + result = text_classification( + MsDataset.load( + 'xcopa', + subset_name='translation-et', + namespace='damotest', + split='test', + target='premise')) + self.printDataset(result) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skip('nlp model does not support tensor input, skipped') + def test_run_with_default_model(self): + text_classification = pipeline(task=Tasks.text_classification) + result = text_classification( + MsDataset.load( + 'xcopa', + subset_name='translation-et', + namespace='damotest', + split='test', + target='premise')) + self.printDataset(result) + + # @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @unittest.skip('nlp model does not support tensor input, skipped') + def test_run_with_modelscope_dataset(self): + text_classification = pipeline(task=Tasks.text_classification) + # loaded from modelscope dataset + dataset = MsDataset.load( + 'xcopa', + subset_name='translation-et', + namespace='damotest', + split='test', + target='premise') + result = text_classification(dataset) + self.printDataset(result) + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py index 57fa809c..0b43e8b4 100644 --- a/tests/pipelines/test_text_ranking.py +++ b/tests/pipelines/test_text_ranking.py @@ -4,7 +4,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import TextRanking +from modelscope.models.nlp import BertForTextRanking from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextRankingPipeline from modelscope.preprocessors import TextRankingPreprocessor @@ -33,7 +33,7 @@ class TextRankingTest(unittest.TestCase): for model_id in self.models: cache_path = snapshot_download(model_id) tokenizer = TextRankingPreprocessor(cache_path) - model = TextRanking.from_pretrained(cache_path) + model = BertForTextRanking.from_pretrained(cache_path) pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.text_ranking, model=model, preprocessor=tokenizer) diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py index aa8aba5c..ae780793 100644 --- a/tests/trainers/test_finetune_sequence_classification.py +++ b/tests/trainers/test_finetune_sequence_classification.py @@ -8,7 +8,7 @@ from modelscope.metainfo import Preprocessors, Trainers from modelscope.models import Model from modelscope.msdatasets import MsDataset from modelscope.pipelines import pipeline -from modelscope.trainers import build_trainer +from modelscope.trainers import NlpTrainerArguments, build_trainer from modelscope.trainers.hooks import Hook from modelscope.trainers.nlp_trainer import (EpochBasedTrainer, NlpEpochBasedTrainer) @@ -38,6 +38,52 @@ class TestFinetuneSequenceClassification(unittest.TestCase): shutil.rmtree(self.tmp_dir) super().tearDown() + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_trainer_cfg_class(self): + dataset = MsDataset.load('clue', subset_name='tnews') + train_dataset = dataset['train'] + validation_dataset = dataset['validation'] + cfg_modify_fn = NlpTrainerArguments( + task=Tasks.text_classification, + preprocessor_type=Preprocessors.sen_cls_tokenizer, + train_first_sequence='sentence', + train_label='label', + labels=[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', + '12', '13', '14' + ], + max_epochs=5, + optimizer_args={ + 'lr': 3e-5, + }, + lr_scheduler_args={ + 'total_iters': int(len(train_dataset) / 32) * 5, + }, + checkpoint_saving_type='BestCkptSaverHook', + metric_key='accuracy', + train_batch_size_per_gpu=32, + checkpoint_interval=1, + train_workers_per_gpu=0, + checkpoint_by_epoch=False, + evaluation_interval=1, + evaluation_by_epoch=False, + eval_workers_per_gpu=0, + metrics=['seq-cls-metric'], + ) + + kwargs = dict( + model='damo/nlp_structbert_backbone_base_std', + train_dataset=train_dataset, + eval_dataset=validation_dataset, + work_dir=self.tmp_dir, + seed=42, + cfg_modify_fn=cfg_modify_fn) + + os.environ['LOCAL_RANK'] = '0' + trainer: EpochBasedTrainer = build_trainer( + name=Trainers.nlp_base_trainer, default_args=kwargs) + trainer.train() + @unittest.skip( 'Skip testing trainer repeatable, because it\'s unstable in daily UT') def test_trainer_repeatable(self): @@ -330,7 +376,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase): 2, 'dataloader': { 'batch_size_per_gpu': 16, - 'workers_per_gpu': 1 + 'workers_per_gpu': 0 }, 'optimizer': { 'type': 'AdamW', @@ -351,7 +397,6 @@ class TestFinetuneSequenceClassification(unittest.TestCase): 'hooks': [{ 'type': 'CheckpointHook', 'interval': 1, - 'save_dir': '/root' }, { 'type': 'TextLoggerHook', 'interval': 1 @@ -366,7 +411,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase): cfg['evaluation'] = { 'dataloader': { 'batch_size_per_gpu': 128, - 'workers_per_gpu': 1, + 'workers_per_gpu': 0, 'shuffle': False } } diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index 5b0c9982..9380ad0f 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -7,8 +7,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Metrics from modelscope.models.base import Model -from modelscope.models.nlp.sequence_classification import \ - SbertForSequenceClassification +from modelscope.models.nlp import SbertForSequenceClassification from modelscope.msdatasets import MsDataset from modelscope.pipelines import pipeline from modelscope.trainers import EpochBasedTrainer, build_trainer