diff --git a/configs/nlp/sbert_sentence_similarity.json b/configs/nlp/sbert_sentence_similarity.json index 1e2bdef5..9320e0d7 100644 --- a/configs/nlp/sbert_sentence_similarity.json +++ b/configs/nlp/sbert_sentence_similarity.json @@ -2,7 +2,7 @@ "framework": "pytorch", "task": "sentence-similarity", "preprocessor": { - "type": "bert-seq-cls-tokenizer-finetune", + "type": "sen-sim-tokenizer", "first_sequence": "sentence1", "second_sequence": "sentence2" }, diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 8f6e7483..fff88cca 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -4,7 +4,7 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, DEFAULT_MODELSCOPE_GROUP, MODEL_ID_SEPARATOR, MODELSCOPE_URL_SCHEME) -from modelscope.utils.utils import get_default_cache_dir +from modelscope.utils.file_utils import get_default_cache_dir def model_id_to_group_owner_name(model_id): diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 215233fe..e0326baa 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -53,6 +53,10 @@ class TaskModels(object): class Heads(object): # nlp heads text_classification = 'text-classification' + # mlm + bert_mlm = 'bert-mlm' + # roberta mlm + roberta_mlm = 'roberta-mlm' class Pipelines(object): @@ -137,7 +141,7 @@ class Trainers(object): Holds the standard trainer name to use for identifying different trainer. This should be used to register trainers. - For a general Trainer, you can use easynlp-trainer/ofa-trainer/sofa-trainer. + For a general Trainer, you can use easynlp-trainer/ofa-trainer. For a model specific Trainer, you can use ${ModelName}-${Task}-trainer. """ @@ -179,6 +183,8 @@ class Preprocessors(object): sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer' zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer' text_error_correction = 'text-error-correction' + word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor' + fill_mask = 'fill-mask' # audio preprocessor linear_aec_fbank = 'linear-aec-fbank' @@ -204,7 +210,7 @@ class Metrics(object): # metric for image instance segmentation task image_ins_seg_coco_metric = 'image-ins-seg-coco-metric' # metrics for sequence classification task - seq_cls_metric = 'seq_cls_metric' + seq_cls_metric = 'seq-cls-metric' # metrics for token-classification task token_cls_metric = 'token-cls-metric' # metrics for text-generation task diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py index c632a9bd..37f9bfec 100644 --- a/modelscope/metrics/__init__.py +++ b/modelscope/metrics/__init__.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from .image_portrait_enhancement_metric import ImagePortraitEnhancementMetric from .sequence_classification_metric import SequenceClassificationMetric from .text_generation_metric import TextGenerationMetric + from .token_classification_metric import TokenClassificationMetric else: _import_structure = { @@ -26,6 +27,7 @@ else: ['ImagePortraitEnhancementMetric'], 'sequence_classification_metric': ['SequenceClassificationMetric'], 'text_generation_metric': ['TextGenerationMetric'], + 'token_classification_metric': ['TokenClassificationMetric'], } import sys diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py index 1b9db825..3a9d810f 100644 --- a/modelscope/metrics/base.py +++ b/modelscope/metrics/base.py @@ -10,6 +10,9 @@ class Metric(ABC): complex metrics for a specific task with or without other Metric subclasses. """ + def __init__(self, trainer=None, *args, **kwargs): + self.trainer = trainer + @abstractmethod def add(self, outputs: Dict, inputs: Dict): """ Append logits and labels within an eval loop. diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py index 4df856f2..bd20d37b 100644 --- a/modelscope/metrics/builder.py +++ b/modelscope/metrics/builder.py @@ -20,7 +20,9 @@ class MetricKeys(object): task_default_metrics = { Tasks.image_segmentation: [Metrics.image_ins_seg_coco_metric], Tasks.sentence_similarity: [Metrics.seq_cls_metric], + Tasks.nli: [Metrics.seq_cls_metric], Tasks.sentiment_classification: [Metrics.seq_cls_metric], + Tasks.token_classification: [Metrics.token_cls_metric], Tasks.text_generation: [Metrics.text_gen_metric], Tasks.image_denoising: [Metrics.image_denoise_metric], Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric], diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py index dabdb725..04b0ee81 100644 --- a/modelscope/metrics/sequence_classification_metric.py +++ b/modelscope/metrics/sequence_classification_metric.py @@ -17,14 +17,14 @@ class SequenceClassificationMetric(Metric): """The metric computation class for sequence classification classes. """ - label_name = 'labels' - - def __init__(self): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.preds = [] self.labels = [] def add(self, outputs: Dict, inputs: Dict): - ground_truths = inputs[self.label_name] + label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS + ground_truths = inputs[label_name] eval_results = outputs[OutputKeys.LOGITS] self.preds.append( torch_nested_numpify(torch_nested_detach(eval_results))) diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py new file mode 100644 index 00000000..8606148e --- /dev/null +++ b/modelscope/metrics/token_classification_metric.py @@ -0,0 +1,123 @@ +import importlib +from typing import Dict, List, Optional, Union + +import numpy as np + +from modelscope.outputs import OutputKeys +from ..metainfo import Metrics +from ..utils.registry import default_group +from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify +from .base import Metric +from .builder import METRICS, MetricKeys + + +@METRICS.register_module( + group_key=default_group, module_name=Metrics.token_cls_metric) +class TokenClassificationMetric(Metric): + """ + The metric computation class for token-classification task. + Args: + return_entity_level_metrics (bool, *optional*): + Whether to return every label's detail metrics, default False. + """ + + def add(self, outputs: Dict, inputs: Dict): + label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS + ground_truths = inputs[label_name] + eval_results = outputs[OutputKeys.LOGITS] + self.preds.append( + torch_nested_numpify(torch_nested_detach(eval_results))) + self.labels.append( + torch_nested_numpify(torch_nested_detach(ground_truths))) + + def __init__(self, return_entity_level_metrics=False, *args, **kwargs): + super().__init__(*args, **kwargs) + self.return_entity_level_metrics = return_entity_level_metrics + self.preds = [] + self.labels = [] + + def evaluate(self): + self.id2label = { + id: label + for label, id in self.trainer.label2id.items() + } + self.preds = np.concatenate(self.preds, axis=0) + self.labels = np.concatenate(self.labels, axis=0) + predictions = np.argmax(self.preds, axis=-1) + + true_predictions = [[ + self.id2label[p] for (p, lb) in zip(prediction, label) + if lb != -100 + ] for prediction, label in zip(predictions, self.labels)] + true_labels = [[ + self.id2label[lb] for (p, lb) in zip(prediction, label) + if lb != -100 + ] for prediction, label in zip(predictions, self.labels)] + + results = self._compute( + predictions=true_predictions, references=true_labels) + if self.return_entity_level_metrics: + final_results = {} + for key, value in results.items(): + if isinstance(value, dict): + for n, v in value.items(): + final_results[f'{key}_{n}'] = v + else: + final_results[key] = value + return final_results + else: + return { + MetricKeys.PRECISION: results[MetricKeys.PRECISION], + MetricKeys.RECALL: results[MetricKeys.RECALL], + MetricKeys.F1: results[MetricKeys.F1], + MetricKeys.ACCURACY: results[MetricKeys.ACCURACY], + } + + @staticmethod + def _compute( + predictions, + references, + suffix: bool = False, + scheme: Optional[str] = None, + mode: Optional[str] = None, + sample_weight: Optional[List[int]] = None, + zero_division: Union[str, int] = 'warn', + ): + from seqeval.metrics import accuracy_score, classification_report + if scheme is not None: + try: + scheme_module = importlib.import_module('seqeval.scheme') + scheme = getattr(scheme_module, scheme) + except AttributeError: + raise ValueError( + f'Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}' + ) + report = classification_report( + y_true=references, + y_pred=predictions, + suffix=suffix, + output_dict=True, + scheme=scheme, + mode=mode, + sample_weight=sample_weight, + zero_division=zero_division, + ) + report.pop('macro avg') + report.pop('weighted avg') + overall_score = report.pop('micro avg') + + scores = { + type_name: { + MetricKeys.PRECISION: score['precision'], + MetricKeys.RECALL: score['recall'], + MetricKeys.F1: score['f1-score'], + 'number': score['support'], + } + for type_name, score in report.items() + } + scores[MetricKeys.PRECISION] = overall_score['precision'] + scores[MetricKeys.RECALL] = overall_score['recall'] + scores[MetricKeys.F1] = overall_score['f1-score'] + scores[MetricKeys.ACCURACY] = accuracy_score( + y_true=references, y_pred=predictions) + return scores diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py index fd556dd4..3b596769 100644 --- a/modelscope/models/base/base_model.py +++ b/modelscope/models/base/base_model.py @@ -10,6 +10,8 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.builder import build_model from modelscope.utils.config import Config from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile +from modelscope.utils.file_utils import func_receive_dict_inputs +from modelscope.utils.hub import parse_label_mapping from modelscope.utils.logger import get_logger logger = get_logger() @@ -69,6 +71,7 @@ class Model(ABC): def from_pretrained(cls, model_name_or_path: str, revision: Optional[str] = DEFAULT_MODEL_REVISION, + cfg_dict: Config = None, *model_args, **kwargs): """ Instantiate a model from local directory or remote model repo. Note @@ -87,25 +90,25 @@ class Model(ABC): ) local_model_dir = snapshot_download(model_name_or_path, revision) logger.info(f'initialize model from {local_model_dir}') - cfg = Config.from_file( - osp.join(local_model_dir, ModelFile.CONFIGURATION)) + if cfg_dict is not None: + cfg = cfg_dict + else: + cfg = Config.from_file( + osp.join(local_model_dir, ModelFile.CONFIGURATION)) task_name = cfg.task model_cfg = cfg.model - assert hasattr( - cfg, 'pipeline'), 'pipeline config is missing from config file.' - pipeline_cfg = cfg.pipeline # TODO @wenmeng.zwm may should manually initialize model after model building if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): model_cfg.type = model_cfg.model_type model_cfg.model_dir = local_model_dir - for k, v in kwargs.items(): model_cfg[k] = v model = build_model( model_cfg, task_name=task_name, default_args=kwargs) # dynamically add pipeline info to model for pipeline inference - model.pipeline = pipeline_cfg + if hasattr(cfg, 'pipeline'): + model.pipeline = cfg.pipeline return model diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py index 52d4460c..cfc88721 100644 --- a/modelscope/models/base/base_torch_model.py +++ b/modelscope/models/base/base_torch_model.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Union import torch from torch import nn +from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger from .base_model import Model @@ -20,6 +21,13 @@ class TorchModel(Model, torch.nn.Module): super().__init__(model_dir, *args, **kwargs) torch.nn.Module.__init__(self) + def __call__(self, input: Dict[str, + torch.Tensor]) -> Dict[str, torch.Tensor]: + if func_receive_dict_inputs(self.forward): + return self.postprocess(self.forward(input)) + else: + return self.postprocess(self.forward(**input)) + def forward(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: raise NotImplementedError @@ -50,6 +58,3 @@ class TorchModel(Model, torch.nn.Module): elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - - def compute_loss(self, outputs: Dict[str, Any], labels): - raise NotImplementedError() diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index f2219b0e..24e65ef1 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -4,32 +4,26 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .backbones import (SbertModel, SpaceGenerator, SpaceModelBase, - GPT3Model) + from .backbones import SbertModel from .heads import SequenceClassificationHead from .bert_for_sequence_classification import BertForSequenceClassification from .csanmt_for_translation import CsanmtForTranslation from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM, BertForMaskedLM) from .nncrf_for_named_entity_recognition import TransformerCRFForNamedEntityRecognition - from .palm_for_text_generation import PalmForTextGeneration - from .sbert_for_nli import SbertForNLI - from .sbert_for_sentence_similarity import SbertForSentenceSimilarity - from .sbert_for_sentiment_classification import SbertForSentimentClassification - from .sbert_for_token_classification import SbertForTokenClassification - from .sbert_for_zero_shot_classification import SbertForZeroShotClassification - from .sequence_classification import SequenceClassificationModel - from .space_for_dialog_intent_prediction import SpaceForDialogIntent - from .space_for_dialog_modeling import SpaceForDialogModeling - from .space_for_dialog_state_tracking import SpaceForDialogStateTracking - from .task_model import SingleBackboneTaskModelBase + from .palm_v2 import PalmForTextGeneration + from .token_classification import SbertForTokenClassification + from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification + from .space import SpaceForDialogIntent + from .space import SpaceForDialogModeling + from .space import SpaceForDialogStateTracking + from .task_models.task_model import SingleBackboneTaskModelBase from .bart_for_text_error_correction import BartForTextErrorCorrection - from .gpt3_for_text_generation import GPT3ForTextGeneration + from .gpt3 import GPT3ForTextGeneration else: _import_structure = { - 'backbones': - ['SbertModel', 'SpaceGenerator', 'SpaceModelBase', 'GPT3Model'], + 'backbones': ['SbertModel'], 'heads': ['SequenceClassificationHead'], 'csanmt_for_translation': ['CsanmtForTranslation'], 'bert_for_sequence_classification': ['BertForSequenceClassification'], @@ -37,21 +31,17 @@ else: ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'], 'nncrf_for_named_entity_recognition': ['TransformerCRFForNamedEntityRecognition'], - 'palm_for_text_generation': ['PalmForTextGeneration'], - 'sbert_for_nli': ['SbertForNLI'], - 'sbert_for_sentence_similarity': ['SbertForSentenceSimilarity'], - 'sbert_for_sentiment_classification': - ['SbertForSentimentClassification'], - 'sbert_for_token_classification': ['SbertForTokenClassification'], - 'sbert_for_zero_shot_classification': - ['SbertForZeroShotClassification'], - 'sequence_classification': ['SequenceClassificationModel'], - 'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'], - 'space_for_dialog_modeling': ['SpaceForDialogModeling'], - 'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'], + 'palm_v2': ['PalmForTextGeneration'], + 'token_classification': ['SbertForTokenClassification'], + 'sequence_classification': + ['VecoForSequenceClassification', 'SbertForSequenceClassification'], + 'space': [ + 'SpaceForDialogIntent', 'SpaceForDialogModeling', + 'SpaceForDialogStateTracking' + ], 'task_model': ['SingleBackboneTaskModelBase'], 'bart_for_text_error_correction': ['BartForTextErrorCorrection'], - 'gpt3_for_text_generation': ['GPT3ForTextGeneration'], + 'gpt3': ['GPT3ForTextGeneration'], } import sys diff --git a/modelscope/models/nlp/backbones/__init__.py b/modelscope/models/nlp/backbones/__init__.py index ffe8ac05..749cf995 100644 --- a/modelscope/models/nlp/backbones/__init__.py +++ b/modelscope/models/nlp/backbones/__init__.py @@ -4,14 +4,10 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .space import SpaceGenerator, SpaceModelBase from .structbert import SbertModel - from .gpt3 import GPT3Model else: _import_structure = { - 'space': ['SpaceGenerator', 'SpaceModelBase'], 'structbert': ['SbertModel'], - 'gpt3': ['GPT3Model'] } import sys diff --git a/modelscope/models/nlp/backbones/space/__init__.py b/modelscope/models/nlp/backbones/space/__init__.py deleted file mode 100644 index a2be83ef..00000000 --- a/modelscope/models/nlp/backbones/space/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .model.generator import Generator as SpaceGenerator -from .model.model_base import SpaceModelBase diff --git a/modelscope/models/nlp/backbones/space/model/__init__.py b/modelscope/models/nlp/backbones/space/model/__init__.py deleted file mode 100644 index 7e1b5264..00000000 --- a/modelscope/models/nlp/backbones/space/model/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .gen_unified_transformer import GenUnifiedTransformer -from .intent_unified_transformer import IntentUnifiedTransformer -from .unified_transformer import UnifiedTransformer diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py new file mode 100644 index 00000000..125db040 --- /dev/null +++ b/modelscope/models/nlp/backbones/structbert.py @@ -0,0 +1,54 @@ +from transformers import PreTrainedModel + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import BACKBONES +from modelscope.models.nlp.structbert import SbertConfig +from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform +from modelscope.utils.constant import Fields +from modelscope.utils.logger import get_logger + +logger = get_logger(__name__) + + +@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert) +class SbertModel(TorchModel, SbertModelTransform): + + def __init__(self, model_dir=None, add_pooling_layer=True, **config): + """ + Args: + model_dir (str, optional): The model checkpoint directory. Defaults to None. + add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True. + """ + config = SbertConfig(**config) + super().__init__(model_dir) + self.config = config + SbertModelTransform.__init__(self, config, add_pooling_layer) + + def extract_sequence_outputs(self, outputs): + return outputs['last_hidden_state'] + + def extract_pooled_outputs(self, outputs): + return outputs['pooler_output'] + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return SbertModelTransform.forward( + self, input_ids, attention_mask, token_type_ids, position_ids, + head_mask, inputs_embeds, encoder_hidden_states, + encoder_attention_mask, past_key_values, use_cache, + output_attentions, output_hidden_states, return_dict) diff --git a/modelscope/models/nlp/backbones/structbert/__init__.py b/modelscope/models/nlp/backbones/structbert/__init__.py deleted file mode 100644 index 1d147730..00000000 --- a/modelscope/models/nlp/backbones/structbert/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from typing import TYPE_CHECKING - -from modelscope.utils.import_utils import LazyImportModule - -if TYPE_CHECKING: - from .modeling_sbert import SbertModel -else: - _import_structure = {'modeling_sbert': ['SbertModel']} - - import sys - - sys.modules[__name__] = LazyImportModule( - __name__, - globals()['__file__'], - _import_structure, - module_spec=__spec__, - extra_objects={}, - ) diff --git a/modelscope/models/nlp/backbones/structbert/modeling_sbert.py b/modelscope/models/nlp/backbones/structbert/modeling_sbert.py deleted file mode 100644 index 2e67a652..00000000 --- a/modelscope/models/nlp/backbones/structbert/modeling_sbert.py +++ /dev/null @@ -1,815 +0,0 @@ -import math -from dataclasses import dataclass -from typing import Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from packaging import version -from torch import nn -from transformers import PreTrainedModel -from transformers.activations import ACT2FN -from transformers.modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, ModelOutput) -from transformers.modeling_utils import (apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer) - -from modelscope.metainfo import Models -from modelscope.models.base import TorchModel -from modelscope.models.builder import BACKBONES -from modelscope.utils.constant import Fields -from modelscope.utils.logger import get_logger -from .configuration_sbert import SbertConfig - -logger = get_logger(__name__) - - -@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert) -class SbertModel(TorchModel, PreTrainedModel): - """ - - The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of - cross-attention is added between the self-attention layers, following the architecture described in `Attention is - all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, - Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. - - To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration - set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` - argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an - input to the forward pass. - """ - - def __init__(self, model_dir=None, add_pooling_layer=True, **config): - """ - Args: - model_dir (str, optional): The model checkpoint directory. Defaults to None. - add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True. - """ - config = SbertConfig(**config) - super().__init__(model_dir) - self.config = config - - self.embeddings = SbertEmbeddings(config) - self.encoder = SbertEncoder(config) - - self.pooler = SbertPooler(config) if add_pooling_layer else None - self.init_weights() - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs): - r""" - encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)` - , `optional`): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` - with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, - sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` - (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` - instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. - use_cache (:obj:`bool`, `optional`): - If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up - decoding (see :obj:`past_key_values`). - """ - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if self.config.is_decoder: - use_cache = use_cache if use_cache is not None else self.config.use_cache - else: - use_cache = False - - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - 'You cannot specify both input_ids and inputs_embeds at the same time' - ) - elif input_ids is not None: - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError( - 'You have to specify either input_ids or inputs_embeds') - - batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device - - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[ - 2] if past_key_values is not None else 0 - - if attention_mask is None: - attention_mask = torch.ones( - ((batch_size, seq_length + past_key_values_length)), - device=device) - - if token_type_ids is None: - if hasattr(self.embeddings, 'token_type_ids'): - buffered_token_type_ids = self.embeddings.token_type_ids[:, : - seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - batch_size, seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros( - input_shape, dtype=torch.long, device=device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, device) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( - ) - encoder_hidden_shape = (encoder_batch_size, - encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones( - encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) - else: - encoder_extended_attention_mask = None - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, - self.config.num_hidden_layers) - - embedding_output, orignal_embeds = self.embeddings( - input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, - return_inputs_embeds=True, - ) - encoder_outputs = self.encoder( - embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler( - sequence_output) if self.pooler is not None else None - - if not return_dict: - return (sequence_output, - pooled_output) + encoder_outputs[1:] + (orignal_embeds, ) - - return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - embedding_output=orignal_embeds) - - def extract_sequence_outputs(self, outputs): - return outputs['last_hidden_state'] - - def extract_pooled_outputs(self, outputs): - return outputs['pooler_output'] - - -class SbertEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings.""" - - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding( - config.vocab_size, - config.hidden_size, - padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, - 'position_embedding_type', - 'absolute') - self.register_buffer( - 'position_ids', - torch.arange(config.max_position_embeddings).expand((1, -1))) - if version.parse(torch.__version__) > version.parse('1.6.0'): - self.register_buffer( - 'token_type_ids', - torch.zeros( - self.position_ids.size(), - dtype=torch.long, - device=self.position_ids.device), - persistent=False, - ) - - def forward(self, - input_ids=None, - token_type_ids=None, - position_ids=None, - inputs_embeds=None, - past_key_values_length=0, - return_inputs_embeds=False): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = self.position_ids[:, - past_key_values_length:seq_length - + past_key_values_length] - - # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs - # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids - # issue #5664 - if token_type_ids is None: - if hasattr(self, 'token_type_ids'): - buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - input_shape[0], seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros( - input_shape, - dtype=torch.long, - device=self.position_ids.device) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == 'absolute': - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - if not return_inputs_embeds: - return embeddings - else: - return embeddings, inputs_embeds - - -class SbertSelfAttention(nn.Module): - - def __init__(self, config): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, 'embedding_size'): - raise ValueError( - f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' - f'heads ({config.num_attention_heads})') - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size - / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, - 'position_embedding_type', - 'absolute') - if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, - self.attention_head_size) - - self.is_decoder = config.is_decoder - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - mixed_query_layer = self.query(hidden_states) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - if is_cross_attention and past_key_value is not None: - # reuse k,v, cross_attentions - key_layer = past_key_value[0] - value_layer = past_key_value[1] - attention_mask = encoder_attention_mask - elif is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - elif past_key_value is not None: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - key_layer = torch.cat([past_key_value[0], key_layer], dim=2) - value_layer = torch.cat([past_key_value[1], value_layer], dim=2) - else: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - - query_layer = self.transpose_for_scores(mixed_query_layer) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_layer, value_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, - key_layer.transpose(-1, -2)) - - if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange( - seq_length, dtype=torch.long, - device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange( - seq_length, dtype=torch.long, - device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to( - dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == 'relative_key': - relative_position_scores = torch.einsum( - 'bhld,lrd->bhlr', query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == 'relative_key_query': - relative_position_scores_query = torch.einsum( - 'bhld,lrd->bhlr', query_layer, positional_embedding) - relative_position_scores_key = torch.einsum( - 'bhrd,lrd->bhlr', key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in SbertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + ( - self.all_head_size, ) - context_layer = context_layer.view(*new_context_layer_shape) - - outputs = (context_layer, - attention_probs) if output_attentions else (context_layer, ) - - if self.is_decoder: - outputs = outputs + (past_key_value, ) - return outputs - - -class SbertSelfOutput(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class SbertAttention(nn.Module): - - def __init__(self, config): - super().__init__() - self.self = SbertSelfAttention(config) - self.output = SbertSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, - self.self.attention_head_size, self.pruned_heads) - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len( - heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - self_outputs = self.self( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them - return outputs - - -class SbertIntermediate(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -class SbertOutput(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class SbertLayer(nn.Module): - - def __init__(self, config): - super().__init__() - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.seq_len_dim = 1 - self.attention = SbertAttention(config) - self.is_decoder = config.is_decoder - self.add_cross_attention = config.add_cross_attention - if self.add_cross_attention: - if not self.is_decoder: - raise ValueError( - f'{self} should be used as a decoder model if cross attention is added' - ) - self.crossattention = SbertAttention(config) - self.intermediate = SbertIntermediate(config) - self.output = SbertOutput(config) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[: - 2] if past_key_value is not None else None - self_attention_outputs = self.attention( - hidden_states, - attention_mask, - head_mask, - output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, - ) - attention_output = self_attention_outputs[0] - - # if decoder, the last output is tuple of self-attn cache - if self.is_decoder: - outputs = self_attention_outputs[1:-1] - present_key_value = self_attention_outputs[-1] - else: - outputs = self_attention_outputs[ - 1:] # add self attentions if we output attention weights - - cross_attn_present_key_value = None - if self.is_decoder and encoder_hidden_states is not None: - if not hasattr(self, 'crossattention'): - raise ValueError( - f'If `encoder_hidden_states` are passed, {self} has to be instantiated' - f'with cross-attention layers by setting `config.add_cross_attention=True`' - ) - - # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple - cross_attn_past_key_value = past_key_value[ - -2:] if past_key_value is not None else None - cross_attention_outputs = self.crossattention( - attention_output, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - cross_attn_past_key_value, - output_attentions, - ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[ - 1:-1] # add cross attentions if we output attention weights - - # add cross-attn cache to positions 3,4 of present_key_value tuple - cross_attn_present_key_value = cross_attention_outputs[-1] - present_key_value = present_key_value + cross_attn_present_key_value - - layer_output = apply_chunking_to_forward(self.feed_forward_chunk, - self.chunk_size_feed_forward, - self.seq_len_dim, - attention_output) - outputs = (layer_output, ) + outputs - - # if decoder, return the attn key/values as the last output - if self.is_decoder: - outputs = outputs + (present_key_value, ) - - return outputs - - def feed_forward_chunk(self, attention_output): - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output - - -class SbertEncoder(nn.Module): - - def __init__(self, config): - super().__init__() - self.config = config - self.layer = nn.ModuleList( - [SbertLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - ): - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = ( - ) if output_attentions and self.config.add_cross_attention else None - - next_decoder_cache = () if use_cache else None - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' - ) - use_cache = False - - def create_custom_forward(module): - - def custom_forward(*inputs): - return module(*inputs, past_key_value, - output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - - hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[-1], ) - if output_attentions: - all_self_attentions = all_self_attentions + ( - layer_outputs[1], ) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + ( - layer_outputs[2], ) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - if not return_dict: - return tuple(v for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_decoder_cache, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) - - -class SbertPooler(nn.Module): - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -@dataclass -class SbertForPreTrainingOutput(ModelOutput): - """ - Output type of :class:`~structbert.utils.BertForPreTraining`. - - Args: - loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when - ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, - sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding( - BaseModelOutputWithPoolingAndCrossAttentions): - embedding_output: torch.FloatTensor = None - logits: Optional[Union[tuple, torch.FloatTensor]] = None - kwargs: dict = None diff --git a/modelscope/models/nlp/backbones/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py similarity index 76% rename from modelscope/models/nlp/backbones/gpt3/__init__.py rename to modelscope/models/nlp/gpt3/__init__.py index b0739c22..076a0c6b 100644 --- a/modelscope/models/nlp/backbones/gpt3/__init__.py +++ b/modelscope/models/nlp/gpt3/__init__.py @@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .configuration_gpt3 import GPT3Config from .modeling_gpt3 import GPT3Model + from .gpt3_for_text_generation import GPT3ForTextGeneration else: _import_structure = { 'configuration_gpt3': ['GPT3Config'], - 'modeling_gpt3': ['GPT3Model'] + 'modeling_gpt3': ['GPT3Model'], + 'gpt3_for_text_generation': ['GPT3ForTextGeneration'], } import sys diff --git a/modelscope/models/nlp/backbones/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration_gpt3.py similarity index 100% rename from modelscope/models/nlp/backbones/gpt3/configuration_gpt3.py rename to modelscope/models/nlp/gpt3/configuration_gpt3.py diff --git a/modelscope/models/nlp/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py similarity index 97% rename from modelscope/models/nlp/gpt3_for_text_generation.py rename to modelscope/models/nlp/gpt3/gpt3_for_text_generation.py index 22a6458d..6bdcb431 100644 --- a/modelscope/models/nlp/gpt3_for_text_generation.py +++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py @@ -20,7 +20,7 @@ class GPT3ForTextGeneration(TorchModel): """ super().__init__(model_dir, *args, **kwargs) - from modelscope.models.nlp import GPT3Model + from modelscope.models.nlp.gpt3 import GPT3Model from transformers import BertTokenizer self.model = GPT3Model.from_pretrained(model_dir) diff --git a/modelscope/models/nlp/backbones/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py similarity index 100% rename from modelscope/models/nlp/backbones/gpt3/modeling_gpt3.py rename to modelscope/models/nlp/gpt3/modeling_gpt3.py diff --git a/modelscope/models/nlp/heads/__init__.py b/modelscope/models/nlp/heads/__init__.py index 6ae43f6d..19194d3a 100644 --- a/modelscope/models/nlp/heads/__init__.py +++ b/modelscope/models/nlp/heads/__init__.py @@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .sequence_classification_head import SequenceClassificationHead + from .torch_pretrain_head import BertMLMHead, RobertaMLMHead else: _import_structure = { - 'sequence_classification_head': ['SequenceClassificationHead'] + 'sequence_classification_head': ['SequenceClassificationHead'], + 'torch_pretrain_head': ['BertMLMHead', 'RobertaMLMHead'], } import sys diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py index 8c6e2188..92f3a4ec 100644 --- a/modelscope/models/nlp/heads/sequence_classification_head.py +++ b/modelscope/models/nlp/heads/sequence_classification_head.py @@ -1,5 +1,4 @@ -import importlib -from typing import Dict, List, Optional, Union +from typing import Dict import torch import torch.nn.functional as F diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py new file mode 100644 index 00000000..6ff6c96f --- /dev/null +++ b/modelscope/models/nlp/heads/torch_pretrain_head.py @@ -0,0 +1,26 @@ +from typing import Dict + +import torch +from transformers.models.bert.modeling_bert import BertOnlyMLMHead +from transformers.models.roberta.modeling_roberta import RobertaLMHead + +from modelscope.metainfo import Heads +from modelscope.models.base import TorchHead +from modelscope.models.builder import HEADS +from modelscope.utils.constant import Tasks + + +@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) +class BertMLMHead(BertOnlyMLMHead, TorchHead): + + def compute_loss(self, outputs: Dict[str, torch.Tensor], + labels) -> Dict[str, torch.Tensor]: + raise NotImplementedError() + + +@HEADS.register_module(Tasks.fill_mask, module_name=Heads.roberta_mlm) +class RobertaMLMHead(RobertaLMHead, TorchHead): + + def compute_loss(self, outputs: Dict[str, torch.Tensor], + labels) -> Dict[str, torch.Tensor]: + raise NotImplementedError() diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py index ffe9631d..ff16335f 100644 --- a/modelscope/models/nlp/masked_language.py +++ b/modelscope/models/nlp/masked_language.py @@ -1,72 +1,115 @@ -from typing import Dict +from typing import Any, Dict, Optional, Union import numpy as np +from transformers import BertForMaskedLM as BertForMaskedLMTransformer from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.base import Tensor +from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.structbert import SbertForMaskedLM +from modelscope.models.nlp.veco import \ + VecoForMaskedLM as VecoForMaskedLMTransformer +from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks __all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM'] -class MaskedLanguageModelBase(TorchModel): - - def __init__(self, model_dir: str, *args, **kwargs): - super().__init__(model_dir, *args, **kwargs) - self.model = self.build_model() - - def build_model(self): - raise NotImplementedError() - - def train(self): - return self.model.train() - - def eval(self): - return self.model.eval() - - @property - def config(self): - if hasattr(self.model, 'config'): - return self.model.config - return None - - def forward(self, input: Dict[str, Tensor]) -> Dict[str, np.ndarray]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - """ - rst = self.model( - input_ids=input['input_ids'], - attention_mask=input['attention_mask'], - token_type_ids=input['token_type_ids']) - return {'logits': rst['logits'], 'input_ids': input['input_ids']} - - @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert) -class StructBertForMaskedLM(MaskedLanguageModelBase): - - def build_model(self): - from sofa import SbertForMaskedLM - return SbertForMaskedLM.from_pretrained(self.model_dir) - - -@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) -class VecoForMaskedLM(MaskedLanguageModelBase): - - def build_model(self): - from sofa import VecoForMaskedLM - return VecoForMaskedLM.from_pretrained(self.model_dir) +class StructBertForMaskedLM(TorchModel, SbertForMaskedLM): + + def __init__(self, config, model_dir): + super(TorchModel, self).__init__(model_dir) + SbertForMaskedLM.__init__(self, config) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + labels=None): + output = SbertForMaskedLM.forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + labels=labels) + output[OutputKeys.INPUT_IDS] = input_ids + return output + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained( + pretrained_model_name_or_path=model_dir, model_dir=model_dir) @MODELS.register_module(Tasks.fill_mask, module_name=Models.bert) -class BertForMaskedLM(MaskedLanguageModelBase): +class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer): + + def __init__(self, config, model_dir): + super(TorchModel, self).__init__(model_dir) + BertForMaskedLMTransformer.__init__(self, config) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + labels=None): + output = BertForMaskedLMTransformer.forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + labels=labels) + output[OutputKeys.INPUT_IDS] = input_ids + return output + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + return super(BertForMaskedLMTransformer, + BertForMaskedLM).from_pretrained( + pretrained_model_name_or_path=model_dir, + model_dir=model_dir) - def build_model(self): - from transformers import BertForMaskedLM - return BertForMaskedLM.from_pretrained(self.model_dir) + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) +class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer): + + def __init__(self, config, model_dir): + super(TorchModel, self).__init__(model_dir) + VecoForMaskedLMTransformer.__init__(self, config) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + labels=None): + output = VecoForMaskedLMTransformer.forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + labels=labels) + output[OutputKeys.INPUT_IDS] = input_ids + return output + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + return super(VecoForMaskedLMTransformer, + VecoForMaskedLM).from_pretrained( + pretrained_model_name_or_path=model_dir, + model_dir=model_dir) diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py new file mode 100644 index 00000000..3a9960ec --- /dev/null +++ b/modelscope/models/nlp/palm_v2/__init__.py @@ -0,0 +1,43 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .configuration_palm import PalmConfig + from .modeling_palm import ( + AbsSummarizer, + PalmForConditionalGeneration, + Translator, + ) + from .palm_for_text_generation import PalmForTextGeneration +else: + _import_structure = { + 'configuration_palm': ['PalmConfig'], + 'modeling_palm': + ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'], + 'palm_for_text_generation': ['PalmForTextGeneration'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/palm_v2/configuration_palm.py b/modelscope/models/nlp/palm_v2/configuration_palm.py new file mode 100644 index 00000000..3b9e51fb --- /dev/null +++ b/modelscope/models/nlp/palm_v2/configuration_palm.py @@ -0,0 +1,116 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PALM model configuration """ + +from transformers.configuration_utils import PretrainedConfig + +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + + +class PalmConfig(PretrainedConfig): + r""" + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + hidden_size (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layernorm_epsilon (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + dec_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer decoder. + attn_separate (:obj:`bool`, `optional`, defaults to false): + Whether or not to separate the q, k, v of attention. + + Examples:: + + >>> from modelscope.models.nlp.palm_v2 import PalmForConditionalGeneration, PalmConfig + >>> configuration = PalmConfig() + + >>> # Initializing a model from the configuration + >>> model = PalmForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = 'palm' + + def __init__(self, + encoder='roberta', + encoder_pth='roberta-base', + max_pos=512, + share_emb=False, + dec_layers=12, + dec_hidden_size=768, + dec_heads=8, + dec_ff_size=3072, + dec_dropout=0.2, + use_bert_emb=True, + label_smoothing=0.1, + alpha=0.95, + beam_size=5, + min_length=40, + max_length=130, + sample_topk=False, + block_trigram=False, + **kwargs): + super().__init__(**kwargs) + self.encoder = encoder + self.encoder_pth = encoder_pth + self.max_pos = max_pos + self.share_emb = share_emb + self.dec_layers = dec_layers + self.dec_hidden_size = dec_hidden_size + self.dec_heads = dec_heads + self.dec_ff_size = dec_ff_size + self.dec_dropout = dec_dropout + self.use_bert_emb = use_bert_emb + self.label_smoothing = label_smoothing + # Translator + self.alpha = alpha + self.beam_size = beam_size + self.min_length = min_length + self.max_length = max_length + self.sample_topk = sample_topk + self.block_trigram = block_trigram diff --git a/modelscope/models/nlp/palm_v2/dureader_eval.py b/modelscope/models/nlp/palm_v2/dureader_eval.py new file mode 100644 index 00000000..db54f21d --- /dev/null +++ b/modelscope/models/nlp/palm_v2/dureader_eval.py @@ -0,0 +1,872 @@ +# ============================================================================== +# Copyright 2017 Baidu.com, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +This module computes evaluation metrics for DuReader dataset. +""" + +import argparse +import copy +import math +import re +import sys +import zipfile +from collections import Counter, defaultdict + +import json +import numpy as np +from rouge import Rouge + +EMPTY = '' +YESNO_LABELS = set(['Yes', 'No', 'Depends']) + + +def my_lcs(string, sub): + """ + Calculates longest common subsequence for a pair of tokenized strings + :param string : list of str : tokens from a string split using whitespace + :param sub : list of str : shorter string, also split using whitespace + :returns: length (list of int): length of the longest common subsequence between the two strings + + Note: my_lcs only gives length of the longest common subsequence, not the actual LCS + """ + if (len(string) < len(sub)): + sub, string = string, sub + + lengths = [[0 for i in range(0, + len(sub) + 1)] + for j in range(0, + len(string) + 1)] + + for j in range(1, len(sub) + 1): + for i in range(1, len(string) + 1): + if (string[i - 1] == sub[j - 1]): + lengths[i][j] = lengths[i - 1][j - 1] + 1 + else: + lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) + + return lengths[len(string)][len(sub)] + + +class Bleu: + + def __init__(self, n=4): + # default compute Blue score up to 4 + self._n = n + self._hypo_for_image = {} + self.ref_for_image = {} + + def compute_score(self, gts, res): + assert (list(gts.keys()) == list(res.keys())) + imgIds = list(gts.keys()) + + bleu_scorer = BleuScorer(n=self._n) + for id in imgIds: + hypo = res[id] + ref = gts[id] + + # Sanity check. + assert (type(hypo) is list) + assert (len(hypo) == 1) + assert (type(ref) is list) + assert (len(ref) >= 1) + + bleu_scorer += (hypo[0], ref) + + score, scores = bleu_scorer.compute_score(option='closest', verbose=1) + return score, scores + + def method(self): + return 'Bleu' + + +def precook(s, n=4, out=False): + """Takes a string as input and returns an object that can be given to + either cook_refs or cook_test. This is optional: cook_refs and cook_test + can take string arguments as well.""" + words = s.split() + counts = defaultdict(int) + for k in range(1, n + 1): + for i in range(len(words) - k + 1): + ngram = tuple(words[i:i + k]) + counts[ngram] += 1 + return (len(words), counts) + + +def cook_refs(refs, eff=None, n=4): # lhuang: oracle will call with "average" + '''Takes a list of reference sentences for a single segment + and returns an object that encapsulates everything that BLEU + needs to know about them.''' + + reflen = [] + maxcounts = {} + for ref in refs: + rl, counts = precook(ref, n) + reflen.append(rl) + for (ngram, count) in counts.items(): + maxcounts[ngram] = max(maxcounts.get(ngram, 0), count) + + # Calculate effective reference sentence length. + if eff == 'shortest': + reflen = min(reflen) + elif eff == 'average': + reflen = float(sum(reflen)) / len(reflen) + + # lhuang: N.B.: leave reflen computaiton to the very end!! + + # lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) + + return reflen, maxcounts + + +def cook_test(test, xxx_todo_changeme, eff=None, n=4): + '''Takes a test sentence and returns an object that + encapsulates everything that BLEU needs to know about it.''' + (reflen, refmaxcounts) = xxx_todo_changeme + testlen, counts = precook(test, n, True) + + result = {} + + # Calculate effective reference sentence length. + + if eff == 'closest': + result['reflen'] = min((abs(ref - testlen), ref) for ref in reflen)[1] + else: # i.e., "average" or "shortest" or None + result['reflen'] = reflen + + result['testlen'] = testlen + + result['guess'] = [max(0, testlen - k + 1) for k in range(1, n + 1)] + + result['correct'] = [0] * n + for (ngram, count) in counts.items(): + result['correct'][len(ngram) - 1] += min( + refmaxcounts.get(ngram, 0), count) + + return result + + +class BleuScorer(object): + """Bleu scorer. + """ + + __slots__ = 'n', 'crefs', 'ctest', '_score', '_ratio', '_testlen', '_reflen', 'special_reflen' + + # special_reflen is used in oracle (proportional effective ref len for a node). + + def copy(self): + ''' copy the refs.''' + new = BleuScorer(n=self.n) + new.ctest = copy.copy(self.ctest) + new.crefs = copy.copy(self.crefs) + new._score = None + return new + + def __init__(self, test=None, refs=None, n=4, special_reflen=None): + ''' singular instance ''' + + self.n = n + self.crefs = [] + self.ctest = [] + self.cook_append(test, refs) + self.special_reflen = special_reflen + + def cook_append(self, test, refs): + '''called by constructor and __iadd__ to avoid creating new instances.''' + + if refs is not None: + self.crefs.append(cook_refs(refs)) + if test is not None: + cooked_test = cook_test(test, self.crefs[-1]) + self.ctest.append(cooked_test) # N.B.: -1 + else: + self.ctest.append( + None) # lens of crefs and ctest have to match + + self._score = None # need to recompute + + def ratio(self, option=None): + self.compute_score(option=option) + return self._ratio + + def score_ratio(self, option=None): + '''return (bleu, len_ratio) pair''' + return (self.fscore(option=option), self.ratio(option=option)) + + def score_ratio_str(self, option=None): + return '%.4f (%.2f)' % self.score_ratio(option) + + def reflen(self, option=None): + self.compute_score(option=option) + return self._reflen + + def testlen(self, option=None): + self.compute_score(option=option) + return self._testlen + + def retest(self, new_test): + if type(new_test) is str: + new_test = [new_test] + assert len(new_test) == len(self.crefs), new_test + self.ctest = [] + for t, rs in zip(new_test, self.crefs): + self.ctest.append(cook_test(t, rs)) + self._score = None + + return self + + def rescore(self, new_test): + ''' replace test(s) with new test(s), and returns the new score.''' + + return self.retest(new_test).compute_score() + + def size(self): + assert len(self.crefs) == len( + self.ctest), 'refs/test mismatch! %d<>%d' % (len( + self.crefs), len(self.ctest)) + return len(self.crefs) + + def __iadd__(self, other): + '''add an instance (e.g., from another sentence).''' + + if type(other) is tuple: + # avoid creating new BleuScorer instances + self.cook_append(other[0], other[1]) + else: + assert self.compatible(other), 'incompatible BLEUs.' + self.ctest.extend(other.ctest) + self.crefs.extend(other.crefs) + self._score = None # need to recompute + + return self + + def compatible(self, other): + return isinstance(other, BleuScorer) and self.n == other.n + + def single_reflen(self, option='average'): + return self._single_reflen(self.crefs[0][0], option) + + def _single_reflen(self, reflens, option=None, testlen=None): + + if option == 'shortest': + reflen = min(reflens) + elif option == 'average': + reflen = float(sum(reflens)) / len(reflens) + elif option == 'closest': + reflen = min((abs(ref - testlen), ref) for ref in reflens)[1] + else: + assert False, 'unsupported reflen option %s' % option + + return reflen + + def recompute_score(self, option=None, verbose=0): + self._score = None + return self.compute_score(option, verbose) + + def compute_score(self, option=None, verbose=0): + n = self.n + small = 1e-9 + tiny = 1e-15 # so that if guess is 0 still return 0 + bleu_list = [[] for _ in range(n)] + + if self._score is not None: + return self._score + + if option is None: + option = 'average' if len(self.crefs) == 1 else 'closest' + + self._testlen = 0 + self._reflen = 0 + totalcomps = { + 'testlen': 0, + 'reflen': 0, + 'guess': [0] * n, + 'correct': [0] * n + } + + # for each sentence + for comps in self.ctest: + testlen = comps['testlen'] + self._testlen += testlen + + if self.special_reflen is None: # need computation + reflen = self._single_reflen(comps['reflen'], option, testlen) + else: + reflen = self.special_reflen + + self._reflen += reflen + + for key in ['guess', 'correct']: + for k in range(n): + totalcomps[key][k] += comps[key][k] + + # append per image bleu score + bleu = 1. + for k in range(n): + bleu *= (float(comps['correct'][k]) + tiny) / ( + float(comps['guess'][k]) + small) + bleu_list[k].append(bleu**(1. / (k + 1))) + ratio = (testlen + tiny) / (reflen + small + ) # N.B.: avoid zero division + if ratio < 1: + for k in range(n): + bleu_list[k][-1] *= math.exp(1 - 1 / ratio) + + if verbose > 1: + print(comps, reflen) + + totalcomps['reflen'] = self._reflen + totalcomps['testlen'] = self._testlen + + bleus = [] + bleu = 1. + for k in range(n): + bleu *= float(totalcomps['correct'][k] + tiny) / ( + totalcomps['guess'][k] + small) + bleus.append(bleu**(1. / (k + 1))) + ratio = (self._testlen + tiny) / (self._reflen + small + ) # N.B.: avoid zero division + if ratio < 1: + for k in range(n): + bleus[k] *= math.exp(1 - 1 / ratio) + + if verbose > 0: + print(totalcomps) + print('ratio:', ratio) + + self._score = bleus + return self._score, bleu_list + + +def normalize(s): + """ + Normalize strings to space joined chars. + + Args: + s: a list of strings. + + Returns: + A list of normalized strings. + """ + if not s: + return s + normalized = [] + for ss in s: + tokens = [c for c in list(ss) if len(c.strip()) != 0] + normalized.append(' '.join(tokens)) + return normalized + + +def data_check(obj, task): + """ + Check data. + + Raises: + Raises AssertionError when data is not legal. + """ + assert 'question_id' in obj, "Missing 'question_id' field." + assert 'question_type' in obj, \ + "Missing 'question_type' field. question_id: {}".format(obj['question_type']) + + assert 'yesno_answers' in obj, \ + "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id']) + assert isinstance(obj['yesno_answers'], list), \ + r"""'yesno_answers' field must be a list, if the 'question_type' is not + 'YES_NO', then this field should be an empty list. + question_id: {}""".format(obj['question_id']) + + assert 'entity_answers' in obj, \ + "Missing 'entity_answers' field. question_id: {}".format(obj['question_id']) + assert isinstance( + obj['entity_answers'], + list) and len(obj['entity_answers']) > 0, r"""'entity_answers' field + must be a list, and has at least one element, which can be a empty list. + question_id: {}""".format(obj['question_id']) + + +def read_file(file_name, task, is_ref=False): + """ + Read predict answers or reference answers from file. + + Args: + file_name: the name of the file containing predict result or reference + result. + + Returns: + A dictionary mapping question_id to the result information. The result + information itself is also a dictionary with has four keys: + - question_type: type of the query. + - yesno_answers: A list of yesno answers corresponding to 'answers'. + - answers: A list of predicted answers. + - entity_answers: A list, each element is also a list containing the entities + tagged out from the corresponding answer string. + """ + + def _open(file_name, mode, zip_obj=None): + if zip_obj is not None: + return zip_obj.open(file_name, mode) + return open(file_name, mode) + + results = {} + keys = ['answers', 'yesno_answers', 'entity_answers', 'question_type'] + if is_ref: + keys += ['source'] + + zf = zipfile.ZipFile(file_name, + 'r') if file_name.endswith('.zip') else None + file_list = [file_name] if zf is None else zf.namelist() + + for fn in file_list: + for line in _open(fn, 'r', zip_obj=zf): + try: + obj = json.loads(line.strip()) + except ValueError: + raise ValueError('Every line of data should be legal json') + data_check(obj, task) + qid = obj['question_id'] + assert qid not in results, 'Duplicate question_id: {}'.format(qid) + results[qid] = {} + for k in keys: + results[qid][k] = obj[k] + return results + + +def compute_bleu_rouge(pred_dict, ref_dict, bleu_order=4): + """ + Compute bleu and rouge scores. + """ + assert set(pred_dict.keys()) == set(ref_dict.keys()), \ + 'missing keys: {}'.format(set(ref_dict.keys()) - set(pred_dict.keys())) + scores = {} + bleu_scores, _ = Bleu(bleu_order).compute_score(ref_dict, pred_dict) + for i, bleu_score in enumerate(bleu_scores): + scores['Bleu-%d' % (i + 1)] = bleu_score + # rouge_score, _ = Rouge().compute_score(ref_dict, pred_dict) + rouge_score = Rouge().get_scores( + list(map(lambda x: x[0], pred_dict.values())), + list(map(lambda x: x[0], ref_dict.values()))) + rouge_score = sum([d['rouge-l']['f'] + for d in rouge_score]) / len(rouge_score) + scores['Rouge-L'] = rouge_score + return scores + + +def local_prf(pred_list, ref_list): + """ + Compute local precision recall and f1-score, + given only one prediction list and one reference list + """ + common = Counter(pred_list) & Counter(ref_list) + num_same = sum(common.values()) + if num_same == 0: + return 0, 0, 0 + p = 1.0 * num_same / len(pred_list) + r = 1.0 * num_same / len(ref_list) + f1 = (2 * p * r) / (p + r) + return p, r, f1 + + +def compute_prf(pred_dict, ref_dict): + """ + Compute precision recall and f1-score. + """ + # pred_question_ids = set(pred_dict.keys()) + ref_question_ids = set(ref_dict.keys()) + correct_preds, total_correct, total_preds = 0, 0, 0 + for question_id in ref_question_ids: + pred_entity_list = pred_dict.get(question_id, [[]]) + assert len(pred_entity_list) == 1, \ + 'the number of entity list for question_id {} is not 1.'.format(question_id) + pred_entity_list = pred_entity_list[0] + all_ref_entity_lists = ref_dict[question_id] + best_local_f1 = 0 + best_ref_entity_list = None + for ref_entity_list in all_ref_entity_lists: + local_f1 = local_prf(pred_entity_list, ref_entity_list)[2] + if local_f1 > best_local_f1: + best_ref_entity_list = ref_entity_list + best_local_f1 = local_f1 + if best_ref_entity_list is None: + if len(all_ref_entity_lists) > 0: + best_ref_entity_list = sorted( + all_ref_entity_lists, key=lambda x: len(x))[0] + else: + best_ref_entity_list = [] + gold_entities = set(best_ref_entity_list) + pred_entities = set(pred_entity_list) + correct_preds += len(gold_entities & pred_entities) + total_preds += len(pred_entities) + total_correct += len(gold_entities) + p = float(correct_preds) / total_preds if correct_preds > 0 else 0 + r = float(correct_preds) / total_correct if correct_preds > 0 else 0 + f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 + return {'Precision': p, 'Recall': r, 'F1': f1} + + +def prepare_prf(pred_dict, ref_dict): + """ + Prepares data for calculation of prf scores. + """ + preds = {k: v['entity_answers'] for k, v in pred_dict.items()} + refs = {k: v['entity_answers'] for k, v in ref_dict.items()} + return preds, refs + + +def filter_dict(result_dict, key_tag): + """ + Filter a subset of the result_dict, where keys ends with 'key_tag'. + """ + filtered = {} + for k, v in result_dict.items(): + if k.endswith(key_tag): + filtered[k] = v + return filtered + + +def get_metrics(pred_result, ref_result, task, source): + """ + Computes metrics. + """ + metrics = {} + + ref_result_filtered = {} + pred_result_filtered = {} + if source == 'both': + ref_result_filtered = ref_result + pred_result_filtered = pred_result + else: + for question_id, info in ref_result.items(): + if info['source'] == source: + ref_result_filtered[question_id] = info + if question_id in pred_result: + pred_result_filtered[question_id] = pred_result[ + question_id] + + if task == 'main' or task == 'all' \ + or task == 'description': + pred_dict, ref_dict = prepare_bleu(pred_result_filtered, + ref_result_filtered, task) + metrics = compute_bleu_rouge(pred_dict, ref_dict) + elif task == 'yesno': + pred_dict, ref_dict = prepare_bleu(pred_result_filtered, + ref_result_filtered, task) + keys = ['Yes', 'No', 'Depends'] + preds = [filter_dict(pred_dict, k) for k in keys] + refs = [filter_dict(ref_dict, k) for k in keys] + + metrics = compute_bleu_rouge(pred_dict, ref_dict) + + for k, pred, ref in zip(keys, preds, refs): + m = compute_bleu_rouge(pred, ref) + k_metric = [(k + '|' + key, v) for key, v in m.items()] + metrics.update(k_metric) + + elif task == 'entity': + pred_dict, ref_dict = prepare_prf(pred_result_filtered, + ref_result_filtered) + pred_dict_bleu, ref_dict_bleu = prepare_bleu(pred_result_filtered, + ref_result_filtered, task) + metrics = compute_prf(pred_dict, ref_dict) + metrics.update(compute_bleu_rouge(pred_dict_bleu, ref_dict_bleu)) + else: + raise ValueError('Illegal task name: {}'.format(task)) + + return metrics + + +def prepare_bleu(pred_result, ref_result, task): + """ + Prepares data for calculation of bleu and rouge scores. + """ + pred_list, ref_list = [], [] + qids = ref_result.keys() + for qid in qids: + if task == 'main': + pred, ref = get_main_result(qid, pred_result, ref_result) + elif task == 'yesno': + pred, ref = get_yesno_result(qid, pred_result, ref_result) + elif task == 'all': + pred, ref = get_all_result(qid, pred_result, ref_result) + elif task == 'entity': + pred, ref = get_entity_result(qid, pred_result, ref_result) + elif task == 'description': + pred, ref = get_desc_result(qid, pred_result, ref_result) + else: + raise ValueError('Illegal task name: {}'.format(task)) + if pred and ref: + pred_list += pred + ref_list += ref + pred_dict = dict(pred_list) + ref_dict = dict(ref_list) + for qid, ans in ref_dict.items(): + ref_dict[qid] = normalize(ref_dict[qid]) + pred_dict[qid] = normalize(pred_dict.get(qid, [EMPTY])) + if not ans or ans == [EMPTY]: + del ref_dict[qid] + del pred_dict[qid] + + for k, v in pred_dict.items(): + assert len(v) == 1, \ + 'There should be only one predict answer. question_id: {}'.format(k) + return pred_dict, ref_dict + + +def get_main_result(qid, pred_result, ref_result): + """ + Prepare answers for task 'main'. + + Args: + qid: question_id. + pred_result: A dict include all question_id's result information read + from args.pred_file. + ref_result: A dict incluce all question_id's result information read + from args.ref_file. + Returns: + Two lists, the first one contains predict result, the second + one contains reference result of the same question_id. Each list has + elements of tuple (question_id, answers), 'answers' is a list of strings. + """ + ref_ans = ref_result[qid]['answers'] + if not ref_ans: + ref_ans = [EMPTY] + pred_ans = pred_result.get(qid, {}).get('answers', [])[:1] + if not pred_ans: + pred_ans = [EMPTY] + + return [(qid, pred_ans)], [(qid, ref_ans)] + + +def get_entity_result(qid, pred_result, ref_result): + """ + Prepare answers for task 'entity'. + + Args: + qid: question_id. + pred_result: A dict include all question_id's result information read + from args.pred_file. + ref_result: A dict incluce all question_id's result information read + from args.ref_file. + Returns: + Two lists, the first one contains predict result, the second + one contains reference result of the same question_id. Each list has + elements of tuple (question_id, answers), 'answers' is a list of strings. + """ + if ref_result[qid]['question_type'] != 'ENTITY': + return None, None + return get_main_result(qid, pred_result, ref_result) + + +def get_desc_result(qid, pred_result, ref_result): + """ + Prepare answers for task 'description'. + + Args: + qid: question_id. + pred_result: A dict include all question_id's result information read + from args.pred_file. + ref_result: A dict incluce all question_id's result information read + from args.ref_file. + Returns: + Two lists, the first one contains predict result, the second + one contains reference result of the same question_id. Each list has + elements of tuple (question_id, answers), 'answers' is a list of strings. + """ + if ref_result[qid]['question_type'] != 'DESCRIPTION': + return None, None + return get_main_result(qid, pred_result, ref_result) + + +def get_yesno_result(qid, pred_result, ref_result): + """ + Prepare answers for task 'yesno'. + + Args: + qid: question_id. + pred_result: A dict include all question_id's result information read + from args.pred_file. + ref_result: A dict incluce all question_id's result information read + from args.ref_file. + Returns: + Two lists, the first one contains predict result, the second + one contains reference result of the same question_id. Each list has + elements of tuple (question_id, answers), 'answers' is a list of strings. + """ + + def _uniq(li, is_ref): + uniq_li = [] + left = [] + keys = set() + for k, v in li: + if k not in keys: + uniq_li.append((k, v)) + keys.add(k) + else: + left.append((k, v)) + + if is_ref: + dict_li = dict(uniq_li) + for k, v in left: + dict_li[k] += v + uniq_li = [(k, v) for k, v in dict_li.items()] + return uniq_li + + def _expand_result(uniq_li): + expanded = uniq_li[:] + keys = set([x[0] for x in uniq_li]) + for k in YESNO_LABELS - keys: + expanded.append((k, [EMPTY])) + return expanded + + def _get_yesno_ans(qid, result_dict, is_ref=False): + if qid not in result_dict: + return [(str(qid) + '_' + k, v) for k, v in _expand_result([])] + yesno_answers = result_dict[qid]['yesno_answers'] + answers = result_dict[qid]['answers'] + lbl_ans = _uniq([(k, [v]) for k, v in zip(yesno_answers, answers)], + is_ref) + ret = [(str(qid) + '_' + k, v) for k, v in _expand_result(lbl_ans)] + return ret + + if ref_result[qid]['question_type'] != 'YES_NO': + return None, None + + ref_ans = _get_yesno_ans(qid, ref_result, is_ref=True) + pred_ans = _get_yesno_ans(qid, pred_result) + return pred_ans, ref_ans + + +def get_all_result(qid, pred_result, ref_result): + """ + Prepare answers for task 'all'. + + Args: + qid: question_id. + pred_result: A dict include all question_id's result information read + from args.pred_file. + ref_result: A dict incluce all question_id's result information read + from args.ref_file. + Returns: + Two lists, the first one contains predict result, the second + one contains reference result of the same question_id. Each list has + elements of tuple (question_id, answers), 'answers' is a list of strings. + """ + if ref_result[qid]['question_type'] == 'YES_NO': + return get_yesno_result(qid, pred_result, ref_result) + return get_main_result(qid, pred_result, ref_result) + + +def format_metrics(metrics, task, err_msg): + """ + Format metrics. 'err' field returns any error occured during evaluation. + + Args: + metrics: A dict object contains metrics for different tasks. + task: Task name. + err_msg: Exception raised during evaluation. + Returns: + Formatted result. + """ + result = {} + sources = ['both', 'search', 'zhidao'] + if err_msg is not None: + return {'errorMsg': str(err_msg), 'errorCode': 1, 'data': []} + data = [] + if task != 'all' and task != 'main': + sources = ['both'] + + if task == 'entity': + metric_names = ['Bleu-4', 'Rouge-L'] + metric_names_prf = ['F1', 'Precision', 'Recall'] + for name in metric_names + metric_names_prf: + for src in sources: + obj = { + 'name': name, + 'value': round(metrics[src].get(name, 0) * 100, 2), + 'type': src, + } + data.append(obj) + elif task == 'yesno': + metric_names = ['Bleu-4', 'Rouge-L'] + details = ['Yes', 'No', 'Depends'] + src = sources[0] + for name in metric_names: + obj = { + 'name': name, + 'value': round(metrics[src].get(name, 0) * 100, 2), + 'type': 'All', + } + data.append(obj) + for d in details: + obj = { + 'name': name, + 'value': round(metrics[src].get(d + '|' + name, 0) * 100, + 2), + 'type': d + } + data.append(obj) + else: + metric_names = ['Bleu-4', 'Rouge-L'] + for name in metric_names: + for src in sources: + obj = { + 'name': name, + 'value': round(metrics[src].get(name, 0) * 100, 2), + 'type': src + } + data.append(obj) + + result['data'] = data + result['errorCode'] = 0 + result['errorMsg'] = 'success' + + return result + + +def main(args): + """ + Do evaluation. + """ + err = None + metrics = {} + try: + pred_result = read_file(args.pred_file, args.task) + ref_result = read_file(args.ref_file, args.task, is_ref=True) + sources = ['both', 'search', 'zhidao'] + if args.task not in set(['main', 'all']): + sources = sources[:1] + for source in sources: + metrics[source] = get_metrics(pred_result, ref_result, args.task, + source) + except ValueError as ve: + err = ve + except AssertionError as ae: + err = ae + + print( + json.dumps( + format_metrics(metrics, args.task, err), + ensure_ascii=False).encode('utf8')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('pred_file', help='predict file') + parser.add_argument('ref_file', help='reference file') + parser.add_argument( + 'task', help='task name: Main|Yes_No|All|Entity|Description') + + args = parser.parse_args() + args.task = args.task.lower().replace('_', '') + main(args) diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py new file mode 100644 index 00000000..c2121cfd --- /dev/null +++ b/modelscope/models/nlp/palm_v2/modeling_palm.py @@ -0,0 +1,1332 @@ +import codecs +import copy +import math +import os +import subprocess +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import json +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn.init import xavier_uniform_ +from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig, + RobertaModel, RobertaTokenizer) +from transformers.activations import ACT2FN +from transformers.modeling_utils import PreTrainedModel + +from modelscope.outputs import OutputKeys +from modelscope.utils import logger as logging +from .configuration_palm import PalmConfig +from .dureader_eval import compute_bleu_rouge, normalize + +CONFIG_NAME = 'config.json' +WEIGHTS_NAME = 'pytorch_model.bin' + + +class MultiHeadedAttention(nn.Module): # SelfAttention + """ + Multi-Head Attention module from + "Attention is All You Need" + :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. + + Similar to standard `dot` attention but uses + multiple attention distributions simulataneously + to select relevant items. + + .. mermaid:: + + graph BT + A[key] + B[value] + C[query] + O[output] + subgraph Attn + D[Attn 1] + E[Attn 2] + F[Attn N] + end + A --> D + C --> D + A --> E + C --> E + A --> F + C --> F + D --> O + E --> O + F --> O + B --> O + + Also includes several additional tricks. + + Args: + head_count (int): number of parallel heads + model_dim (int): the dimension of keys/values/queries, + must be divisible by head_count + dropout (float): dropout parameter + """ + + def __init__(self, + head_count, + model_dim, + dropout=0.1, + use_final_linear=True): + assert model_dim % head_count == 0 + self.dim_per_head = model_dim // head_count + self.model_dim = model_dim + + super().__init__() + self.head_count = head_count + + self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head) + self.linear_values = nn.Linear(model_dim, + head_count * self.dim_per_head) + self.linear_query = nn.Linear(model_dim, + head_count * self.dim_per_head) + self.softmax = nn.Softmax(dim=-1) + self.dropout = nn.Dropout(dropout) + self.use_final_linear = use_final_linear + if (self.use_final_linear): + self.final_linear = nn.Linear(model_dim, model_dim) + + def forward(self, + key, + value, + query, + mask=None, + layer_cache=None, + type=None, + predefined_graph_1=None, + return_attn=False): + """ + Compute the context vector and the attention vectors. + + Args: + key (`FloatTensor`): set of `key_len` + key vectors `[batch, key_len, dim]` + value (`FloatTensor`): set of `key_len` + value vectors `[batch, key_len, dim]` + query (`FloatTensor`): set of `query_len` + query vectors `[batch, query_len, dim]` + mask: binary mask indicating which keys have + non-zero attention `[batch, query_len, key_len]` + Returns: + (`FloatTensor`, `FloatTensor`) : + + * output context vectors `[batch, query_len, dim]` + * one of the attention vectors `[batch, query_len, key_len]` + """ + + batch_size = key.size(0) + dim_per_head = self.dim_per_head + head_count = self.head_count + + def shape(x): + """ projection """ + return x.view(batch_size, -1, head_count, dim_per_head) \ + .transpose(1, 2) + + def unshape(x): + """ compute context """ + return x.transpose(1, 2).contiguous() \ + .view(batch_size, -1, head_count * dim_per_head) + + # 1) Project key, value, and query. + if layer_cache is not None: + if type == 'self': + query, key, value = self.linear_query(query), self.linear_keys( + query), self.linear_values(query) + + key = shape(key) + value = shape(value) + + if layer_cache is not None: + device = key.device + if layer_cache['self_keys'] is not None: + key = torch.cat( + (layer_cache['self_keys'].to(device), key), dim=2) + if layer_cache['self_values'] is not None: + value = torch.cat( + (layer_cache['self_values'].to(device), value), + dim=2) + layer_cache['self_keys'] = key + layer_cache['self_values'] = value + elif type == 'context': + query = self.linear_query(query) + if layer_cache is not None: + if layer_cache['memory_keys'] is None: + key, value = self.linear_keys(key), self.linear_values( + value) + key = shape(key) + value = shape(value) + else: + key, value = layer_cache['memory_keys'], layer_cache[ + 'memory_values'] + layer_cache['memory_keys'] = key + layer_cache['memory_values'] = value + else: + key, value = self.linear_keys(key), self.linear_values( + value) + key = shape(key) + value = shape(value) + else: + key = self.linear_keys(key) + value = self.linear_values(value) + query = self.linear_query(query) + key = shape(key) + value = shape(value) + + query = shape(query) + + # 2) Calculate and scale scores. + query = query / math.sqrt(dim_per_head) + scores = torch.matmul(query, key.transpose(2, 3)) + + if mask is not None: + mask = mask.unsqueeze(1).expand_as(scores) + scores = scores.masked_fill(mask, -1e18) + + # 3) Apply attention dropout and compute context vectors. + + attn = self.softmax(scores) + + if predefined_graph_1 is not None: + attn_masked = attn[:, -1] * predefined_graph_1 + attn_masked = attn_masked / ( + torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9) + + attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1) + + drop_attn = self.dropout(attn) + if self.use_final_linear: + context = unshape(torch.matmul(drop_attn, value)) + output = self.final_linear(context) + if return_attn: + return output, attn + else: + return output + else: + context = torch.matmul(drop_attn, value) + if return_attn: + return context, attn + else: + return context + + +class PositionwiseFeedForward(nn.Module): # Output + """ A two-layer Feed-Forward-Network with residual layer norm. + + Args: + d_model (int): the size of input for the first-layer of the FFN. + d_ff (int): the hidden layer size of the second-layer + of the FNN. + dropout (float): dropout probability in :math:`[0, 1)`. + """ + + def __init__(self, d_model, d_ff, dropout=0.1): + super().__init__() + self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) + self.w_1 = nn.Linear(d_model, d_ff) + self.actv = ACT2FN['gelu_new'] + self.dropout_1 = nn.Dropout(dropout) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x): + inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x)))) + output = self.dropout_2(self.w_2(inter)) + return output + x + + +class TransformerDecoderLayer(nn.Module): # Layer + """ + Args: + d_model (int): the dimension of keys/values/queries in + MultiHeadedAttention, also the input size of + the first-layer of the PositionwiseFeedForward. + heads (int): the number of heads for MultiHeadedAttention. + d_ff (int): the second-layer of the PositionwiseFeedForward. + dropout (float): dropout probability(0-1.0). + self_attn_type (string): type of self-attention scaled-dot, average + """ + MAX_SIZE = 5000 + + def __init__(self, d_model, heads, d_ff, dropout): + super().__init__() + + self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) + + self.context_attn = MultiHeadedAttention( + heads, d_model, dropout=dropout) + self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) + self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) + self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) + self.drop = nn.Dropout(dropout) + mask = self._get_attn_subsequent_mask(self.MAX_SIZE) + # Register self.mask as a buffer in TransformerDecoderLayer, so + # it gets TransformerDecoderLayer's cuda behavior automatically. + self.register_buffer('mask', mask) + + def forward(self, + inputs, + memory_bank, + src_pad_mask, + tgt_pad_mask, + previous_input=None, + layer_cache=None, + step=None): + """ + Args: + inputs (`FloatTensor`): `[batch_size x 1 x model_dim]` + memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]` + src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]` + tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]` + + Returns: + (`FloatTensor`, `FloatTensor`, `FloatTensor`): + + * output `[batch_size x 1 x model_dim]` + * attn `[batch_size x 1 x src_len]` + * all_input `[batch_size x current_step x model_dim]` + + """ + dec_mask = torch.gt( + tgt_pad_mask.type(torch.uint8) + + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type( + torch.uint8), 0) + input_norm = self.layer_norm_1(inputs) + all_input = input_norm + if previous_input is not None: + all_input = torch.cat((previous_input, input_norm), dim=1) + dec_mask = None + + query = self.self_attn( + all_input, + all_input, + input_norm, + mask=dec_mask, + layer_cache=layer_cache, + type='self') + + query = self.drop(query) + inputs + + query_norm = self.layer_norm_2(query) + mid, attn = self.context_attn( + memory_bank, + memory_bank, + query_norm, + mask=src_pad_mask, + layer_cache=layer_cache, + type='context', + return_attn=True) + output = self.feed_forward(self.drop(mid) + query) + + return output, attn, all_input + + def _get_attn_subsequent_mask(self, size): + """ + Get an attention mask to avoid using the subsequent info. + + Args: + size: int + + Returns: + (`LongTensor`): + + * subsequent_mask `[1 x size x size]` + """ + attn_shape = (1, size, size) + subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') + subsequent_mask = torch.from_numpy(subsequent_mask) + return subsequent_mask + + +class PositionalEncoding(nn.Module): + + def __init__(self, dropout, dim, max_len=5000): + super().__init__() + pe = torch.zeros(max_len, dim) + position = torch.arange(0, max_len).unsqueeze(1) + div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) + * -(math.log(10000.0) / dim))) + pe[:, 0::2] = torch.sin(position.float() * div_term) + pe[:, 1::2] = torch.cos(position.float() * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + self.dropout = nn.Dropout(dropout) + self.dim = dim + + def forward(self, emb, step=None): + emb = emb * math.sqrt(self.dim) + if (step): + emb = emb + self.pe[:, step][:, None, :] + + else: + emb = emb + self.pe[:, :emb.size(1)] + emb = self.dropout(emb) + return emb + + def get_emb(self, emb): + return self.pe[:, :emb.size(1)] + + +class TransformerDecoder(nn.Module): # Decoder + """ + The Transformer decoder from "Attention is All You Need". + + + .. mermaid:: + + graph BT + A[input] + B[multi-head self-attn] + BB[multi-head src-attn] + C[feed forward] + O[output] + A --> B + B --> BB + BB --> C + C --> O + + + Args: + num_layers (int): number of encoder layers. + d_model (int): size of the model + heads (int): number of heads + d_ff (int): size of the inner FF layer + dropout (float): dropout parameters + embeddings (:obj:`onmt.modules.Embeddings`): + embeddings to use, should have positional encodings + attn_type (str): if using a seperate copy attention + """ + decoder_type = 'transformer' + + class TransformerDecoderState: + + def __init__(self, src): + self.src = src + self.previous_input = None + self.previous_layer_inputs = None + self.cache = None + + def update_state(self, new_input, previous_layer_inputs): + self.previous_input = new_input + self.previous_layer_inputs = previous_layer_inputs + self.cache = None + + def _init_cache(self, num_layers): + self.cache = {} + for num in range(num_layers): + layer_cache = { + 'memory_keys': None, + 'memory_values': None, + 'self_keys': None, + 'self_values': None + } + self.cache['layer_{}'.format(num)] = layer_cache + + def map_batch_fn(self, fn): + + def _recursive_map(struct, batch_dim=0): + for k, v in struct.items(): + if v is not None: + if isinstance(v, dict): + _recursive_map(v) + else: + struct[k] = fn(v, batch_dim) + + self.src = fn(self.src, 0) + if self.cache is not None: + _recursive_map(self.cache) + + def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings): + super().__init__() + + # Basic attributes. + self.num_layers = num_layers + self.embeddings = embeddings + self.pos_emb = PositionalEncoding(dropout, + self.embeddings.embedding_dim) + + # Build TransformerDecoder. + self.transformer_layers = nn.ModuleList([ + TransformerDecoderLayer(d_model, heads, d_ff, dropout) + for _ in range(num_layers) + ]) + self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) + self.state = None + + def init_state(self, src, with_cache=False): + self.state = self.TransformerDecoderState(src) + if with_cache: + self.state._init_cache(self.num_layers) + + def forward(self, tgt, memory_bank, step=None, memory_masks=None): + src_words = self.state.src + tgt_words = tgt + src_batch, src_len = src_words.size() + tgt_batch, tgt_len = tgt_words.size() + + # Run the forward pass of the TransformerDecoder. + # emb = self.embeddings(tgt, step=step) + emb = self.embeddings(tgt) + assert emb.dim() == 3 # len x batch x embedding_dim + output = self.pos_emb(emb, step) + + src_memory_bank = memory_bank + padding_idx = self.embeddings.padding_idx + tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \ + .expand(tgt_batch, tgt_len, tgt_len) + + if memory_masks is not None: + src_len = memory_masks.size(-1) + src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len) + else: + src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \ + .expand(src_batch, tgt_len, src_len) + + if self.state.cache is None: + saved_inputs = [] + attns = [] + for i in range(self.num_layers): + prev_layer_input = None + if self.state.cache is None: + if self.state.previous_input is not None: + prev_layer_input = self.state.previous_layer_inputs[i] + output, attn, all_input \ + = self.transformer_layers[i](output, src_memory_bank, src_pad_mask, tgt_pad_mask, + previous_input=prev_layer_input, + layer_cache=self.state.cache['layer_{}'.format(i)] + if self.state.cache is not None else None, step=step) + if self.state.cache is None: + saved_inputs.append(all_input) + attns.append(attn) + + if self.state.cache is None: + saved_inputs = torch.stack(saved_inputs) + + output = self.layer_norm(output) + + # Process the result and update the attentions. + if self.state.cache is None: + self.state.update_state(tgt, saved_inputs) + + return output, attns + + +class PalmPointerGenerator(nn.Module): + + def __init__(self, hidden_size, vocab_size): + super().__init__() + self.dense = nn.Linear(hidden_size, vocab_size) + self.gen_func = nn.LogSoftmax(-1) + + def forward(self, x): + x = self.dense(x) + x = self.gen_func(x) + return x + + +class PalmPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = PalmConfig + base_model_prefix = 'palm' + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Optional[Union[str, + os.PathLike]], + **kwargs): + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + config = PalmConfig.from_json_file(config_file) if os.path.isfile( + config_file) else PalmConfig() + config.encoder_pth = os.path.join(pretrained_model_name_or_path, + config.encoder_pth) + checkpoint_file = os.path.join(pretrained_model_name_or_path, + WEIGHTS_NAME) + checkpoint = torch.load(checkpoint_file) if os.path.isfile( + checkpoint_file) else None + return cls(config, checkpoint, **kwargs) + + +class AbsSummarizer(PalmPreTrainedModel): # Model + + def __init__(self, config, checkpoint=None): + super().__init__(config) + self.config = config + if config.encoder == 'bert' or config.encoder == 'zh_bert': + self.bert = BertModel( + BertConfig.from_pretrained(config.encoder_pth)) + elif config.encoder == 'roberta': + self.bert = RobertaModel( + RobertaConfig.from_pretrained(config.encoder_pth)) + + if (config.max_pos > 512): + my_pos_embeddings = nn.Embedding( + config.max_pos, self.bert.model.config.hidden_size) + my_pos_embeddings.weight.data[:512] = \ + self.bert.embeddings.position_embeddings.weight.data + my_pos_embeddings.weight.data[512:] = \ + self.bert.embeddings.position_embeddings.weight.data[-1][None, :].repeat(config.max_pos - 512, 1) + self.bert.model.embeddings.position_embeddings = my_pos_embeddings + self.vocab_size = self.bert.config.vocab_size + tgt_embeddings = nn.Embedding( + self.vocab_size, + self.bert.config.hidden_size, + padding_idx=1 if config.encoder == 'roberta' else 0) + + if config.share_emb: + tgt_embeddings.weight = copy.deepcopy( + self.bert.model.embeddings.word_embeddings.weight) + self.decoder = TransformerDecoder( + config.dec_layers, + config.dec_hidden_size, + heads=config.dec_heads, + d_ff=config.dec_ff_size, + dropout=config.dec_dropout, + embeddings=tgt_embeddings) + self.generator = PalmPointerGenerator(config.dec_hidden_size, + self.vocab_size) + self.generator.dense.weight = self.decoder.embeddings.weight + + if checkpoint is not None: + for key in list(checkpoint['model'].keys()): + checkpoint['model'][key.replace('module.', + '')] = checkpoint['model'][key] + msg = self.load_state_dict(checkpoint['model'], strict=False) + print(msg) + else: + for module in self.decoder.modules(): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + for p in self.generator.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + else: + p.data.zero_() + if config.use_bert_emb: + if config.encoder == 'roberta': + tgt_embeddings = nn.Embedding( + self.vocab_size, + self.bert.config.hidden_size, + padding_idx=1) + else: + tgt_embeddings = nn.Embedding( + self.vocab_size, + self.bert.config.hidden_size, + padding_idx=0) + tgt_embeddings.weight = copy.deepcopy( + self.bert.embeddings.word_embeddings.weight) + self.decoder.embeddings = tgt_embeddings + self.generator.dense.weight = self.decoder.embeddings.weight + + def forward(self, src, tgt, mask_src): + top_vec, _ = self.bert(src, mask_src, return_dict=False) + self.decoder.init_state(src) + decoder_outputs, attns = self.decoder(tgt[:, :-1], top_vec) + return decoder_outputs, attns[-1], top_vec + + +class LabelSmoothingLoss(nn.Module): + """ + With label smoothing, + KL-divergence between q_{smoothed ground truth prob.}(w) + and p_{prob. computed by model}(w) is minimized. + """ + + def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100): + assert 0.0 < label_smoothing <= 1.0 + self.padding_idx = ignore_index + super(LabelSmoothingLoss, self).__init__() + + smoothing_value = label_smoothing / (tgt_vocab_size - 2) + one_hot = torch.full((tgt_vocab_size, ), smoothing_value) + one_hot[self.padding_idx] = 0 + self.register_buffer('one_hot', one_hot.unsqueeze(0)) + self.confidence = 1.0 - label_smoothing + + def forward(self, output, target): + """ + output (FloatTensor): batch_size x n_classes + target (LongTensor): batch_size + """ + model_prob = self.one_hot.repeat(target.size(0), 1) + model_prob.scatter_(1, target.unsqueeze(1), self.confidence) + model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0) + + return F.kl_div(output, model_prob, reduction='sum') + + +class NMTLossCompute(nn.Module): + """ + Standard NMT Loss Computation. + """ + + def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0): + super().__init__() + self.generator = generator + self.padding_idx = symbols['PAD'] + if label_smoothing > 0: + self.criterion = LabelSmoothingLoss( + label_smoothing, vocab_size, ignore_index=self.padding_idx) + else: + self.criterion = nn.NLLLoss( + ignore_index=self.padding_idx, reduction='sum') + + def _bottle(self, _v): + return _v.view(-1, _v.size(2)) + + def _unbottle(self, _v, batch_size): + return _v.view(-1, batch_size, _v.size(1)) + + def forward(self, tgt, output): + target = tgt[:, 1:] + normalization = target.ne(self.padding_idx).sum() + bottled_output = self._bottle(output) + scores = self.generator(bottled_output) + gtruth = target.contiguous().view(-1) + loss = self.criterion(scores, gtruth) + loss.div(float(normalization)) + return loss + + +class PalmForConditionalGeneration(PalmPreTrainedModel): + + def __init__(self, config, checkpoint=None): + super().__init__(config) + self.config = config + if config.encoder == 'roberta': + tokenizer = RobertaTokenizer.from_pretrained( + config.encoder_pth, do_lower_case=False) + symbols = { + 'BOS': tokenizer.cls_token_id, + 'EOS': tokenizer.sep_token_id, + 'PAD': tokenizer.pad_token_id, + 'EOQ': tokenizer.unk_token_id + } + elif config.encoder == 'bert' or config.encoder == 'zh_bert': + tokenizer = BertTokenizer.from_pretrained( + config.encoder_pth, do_lower_case=True) + symbols = { + 'BOS': tokenizer.vocab['[CLS]'], + 'EOS': tokenizer.vocab['[SEP]'], + 'PAD': tokenizer.vocab['[PAD]'], + 'EOQ': tokenizer.vocab['[unused2]'] + } + self.tokenizer = tokenizer + self.symbols = symbols + self.palm = AbsSummarizer(config, checkpoint) + self.loss = NMTLossCompute(self.palm.generator, symbols, + self.palm.vocab_size, + config.label_smoothing) + + def forward(self, src, tgt, mask_src): + output = self.palm(src, tgt, mask_src)[0] + loss = self.loss(tgt, output) + return loss + + +class Translator(nn.Module): + """ + Uses a model to translate a batch of sentences. + """ + + @dataclass + class Batch: + batch_size: int + src: torch.Tensor + tgt: torch.Tensor + mask_src: torch.Tensor + query_id: List[None] = None + src_str: List[List[str]] = None + tgt_str: List[str] = None + + def __init__(self, + model: PalmForConditionalGeneration, + dataset: str = 'cnn'): + super().__init__() + self.logger = logging.get_logger(__name__) + self.args = model.config + self.args.dataset = dataset + self.model = model.palm + self.generator = self.model.generator + self.vocab = model.tokenizer + self.symbols = model.symbols + self.start_token = self.symbols['BOS'] + self.end_token = self.symbols['EOS'] + self.alpha = self.args.alpha + self.beam_size = self.args.beam_size + self.min_length = self.args.min_length + self.max_length = self.args.max_length + + def from_batch(self, translation_batch): + batch = translation_batch['batch'] + assert (len(translation_batch['gold_score']) == len( + translation_batch['predictions'])) + batch_size = batch.batch_size + + preds, pred_score, _, tgt_str, src, src_str = \ + translation_batch['predictions'], translation_batch['scores'], translation_batch['gold_score'], \ + batch.tgt_str, batch.src, batch.src_str + query_id = batch.query_id + ''' + try: + query_id = batch.query_id + except: + query_id = None + ''' + translations = [] + for b in range(batch_size): + if self.args.dataset == 'qg_ranking_test': + if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert': + pred_sents = [ + ' '.join( + self.vocab.convert_ids_to_tokens( + [int(n) for n in each])).replace(' ##', '') + for each in preds[b] + ] + elif self.args.encoder == 'roberta': + pred_sents = [ + self.vocab.decode([int(n) for n in each + ]).replace('', + '').replace('', '') + for each in preds[b] + ] + elif self.args.encoder == 'roberta': + pred_sents = self.vocab.decode([int(n) + for n in preds[b][0]]).replace( + '', + '').replace('', '') + elif self.args.encoder == 'bert': + pred_sents = self.vocab.convert_ids_to_tokens( + [int(n) for n in preds[b][0]]) + pred_sents = ' '.join(pred_sents).replace(' ##', '') + elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase': + pred_sents = [ + self.vocab.convert_ids_to_tokens([int(n) for n in pred]) + for pred in preds[b] + ] + pred_sents = [ + ''.join(pred).replace(' ##', '') for pred in pred_sents + ] + elif self.args.encoder == 'zh_bert': + pred_sents = self.vocab.convert_ids_to_tokens( + [int(n) for n in preds[b][0]]) + pred_sents = ''.join(pred_sents).replace('##', '') + gold_sent = tgt_str[b] + + if self.args.encoder == 'roberta': + raw_src = self.vocab.decode([int(t) for t in src[b]]) + raw_src = ' '.join(src_str[b]) + else: + raw_src = [self.vocab.ids_to_tokens[int(t)] + for t in src[b]][:500] + raw_src = ' '.join(raw_src) + if self.args.dataset == 'faq': + translation = (pred_sents, gold_sent, src_str[b], query_id[b], + pred_score[b]) + else: + translation = (pred_sents, gold_sent, raw_src, query_id[b], + pred_score[b]) + # translation = (pred_sents[0], gold_sent) + translations.append(translation) + + return translations + + def translate(self, data_iter, step): + gold_path = self.args.result_path + '.%d.gold' % step + can_path = self.args.result_path + '.%d.candidate' % step + self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8') + self.can_out_file = codecs.open(can_path, 'w', 'utf-8') + self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w', + 'utf-8') + if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta': + out = '\t'.join([ + 'query_id', 'source_query', 'target_query', 'predict_query' + ]) + '\n' + self.pred_json_score_out_file.write(out) + + raw_src_path = self.args.result_path + '.%d.raw_src' % step + self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8') + + pred_results, gold_results = [], [] + cnt = 0 + pred_dict, ref_dict = {}, {} + for i, batch in enumerate(data_iter): + self.logger.info(f'data: {i + 1} / {len(data_iter)}') + batch_data = self.translate_batch(batch) + translations = self.from_batch(batch_data) + + for trans in translations: + pred, gold, src, query_id, pred_score = trans + src = src.replace('', '').replace('##', '').strip() + if self.args.dataset == 'qg_ranking_test': + pred_str = '\t'.join([ + each.replace('[unused0]', '').replace( + '[PAD]', '').replace('[unused1]', '').replace( + r' +', ' ').replace('[SEP]', '').replace( + '[unused2]', + '').replace(r' +', ' ').replace( + '', + '').replace('', '').replace( + '', + '').replace('', '').replace( + '', ' ').strip() + for each in pred + ]) + else: + pred_str = pred.replace('[unused0]', '').replace( + '[PAD]', '').replace('[unused1]', '').replace( + r' +', ' ').replace('[SEP]', '').replace( + '[unused2]', '').replace('[CLS]', '').replace( + '[SEP]', '').replace('[UNK]', '').strip() + pred_str = pred_str.replace(r' +', ' ').replace( + '', + '').replace('', '').replace('', '').replace( + '', '').replace('', ' ').strip() + gold_str = gold.replace('', '').strip().replace( + '[UNK]', '').replace('[unused1]', '').replace( + '[unused2]', + '').replace('##', '').replace('[CLS]', '').replace( + '[SEP]', '').strip().replace('', '').replace( + '', '').replace('', ' ').strip() + if (self.args.recall_eval): + _pred_str = '' + # gap = 1e3 + for sent in pred_str.split(''): + can_pred_str = _pred_str + '' + sent.strip() + # can_gap = math.fabs(len(_pred_str.split()) - len(gold_str.split())) + # if(can_gap>=gap): + if len(can_pred_str.split()) >= len( + gold_str.split()) + 10: + pred_str = _pred_str + break + else: + # gap = can_gap + _pred_str = can_pred_str + + if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking': + pred_str = pred_str.replace('', ' ') + if query_id is not None: + pred_json = { + 'query_id': query_id, + 'answers': [pred_str] + } + gold_json = { + 'query_id': query_id, + 'answers': [gold_str] + } + pred_json_score = { + 'query_id': query_id, + 'answers': [pred_str], + 'scores': pred_score[0].cpu().numpy().tolist() + } + else: + pred_json = {'query_id': cnt, 'answers': [pred_str]} + gold_json = {'query_id': cnt, 'answers': [gold_str]} + pred_json_score = { + 'query_id': cnt, + 'answers': [pred_str], + 'scores': pred_score[0].cpu().numpy().tolist() + } + json.dump(pred_json, self.can_out_file) + self.can_out_file.write('\n') + json.dump(gold_json, self.gold_out_file) + self.gold_out_file.write('\n') + json.dump(pred_json_score, self.pred_json_score_out_file) + self.pred_json_score_out_file.write('\n') + self.src_out_file.write(src.strip() + '\n') + elif self.args.dataset == 'cnn': + self.can_out_file.write(pred_str + '\n') + self.gold_out_file.write(gold_str + '\n') + self.src_out_file.write(src.strip() + '\n') + elif self.args.dataset == 'dureader': + if query_id is None: + query_id = str(cnt) + pred_results.extend(normalize([pred_str])) + gold_results.extend(normalize([gold_str])) + self.can_out_file.write(pred_str + '\n') + self.gold_out_file.write('\t'.join([src[0], gold_str]) + + '\n') + + elif self.args.dataset == 'paraphrase': + if query_id is None: + query_id = str(cnt) + if self.args.encoder == 'roberta': + pred_str = [pred_str] + pred_dict[query_id] = normalize([pred_str[0]]) + ref_dict[query_id] = normalize([gold_str]) + # pred_str_list = [src] + pred_str + # self.can_out_file.write("\t".join(pred_str_list)+"\n") + # self.can_out_file.write("\t".join(pred_str_list)+"\n") + # self.gold_out_file.write("\t".join([src, pred_str[0], gold_str])+"\n") + self.pred_json_score_out_file.write( + '\t'.join([str(query_id), src, gold_str, pred_str[0]]) + + '\n') + elif self.args.dataset == 'faq': + if pred_score[0].cpu().numpy().tolist() < -3.5: + continue + self.can_out_file.write( + '\t'.join([str(query_id), src, pred_str]) + '\n') + self.gold_out_file.write( + '\t'.join([str(query_id), src, gold_str]) + '\n') + # passage, answer, question, score + self.pred_json_score_out_file.write('\t'.join([ + str(query_id), gold_str, src, pred_str, + str(pred_score[0].cpu().numpy().tolist()) + ]) + '\n') + elif self.args.dataset == 'qg_ranking_test': + self.can_out_file.write( + str(query_id) + '\t' + pred_str + '\n') + + cnt += 1 + self.can_out_file.flush() + self.gold_out_file.flush() + self.src_out_file.flush() + self.logger.info('cnt: %s' % cnt) + self.can_out_file.close() + self.gold_out_file.close() + self.src_out_file.close() + + if (step != -1): + if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking': + cnn_results = subprocess.getoutput( + './run.sh %s %s' % (gold_path, can_path)) # run.sh ... + self.logger.info(cnn_results) + elif self.args.dataset == 'cnn': + self.logger.info('Calculating Rouge') + from rouge import Rouge + candidates = [ + line.strip() for line in open(can_path, encoding='utf-8') + ] + references = [ + line.strip() for line in open(gold_path, encoding='utf-8') + ] + rouge_score = Rouge().get_scores( + candidates, references, avg=True) + # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges))) + print(rouge_score) + elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase': + + def postprocess_text(preds, labels): + preds = [pred.strip().replace('.', '') for pred in preds] + labels = [label.strip() for label in labels] + while '' in preds: + idx = preds.index('') + preds[idx] = '。' + return preds, labels + + # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) + # self.logger.info('Dev eval result: {}'.format(bleu_rouge)) + pred_results, gold_results = postprocess_text( + pred_results, gold_results) + pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)} + gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)} + bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict) + print(bleu_rouge) + # unreachable + elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase': + # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) + # self.logger.info('Dev eval result: {}'.format(bleu_rouge)) + pred_results, gold_results = postprocess_text( + pred_results, gold_results) + bleu_score = cal_bleu(pred_results, gold_results) + from rouge import Rouge + rouge = Rouge() + rouge_score = rouge.get_scores( + pred_results, gold_results, avg=True) + print("'Dev eval result: Bleu-4={}, {}".format( + bleu_score, rouge_score)) + + def translate_batch(self, batch: 'Batch', fast: bool = False): + """ + Translate a batch of sentences. + + Mostly a wrapper around :obj:`Beam`. + + Args: + batch (:obj:`Batch`): a batch from a dataset object + data (:obj:`Dataset`): the dataset object + fast (bool): enables fast beam search (may not support all features) + + Todo: + Shouldn't need the original dataset. + """ + self.model.eval() + with torch.no_grad(): + return self._fast_translate_batch( + batch, self.max_length, min_length=self.min_length) + + def _tile(self, x, count, dim=0): + perm = list(range(len(x.size()))) + if dim != 0: + perm[0], perm[dim] = perm[dim], perm[0] + x = x.permute(perm).contiguous() + out_size = list(x.size()) + out_size[0] *= count + batch = x.size(0) + x = x.view(batch, -1) \ + .transpose(0, 1) \ + .repeat(count, 1) \ + .transpose(0, 1) \ + .contiguous() \ + .view(*out_size) + if dim != 0: + x = x.permute(perm).contiguous() + return x + + def _top_k_top_p_filtering(self, + logits, + top_k=10, + top_p=1.0, + filter_value=-float('Inf'), + min_tokens_to_keep=1): + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), + logits.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, + None] + logits[indices_to_remove] = filter_value + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum( + F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ + ..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove) + logits[indices_to_remove] = filter_value + return logits + + def _fast_translate_batch(self, + batch: 'Batch', + max_length: int, + min_length: int = 0): + # TODO: faster code path for beam_size == 1. + # TODO: support these blacklisted features. + + beam_size = self.beam_size + batch_size = batch.batch_size + src = batch.src + mask_src = batch.mask_src + + src_features, _ = self.model.bert(src, mask_src, return_dict=False) + self.model.decoder.init_state(src, with_cache=True) + device = src_features.device + + # Tile states and memory beam_size times. + self.model.decoder.state.map_batch_fn( + lambda state, dim: self._tile(state, beam_size, dim=dim)) + src_features = self._tile(src_features, beam_size, dim=0) + batch_offset = torch.arange( + batch_size, dtype=torch.long, device=device) + beam_offset = torch.arange( + 0, + batch_size * beam_size, + step=beam_size, + dtype=torch.long, + device=device) + alive_seq = torch.full([batch_size * beam_size, 1], + self.start_token, + dtype=torch.long, + device=device) + + # Give full probability to the first beam on the first step. + topk_log_probs = ( + torch.tensor( + [0.0] + [float('-inf')] * (beam_size - 1), + device=device).repeat(batch_size)) + + # Structure that holds finished hypotheses. + hypotheses = [[] for _ in range(batch_size)] # noqa: F812 + + results = {} + results['predictions'] = [[] for _ in range(batch_size)] # noqa: F812 + results['scores'] = [[] for _ in range(batch_size)] # noqa: F812 + results['gold_score'] = [0] * batch_size + results['batch'] = batch + + for step in range(max_length): + self.logger.info(f'step: {step + 1} / {max_length}') + decoder_input = alive_seq[:, -1].view(1, -1) + + # Decoder forward. + decoder_input = decoder_input.transpose(0, 1) + dec_out, attns = self.model.decoder( + decoder_input, src_features, step=step) + + # Generator forward. + log_probs = self.generator.forward( + dec_out.transpose(0, 1).squeeze(0)) + vocab_size = log_probs.size(-1) + + if step < min_length: + log_probs[:, self.end_token] = -1e20 + + # Multiply probs by the beam probability. + + length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha + # ''' + if self.args.sample_topk: + temperature = self.args.temperature + _scores = log_probs / temperature + _scores = self._top_k_top_p_filtering( + _scores, + top_k=self.args.top_k, + top_p=self.args.top_p, + min_tokens_to_keep=1 + ) # (batch_size * num_beams, vocab_size) + # Sample 2 next words for each beam (so we have some spare tokens + # and match output of greedy beam search) + topk_ids = torch.multinomial( + F.softmax(_scores, dim=-1), + num_samples=1) # (batch_size * num_beams, 2) + # Compute next scores + _scores = F.log_softmax( + _scores, dim=1) # (batch_size * num_beams, vocab_size) + + _scores += topk_log_probs.view(-1).unsqueeze(1) + _scores = _scores / length_penalty + topk_scores = torch.gather( + _scores, -1, topk_ids) # (batch_size * num_beams, 2) + # log_probs += # (batch_size * num_beams, 2) + # Match shape of greedy beam search + topk_ids = topk_ids.view( + -1, beam_size) # (batch_size, 2 * num_beams) + topk_scores = topk_scores.view( + -1, beam_size) # (batch_size, 2 * num_beams) + # ''' + else: + log_probs += topk_log_probs.view(-1).unsqueeze(1) + curr_scores = log_probs / length_penalty + + curr_scores = curr_scores.reshape(-1, beam_size * vocab_size) + topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1) + if self.args.block_trigram: + cur_len = alive_seq.size(1) + if cur_len > 3: + for i in range(alive_seq.size(0)): + fail = False + words = [int(w) for w in alive_seq[i]] + if self.args.encoder == 'roberta': + # words = [self.vocab.convert_ids_to_tokens[w] for w in words] + words = self.vocab.decode(words).strip().split() + else: + words = [ + self.vocab.ids_to_tokens[w] for w in words + ] + words = ' '.join(words).replace(' ##', '').split() + if len(words) <= 3: + continue + trigrams = [(words[i - 1], words[i], words[i + 1]) + for i in range(1, + len(words) - 1)] + trigram = tuple(trigrams[-1]) + if trigram in trigrams[:-1]: + fail = True + if fail: + curr_scores[i] = -10e20 + # Recover log probs. + topk_log_probs = topk_scores * length_penalty + + # Resolve beam origin and true word ids. + # topk_beam_index = topk_ids.div(vocab_size) + topk_beam_index = topk_ids // vocab_size + topk_ids = topk_ids.fmod(vocab_size) + + # Map beam_index to batch_index in the flat representation. + batch_index = ( + topk_beam_index + + beam_offset[:topk_beam_index.size(0)].unsqueeze(1)) + select_indices = batch_index.view(-1) + + # Append last prediction. + alive_seq = torch.cat([ + alive_seq.index_select(0, select_indices), + topk_ids.view(-1, 1) + ], -1) + + is_finished = topk_ids.eq(self.end_token) + if step + 1 == max_length: + is_finished.fill_(self.end_token) + # End condition is top beam is finished. + end_condition = is_finished[:, 0].eq(1) + # Save finished hypotheses. + if is_finished.any(): + predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1)) + for i in range(is_finished.size(0)): + b = batch_offset[i] + if end_condition[i]: + is_finished[i].fill_(self.end_token) + finished_hyp = is_finished[i].nonzero().view(-1) + # Store finished hypotheses for this batch. + for j in finished_hyp: + hypotheses[b].append( + (topk_scores[i, j], predictions[i, j, 1:])) + # If the batch reached the end, save the n_best hypotheses. + if end_condition[i]: + best_hyp = sorted( + hypotheses[b], key=lambda x: x[0], reverse=True) + if self.args.dataset == 'qg_ranking_test' or ( + self.args.dataset == 'paraphrase' + and not self.args.sample_topk): + for each in best_hyp[:beam_size]: + score, pred = each + results['scores'][b].append(score) + results['predictions'][b].append(pred) + else: + score, pred = best_hyp[0] + results['scores'][b].append(score) + results['predictions'][b].append(pred) + non_finished = end_condition.eq(0).nonzero().view(-1) + # If all sentences are translated, no need to go further. + if len(non_finished) == 0: + break + # Remove finished batches for the next step. + topk_log_probs = topk_log_probs.index_select(0, non_finished) + batch_index = batch_index.index_select(0, non_finished) + batch_offset = batch_offset.index_select(0, non_finished) + alive_seq = predictions.index_select(0, non_finished) \ + .view(-1, alive_seq.size(-1)) + # Reorder states. + select_indices = batch_index.view(-1) + src_features = src_features.index_select(0, select_indices) + self.model.decoder.state.map_batch_fn( + lambda state, dim: state.index_select(dim, select_indices)) + + return results + + def forward(self, input_ids: torch.Tensor, + attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]: + batch = self.Batch( + batch_size=input_ids.size()[0], + src=input_ids, + tgt=None, + mask_src=attention_mask) + translation_batch = self.translate_batch(batch) + + preds = translation_batch['predictions'] + return {'predictions': preds} diff --git a/modelscope/models/nlp/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py similarity index 96% rename from modelscope/models/nlp/palm_for_text_generation.py rename to modelscope/models/nlp/palm_v2/palm_for_text_generation.py index 23d60663..7f8e918b 100644 --- a/modelscope/models/nlp/palm_for_text_generation.py +++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py @@ -22,8 +22,8 @@ class PalmForTextGeneration(TorchModel): """ super().__init__(model_dir, *args, **kwargs) - from sofa.models.palm_v2 import (PalmForConditionalGeneration, - Translator) + from modelscope.models.nlp.palm_v2 import ( + PalmForConditionalGeneration, Translator) self.model = PalmForConditionalGeneration.from_pretrained(model_dir) self.tokenizer = self.model.tokenizer self.generator = Translator(self.model) diff --git a/modelscope/models/nlp/sbert_for_nli.py b/modelscope/models/nlp/sbert_for_nli.py deleted file mode 100644 index ea62a8bd..00000000 --- a/modelscope/models/nlp/sbert_for_nli.py +++ /dev/null @@ -1,23 +0,0 @@ -from modelscope.metainfo import Models -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks -from .sbert_for_sequence_classification import \ - SbertForSequenceClassificationBase - -__all__ = ['SbertForNLI'] - - -@MODELS.register_module(Tasks.nli, module_name=Models.structbert) -class SbertForNLI(SbertForSequenceClassificationBase): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the text generation model from the `model_dir` path. - - Args: - model_dir (str): the model path. - model_cls (Optional[Any], optional): model loader, if None, use the - default loader to load model weights, by default None. - """ - super().__init__( - model_dir, *args, model_args={'num_labels': 3}, **kwargs) - assert self.model.config.num_labels == 3 diff --git a/modelscope/models/nlp/sbert_for_sentence_similarity.py b/modelscope/models/nlp/sbert_for_sentence_similarity.py deleted file mode 100644 index 00b612ea..00000000 --- a/modelscope/models/nlp/sbert_for_sentence_similarity.py +++ /dev/null @@ -1,25 +0,0 @@ -from modelscope.metainfo import Models -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks -from .sbert_for_sequence_classification import \ - SbertForSequenceClassificationBase - -__all__ = ['SbertForSentenceSimilarity'] - - -@MODELS.register_module( - Tasks.sentence_similarity, module_name=Models.structbert) -class SbertForSentenceSimilarity(SbertForSequenceClassificationBase): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the sentence similarity model from the `model_dir` path. - - Args: - model_dir (str): the model path. - model_cls (Optional[Any], optional): model loader, if None, use the - default loader to load model weights, by default None. - """ - super().__init__( - model_dir, *args, model_args={'num_labels': 2}, **kwargs) - self.model_dir = model_dir - assert self.model.config.num_labels == 2 diff --git a/modelscope/models/nlp/sbert_for_sentiment_classification.py b/modelscope/models/nlp/sbert_for_sentiment_classification.py deleted file mode 100644 index 83ac93c5..00000000 --- a/modelscope/models/nlp/sbert_for_sentiment_classification.py +++ /dev/null @@ -1,22 +0,0 @@ -from modelscope.metainfo import Models -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks -from .sbert_for_sequence_classification import \ - SbertForSequenceClassificationBase - -__all__ = ['SbertForSentimentClassification'] - - -@MODELS.register_module( - Tasks.sentiment_classification, module_name=Models.structbert) -class SbertForSentimentClassification(SbertForSequenceClassificationBase): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the text generation model from the `model_dir` path. - - Args: - model_dir (str): the model path. - """ - super().__init__( - model_dir, *args, model_args={'num_labels': 2}, **kwargs) - assert self.model.config.num_labels == 2 diff --git a/modelscope/models/nlp/sbert_for_sequence_classification.py b/modelscope/models/nlp/sbert_for_sequence_classification.py deleted file mode 100644 index 59fcf6fa..00000000 --- a/modelscope/models/nlp/sbert_for_sequence_classification.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -from typing import Any, Dict - -import json -import numpy as np -import torch -from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel -from torch import nn - -from modelscope.models import TorchModel - - -class SbertTextClassfier(SbertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - self.encoder = SbertModel(config, add_pooling_layer=True) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - def forward(self, - input_ids=None, - token_type_ids=None, - labels=None, - **kwargs): - outputs = self.encoder( - input_ids, - token_type_ids=token_type_ids, - return_dict=None, - ) - pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - if labels is not None: - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return {'logits': logits, 'loss': loss} - return {'logits': logits} - - def build(**kwags): - return SbertTextClassfier.from_pretrained(model_dir, **model_args) - - -class SbertForSequenceClassificationBase(TorchModel): - - def __init__(self, model_dir: str, model_args=None, *args, **kwargs): - super().__init__(model_dir, *args, **kwargs) - if model_args is None: - model_args = {} - self.model = SbertTextClassfier.from_pretrained( - model_dir, **model_args) - self.id2label = {} - self.label_path = os.path.join(self.model_dir, 'label_mapping.json') - if os.path.exists(self.label_path): - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.id2label = { - idx: name - for name, idx in self.label_mapping.items() - } - - def train(self): - return self.model.train() - - def eval(self): - return self.model.eval() - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - input_ids = torch.tensor(input['input_ids'], dtype=torch.long) - token_type_ids = torch.tensor( - input['token_type_ids'], dtype=torch.long) - return self.model.forward(input_ids, token_type_ids) - - def postprocess(self, input, **kwargs): - logits = input['logits'] - probs = logits.softmax(-1).cpu().numpy() - pred = logits.argmax(-1).cpu().numpy() - logits = logits.cpu().numpy() - res = {'predictions': pred, 'probabilities': probs, 'logits': logits} - return res diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py deleted file mode 100644 index 748c4107..00000000 --- a/modelscope/models/nlp/sbert_for_token_classification.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.base import Tensor -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks - -__all__ = ['SbertForTokenClassification'] - - -@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert) -class SbertForTokenClassification(TorchModel): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the word segmentation model from the `model_dir` path. - - Args: - model_dir (str): the model path. - model_cls (Optional[Any], optional): model loader, if None, use the - default loader to load model weights, by default None. - """ - super().__init__(model_dir, *args, **kwargs) - self.model_dir = model_dir - import sofa - self.model = sofa.SbertForTokenClassification.from_pretrained( - self.model_dir) - self.config = sofa.SbertConfig.from_pretrained(self.model_dir) - - def train(self): - return self.model.train() - - def eval(self): - return self.model.eval() - - def forward(self, input: Dict[str, - Any]) -> Dict[str, Union[str, np.ndarray]]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, Union[str,np.ndarray]]: results - Example: - { - 'predictions': array([1,4]), # lable 0-negative 1-positive - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - 'text': str(今天), - } - """ - input_ids = torch.tensor(input['input_ids']).unsqueeze(0) - return {**self.model(input_ids), 'text': input['text']} - - def postprocess(self, input: Dict[str, Tensor], - **kwargs) -> Dict[str, Tensor]: - logits = input['logits'] - pred = torch.argmax(logits[0], dim=-1) - pred = pred.cpu().numpy() - rst = {'predictions': pred, 'logits': logits, 'text': input['text']} - return rst diff --git a/modelscope/models/nlp/sbert_for_zero_shot_classification.py b/modelscope/models/nlp/sbert_for_zero_shot_classification.py deleted file mode 100644 index b772cf45..00000000 --- a/modelscope/models/nlp/sbert_for_zero_shot_classification.py +++ /dev/null @@ -1,50 +0,0 @@ -from typing import Any, Dict - -import numpy as np - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks - -__all__ = ['SbertForZeroShotClassification'] - - -@MODELS.register_module( - Tasks.zero_shot_classification, module_name=Models.structbert) -class SbertForZeroShotClassification(TorchModel): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the zero shot classification model from the `model_dir` path. - - Args: - model_dir (str): the model path. - """ - - super().__init__(model_dir, *args, **kwargs) - from sofa import SbertForSequenceClassification - self.model = SbertForSequenceClassification.from_pretrained(model_dir) - - def train(self): - return self.model.train() - - def eval(self): - return self.model.eval() - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - Example: - { - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } - """ - outputs = self.model(**input) - logits = outputs['logits'].cpu().numpy() - res = {'logits': logits} - return res diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py index 4920c6ff..5550d749 100644 --- a/modelscope/models/nlp/sequence_classification.py +++ b/modelscope/models/nlp/sequence_classification.py @@ -1,85 +1,174 @@ -import os -from typing import Any, Dict +from abc import abstractmethod -import json -import numpy as np +from torch import nn -from modelscope.metainfo import TaskModels +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.structbert import SbertPreTrainedModel +from modelscope.models.nlp.veco import \ + VecoForSequenceClassification as VecoForSequenceClassificationTransform from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks -from .task_model import SingleBackboneTaskModelBase +from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.tensor_utils import (torch_nested_detach, + torch_nested_numpify) -__all__ = ['SequenceClassificationModel'] +__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification'] -@MODELS.register_module( - Tasks.sentiment_classification, module_name=TaskModels.text_classification) -@MODELS.register_module( - Tasks.text_classification, module_name=TaskModels.text_classification) -class SequenceClassificationModel(SingleBackboneTaskModelBase): +class SequenceClassificationBase(TorchModel): + base_model_prefix: str = 'bert' + + def __init__(self, config, model_dir): + super().__init__(model_dir) + self.num_labels = config.num_labels + self.config = config + setattr(self, self.base_model_prefix, self.build_base_model()) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the sequence classification model from the `model_dir` path. + @abstractmethod + def build_base_model(self): + """Build the backbone model. - Args: - model_dir (str): the model path. + Returns: the backbone instance. """ - super().__init__(model_dir, *args, **kwargs) - if 'base_model_prefix' in kwargs: - self._base_model_prefix = kwargs['base_model_prefix'] - - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - - # get the num_labels from label_mapping.json - self.id2label = {} - self.label_path = os.path.join(model_dir, 'label_mapping.json') - if os.path.exists(self.label_path): - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.id2label = { - idx: name - for name, idx in self.label_mapping.items() - } - head_cfg['num_labels'] = len(self.label_mapping) - - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - outputs = super().forward(input) - sequence_output, pooled_output = self.extract_backbone_outputs(outputs) - outputs = self.head.forward(pooled_output) - if 'labels' in input: - loss = self.compute_loss(outputs, input['labels']) - outputs.update(loss) - return outputs - - def extract_logits(self, outputs): - return outputs[OutputKeys.LOGITS].cpu().detach() - - def extract_backbone_outputs(self, outputs): - sequence_output = None - pooled_output = None - if hasattr(self.backbone, 'extract_sequence_outputs'): - sequence_output = self.backbone.extract_sequence_outputs(outputs) - if hasattr(self.backbone, 'extract_pooled_outputs'): - pooled_output = self.backbone.extract_pooled_outputs(outputs) - return sequence_output, pooled_output - - def compute_loss(self, outputs, labels): - loss = self.head.compute_loss(outputs, labels) - return loss + pass + + @property + def base_model(self): + return getattr(self, self.base_model_prefix) + + def forward(self, **kwargs): + labels = None + if OutputKeys.LABEL in kwargs: + labels = kwargs.pop(OutputKeys.LABEL) + elif OutputKeys.LABELS in kwargs: + labels = kwargs.pop(OutputKeys.LABELS) + + outputs = self.base_model.forward(**kwargs) + + # backbone model should return pooled_output as its second output + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss} + return {OutputKeys.LOGITS: logits} def postprocess(self, input, **kwargs): - logits = self.extract_logits(input) - probs = logits.softmax(-1).numpy() - pred = logits.argmax(-1).numpy() - logits = logits.numpy() + logits = input[OutputKeys.LOGITS] + probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1))) + pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1))) + logits = torch_nested_numpify(torch_nested_detach(logits)) res = { OutputKeys.PREDICTIONS: pred, OutputKeys.PROBABILITIES: probs, OutputKeys.LOGITS: logits } return res + + +@MODELS.register_module( + Tasks.sentence_similarity, module_name=Models.structbert) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.structbert) +@MODELS.register_module(Tasks.nli, module_name=Models.structbert) +@MODELS.register_module( + Tasks.zero_shot_classification, module_name=Models.structbert) +class SbertForSequenceClassification(SequenceClassificationBase, + SbertPreTrainedModel): + base_model_prefix: str = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, model_dir): + if hasattr(config, 'base_model_prefix'): + SbertForSequenceClassification.base_model_prefix = config.base_model_prefix + super().__init__(config, model_dir) + + def build_base_model(self): + from .structbert import SbertModel + return SbertModel(self.config, add_pooling_layer=True) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + labels=None, + **kwargs): + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + labels=labels) + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + + model_args = {} if num_labels is None else {'num_labels': num_labels} + return super(SbertPreTrainedModel, + SbertForSequenceClassification).from_pretrained( + pretrained_model_name_or_path=kwargs.get('model_dir'), + model_dir=kwargs.get('model_dir'), + **model_args) + + +@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.veco) +@MODELS.register_module(Tasks.nli, module_name=Models.veco) +class VecoForSequenceClassification(TorchModel, + VecoForSequenceClassificationTransform): + + def __init__(self, config, model_dir): + super().__init__(model_dir) + VecoForSequenceClassificationTransform.__init__(self, config) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + **kwargs): + return VecoForSequenceClassificationTransform.forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + labels=labels) + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + + model_args = {} if num_labels is None else {'num_labels': num_labels} + return super(VecoForSequenceClassificationTransform, + VecoForSequenceClassification).from_pretrained( + pretrained_model_name_or_path=kwargs.get('model_dir'), + model_dir=kwargs.get('model_dir'), + **model_args) diff --git a/modelscope/models/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py new file mode 100644 index 00000000..45f856c1 --- /dev/null +++ b/modelscope/models/nlp/space/__init__.py @@ -0,0 +1,28 @@ +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .model import SpaceGenerator + from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig + from .space_for_dialog_intent_prediction import SpaceForDialogIntent + from .space_for_dialog_modeling import SpaceForDialogModeling + from .space_for_dialog_state_tracking import SpaceForDialogStateTracking +else: + _import_structure = { + 'model': + ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'], + 'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'], + 'space_for_dialog_modeling': ['SpaceForDialogModeling'], + 'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py new file mode 100644 index 00000000..24641f06 --- /dev/null +++ b/modelscope/models/nlp/space/model/__init__.py @@ -0,0 +1,10 @@ +from .configuration_space import SpaceConfig +from .gen_unified_transformer import GenUnifiedTransformer +from .generator import Generator as SpaceGenerator +from .intent_unified_transformer import IntentUnifiedTransformer +from .model_base import SpaceModelBase +from .modeling_space import (SpaceForDST, SpaceForMaskedLM, + SpaceForPreTraining, SpaceModel) +from .tokenization_space import (BasicTokenizer, SpaceTokenizer, + WordpieceTokenizer) +from .unified_transformer import UnifiedTransformer diff --git a/modelscope/models/nlp/space/model/configuration_space.py b/modelscope/models/nlp/space/model/configuration_space.py new file mode 100644 index 00000000..0da2d629 --- /dev/null +++ b/modelscope/models/nlp/space/model/configuration_space.py @@ -0,0 +1,32 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors. +# Copyright 2020 The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Space configuration, mainly copied from :class:`~transformers.configuration_xlm_roberta` """ + +from modelscope.models.nlp.structbert import SbertConfig +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + + +class SpaceConfig(SbertConfig): + """ + This class overrides [`SbertConfig`]. Please check the superclass for the appropriate + documentation alongside usage examples. + """ + + model_type = 'space' diff --git a/modelscope/models/nlp/backbones/space/model/gen_unified_transformer.py b/modelscope/models/nlp/space/model/gen_unified_transformer.py similarity index 100% rename from modelscope/models/nlp/backbones/space/model/gen_unified_transformer.py rename to modelscope/models/nlp/space/model/gen_unified_transformer.py diff --git a/modelscope/models/nlp/backbones/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py similarity index 100% rename from modelscope/models/nlp/backbones/space/model/generator.py rename to modelscope/models/nlp/space/model/generator.py diff --git a/modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py b/modelscope/models/nlp/space/model/intent_unified_transformer.py similarity index 100% rename from modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py rename to modelscope/models/nlp/space/model/intent_unified_transformer.py diff --git a/modelscope/models/nlp/backbones/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py similarity index 100% rename from modelscope/models/nlp/backbones/space/model/model_base.py rename to modelscope/models/nlp/space/model/model_base.py diff --git a/modelscope/models/nlp/space/model/modeling_space.py b/modelscope/models/nlp/space/model/modeling_space.py new file mode 100644 index 00000000..f093cbc5 --- /dev/null +++ b/modelscope/models/nlp/space/model/modeling_space.py @@ -0,0 +1,268 @@ +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`""" + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.file_utils import add_start_docstrings + +from modelscope.models.nlp.structbert.modeling_sbert import ( + SbertForMaskedLM, SbertModel, SbertPreTrainedModel) +from .configuration_space import SpaceConfig + +SPACE_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config ([`SpaceConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model + weights. +""" + + +@add_start_docstrings( + 'The bare Space Model transformer outputting raw hidden-states without any specific head on top. ' + 'It is identical with the Bert Model from Transformers', + SPACE_START_DOCSTRING, +) +class SpaceModel(SbertModel): + """ + This class overrides [`SbertModel`]. Please check the superclass for the appropriate + documentation alongside usage examples. + """ + + config_class = SpaceConfig + + +@add_start_docstrings( + """ + Space Model transformer with Dialog state tracking heads on top (a inform projection + layer with a dialog state layer and a set of slots including history infromation from + previous dialog) e.g. for multiwoz2.2 tasks. + """, + SPACE_START_DOCSTRING, +) +class SpaceForDST(SbertPreTrainedModel): + + def __init__(self, config): + super(SpaceForDST, self).__init__(config) + self.slot_list = config.dst_slot_list + self.class_types = config.dst_class_types + self.class_labels = config.dst_class_labels + self.token_loss_for_nonpointable = config.dst_token_loss_for_nonpointable + self.refer_loss_for_nonpointable = config.dst_refer_loss_for_nonpointable + self.class_aux_feats_inform = config.dst_class_aux_feats_inform + self.class_aux_feats_ds = config.dst_class_aux_feats_ds + self.class_loss_ratio = config.dst_class_loss_ratio + + # Only use refer loss if refer class is present in dataset. + if 'refer' in self.class_types: + self.refer_index = self.class_types.index('refer') + else: + self.refer_index = -1 + + self.bert = SpaceModel(config) + self.dropout = nn.Dropout(config.dst_dropout_rate) + self.dropout_heads = nn.Dropout(config.dst_heads_dropout_rate) + + if self.class_aux_feats_inform: + self.add_module( + 'inform_projection', + nn.Linear(len(self.slot_list), len(self.slot_list))) + if self.class_aux_feats_ds: + self.add_module( + 'ds_projection', + nn.Linear(len(self.slot_list), len(self.slot_list))) + + aux_dims = len(self.slot_list) * ( + self.class_aux_feats_inform + self.class_aux_feats_ds + ) # second term is 0, 1 or 2 + + for slot in self.slot_list: + self.add_module( + 'class_' + slot, + nn.Linear(config.hidden_size + aux_dims, self.class_labels)) + self.add_module('token_' + slot, nn.Linear(config.hidden_size, 2)) + self.add_module( + 'refer_' + slot, + nn.Linear(config.hidden_size + aux_dims, + len(self.slot_list) + 1)) + + self.init_weights() + + def forward(self, + input_ids, + input_mask=None, + segment_ids=None, + position_ids=None, + head_mask=None, + start_pos=None, + end_pos=None, + inform_slot_id=None, + refer_id=None, + class_label_id=None, + diag_state=None): + outputs = self.bert( + input_ids, + attention_mask=input_mask, + token_type_ids=segment_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + pooled_output = outputs[1] + + sequence_output = self.dropout(sequence_output) + pooled_output = self.dropout(pooled_output) + + # TODO: establish proper format in labels already? + if inform_slot_id is not None: + inform_labels = torch.stack(list(inform_slot_id.values()), + 1).float() + if diag_state is not None: + diag_state_labels = torch.clamp( + torch.stack(list(diag_state.values()), 1).float(), 0.0, 1.0) + + total_loss = 0 + per_slot_per_example_loss = {} + per_slot_class_logits = {} + per_slot_start_logits = {} + per_slot_end_logits = {} + per_slot_refer_logits = {} + for slot in self.slot_list: + if self.class_aux_feats_inform and self.class_aux_feats_ds: + pooled_output_aux = torch.cat( + (pooled_output, self.inform_projection(inform_labels), + self.ds_projection(diag_state_labels)), 1) + elif self.class_aux_feats_inform: + pooled_output_aux = torch.cat( + (pooled_output, self.inform_projection(inform_labels)), 1) + elif self.class_aux_feats_ds: + pooled_output_aux = torch.cat( + (pooled_output, self.ds_projection(diag_state_labels)), 1) + else: + pooled_output_aux = pooled_output + class_logits = self.dropout_heads( + getattr(self, 'class_' + slot)(pooled_output_aux)) + + token_logits = self.dropout_heads( + getattr(self, 'token_' + slot)(sequence_output)) + start_logits, end_logits = token_logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + refer_logits = self.dropout_heads( + getattr(self, 'refer_' + slot)(pooled_output_aux)) + + per_slot_class_logits[slot] = class_logits + per_slot_start_logits[slot] = start_logits + per_slot_end_logits[slot] = end_logits + per_slot_refer_logits[slot] = refer_logits + + # If there are no labels, don't compute loss + if class_label_id is not None and start_pos is not None and end_pos is not None and refer_id is not None: + # If we are on multi-GPU, split add a dimension + if len(start_pos[slot].size()) > 1: + start_pos[slot] = start_pos[slot].squeeze(-1) + if len(end_pos[slot].size()) > 1: + end_pos[slot] = end_pos[slot].squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) # This is a single index + start_pos[slot].clamp_(0, ignored_index) + end_pos[slot].clamp_(0, ignored_index) + + class_loss_fct = CrossEntropyLoss(reduction='none') + token_loss_fct = CrossEntropyLoss( + reduction='none', ignore_index=ignored_index) + refer_loss_fct = CrossEntropyLoss(reduction='none') + + start_loss = token_loss_fct(start_logits, start_pos[slot]) + end_loss = token_loss_fct(end_logits, end_pos[slot]) + token_loss = (start_loss + end_loss) / 2.0 + + token_is_pointable = (start_pos[slot] > 0).float() + if not self.token_loss_for_nonpointable: + token_loss *= token_is_pointable + + refer_loss = refer_loss_fct(refer_logits, refer_id[slot]) + token_is_referrable = torch.eq(class_label_id[slot], + self.refer_index).float() + if not self.refer_loss_for_nonpointable: + refer_loss *= token_is_referrable + + class_loss = class_loss_fct(class_logits, class_label_id[slot]) + + if self.refer_index > -1: + per_example_loss = (self.class_loss_ratio) * class_loss + ( + (1 - self.class_loss_ratio) / 2) * token_loss + ( + (1 - self.class_loss_ratio) / 2) * refer_loss + else: + per_example_loss = self.class_loss_ratio * class_loss + ( + 1 - self.class_loss_ratio) * token_loss + + total_loss += per_example_loss.sum() + per_slot_per_example_loss[slot] = per_example_loss + + # add hidden states and attention if they are here + outputs = (total_loss, ) + ( + per_slot_per_example_loss, + per_slot_class_logits, + per_slot_start_logits, + per_slot_end_logits, + per_slot_refer_logits, + ) + outputs[2:] + + return outputs + + +@add_start_docstrings( + 'The Space Model Model with a `language modeling` head on tops', + SPACE_START_DOCSTRING, +) +class SpaceForMaskedLM(SbertForMaskedLM): + """ + This class overrides [`SbertForMaskedLM`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = SpaceConfig + + +@add_start_docstrings( + """ + Space Model with only one head on top as done during the pretraining: a `masked language modeling` head. + """, + SPACE_START_DOCSTRING, +) +class SpaceForPreTraining(SbertPreTrainedModel): + + def __init__(self, model_name_or_path: str): + super(SpaceForPreTraining, self).__init__() + self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path) + + def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor): + outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels) + return outputs[0] diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py new file mode 100644 index 00000000..84712b7b --- /dev/null +++ b/modelscope/models/nlp/space/model/tokenization_space.py @@ -0,0 +1,29 @@ +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for Space. mainly copied from :module:`~transformers.tokenization_xlm_roberta`""" + +from modelscope.models.nlp.structbert import (BasicTokenizer, SbertTokenizer, + WordpieceTokenizer) +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + + +class SpaceTokenizer(SbertTokenizer): + """ + This class overrides [`SpaceTokenizer`]. Please check the superclass for the appropriate + documentation alongside usage examples. + """ diff --git a/modelscope/models/nlp/backbones/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py similarity index 97% rename from modelscope/models/nlp/backbones/space/model/unified_transformer.py rename to modelscope/models/nlp/space/model/unified_transformer.py index f5df954d..b0775541 100644 --- a/modelscope/models/nlp/backbones/space/model/unified_transformer.py +++ b/modelscope/models/nlp/space/model/unified_transformer.py @@ -5,10 +5,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from modelscope.models.nlp.backbones.space.model.model_base import \ - SpaceModelBase -from modelscope.models.nlp.backbones.space.modules.embedder import Embedder -from modelscope.models.nlp.backbones.space.modules.transformer_block import \ +from modelscope.models.nlp.space.model.model_base import SpaceModelBase +from modelscope.models.nlp.space.modules.embedder import Embedder +from modelscope.models.nlp.space.modules.transformer_block import \ TransformerBlock diff --git a/modelscope/models/nlp/backbones/space/modules/__init__.py b/modelscope/models/nlp/space/modules/__init__.py similarity index 100% rename from modelscope/models/nlp/backbones/space/modules/__init__.py rename to modelscope/models/nlp/space/modules/__init__.py diff --git a/modelscope/models/nlp/backbones/space/modules/embedder.py b/modelscope/models/nlp/space/modules/embedder.py similarity index 100% rename from modelscope/models/nlp/backbones/space/modules/embedder.py rename to modelscope/models/nlp/space/modules/embedder.py diff --git a/modelscope/models/nlp/backbones/space/modules/feedforward.py b/modelscope/models/nlp/space/modules/feedforward.py similarity index 100% rename from modelscope/models/nlp/backbones/space/modules/feedforward.py rename to modelscope/models/nlp/space/modules/feedforward.py diff --git a/modelscope/models/nlp/backbones/space/modules/functions.py b/modelscope/models/nlp/space/modules/functions.py similarity index 100% rename from modelscope/models/nlp/backbones/space/modules/functions.py rename to modelscope/models/nlp/space/modules/functions.py diff --git a/modelscope/models/nlp/backbones/space/modules/multihead_attention.py b/modelscope/models/nlp/space/modules/multihead_attention.py similarity index 100% rename from modelscope/models/nlp/backbones/space/modules/multihead_attention.py rename to modelscope/models/nlp/space/modules/multihead_attention.py diff --git a/modelscope/models/nlp/backbones/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py similarity index 100% rename from modelscope/models/nlp/backbones/space/modules/transformer_block.py rename to modelscope/models/nlp/space/modules/transformer_block.py diff --git a/modelscope/models/nlp/space_for_dialog_intent_prediction.py b/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py similarity index 97% rename from modelscope/models/nlp/space_for_dialog_intent_prediction.py rename to modelscope/models/nlp/space/space_for_dialog_intent_prediction.py index bd0eb63b..c862fbef 100644 --- a/modelscope/models/nlp/space_for_dialog_intent_prediction.py +++ b/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py @@ -7,7 +7,7 @@ from modelscope.metainfo import Models from modelscope.models import TorchModel from modelscope.models.base import Tensor from modelscope.models.builder import MODELS -from modelscope.models.nlp.backbones import SpaceGenerator, SpaceModelBase +from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase from modelscope.preprocessors.space import IntentBPETextField from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks diff --git a/modelscope/models/nlp/space_for_dialog_modeling.py b/modelscope/models/nlp/space/space_for_dialog_modeling.py similarity index 97% rename from modelscope/models/nlp/space_for_dialog_modeling.py rename to modelscope/models/nlp/space/space_for_dialog_modeling.py index 60713c3d..8b9ed8b3 100644 --- a/modelscope/models/nlp/space_for_dialog_modeling.py +++ b/modelscope/models/nlp/space/space_for_dialog_modeling.py @@ -7,7 +7,7 @@ from modelscope.metainfo import Models from modelscope.models import TorchModel from modelscope.models.base import Tensor from modelscope.models.builder import MODELS -from modelscope.models.nlp.backbones import SpaceGenerator, SpaceModelBase +from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase from modelscope.preprocessors.space import MultiWOZBPETextField from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks diff --git a/modelscope/models/nlp/space_for_dialog_state_tracking.py b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py similarity index 97% rename from modelscope/models/nlp/space_for_dialog_state_tracking.py rename to modelscope/models/nlp/space/space_for_dialog_state_tracking.py index de5f95ce..ee7356b1 100644 --- a/modelscope/models/nlp/space_for_dialog_state_tracking.py +++ b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py @@ -21,7 +21,7 @@ class SpaceForDialogStateTracking(TorchModel): super().__init__(model_dir, *args, **kwargs) - from sofa.models.space import SpaceConfig, SpaceForDST + from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig self.model_dir = model_dir self.config = SpaceConfig.from_pretrained(self.model_dir) diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py new file mode 100644 index 00000000..d42db83c --- /dev/null +++ b/modelscope/models/nlp/structbert/__init__.py @@ -0,0 +1,45 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .configuration_sbert import SbertConfig + from .modeling_sbert import (SbertForMaskedLM, SbertModel, + SbertPreTrainedModel) + from .tokenization_sbert import (BasicTokenizer, SbertTokenizer, + WordpieceTokenizer) + from .tokenization_sbert_fast import SbertTokenizerFast +else: + _import_structure = { + 'configuration_sbert': ['SbertConfig'], + 'modeling_sbert': + ['SbertForMaskedLM', 'SbertModel', 'SbertPreTrainedModel'], + 'tokenization_sbert': + ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'], + 'tokenization_sbert_fast': ['SbertTokenizerFast'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/backbones/structbert/adv_utils.py b/modelscope/models/nlp/structbert/adv_utils.py similarity index 96% rename from modelscope/models/nlp/backbones/structbert/adv_utils.py rename to modelscope/models/nlp/structbert/adv_utils.py index 9864148f..44aae85c 100644 --- a/modelscope/models/nlp/backbones/structbert/adv_utils.py +++ b/modelscope/models/nlp/structbert/adv_utils.py @@ -59,7 +59,8 @@ def compute_adv_loss(embedding, """ Calculate the adv loss of the model. :param embedding: Original sentense embedding - :param model: The model or the forward function(including decoder/classifier), accept kwargs as input, output logits + :param model: The model, or the forward function(including decoder/classifier), + accept kwargs as input, output logits :param ori_logits: The original logits outputed from the model function :param ori_loss: The original loss :param adv_grad_factor: This factor will be multipled by the KL loss grad and then the result will be added to @@ -119,7 +120,8 @@ def compute_adv_loss_pair(embedding, """ Calculate the adv loss of the model. This function is used in the pair logits scenerio. :param embedding: Original sentense embedding - :param model: The model or the forward function(including decoder/classifier), accept kwargs as input, output logits + :param model: The model, or the forward function(including decoder/classifier), + accept kwargs as input, output logits :param start_logits: The original start logits outputed from the model function :param end_logits: The original end logits outputed from the model function :param ori_loss: The original loss diff --git a/modelscope/models/nlp/backbones/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration_sbert.py similarity index 94% rename from modelscope/models/nlp/backbones/structbert/configuration_sbert.py rename to modelscope/models/nlp/structbert/configuration_sbert.py index 878b2216..374d4b62 100644 --- a/modelscope/models/nlp/backbones/structbert/configuration_sbert.py +++ b/modelscope/models/nlp/structbert/configuration_sbert.py @@ -24,11 +24,12 @@ logger = logging.get_logger(__name__) class SbertConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a :class:`~sofa.models.SbertModel`. + This is the configuration class to store the configuration + of a :class:`~modelscope.models.nlp.structbert.SbertModel`. It is used to instantiate a SBERT model according to the specified arguments. - Configuration objects inherit from :class:`~sofa.utils.PretrainedConfig` and can be used to control the model - outputs. Read the documentation from :class:`~sofa.utils.PretrainedConfig` for more information. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Args: @@ -99,11 +100,13 @@ class SbertConfig(PretrainedConfig): type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, + pad_token_id=0, position_embedding_type='absolute', use_cache=True, classifier_dropout=None, **kwargs): - super().__init__(**kwargs) + super().__init__(pad_token_id=pad_token_id, **kwargs) + self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py new file mode 100755 index 00000000..bbac3c95 --- /dev/null +++ b/modelscope/models/nlp/structbert/modeling_sbert.py @@ -0,0 +1,1964 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch SBERT model. mainly copied from :module:`~transformers.modeling_bert`""" + +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.file_utils import (ModelOutput, add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, MaskedLMOutput, + MultipleChoiceModelOutput, NextSentencePredictorOutput, + QuestionAnsweringModelOutput, SequenceClassifierOutput, + TokenClassifierOutput) +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) + +from modelscope.metainfo import Models +from modelscope.models.builder import BACKBONES +from modelscope.utils.constant import Fields +from modelscope.utils.logger import get_logger +from .adv_utils import compute_adv_loss, compute_adv_loss_pair +from .configuration_sbert import SbertConfig + +logger = get_logger(__name__) + +_CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512' +_CONFIG_FOR_DOC = 'SbertConfig' +_TOKENIZER_FOR_DOC = 'SbertTokenizer' + + +class SbertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse('1.6.0'): + self.register_buffer( + 'token_type_ids', + torch.zeros( + self.position_ids.size(), + dtype=torch.long, + device=self.position_ids.device), + persistent=False, + ) + + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + return_inputs_embeds=False): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, + past_key_values_length:seq_length + + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users + # when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, 'token_type_ids'): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, + dtype=torch.long, + device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == 'absolute': + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + if not return_inputs_embeds: + return embeddings + else: + return embeddings, inputs_embeds + + +class SbertSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, 'embedding_size'): + raise ValueError( + f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in SbertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + if self.is_decoder: + outputs = outputs + (past_key_value, ) + return outputs + + +class SbertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class SbertAttention(nn.Module): + + def __init__(self, config): + super().__init__() + self.self = SbertSelfAttention(config) + self.output = SbertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, + self.self.attention_head_size, self.pruned_heads) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class SbertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class SbertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class SbertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = SbertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError( + f'{self} should be used as a decoder model if cross attention is added' + ) + self.crossattention = SbertAttention(config) + self.intermediate = SbertIntermediate(config) + self.output = SbertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[: + 2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, 'crossattention'): + raise ValueError( + f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention ' + f'layers by setting `config.add_cross_attention=True`') + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[ + -2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[ + 1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward(self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output) + outputs = (layer_output, ) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value, ) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class SbertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [SbertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + ) if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1], ) + if output_attentions: + all_self_attentions = all_self_attentions + ( + layer_outputs[1], ) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + ( + layer_outputs[2], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class SbertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class SbertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class SbertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = SbertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class SbertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = SbertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class SbertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class SbertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = SbertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class SbertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SbertConfig + base_model_prefix = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, SbertEncoder): + module.gradient_checkpointing = value + + +@dataclass +class SbertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.BertForPreTraining`. + + Args: + loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` + is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` + is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +SBERT_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with + all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +SBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple. +""" + + +@dataclass +class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding( + BaseModelOutputWithPoolingAndCrossAttentions): + embedding_output: torch.FloatTensor = None + logits: Optional[Union[tuple, torch.FloatTensor]] = None + kwargs: dict = None + + +@add_start_docstrings( + 'The Sbert Model transformer outputting raw hidden-states without any specific head on top.', + SBERT_START_DOCSTRING, +) +class SbertModel(SbertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration + set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config: SbertConfig, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = SbertEmbeddings(config) + self.encoder = SbertEncoder(config) + + self.pooler = SbertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple + having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds') + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[ + 2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), + device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, 'token_type_ids'): + buffered_token_type_ids = self.embeddings.token_type_ids[:, : + seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + embedding_output, orignal_embeds = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + return_inputs_embeds=True, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, + pooled_output) + encoder_outputs[1:] + (orignal_embeds, ) + + return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + embedding_output=orignal_embeds) + + +@add_start_docstrings( + """ + Sbert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + SBERT_START_DOCSTRING, +) +class SbertForPreTraining(SbertPreTrainedModel): + + def __init__(self, config: SbertConfig): + super().__init__(config) + + self.bert = SbertModel(config) + self.cls = SbertPreTrainingHeads(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=SbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls( + sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + next_sentence_loss = loss_fct( + seq_relationship_score.view(-1, 2), + next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, + seq_relationship_score) + outputs[2:-1] + return ((total_loss, ) + + output) if total_loss is not None else output + + return SbertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Sbert Model with a `language modeling` head on top for CLM fine-tuning. """, + SBERT_START_DOCSTRING) +class SbertLMHeadModel(SbertPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config: SbertConfig): + super().__init__(config) + + if not config.is_decoder: + logger.warning( + 'If you want to use `SbertLMHeadModel` as a standalone, add `is_decoder=True.`' + ) + + self.bert = SbertModel(config, add_pooling_layer=False) + self.cls = SbertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` + with each tuple having 4 tensors of + shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + + Returns: + + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, : + -1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + if not return_dict: + output = (prediction_scores, ) + outputs[2:-1] + return ((lm_loss, ) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + past=None, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'past_key_values': past + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple( + past_state.index_select(0, beam_idx) + for past_state in layer_past), ) + return reordered_past + + +@add_start_docstrings( + """Sbert Model with a `language modeling` head on top. """, + SBERT_START_DOCSTRING) +class SbertForMaskedLM(SbertPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config: SbertConfig): + super().__init__(config) + + if config.is_decoder: + logger.warning( + 'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for ' + 'bi-directional self-attention.') + + self.bert = SbertModel(config) + self.cls = SbertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:-1] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation' + attention_mask_zero = attention_mask.new_zeros( + (attention_mask.shape[0], 1)) + attention_mask = torch.cat([attention_mask, attention_mask_zero], + dim=-1) + dummy_token = torch.full((effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {'input_ids': input_ids, 'attention_mask': attention_mask} + + +@add_start_docstrings( + """Sbert Model with a `next sentence prediction (classification)` head on top. """, + SBERT_START_DOCSTRING, +) +class SbertForNextSentencePrediction(SbertPreTrainedModel): + + def __init__(self, config: SbertConfig): + super().__init__(config) + + self.bert = SbertModel(config) + self.cls = SbertOnlyNSPHead(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see ``input_ids`` docstring). Indices should be in ``[0, 1]``: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + """ + + if 'next_sentence_label' in kwargs: + warnings.warn( + 'The `next_sentence_label` argument is deprecated and will be removed ' + 'in a future version, use `labels` instead.', + FutureWarning, + ) + labels = kwargs.pop('next_sentence_label') + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct( + seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores, ) + outputs[2:-1] + return ((next_sentence_loss, ) + + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Sbert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + SBERT_START_DOCSTRING, +) +class SbertForSequenceClassification(SbertPreTrainedModel): + + def __init__(self, config: SbertConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + if self.config.adv_grad_factor is None: + logger.warning( + 'Adv parameters not set, skipping compute_adv_loss.') + self.bert = SbertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def _forward_call(self, **kwargs): + outputs = self.bert(**kwargs) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + outputs['logits'] = logits + outputs.kwargs = kwargs + return outputs + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if not return_dict: + logger.error('Return tuple in sbert is not supported now.') + outputs = self._forward_call( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + return self.compute_loss(outputs, labels, **outputs.kwargs) + + def compute_loss(self, outputs, labels, **kwargs): + logits = outputs.logits + embedding_output = outputs.embedding_output + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = 'regression' + elif self.num_labels > 1 and (labels.dtype == torch.long + or labels.dtype == torch.int): + self.config.problem_type = 'single_label_classification' + else: + self.config.problem_type = 'multi_label_classification' + + if self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + if self.config.adv_grad_factor is not None and self.training: + loss = compute_adv_loss( + embedding=embedding_output, + model=self._forward_call, + ori_logits=logits, + ori_loss=loss, + adv_bound=self.config.adv_bound, + adv_grad_factor=self.config.adv_grad_factor, + sigma=self.config.sigma, + **kwargs) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Sbert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + SBERT_START_DOCSTRING, +) +class SbertForMultipleChoice(SbertPreTrainedModel): + + def __init__(self, config: SbertConfig): + super().__init__(config) + self.config = config + if self.config.adv_grad_factor is None: + logger.warning( + 'Adv parameters not set, skipping compute_adv_loss.') + self.bert = SbertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + def _forward_call(self, num_choices, **kwargs): + outputs = self.bert(**kwargs) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + outputs['logits'] = logits.view(-1, num_choices) + kwargs['num_choices'] = num_choices + outputs.kwargs = kwargs + return outputs + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format( + 'batch_size, num_choices, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if not return_dict: + logger.error('Return tuple in sbert is not supported now.') + + num_choices = input_ids.shape[ + 1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view( + -1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view( + -1, + attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view( + -1, + token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view( + -1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), + inputs_embeds.size(-1)) + if inputs_embeds is not None else None) + + outputs = self._forward_call( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + num_choices=num_choices) + + reshaped_logits = outputs.logits + kwargs = outputs.kwargs + embedding_output = outputs.embedding_output + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + if self.config.adv_grad_factor is not None and self.training: + loss = compute_adv_loss( + embedding=embedding_output, + model=self._forward_call, + ori_logits=reshaped_logits, + ori_loss=loss, + adv_bound=self.config.adv_bound, + adv_grad_factor=self.config.adv_grad_factor, + sigma=self.config.sigma, + **kwargs) + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Sbert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + SBERT_START_DOCSTRING, +) +class SbertForTokenClassification(SbertPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config: SbertConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + if self.config.adv_grad_factor is None: + logger.warning( + 'Adv parameters not set, skipping compute_adv_loss.') + self.bert = SbertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def _forward_call(self, **kwargs): + outputs = self.bert(**kwargs) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + outputs['logits'] = logits + outputs.kwargs = kwargs + return outputs + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if not return_dict: + logger.error('Return tuple in sbert is not supported now.') + + outputs = self._forward_call( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + logits = outputs.logits + embedding_output = outputs.embedding_output + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels)) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + if self.config.adv_grad_factor is not None and self.training: + loss = compute_adv_loss( + embedding=embedding_output, + model=self._forward_call, + ori_logits=logits, + ori_loss=loss, + adv_bound=self.config.adv_bound, + adv_grad_factor=self.config.adv_grad_factor, + sigma=self.config.sigma, + with_attention_mask=attention_mask is not None, + **outputs.kwargs) + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Sbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + SBERT_START_DOCSTRING, +) +class SbertForQuestionAnswering(SbertPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config: SbertConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + if self.config.adv_grad_factor is None: + logger.warning( + 'Adv parameters not set, skipping compute_adv_loss.') + self.bert = SbertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def _forward_call(self, **kwargs): + outputs = self.bert(**kwargs) + sequence_output = outputs[0] + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + outputs['logits'] = (start_logits, end_logits) + outputs.kwargs = kwargs + return outputs + + @add_start_docstrings_to_model_forward( + SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if not return_dict: + logger.error('Return tuple in sbert is not supported now.') + + outputs = self._forward_call( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + return self.compute_loss(outputs, start_positions, end_positions, + **outputs.kwargs) + + def compute_loss(self, + outputs, + start_positions=None, + end_positions=None, + **kwargs): + start_logits, end_logits = outputs.logits + embedding_output = outputs.embedding_output + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + if self.config.adv_grad_factor is not None and self.training: + total_loss = compute_adv_loss_pair( + embedding=embedding_output, + model=self._forward_call, + start_logits=start_logits, + end_logits=end_logits, + ori_loss=total_loss, + adv_bound=self.config.adv_bound, + adv_grad_factor=self.config.adv_grad_factor, + sigma=self.config.sigma, + **kwargs) + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization_sbert.py new file mode 100644 index 00000000..6db69509 --- /dev/null +++ b/modelscope/models/nlp/structbert/tokenization_sbert.py @@ -0,0 +1,516 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert`""" + +import collections +import os +import unicodedata +from typing import List, Optional, Tuple + +from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control, + _is_punctuation, _is_whitespace) + +from modelscope.utils.logger import get_logger + +logger = get_logger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} + +PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'chinese_sbert-large-std-512': 512, + 'english_sbert-large-std-512': 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + 'english_sbert-large-std-512': { + 'do_lower_case': True + }, +} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, 'r', encoding='utf-8') as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip('\n') + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class SbertTokenizer(PreTrainedTokenizer): + r""" + Construct a SBERT tokenizer. Based on WordPiece. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to do basic tokenization before WordPiece. + never_split (:obj:`Iterable`, `optional`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this `issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token='[UNK]', + sep_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + mask_token='[MASK]', + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs): + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained " + 'model use `tokenizer = SbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`' + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([ + (ids, tok) for tok, ids in self.vocab.items() + ]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=self.unk_token) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens): + + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = ' '.join(tokens).replace(' ##', '').strip() + return out_string + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A SBERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence + pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + + sep) * [1] + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, + (filename_prefix + '-' if filename_prefix else '') + + VOCAB_FILES_NAMES['vocab_file']) + else: + vocab_file = (filename_prefix + + '-' if filename_prefix else '') + save_directory + with open(vocab_file, 'w', encoding='utf-8') as writer: + for token, token_index in sorted( + self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f'Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive.' + ' Please check that the vocabulary is not corrupted!') + index = token_index + writer.write(token + '\n') + index += 1 + return (vocab_file, ) + + +class BasicTokenizer(object): + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + never_split (:obj:`Iterable`, `optional`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this `issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + """ + + def __init__(self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + :func:`PreTrainedTokenizer.tokenize`) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union( + set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(' '.join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize('NFD', text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == 'Mn': + continue + output.append(char) + return ''.join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return [''.join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(' ') + output.append(char) + output.append(' ') + else: + output.append(char) + return ''.join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF) + or (0x20000 <= cp <= 0x2A6DF) or (0x2A700 <= cp <= 0x2B73F) + or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF) + or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(' ') + else: + output.append(char) + return ''.join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = ''.join(chars[start:end]) + if start > 0: + substr = '##' + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py new file mode 100644 index 00000000..b02039c6 --- /dev/null +++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py @@ -0,0 +1,200 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert_fast`""" + +from typing import List, Optional, Tuple + +import json +import transformers +from tokenizers import normalizers +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast + +from modelscope.utils.logger import get_logger +from .tokenization_sbert import SbertTokenizer + +logger = get_logger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'vocab.txt', + 'tokenizer_file': 'tokenizer.json' +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': {}, + 'tokenizer_file': {}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'chinese_sbert-large-std-512': 512, + 'english_sbert-large-std-512': 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + 'english_sbert-large-std-512': { + 'do_lower_case': True + }, +} + +transformers.SLOW_TO_FAST_CONVERTERS[ + 'SbertTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS['BertTokenizer'] + + +class SbertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" SBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this + issue `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = SbertTokenizer + + def __init__(self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token='[UNK]', + sep_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + mask_token='[MASK]', + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + pre_tok_state = json.loads( + self.backend_tokenizer.normalizer.__getstate__()) + if (pre_tok_state.get('lowercase', do_lower_case) != do_lower_case + or pre_tok_state.get('strip_accents', + strip_accents) != strip_accents): + pre_tok_class = getattr(normalizers, pre_tok_state.pop('type')) + pre_tok_state['lowercase'] = do_lower_case + pre_tok_state['strip_accents'] = strip_accents + self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A SBERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1: + output += token_ids_1 + [self.sep_token_id] + + return output + + def create_token_type_ids_from_sequences( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence + pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + + sep) * [1] + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save( + save_directory, name=filename_prefix) + return tuple(files) diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py new file mode 100644 index 00000000..988f2917 --- /dev/null +++ b/modelscope/models/nlp/task_models/sequence_classification.py @@ -0,0 +1,86 @@ +import os +from typing import Any, Dict + +import json +import numpy as np + +from modelscope.metainfo import TaskModels +from modelscope.models.builder import MODELS +from modelscope.models.nlp.task_models.task_model import \ + SingleBackboneTaskModelBase +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks + +__all__ = ['SequenceClassificationModel'] + + +@MODELS.register_module( + Tasks.sentiment_classification, module_name=TaskModels.text_classification) +@MODELS.register_module( + Tasks.text_classification, module_name=TaskModels.text_classification) +class SequenceClassificationModel(SingleBackboneTaskModelBase): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the sequence classification model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + if 'base_model_prefix' in kwargs: + self._base_model_prefix = kwargs['base_model_prefix'] + + backbone_cfg = self.cfg.backbone + head_cfg = self.cfg.head + + # get the num_labels from label_mapping.json + self.id2label = {} + self.label_path = os.path.join(model_dir, 'label_mapping.json') + if os.path.exists(self.label_path): + with open(self.label_path) as f: + self.label_mapping = json.load(f) + self.id2label = { + idx: name + for name, idx in self.label_mapping.items() + } + head_cfg['num_labels'] = len(self.label_mapping) + + self.build_backbone(backbone_cfg) + self.build_head(head_cfg) + + def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: + outputs = super().forward(input) + sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + outputs = self.head.forward(pooled_output) + if 'labels' in input: + loss = self.compute_loss(outputs, input['labels']) + outputs.update(loss) + return outputs + + def extract_logits(self, outputs): + return outputs[OutputKeys.LOGITS].cpu().detach() + + def extract_backbone_outputs(self, outputs): + sequence_output = None + pooled_output = None + if hasattr(self.backbone, 'extract_sequence_outputs'): + sequence_output = self.backbone.extract_sequence_outputs(outputs) + if hasattr(self.backbone, 'extract_pooled_outputs'): + pooled_output = self.backbone.extract_pooled_outputs(outputs) + return sequence_output, pooled_output + + def compute_loss(self, outputs, labels): + loss = self.head.compute_loss(outputs, labels) + return loss + + def postprocess(self, input, **kwargs): + logits = self.extract_logits(input) + probs = logits.softmax(-1).numpy() + pred = logits.argmax(-1).numpy() + logits = logits.numpy() + res = { + OutputKeys.PREDICTIONS: pred, + OutputKeys.PROBABILITIES: probs, + OutputKeys.LOGITS: logits + } + return res diff --git a/modelscope/models/nlp/task_model.py b/modelscope/models/nlp/task_models/task_model.py similarity index 98% rename from modelscope/models/nlp/task_model.py rename to modelscope/models/nlp/task_models/task_model.py index e83c6604..104b4c32 100644 --- a/modelscope/models/nlp/task_model.py +++ b/modelscope/models/nlp/task_models/task_model.py @@ -11,8 +11,8 @@ from modelscope.models.base import TorchModel from modelscope.models.builder import build_backbone, build_head from modelscope.utils.config import ConfigDict from modelscope.utils.constant import Fields, Tasks +from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger -from modelscope.utils.utils import if_func_receive_dict_inputs logger = get_logger(__name__) @@ -424,12 +424,15 @@ class SingleBackboneTaskModelBase(BaseTaskModel): def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: """default forward method is the backbone-only forward""" - if if_func_receive_dict_inputs(self.backbone.forward): + if func_receive_dict_inputs(self.backbone.forward): outputs = self.backbone.forward(input) else: outputs = self.backbone.forward(**input) return outputs + def compute_loss(self, outputs: Dict[str, Any], labels): + raise NotImplementedError() + class EncoderDecoderTaskModelBase(BaseTaskModel): """ @@ -472,13 +475,13 @@ class EncoderDecoderTaskModelBase(BaseTaskModel): return getattr(self, self._decoder_prefix) def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - if if_func_receive_dict_inputs(self.encoder_.forward): + if func_receive_dict_inputs(self.encoder_.forward): encoder_outputs = self.encoder_.forward(input) else: encoder_outputs = self.encoder_.forward(**input) decoder_inputs = self.project_decoder_inputs_and_mediate( input, encoder_outputs) - if if_func_receive_dict_inputs(self.decoder_.forward): + if func_receive_dict_inputs(self.decoder_.forward): outputs = self.decoder_.forward(decoder_inputs) else: outputs = self.decoder_.forward(**decoder_inputs) diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py new file mode 100644 index 00000000..ebb1eda2 --- /dev/null +++ b/modelscope/models/nlp/token_classification.py @@ -0,0 +1,147 @@ +from abc import abstractmethod +from typing import Dict + +import numpy as np +import torch +from torch import nn + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.tensor_utils import (torch_nested_detach, + torch_nested_numpify) +from .structbert import SbertPreTrainedModel + +__all__ = ['SbertForTokenClassification'] + + +class TokenClassification(TorchModel): + + base_model_prefix: str = 'bert' + + def __init__(self, config, model_dir): + super().__init__(model_dir) + self.num_labels = config.num_labels + self.config = config + setattr(self, self.base_model_prefix, self.build_base_model()) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + @abstractmethod + def build_base_model(self): + """Build the backbone model. + + Returns: the backbone instance. + """ + pass + + @property + def base_model(self): + return getattr(self, self.base_model_prefix) + + def compute_loss(self, logits, labels, **kwargs): + """Compute loss. + + For example, if backbone is pretrained model, there will be a 'attention_mask' parameter to skip + useless tokens. + + Args: + logits: The logits from the classifier + labels: The labels + **kwargs: Other input params. + + Returns: Loss. + + """ + pass + + def forward(self, **kwargs): + labels = None + if OutputKeys.LABEL in kwargs: + labels = kwargs.pop(OutputKeys.LABEL) + elif OutputKeys.LABELS in kwargs: + labels = kwargs.pop(OutputKeys.LABELS) + + outputs = self.base_model(**kwargs) + # base model should return the sequence_output as its first output + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + if labels is not None: + loss = self.compute_loss(logits, labels, **kwargs) + return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss} + return {OutputKeys.LOGITS: logits} + + def postprocess(self, input: Dict[str, np.ndarray], + **kwargs) -> Dict[str, np.ndarray]: + logits = input[OutputKeys.LOGITS] + pred = torch.argmax(logits[0], dim=-1) + pred = torch_nested_numpify(torch_nested_detach(pred)) + logits = torch_nested_numpify(torch_nested_detach(logits)) + rst = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits} + return rst + + +@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert) +@MODELS.register_module( + Tasks.token_classification, module_name=Models.structbert) +class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): + + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config, model_dir): + if hasattr(config, 'base_model_prefix'): + SbertForTokenClassification.base_model_prefix = config.base_model_prefix + super().__init__(config, model_dir) + + def build_base_model(self): + from .structbert import SbertModel + return SbertModel(self.config, add_pooling_layer=False) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + labels=None, + **kwargs): + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + labels=labels) + + def compute_loss(self, logits, labels, attention_mask=None, **kwargs): + loss_fct = nn.CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels)) + return loss_fct(active_logits, active_labels) + else: + return loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + + model_args = {} if num_labels is None else {'num_labels': num_labels} + return super(SbertPreTrainedModel, + SbertForTokenClassification).from_pretrained( + pretrained_model_name_or_path=kwargs.get('model_dir'), + model_dir=kwargs.get('model_dir'), + **model_args) diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py new file mode 100644 index 00000000..0fe786fd --- /dev/null +++ b/modelscope/models/nlp/veco/__init__.py @@ -0,0 +1,43 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .configuration_veco import VecoConfig + from .modeling_veco import (VecoForMaskedLM, VecoForSequenceClassification, + VecoModel) + from .tokenization_veco import VecoTokenizer + from .tokenization_veco_fast import VecoTokenizerFast +else: + _import_structure = { + 'configuration_veco': ['VecoConfig'], + 'modeling_veco': + ['VecoForMaskedLM', 'VecoForSequenceClassification', 'VecoModel'], + 'tokenization_veco': ['VecoTokenizer'], + 'tokenization_veco_fast': ['VecoTokenizerFast'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/veco/configuration_veco.py b/modelscope/models/nlp/veco/configuration_veco.py new file mode 100644 index 00000000..396755dc --- /dev/null +++ b/modelscope/models/nlp/veco/configuration_veco.py @@ -0,0 +1,33 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors. +# Copyright 2020 The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Veco configuration, mainly copied from :class:`~transformers.configuration_xlm_roberta` """ + +from transformers import RobertaConfig + +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + + +class VecoConfig(RobertaConfig): + """ + This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate + documentation alongside usage examples. + """ + + model_type = 'veco' diff --git a/modelscope/models/nlp/veco/modeling_veco.py b/modelscope/models/nlp/veco/modeling_veco.py new file mode 100644 index 00000000..b519c236 --- /dev/null +++ b/modelscope/models/nlp/veco/modeling_veco.py @@ -0,0 +1,143 @@ +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`""" + +from transformers import (RobertaForMaskedLM, RobertaForMultipleChoice, + RobertaForQuestionAnswering, + RobertaForSequenceClassification, + RobertaForTokenClassification, RobertaModel) +from transformers.file_utils import add_start_docstrings + +from modelscope.metainfo import Models +from modelscope.models.builder import BACKBONES +from modelscope.utils import logger as logging +from modelscope.utils.constant import Fields +from .configuration_veco import VecoConfig + +logger = logging.get_logger(__name__) + +VECO_PRETRAINED_MODEL_ARCHIVE_LIST = [] + +VECO_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config ([`VecoConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model + weights. +""" + + +@add_start_docstrings( + 'The bare Veco Model transformer outputting raw hidden-states without any specific head on top.', + VECO_START_DOCSTRING, +) +class VecoModel(RobertaModel): + """ + This class overrides [`RobertaModel`]. Please check the superclass for the appropriate + documentation alongside usage examples. + """ + + config_class = VecoConfig + + +@add_start_docstrings( + """ + Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + VECO_START_DOCSTRING, +) +class VecoForSequenceClassification(RobertaForSequenceClassification): + """ + This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + +@add_start_docstrings( + """ + Veco Model transformer with a masked language model head on top (a linear layer on top of the + pooled output). + """, + VECO_START_DOCSTRING, +) +class VecoForMaskedLM(RobertaForMaskedLM): + """ + This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + +@add_start_docstrings( + """ + Veco Model with a multiple choice classification head on top (a linear layer on top of the pooled output and + a softmax) e.g. for RocStories/SWAG tasks. + """, + VECO_START_DOCSTRING, +) +class VecoForMultipleChoice(RobertaForMultipleChoice): + """ + This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + +@add_start_docstrings( + """ + Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + VECO_START_DOCSTRING, +) +class VecoForTokenClassification(RobertaForTokenClassification): + """ + This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig + + +@add_start_docstrings( + """ + Veco Model with a span classification head on top for extractive question-answering tasks like SQuAD (a + linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + VECO_START_DOCSTRING, +) +class VecoForQuestionAnswering(RobertaForQuestionAnswering): + """ + This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the + appropriate documentation alongside usage examples. + """ + + config_class = VecoConfig diff --git a/modelscope/models/nlp/veco/tokenization_veco.py b/modelscope/models/nlp/veco/tokenization_veco.py new file mode 100644 index 00000000..21711456 --- /dev/null +++ b/modelscope/models/nlp/veco/tokenization_veco.py @@ -0,0 +1,321 @@ +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta`""" + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer + +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = '▁' + +VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} + +PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + + +class VecoTokenizer(PreTrainedTokenizer): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. + The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) + can be used, among other things, to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ['input_ids', 'attention_mask'] + + def __init__(self, + vocab_file, + bos_token='', + eos_token='', + sep_token='', + cls_token='', + unk_token='', + pad_token='', + mask_token='', + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken( + mask_token, lstrip=True, rstrip=False) if isinstance( + mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # Mimic fairseq token-to-id alignment for the first 4 token + self.fairseq_tokens_to_ids = { + '': 0, + '': 1, + '': 2, + '': 3 + } + + # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + + self.fairseq_tokens_to_ids[''] = len( + self.sp_model) + self.fairseq_offset + self.fairseq_ids_to_tokens = { + v: k + for k, v in self.fairseq_tokens_to_ids.items() + } + + def __getstate__(self): + state = self.__dict__.copy() + state['sp_model'] = None + state['sp_model_proto'] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, 'sp_model_kwargs'): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An Veco sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ( + [0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len( + self.sp_model) + self.fairseq_offset + 1 # Add the token + + def get_vocab(self): + vocab = { + self.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + return out_string + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error( + f'Vocabulary path ({save_directory}) should be a directory') + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + '-' if filename_prefix else '') + + VOCAB_FILES_NAMES['vocab_file']) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file, ) diff --git a/modelscope/models/nlp/veco/tokenization_veco_fast.py b/modelscope/models/nlp/veco/tokenization_veco_fast.py new file mode 100644 index 00000000..3edae0e7 --- /dev/null +++ b/modelscope/models/nlp/veco/tokenization_veco_fast.py @@ -0,0 +1,213 @@ +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Fast Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta_fast`""" + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +import transformers +from transformers.file_utils import is_sentencepiece_available +from transformers.tokenization_utils import AddedToken +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast + +from modelscope.utils import logger as logging + +if is_sentencepiece_available(): + from .tokenization_veco import VecoTokenizer +else: + VecoTokenizer = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'sentencepiece.bpe.model', + 'tokenizer_file': 'tokenizer.json' +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': {}, + 'tokenizer_file': {}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + +transformers.SLOW_TO_FAST_CONVERTERS[ + 'VecoTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS[ + 'XLMRobertaTokenizer'] + + +class VecoTokenizerFast(PreTrainedTokenizerFast): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. + Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ['input_ids', 'attention_mask'] + slow_tokenizer_class = VecoTokenizer + + def __init__(self, + vocab_file=None, + tokenizer_file=None, + bos_token='', + eos_token='', + sep_token='', + cls_token='', + unk_token='', + pad_token='', + mask_token='', + **kwargs): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken( + mask_token, lstrip=True, rstrip=False) if isinstance( + mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + self.can_save_slow_tokenizer = False if not self.vocab_file else True + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An Veco sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow ' + 'tokenizer.') + + if not os.path.isdir(save_directory): + logger.error( + f'Vocabulary path ({save_directory}) should be a directory.') + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + '-' if filename_prefix else '') + + VOCAB_FILES_NAMES['vocab_file']) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file, ) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 8174d054..f6896e4a 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -517,3 +517,10 @@ class MsDataset: def to_hf_dataset(self) -> Dataset: self._hf_ds.reset_format() return self._hf_ds + + @staticmethod + def interleave_datasets(datasets: List[Any], + probabilities: Optional[List[float]] = None, + seed: Optional[int] = None): + from datasets import interleave_datasets + return interleave_datasets(datasets, probabilities, seed) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 0937e441..a82f6ed5 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -9,6 +9,7 @@ class OutputKeys(object): SCORES = 'scores' LABEL = 'label' LABELS = 'labels' + INPUT_IDS = 'input_ids' LABEL_POS = 'label_pos' POSES = 'poses' CAPTION = 'caption' diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index e6a35efc..1111f0d3 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -9,9 +9,8 @@ if TYPE_CHECKING: from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline from .fill_mask_pipeline import FillMaskPipeline from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline - from .nli_pipeline import NLIPipeline - from .sentence_similarity_pipeline import SentenceSimilarityPipeline - from .sentiment_classification_pipeline import SentimentClassificationPipeline + from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline + from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline from .sequence_classification_pipeline import SequenceClassificationPipeline from .text_generation_pipeline import TextGenerationPipeline from .translation_pipeline import TranslationPipeline @@ -28,10 +27,10 @@ else: 'dialog_modeling_pipeline': ['DialogModelingPipeline'], 'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'], 'fill_mask_pipeline': ['FillMaskPipeline'], - 'nli_pipeline': ['NLIPipeline'], - 'sentence_similarity_pipeline': ['SentenceSimilarityPipeline'], - 'sentiment_classification_pipeline': - ['SentimentClassificationPipeline'], + 'single_sentence_classification_pipeline': + ['SingleSentenceClassificationPipeline'], + 'pair_sentence_classification_pipeline': + ['PairSentenceClassificationPipeline'], 'sequence_classification_pipeline': ['SequenceClassificationPipeline'], 'text_generation_pipeline': ['TextGenerationPipeline'], 'word_segmentation_pipeline': ['WordSegmentationPipeline'], diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 27c34817..e4affe40 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -5,11 +5,10 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp.masked_language import MaskedLanguageModelBase from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import FillMaskPreprocessor +from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -21,18 +20,18 @@ _type_map = {'veco': 'roberta', 'sbert': 'bert'} class FillMaskPipeline(Pipeline): def __init__(self, - model: Union[MaskedLanguageModelBase, str], - preprocessor: Optional[FillMaskPreprocessor] = None, - first_sequence='sentense', + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + first_sequence='sentence', **kwargs): """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction Args: - model (MaskedLanguageModelBase): a model instance - preprocessor (FillMaskPreprocessor): a preprocessor instance + model (Model): a model instance + preprocessor (Preprocessor): a preprocessor instance """ fill_mask_model = model if isinstance( - model, MaskedLanguageModelBase) else Model.from_pretrained(model) + model, Model) else Model.from_pretrained(model) if preprocessor is None: preprocessor = FillMaskPreprocessor( @@ -73,7 +72,7 @@ class FillMaskPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: with torch.no_grad(): - return super().forward(inputs, **forward_params) + return self.model(inputs, **forward_params) def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: """process the prediction results @@ -85,8 +84,8 @@ class FillMaskPipeline(Pipeline): Dict[str, str]: the prediction results """ import numpy as np - logits = inputs['logits'].detach().cpu().numpy() - input_ids = inputs['input_ids'].detach().cpu().numpy() + logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy() + input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy() pred_ids = np.argmax(logits, axis=-1) model_type = self.model.config.model_type process_type = model_type if model_type in self.mask_id else _type_map[ diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 65334144..29c439fc 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -4,11 +4,10 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp import TransformerCRFForNamedEntityRecognition from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import NERPreprocessor +from modelscope.preprocessors import NERPreprocessor, Preprocessor from modelscope.utils.constant import Tasks __all__ = ['NamedEntityRecognitionPipeline'] @@ -20,13 +19,12 @@ __all__ = ['NamedEntityRecognitionPipeline'] class NamedEntityRecognitionPipeline(Pipeline): def __init__(self, - model: Union[TransformerCRFForNamedEntityRecognition, str], - preprocessor: Optional[NERPreprocessor] = None, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, **kwargs): model = model if isinstance(model, - TransformerCRFForNamedEntityRecognition - ) else Model.from_pretrained(model) + Model) else Model.from_pretrained(model) if preprocessor is None: preprocessor = NERPreprocessor(model.model_dir) model.eval() diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py deleted file mode 100644 index 200f44e4..00000000 --- a/modelscope/pipelines/nlp/nli_pipeline.py +++ /dev/null @@ -1,73 +0,0 @@ -import uuid -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import SbertForNLI -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import NLIPreprocessor -from modelscope.utils.constant import Tasks - -__all__ = ['NLIPipeline'] - - -@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) -class NLIPipeline(Pipeline): - - def __init__(self, - model: Union[SbertForNLI, str], - preprocessor: NLIPreprocessor = None, - first_sequence='first_sequence', - second_sequence='second_sequence', - **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction - - Args: - model (SbertForNLI): a model instance - preprocessor (NLIPreprocessor): a preprocessor instance - """ - assert isinstance(model, str) or isinstance(model, SbertForNLI), \ - 'model must be a single str or SbertForNLI' - model = model if isinstance( - model, SbertForNLI) else Model.from_pretrained(model) - if preprocessor is None: - preprocessor = NLIPreprocessor( - model.model_dir, - first_sequence=first_sequence, - second_sequence=second_sequence) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - assert len(model.id2label) > 0 - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return super().forward(inputs, **forward_params) - - def postprocess(self, - inputs: Dict[str, Any], - topk: int = 5) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - - Returns: - Dict[str, str]: the prediction results - """ - - probs = inputs['probabilities'][0] - num_classes = probs.shape[0] - topk = min(topk, num_classes) - top_indices = np.argpartition(probs, -topk)[-topk:] - cls_ids = top_indices[np.argsort(probs[top_indices])] - probs = probs[cls_ids].tolist() - - cls_names = [self.model.id2label[cid] for cid in cls_ids] - - return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py new file mode 100644 index 00000000..0804ec8c --- /dev/null +++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py @@ -0,0 +1,37 @@ +from typing import Union + +from modelscope.models.base import Model +from ...metainfo import Pipelines +from ...preprocessors import (PairSentenceClassificationPreprocessor, + Preprocessor) +from ...utils.constant import Tasks +from ..builder import PIPELINES +from .sequence_classification_pipeline_base import \ + SequenceClassificationPipelineBase + +__all__ = ['PairSentenceClassificationPipeline'] + + +@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) +@PIPELINES.register_module( + Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) +class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase): + + def __init__(self, + model: Union[Model, str], + preprocessor: Preprocessor = None, + first_sequence='first_sequence', + second_sequence='second_sequence', + **kwargs): + """use `model` and `preprocessor` to create a nlp pair sentence classification pipeline for prediction + + Args: + model (Model): a model instance + preprocessor (Preprocessor): a preprocessor instance + """ + if preprocessor is None: + preprocessor = PairSentenceClassificationPreprocessor( + model.model_dir if isinstance(model, Model) else model, + first_sequence=first_sequence, + second_sequence=second_sequence) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py deleted file mode 100644 index c09e2115..00000000 --- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py +++ /dev/null @@ -1,73 +0,0 @@ -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import SbertForSentenceSimilarity -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Input, Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import SentenceSimilarityPreprocessor -from modelscope.utils.constant import Tasks - -__all__ = ['SentenceSimilarityPipeline'] - - -@PIPELINES.register_module( - Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) -class SentenceSimilarityPipeline(Pipeline): - - def __init__(self, - model: Union[Model, str], - preprocessor: SentenceSimilarityPreprocessor = None, - first_sequence='first_sequence', - second_sequence='second_sequence', - **kwargs): - """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction - - Args: - model (SbertForSentenceSimilarity): a model instance - preprocessor (SentenceSimilarityPreprocessor): a preprocessor instance - """ - assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \ - 'model must be a single str or SbertForSentenceSimilarity' - sc_model = model if isinstance( - model, - SbertForSentenceSimilarity) else Model.from_pretrained(model) - if preprocessor is None: - preprocessor = SentenceSimilarityPreprocessor( - sc_model.model_dir, - first_sequence=first_sequence, - second_sequence=second_sequence) - sc_model.eval() - super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) - - assert hasattr(self.model, 'id2label'), \ - 'id2label map should be initalizaed in init function.' - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return super().forward(inputs, **forward_params) - - def postprocess(self, inputs: Dict[str, Any], - **postprocess_params) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - - Returns: - Dict[str, str]: the prediction results - """ - - probs = inputs['probabilities'][0] - num_classes = probs.shape[0] - top_indices = np.argpartition(probs, -num_classes)[-num_classes:] - cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)] - probs = probs[cls_ids].tolist() - cls_names = [self.model.id2label[cid] for cid in cls_ids] - b = 0 - return {OutputKeys.SCORES: probs[b], OutputKeys.LABELS: cls_names[b]} diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py deleted file mode 100644 index 8e57d77b..00000000 --- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py +++ /dev/null @@ -1,74 +0,0 @@ -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import SequenceClassificationModel -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import SentimentClassificationPreprocessor -from modelscope.utils.constant import Tasks - -__all__ = ['SentimentClassificationPipeline'] - - -@PIPELINES.register_module( - Tasks.sentiment_classification, - module_name=Pipelines.sentiment_classification) -class SentimentClassificationPipeline(Pipeline): - - def __init__(self, - model: Union[SequenceClassificationModel, str], - preprocessor: SentimentClassificationPreprocessor = None, - first_sequence='first_sequence', - second_sequence='second_sequence', - **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction - - Args: - model (SequenceClassificationModel): a model instance - preprocessor (SentimentClassificationPreprocessor): a preprocessor instance - """ - assert isinstance(model, str) or isinstance(model, SequenceClassificationModel), \ - 'model must be a single str or SentimentClassification' - model = model if isinstance( - model, - SequenceClassificationModel) else Model.from_pretrained(model) - if preprocessor is None: - preprocessor = SentimentClassificationPreprocessor( - model.model_dir, - first_sequence=first_sequence, - second_sequence=second_sequence) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - assert len(model.id2label) > 0 - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return super().forward(inputs, **forward_params) - - def postprocess(self, - inputs: Dict[str, Any], - topk: int = 5) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - - Returns: - Dict[str, str]: the prediction results - """ - - probs = inputs['probabilities'][0] - num_classes = probs.shape[0] - topk = min(topk, num_classes) - top_indices = np.argpartition(probs, -topk)[-topk:] - cls_ids = top_indices[np.argsort(probs[top_indices])] - probs = probs[cls_ids].tolist() - - cls_names = [self.model.id2label[cid] for cid in cls_ids] - return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py new file mode 100644 index 00000000..ad31bfbd --- /dev/null +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py @@ -0,0 +1,60 @@ +from typing import Any, Dict, Union + +import numpy as np +import torch + +from modelscope.models.base import Model +from modelscope.outputs import OutputKeys +from ...preprocessors import Preprocessor +from ..base import Pipeline + + +class SequenceClassificationPipelineBase(Pipeline): + + def __init__(self, model: Union[Model, str], preprocessor: Preprocessor, + **kwargs): + """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + + Args: + model (str or Model): a model instance + preprocessor (Preprocessor): a preprocessor instance + """ + assert isinstance(model, str) or isinstance(model, Model), \ + 'model must be a single str or Model' + model = model if isinstance(model, + Model) else Model.from_pretrained(model) + assert preprocessor is not None + model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.id2label = kwargs.get('id2label') + if self.id2label is None and hasattr(self.preprocessor, 'id2label'): + self.id2label = self.preprocessor.id2label + assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ + 'as a parameter or make sure the preprocessor has the attribute.' + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return self.model(inputs, **forward_params) + + def postprocess(self, + inputs: Dict[str, Any], + topk: int = 5) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + topk (int): The topk probs to take + Returns: + Dict[str, str]: the prediction results + """ + + probs = inputs[OutputKeys.PROBABILITIES][0] + num_classes = probs.shape[0] + topk = min(topk, num_classes) + top_indices = np.argpartition(probs, -topk)[-topk:] + cls_ids = top_indices[np.argsort(probs[top_indices])] + probs = probs[cls_ids].tolist() + + cls_names = [self.id2label[cid] for cid in cls_ids] + return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py new file mode 100644 index 00000000..8e0b4fe0 --- /dev/null +++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py @@ -0,0 +1,35 @@ +from typing import Union + +from ...metainfo import Pipelines +from ...models import Model +from ...preprocessors import (Preprocessor, + SingleSentenceClassificationPreprocessor) +from ...utils.constant import Tasks +from ..builder import PIPELINES +from .sequence_classification_pipeline_base import \ + SequenceClassificationPipelineBase + +__all__ = ['SingleSentenceClassificationPipeline'] + + +@PIPELINES.register_module( + Tasks.sentiment_classification, + module_name=Pipelines.sentiment_classification) +class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase): + + def __init__(self, + model: Union[Model, str], + preprocessor: Preprocessor = None, + first_sequence='first_sequence', + **kwargs): + """use `model` and `preprocessor` to create a nlp single sentence classification pipeline for prediction + + Args: + model (Model): a model instance + preprocessor (Preprocessor): a preprocessor instance + """ + if preprocessor is None: + preprocessor = SingleSentenceClassificationPreprocessor( + model.model_dir if isinstance(model, Model) else model, + first_sequence=first_sequence) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 85a81eba..287c98ff 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -3,7 +3,7 @@ from typing import Any, Dict, Optional, Union import torch from modelscope.metainfo import Pipelines -from modelscope.models.base import TorchModel +from modelscope.models.base import Model from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import TextGenerationPreprocessor @@ -17,7 +17,7 @@ __all__ = ['TextGenerationPipeline'] class TextGenerationPipeline(Pipeline): def __init__(self, - model: Union[TorchModel, str], + model: Union[Model, str], preprocessor: Optional[TextGenerationPreprocessor] = None, **kwargs): """use `model` and `preprocessor` to create a nlp text generation pipeline for prediction @@ -26,8 +26,8 @@ class TextGenerationPipeline(Pipeline): model (PalmForTextGeneration): a model instance preprocessor (TextGenerationPreprocessor): a preprocessor instance """ - model = model if isinstance( - model, TorchModel) else TorchModel.from_pretrained(model) + model = model if isinstance(model, + Model) else Model.from_pretrained(model) if preprocessor is None: preprocessor = TextGenerationPreprocessor( model.model_dir, diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index fdf9be64..dba3fe9f 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -4,11 +4,9 @@ from typing import Any, Dict import numpy as np import tensorflow as tf -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Pipelines -from modelscope.models.nlp import CsanmtForTranslation from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 73d0c278..06e6a31c 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -4,11 +4,11 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp import SbertForTokenClassification from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import TokenClassificationPreprocessor +from modelscope.preprocessors import (Preprocessor, + TokenClassificationPreprocessor) from modelscope.utils.constant import Tasks __all__ = ['WordSegmentationPipeline'] @@ -18,33 +18,35 @@ __all__ = ['WordSegmentationPipeline'] Tasks.word_segmentation, module_name=Pipelines.word_segmentation) class WordSegmentationPipeline(Pipeline): - def __init__( - self, - model: Union[SbertForTokenClassification, str], - preprocessor: Optional[TokenClassificationPreprocessor] = None, - **kwargs): + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + **kwargs): """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction Args: - model (StructBertForTokenClassification): a model instance - preprocessor (TokenClassificationPreprocessor): a preprocessor instance + model (Model): a model instance + preprocessor (Preprocessor): a preprocessor instance """ - model = model if isinstance( - model, - SbertForTokenClassification) else Model.from_pretrained(model) + model = model if isinstance(model, + Model) else Model.from_pretrained(model) if preprocessor is None: preprocessor = TokenClassificationPreprocessor(model.model_dir) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.tokenizer = preprocessor.tokenizer - self.config = model.config - assert len(self.config.id2label) > 0 - self.id2label = self.config.id2label + self.id2label = kwargs.get('id2label') + if self.id2label is None and hasattr(self.preprocessor, 'id2label'): + self.id2label = self.preprocessor.id2label + assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ + 'as a parameter or make sure the preprocessor has the attribute.' def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: + text = inputs.pop(OutputKeys.TEXT) with torch.no_grad(): - return super().forward(inputs, **forward_params) + return { + **self.model(inputs, **forward_params), OutputKeys.TEXT: text + } def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index 642d4870..d0dd2336 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -5,11 +5,11 @@ from scipy.special import softmax from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp import SbertForZeroShotClassification from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import ZeroShotClassificationPreprocessor +from modelscope.preprocessors import (Preprocessor, + ZeroShotClassificationPreprocessor) from modelscope.utils.constant import Tasks __all__ = ['ZeroShotClassificationPipeline'] @@ -21,19 +21,18 @@ __all__ = ['ZeroShotClassificationPipeline'] class ZeroShotClassificationPipeline(Pipeline): def __init__(self, - model: Union[SbertForZeroShotClassification, str], - preprocessor: ZeroShotClassificationPreprocessor = None, + model: Union[Model, str], + preprocessor: Preprocessor = None, **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + """use `model` and `preprocessor` to create a nlp zero-shot text classification pipeline for prediction Args: - model (SbertForZeroShotClassification): a model instance - preprocessor (SentimentClassificationPreprocessor): a preprocessor instance + model (Model): a model instance + preprocessor (Preprocessor): a preprocessor instance """ - assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \ - 'model must be a single str or SbertForZeroShotClassification' - model = model if isinstance( - model, - SbertForZeroShotClassification) else Model.from_pretrained(model) + assert isinstance(model, str) or isinstance(model, Model), \ + 'model must be a single str or Model' + model = model if isinstance(model, + Model) else Model.from_pretrained(model) self.entailment_id = 0 self.contradiction_id = 2 if preprocessor is None: @@ -58,7 +57,7 @@ class ZeroShotClassificationPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: with torch.no_grad(): - return super().forward(inputs, **forward_params) + return self.model(inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any], @@ -70,7 +69,7 @@ class ZeroShotClassificationPipeline(Pipeline): Returns: Dict[str, Any]: the prediction results """ - logits = inputs['logits'] + logits = inputs[OutputKeys.LOGITS] if multi_label or len(candidate_labels) == 1: logits = logits[..., [self.contradiction_id, self.entailment_id]] scores = softmax(logits, axis=-1)[..., 1] diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 9d991146..c73a6c4f 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -18,11 +18,11 @@ if TYPE_CHECKING: MPlugVisualQuestionAnsweringPreprocessor) from .nlp import (Tokenize, SequenceClassificationPreprocessor, TextGenerationPreprocessor, - TokenClassificationPreprocessor, NLIPreprocessor, - SentimentClassificationPreprocessor, - SentenceSimilarityPreprocessor, FillMaskPreprocessor, - ZeroShotClassificationPreprocessor, NERPreprocessor, - TextErrorCorrectionPreprocessor) + TokenClassificationPreprocessor, + SingleSentenceClassificationPreprocessor, + PairSentenceClassificationPreprocessor, + FillMaskPreprocessor, ZeroShotClassificationPreprocessor, + NERPreprocessor, TextErrorCorrectionPreprocessor) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, DialogStateTrackingPreprocessor) @@ -46,8 +46,8 @@ else: 'nlp': [ 'Tokenize', 'SequenceClassificationPreprocessor', 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'NLIPreprocessor', 'SentimentClassificationPreprocessor', - 'SentenceSimilarityPreprocessor', 'FillMaskPreprocessor', + 'SingleSentenceClassificationPreprocessor', + 'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', 'TextErrorCorrectionPreprocessor' ], diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index d0142693..6360a907 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -1,5 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - +import os from abc import ABC, abstractmethod from typing import Any, Dict @@ -10,6 +10,8 @@ class Preprocessor(ABC): def __init__(self, *args, **kwargs): self._mode = ModeKeys.INFERENCE + self.device = int( + os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None pass @abstractmethod diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index a0a7a5b5..f0951f38 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -2,14 +2,14 @@ import os.path as osp import uuid -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Iterable, Optional, Tuple, Union from transformers import AutoTokenizer -from modelscope.metainfo import Preprocessors -from modelscope.models import Model +from modelscope.metainfo import Models, Preprocessors +from modelscope.outputs import OutputKeys from modelscope.utils.constant import Fields, InputFields, ModeKeys -from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.hub import get_model_type, parse_label_mapping from modelscope.utils.type_assert import type_assert from .base import Preprocessor from .builder import PREPROCESSORS @@ -17,8 +17,8 @@ from .builder import PREPROCESSORS __all__ = [ 'Tokenize', 'SequenceClassificationPreprocessor', 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'NLIPreprocessor', 'SentimentClassificationPreprocessor', - 'FillMaskPreprocessor', 'SentenceSimilarityPreprocessor', + 'PairSentenceClassificationPreprocessor', + 'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', 'TextErrorCorrectionPreprocessor' ] @@ -38,99 +38,6 @@ class Tokenize(Preprocessor): return data -class NLPPreprocessorBase(Preprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.tokenize_kwargs = kwargs - self.tokenizer = self.build_tokenizer(model_dir) - self.label2id = parse_label_mapping(self.model_dir) - - def build_tokenizer(self, model_dir): - from sofa import SbertTokenizer - return SbertTokenizer.from_pretrained(model_dir) - - @type_assert(object, object) - def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: - """process the raw input data - - Args: - data (tuple): [sentence1, sentence2] - sentence1 (str): a sentence - Example: - 'you are so handsome.' - sentence2 (str): a sentence - Example: - 'you are so beautiful.' - Returns: - Dict[str, Any]: the preprocessed data - """ - - text_a, text_b = None, None - if isinstance(data, str): - text_a = data - elif isinstance(data, tuple): - assert len(data) == 2 - text_a, text_b = data - elif isinstance(data, dict): - text_a = data.get(self.first_sequence) - text_b = data.get(self.second_sequence, None) - - rst = self.tokenizer(text_a, text_b, **self.tokenize_kwargs) - if self._mode == ModeKeys.TRAIN: - rst = {k: v.squeeze() for k, v in rst.items()} - if self.label2id is not None and 'label' in data: - rst['label'] = self.label2id[str(data['label'])] - return rst - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.nli_tokenizer) -class NLIPreprocessor(NLPPreprocessorBase): - - def __init__(self, model_dir: str, *args, **kwargs): - kwargs['truncation'] = True - kwargs['padding'] = False - kwargs['return_tensors'] = 'pt' - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, *args, **kwargs) - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) -class SentimentClassificationPreprocessor(NLPPreprocessorBase): - - def __init__(self, model_dir: str, *args, **kwargs): - kwargs['truncation'] = True - kwargs['padding'] = 'max_length' - kwargs['return_tensors'] = 'pt' - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, *args, **kwargs) - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) -class SentenceSimilarityPreprocessor(NLPPreprocessorBase): - - def __init__(self, model_dir: str, *args, **kwargs): - kwargs['truncation'] = True - kwargs['padding'] = False if 'padding' not in kwargs else kwargs[ - 'padding'] - kwargs['return_tensors'] = 'pt' - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, *args, **kwargs) - - @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) class SequenceClassificationPreprocessor(Preprocessor): @@ -197,32 +104,193 @@ class SequenceClassificationPreprocessor(Preprocessor): return rst +class NLPTokenizerPreprocessorBase(Preprocessor): + + def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs): + """preprocess the data via the vocab.txt from the `model_dir` path + + Args: + model_dir (str): model path + """ + + super().__init__(**kwargs) + self.model_dir: str = model_dir + self.first_sequence: str = kwargs.pop('first_sequence', + 'first_sequence') + self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') + self.pair = pair + self._mode = mode + self.label = kwargs.pop('label', OutputKeys.LABEL) + self.label2id = None + if 'label2id' in kwargs: + self.label2id = kwargs.pop('label2id') + if self.label2id is None: + self.label2id = parse_label_mapping(self.model_dir) + + self.tokenize_kwargs = kwargs + self.tokenizer = self.build_tokenizer(model_dir) + + @property + def id2label(self): + if self.label2id is not None: + return {id: label for label, id in self.label2id.items()} + return None + + def build_tokenizer(self, model_dir): + model_type = get_model_type(model_dir) + if model_type in (Models.structbert, Models.gpt3, Models.palm): + from modelscope.models.nlp.structbert import SbertTokenizerFast + return SbertTokenizerFast.from_pretrained(model_dir) + elif model_type == Models.veco: + from modelscope.models.nlp.veco import VecoTokenizerFast + return VecoTokenizerFast.from_pretrained(model_dir) + else: + return AutoTokenizer.from_pretrained(model_dir) + + def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: + """process the raw input data + + Args: + data (tuple): [sentence1, sentence2] + sentence1 (str): a sentence + Example: + 'you are so handsome.' + sentence2 (str): a sentence + Example: + 'you are so beautiful.' + Returns: + Dict[str, Any]: the preprocessed data + """ + + text_a, text_b, labels = self.parse_text_and_label(data) + output = self.tokenizer( + text_a, + text_b, + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, + **self.tokenize_kwargs) + self.labels_to_id(labels, output) + return output + + def parse_text_and_label(self, data): + text_a, text_b, labels = None, None, None + if isinstance(data, str): + text_a = data + elif isinstance(data, tuple) or isinstance(data, list): + if len(data) == 3: + text_a, text_b, labels = data + elif len(data) == 2: + if self.pair: + text_a, text_b = data + else: + text_a, labels = data + elif isinstance(data, dict): + text_a = data.get(self.first_sequence) + text_b = data.get(self.second_sequence) + labels = data.get(self.label) + + return text_a, text_b, labels + + def labels_to_id(self, labels, output): + + def label_can_be_mapped(label): + return isinstance(label, str) or isinstance(label, int) + + if labels is not None: + if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ + and self.label2id is not None: + output[OutputKeys.LABEL] = [ + self.label2id[str(label)] for label in labels + ] + elif label_can_be_mapped(labels) and self.label2id is not None: + output[OutputKeys.LABEL] = self.label2id[str(labels)] + else: + output[OutputKeys.LABEL] = labels + + @PREPROCESSORS.register_module( - Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune') -class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor): - """Sentence similarity preprocessor in the finetune scenario + Fields.nlp, module_name=Preprocessors.nli_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) +class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get( + 'padding', False if mode == 'inference' else 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, pair=True, mode=mode, **kwargs) - Mainly added the label mapping procedure. - """ - def __init__(self, model_dir: str, *args, **kwargs): - kwargs['padding'] = 'max_length' - super().__init__(model_dir, *args, **kwargs) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) +class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get( + 'padding', False if mode == 'inference' else 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, pair=False, mode=mode, **kwargs) + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) +class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + """preprocess the data via the vocab.txt from the `model_dir` path + + Args: + model_dir (str): model path + """ + self.sequence_length = kwargs.pop('sequence_length', 512) + super().__init__(model_dir, pair=False, mode=mode, **kwargs) + + def __call__(self, data: Union[str, Dict], hypothesis_template: str, + candidate_labels: list) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str or dict): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + if isinstance(data, dict): + data = data.get(self.first_sequence) + + pairs = [[data, hypothesis_template.format(label)] + for label in candidate_labels] + + features = self.tokenizer( + pairs, + padding=True, + truncation=True, + max_length=self.sequence_length, + truncation_strategy='only_first', + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None) + return features @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_gen_tokenizer) -class TextGenerationPreprocessor(NLPPreprocessorBase): +class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): - def __init__(self, model_dir: str, tokenizer=None, *args, **kwargs): + def __init__(self, + model_dir: str, + tokenizer=None, + mode=ModeKeys.INFERENCE, + **kwargs): self.tokenizer = self.build_tokenizer( model_dir) if tokenizer is None else tokenizer - kwargs['truncation'] = True - kwargs['padding'] = True - kwargs['return_tensors'] = 'pt' - kwargs['return_token_type_ids'] = False + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', True) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + False) kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, *args, **kwargs) + super().__init__(model_dir, pair=False, mode=mode, **kwargs) @staticmethod def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: @@ -240,19 +308,13 @@ class TextGenerationPreprocessor(NLPPreprocessorBase): roberta_tokenizer_dir, do_lower_case=False) return super().build_tokenizer(model_dir) - -@PREPROCESSORS.register_module( - Fields.nlp, module_name='palm-text-gen-tokenizer-finetune') -class TextGenerationFinetunePreprocessor(TextGenerationPreprocessor): - - @type_assert(object, dict) - def __call__(self, data: dict) -> Dict[str, Any]: + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: + if self._mode == 'inference': + return super().__call__(data) src_txt = data['src_txt'] tgt_txt = data['tgt_txt'] src_rst = super().__call__(src_txt) tgt_rst = super().__call__(tgt_txt) - src_rst = {k: v.squeeze() for k, v in src_rst.items()} - tgt_rst = {k: v.squeeze() for k, v in tgt_rst.items()} return { 'src': src_rst['input_ids'], @@ -261,87 +323,69 @@ class TextGenerationFinetunePreprocessor(TextGenerationPreprocessor): } -@PREPROCESSORS.register_module(Fields.nlp) -class FillMaskPreprocessor(NLPPreprocessorBase): +@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) +class FillMaskPreprocessor(NLPTokenizerPreprocessorBase): - def __init__(self, model_dir: str, *args, **kwargs): - kwargs['truncation'] = True - kwargs['padding'] = 'max_length' - kwargs['return_tensors'] = 'pt' + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) - kwargs['return_token_type_ids'] = True - super().__init__(model_dir, *args, **kwargs) - - def build_tokenizer(self, model_dir): - from modelscope.utils.hub import get_model_type - model_type = get_model_type(model_dir) - if model_type in ['sbert', 'structbert', 'bert']: - from sofa import SbertTokenizer - return SbertTokenizer.from_pretrained(model_dir, use_fast=False) - elif model_type == 'veco': - from sofa import VecoTokenizer - return VecoTokenizer.from_pretrained(model_dir, use_fast=False) - else: - # TODO Only support veco & sbert - raise RuntimeError(f'Unsupported model type: {model_type}') + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + True) + super().__init__(model_dir, pair=False, mode=mode, **kwargs) @PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) -class TokenClassificationPreprocessor(NLPPreprocessorBase): - - def __init__(self, model_dir: str, *args, **kwargs): - super().__init__(model_dir, *args, **kwargs) - - @type_assert(object, str) - def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: - """process the raw input data + Fields.nlp, + module_name=Preprocessors.word_segment_text_to_label_preprocessor) +class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - - # preprocess the data for the model input - if isinstance(data, dict): - data = data[self.first_sequence] - text = data.replace(' ', '').strip() - tokens = [] - for token in text: - token = self.tokenizer.tokenize(token) - tokens.extend(token) - input_ids = self.tokenizer.convert_tokens_to_ids(tokens) - input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids) - attention_mask = [1] * len(input_ids) - token_type_ids = [0] * len(input_ids) + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.first_sequence: str = kwargs.pop('first_sequence', + 'first_sequence') + self.label = kwargs.pop('label', OutputKeys.LABELS) + + def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: + data = data.split(' ') + data = list(filter(lambda x: len(x) > 0, data)) + + def produce_train_sample(words): + chars = [] + labels = [] + for word in words: + chars.extend(list(word)) + if len(word) == 1: + labels.append('S-CWS') + else: + labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2) + + ['E-CWS']) + assert len(chars) == len(labels) + return chars, labels + + chars, labels = produce_train_sample(data) return { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'token_type_ids': token_type_ids + self.first_sequence: chars, + self.label: labels, } @PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) -class ZeroShotClassificationPreprocessor(NLPPreprocessorBase): - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) +class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): - Args: - model_dir (str): model path - """ - self.sequence_length = kwargs.pop('sequence_length', 512) - super().__init__(model_dir, *args, **kwargs) + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get( + 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + kwargs['is_split_into_words'] = kwargs.pop( + 'is_split_into_words', + False if mode == ModeKeys.INFERENCE else True) + self.label_all_tokens = kwargs.pop('label_all_tokens', False) + super().__init__(model_dir, pair=False, mode=mode, **kwargs) - @type_assert(object, str) - def __call__(self, data, hypothesis_template: str, - candidate_labels: list) -> Dict[str, Any]: + def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: """process the raw input data Args: @@ -352,20 +396,74 @@ class ZeroShotClassificationPreprocessor(NLPPreprocessorBase): Returns: Dict[str, Any]: the preprocessed data """ - if isinstance(data, dict): - data = data.get(self.first_sequence) - pairs = [[data, hypothesis_template.format(label)] - for label in candidate_labels] - - features = self.tokenizer( - pairs, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_tensors='pt', - truncation_strategy='only_first') - return features + # preprocess the data for the model input + # if isinstance(data, dict): + # data = data[self.first_sequence] + # text = data.replace(' ', '').strip() + # tokens = [] + # for token in text: + # token = self.tokenizer.tokenize(token) + # tokens.extend(token) + # input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + # input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids) + # attention_mask = [1] * len(input_ids) + # token_type_ids = [0] * len(input_ids) + + # new code to deal with labels + # tokenized_inputs = self.tokenizer(data, truncation=True, is_split_into_words=True) + + text_a = None + labels_list = None + if isinstance(data, str): + text_a = data + elif isinstance(data, dict): + text_a = data.get(self.first_sequence) + labels_list = data.get(self.label) + tokenized_inputs = self.tokenizer( + text_a, + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, + **self.tokenize_kwargs) + + if labels_list is not None: + assert self.label2id is not None + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + label_enumerate_values = [ + k for k, v in sorted( + self.label2id.items(), key=lambda item: item[1]) + ] + for idx, label in enumerate(label_enumerate_values): + if label.startswith('B-') and label.replace( + 'B-', 'I-') in label_enumerate_values: + b_to_i_label.append( + label_enumerate_values.index( + label.replace('B-', 'I-'))) + else: + b_to_i_label.append(idx) + + label_row = [self.label2id[lb] for lb in labels_list] + word_ids = tokenized_inputs.word_ids() + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + if word_idx is None: + label_ids.append(-100) + elif word_idx != previous_word_idx: + label_ids.append(label_row[word_idx]) + else: + if self.label_all_tokens: + label_ids.append(b_to_i_label[label_row[word_idx]]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + labels = label_ids + tokenized_inputs['labels'] = labels + # new code end + + if self._mode == ModeKeys.INFERENCE: + tokenized_inputs[OutputKeys.TEXT] = text_a + return tokenized_inputs @PREPROCESSORS.register_module( diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py index 80036ed1..038ab09b 100644 --- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py +++ b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py @@ -24,7 +24,7 @@ class DialogStateTrackingPreprocessor(Preprocessor): """ super().__init__(*args, **kwargs) - from sofa.models.space import SpaceConfig, SpaceTokenizer + from modelscope.models.nlp.space import SpaceConfig, SpaceTokenizer self.model_dir: str = model_dir self.config = SpaceConfig.from_pretrained(self.model_dir) self.tokenizer = SpaceTokenizer.from_pretrained(self.model_dir) diff --git a/modelscope/task_datasets/__init__.py b/modelscope/task_datasets/__init__.py index 5f0d9b1e..93e01cb5 100644 --- a/modelscope/task_datasets/__init__.py +++ b/modelscope/task_datasets/__init__.py @@ -7,12 +7,14 @@ if TYPE_CHECKING: from .base import TaskDataset from .builder import TASK_DATASETS, build_task_dataset from .torch_base_dataset import TorchTaskDataset + from .veco_dataset import VecoDataset else: _import_structure = { 'base': ['TaskDataset'], 'builder': ['TASK_DATASETS', 'build_task_dataset'], 'torch_base_dataset': ['TorchTaskDataset'], + 'veco_dataset': ['VecoDataset'], } import sys diff --git a/modelscope/task_datasets/base.py b/modelscope/task_datasets/base.py index a4104ced..39b791b1 100644 --- a/modelscope/task_datasets/base.py +++ b/modelscope/task_datasets/base.py @@ -1,6 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from abc import ABC, abstractmethod -from typing import Any, List, Tuple +from typing import Any, List, Tuple, Union class TaskDataset(ABC): @@ -8,7 +8,7 @@ class TaskDataset(ABC): """ def __init__(self, - datasets: Tuple[Any, List[Any]], + datasets: Union[Any, List[Any]], mode, preprocessor=None, **kwargs): @@ -18,7 +18,7 @@ class TaskDataset(ABC): self._inner_dataset = self.prepare_dataset(datasets) @abstractmethod - def prepare_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: + def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any: """Prepare a dataset. User can process the input datasets in a whole dataset perspective. diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/task_datasets/torch_base_dataset.py index 5ec9209e..014e4faa 100644 --- a/modelscope/task_datasets/torch_base_dataset.py +++ b/modelscope/task_datasets/torch_base_dataset.py @@ -1,5 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, List, Tuple +from typing import Any, List, Tuple, Union from torch.utils.data import ConcatDataset, Dataset @@ -14,7 +14,7 @@ class TorchTaskDataset(TaskDataset, Dataset): """ def __init__(self, - datasets: Tuple[Any, List[Any]], + datasets: Union[Any, List[Any]], mode, preprocessor=None, **kwargs): @@ -26,7 +26,7 @@ class TorchTaskDataset(TaskDataset, Dataset): def __len__(self): return len(self._inner_dataset) - def prepare_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: + def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any: """Prepare a dataset. User can process the input datasets in a whole dataset perspective. diff --git a/modelscope/task_datasets/veco_dataset.py b/modelscope/task_datasets/veco_dataset.py new file mode 100644 index 00000000..df7c6483 --- /dev/null +++ b/modelscope/task_datasets/veco_dataset.py @@ -0,0 +1,76 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, List, Union + +import numpy as np +from datasets import Dataset, IterableDataset, concatenate_datasets + +from modelscope.metainfo import Models +from modelscope.utils.constant import Tasks +from .builder import TASK_DATASETS +from .torch_base_dataset import TorchTaskDataset + + +@TASK_DATASETS.register_module(module_name=Models.veco, group_key=Tasks.nli) +class VecoDataset(TorchTaskDataset): + + def __init__(self, + datasets: Union[Any, List[Any]], + mode, + preprocessor=None, + **kwargs): + self.seed = kwargs.get('seed', 42) + self.permutation = None + self.datasets = None + super().__init__(datasets, mode, preprocessor, **kwargs) + + def switch_dataset(self, idx): + """Switch dataset in evaluation. + + Veco evaluates dataset one by one. + + Args: + idx: The index of the dataset + """ + if self.mode == 'train': + raise ValueError( + 'Only support switch dataset in the evaluation loop') + if idx >= len(self.datasets): + raise ValueError( + 'Index is bigger than the number of the datasets.') + self._inner_dataset = self.datasets[idx] + + def __getitem__(self, item): + if self.permutation is not None: + item = self.permutation[item] + return super().__getitem__(item) + + def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any: + """Compose all the datasets. + + If the mode is 'train', all datasets will be mixed together, if the mode is 'eval', + the datasets will be kept and returns the first one. + + Args: + datasets: The datasets to be composed. + + Returns: The final dataset. + """ + if not isinstance(datasets, (list, tuple)): + datasets = [datasets] + if self.mode == 'train': + if len(datasets) == 1: + return datasets[0] + elif all([ + isinstance(dataset, (Dataset, IterableDataset)) + for dataset in datasets + ]): + dataset = concatenate_datasets(list(datasets)) + return dataset.shuffle(seed=self.seed) + else: + generator = np.random.default_rng(self.seed) + _len = sum([len(dataset) for dataset in datasets]) + self.permutation = generator.permutation(_len) + return super().prepare_dataset(datasets) + else: + self.datasets = datasets + return self.datasets[0] diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py index 350bab61..d802fd8b 100644 --- a/modelscope/trainers/__init__.py +++ b/modelscope/trainers/__init__.py @@ -4,4 +4,5 @@ from .cv import (ImageInstanceSegmentationTrainer, ImagePortraitEnhancementTrainer) from .multi_modal import CLIPTrainer from .nlp import SequenceClassificationTrainer +from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer from .trainer import EpochBasedTrainer diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py index aea27f2f..80d8c03c 100644 --- a/modelscope/trainers/hooks/evaluation_hook.py +++ b/modelscope/trainers/hooks/evaluation_hook.py @@ -32,6 +32,7 @@ class EvaluationHook(Hook): def do_evaluate(self, trainer): """Evaluate the results.""" eval_res = trainer.evaluate() + trainer.data_loader = trainer.train_dataloader for name, val in eval_res.items(): trainer.log_buffer.output[name] = val diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py index cf3a16e7..9a5de392 100644 --- a/modelscope/trainers/hooks/lr_scheduler_hook.py +++ b/modelscope/trainers/hooks/lr_scheduler_hook.py @@ -21,9 +21,6 @@ class LrSchedulerHook(Hook): def __init__(self, by_epoch=True, warmup=None) -> None: super().__init__() self.by_epoch = by_epoch - if not self.by_epoch: - raise ValueError('We only support ``by_epoch=True`` now!') - self.warmup = warmup self.warmup_lr_scheduler = None @@ -49,6 +46,11 @@ class LrSchedulerHook(Hook): return lr def before_train_iter(self, trainer): + if not self.by_epoch: + if self.warmup_lr_scheduler is not None: + self.warmup_lr_scheduler.step() + else: + trainer.lr_scheduler.step() trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer) def before_train_epoch(self, trainer): diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py new file mode 100644 index 00000000..c8121db6 --- /dev/null +++ b/modelscope/trainers/nlp_trainer.py @@ -0,0 +1,192 @@ +import os +from typing import Callable, Dict, Optional, Tuple, Union + +import torch +from torch import nn +from torch.utils.data import Dataset + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metrics.builder import build_metric +from modelscope.models.base import Model, TorchModel +from modelscope.msdatasets import MsDataset +from modelscope.preprocessors import Preprocessor, build_preprocessor +from modelscope.utils.config import Config, ConfigDict +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys, + ModelFile, Tasks) +from .base import TRAINERS +from .trainer import EpochBasedTrainer + + +@TRAINERS.register_module(module_name='NlpEpochBasedTrainer') +class NlpEpochBasedTrainer(EpochBasedTrainer): + + def __init__( + self, + model: Optional[Union[TorchModel, nn.Module, str]] = None, + cfg_file: Optional[str] = None, + cfg_modify_fn: Optional[Callable] = None, + arg_parse_fn: Optional[Callable] = None, + data_collator: Optional[Callable] = None, + train_dataset: Optional[Union[MsDataset, Dataset]] = None, + eval_dataset: Optional[Union[MsDataset, Dataset]] = None, + preprocessor: Optional[Preprocessor] = None, + optimizers: Tuple[torch.optim.Optimizer, + torch.optim.lr_scheduler._LRScheduler] = (None, + None), + model_revision: Optional[str] = DEFAULT_MODEL_REVISION, + **kwargs): + """Add code to adapt with nlp models. + + Args: + cfg_modify_fn: An input fn which is used to modify the cfg read out of the file. + """ + + if isinstance(model, str): + if os.path.exists(model): + model_dir = model if os.path.isdir(model) else os.path.dirname( + model) + else: + model_dir = snapshot_download(model, revision=model_revision) + cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) + else: + assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class' + model_dir = os.path.dirname(cfg_file) + + self.cfg_modify_fn = cfg_modify_fn + self.cfg = self.rebuild_config(Config.from_file(cfg_file)) + try: + labels = self.cfg.dataset.train.labels + except AttributeError: + labels = None + + self.label2id = None + self.num_labels = None + if labels is not None and len(labels) > 0: + self.label2id = {label: idx for idx, label in enumerate(labels)} + self.id2label = {idx: label for idx, label in enumerate(labels)} + self.num_labels = len(labels) + + def build_dataset_keys(cfg): + if cfg is not None: + input_keys = { + 'first_sequence': getattr(cfg, 'first_sequence', None), + 'second_sequence': getattr(cfg, 'second_sequence', None), + 'label': getattr(cfg, 'label', None), + } + else: + input_keys = {} + + return {k: v for k, v in input_keys.items() if v is not None} + + self.train_keys = build_dataset_keys( + self.cfg.dataset.train if hasattr(self.cfg, 'dataset') + and hasattr(self.cfg.dataset, 'train') else None) + # TODO eval may has special keys, which is now not supported. + # because there is only one preprocessor in the trainer, and it only supports one group of keys. + self.eval_keys = self.train_keys + + super().__init__( + model=model_dir, + cfg_file=cfg_file, + arg_parse_fn=arg_parse_fn, + data_collator=data_collator, + preprocessor=preprocessor, + optimizers=optimizers, + model_revision=model_revision, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + **kwargs) + + def rebuild_config(self, cfg: Config): + if self.cfg_modify_fn is not None: + return self.cfg_modify_fn(cfg) + return cfg + + def build_model(self) -> Union[nn.Module, TorchModel]: + """ Instantiate a pytorch model and return. + + By default, we will create a model using config from configuration file. You can + override this method in a subclass. + + """ + model_args = {} if self.num_labels is None else { + 'num_labels': self.num_labels + } + model = Model.from_pretrained( + self.model_dir, cfg_dict=self.cfg, **model_args) + if not isinstance(model, nn.Module) and hasattr(model, 'model'): + return model.model + elif isinstance(model, nn.Module): + return model + + def build_preprocessor(self) -> Preprocessor: + """Build the preprocessor. + + User can override this method to implement custom logits. + + Returns: The preprocessor instance. + + """ + model_args = {} if self.label2id is None else { + 'label2id': self.label2id + } + cfg = ConfigDict({ + **getattr(self.cfg, 'preprocessor'), + 'model_dir': + self.model_dir, + **model_args, + 'mode': + ModeKeys.TRAIN, + **self.train_keys, + }) + return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) + + +@TRAINERS.register_module(module_name='VecoTrainer') +class VecoTrainer(NlpEpochBasedTrainer): + + def evaluate(self, checkpoint_path=None): + """Veco evaluates the datasets one by one. + + """ + from modelscope.task_datasets import VecoDataset + self.model.eval() + self._mode = ModeKeys.EVAL + metric_values = {} + + if self.eval_dataset is None: + val_data = self.cfg.dataset.val + self.eval_dataset = self.build_dataset( + val_data, mode=ModeKeys.EVAL) + + idx = 0 + dataset_cnt = 1 + if isinstance(self.eval_dataset, VecoDataset): + self.eval_dataset.switch_dataset(idx) + dataset_cnt = len(self.eval_dataset.datasets) + + while True: + self.eval_dataloader = self._build_dataloader_with_dataset( + self.eval_dataset, **self.cfg.evaluation.get('dataloader', {})) + self.data_loader = self.eval_dataloader + + metric_classes = [ + build_metric(metric, default_args={'trainer': self}) + for metric in self.metrics + ] + self.evaluation_loop(self.eval_dataloader, checkpoint_path, + metric_classes) + + for m_idx, metric_cls in enumerate(metric_classes): + if f'eval_dataset[{idx}]' not in metric_values: + metric_values[f'eval_dataset[{idx}]'] = {} + metric_values[f'eval_dataset[{idx}]'][ + self.metrics[m_idx]] = metric_cls.evaluate() + + idx += 1 + if idx < dataset_cnt: + self.eval_dataset.switch_dataset(idx) + else: + break + + return metric_values diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index e83654a2..c5574f32 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -22,7 +22,8 @@ from modelscope.models.base import Model, TorchModel from modelscope.msdatasets.ms_dataset import MsDataset from modelscope.preprocessors import build_preprocessor from modelscope.preprocessors.base import Preprocessor -from modelscope.task_datasets import TorchTaskDataset, build_task_dataset +from modelscope.task_datasets.builder import build_task_dataset +from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.lrscheduler.builder import build_lr_scheduler @@ -30,12 +31,12 @@ from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys, ModelFile, Tasks, TrainerStages) +from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg from modelscope.utils.tensor_utils import torch_default_data_collator from modelscope.utils.torch_utils import (broadcast, create_device, get_dist_info, init_dist) -from modelscope.utils.utils import if_func_receive_dict_inputs from .base import BaseTrainer from .builder import TRAINERS from .default_config import DEFAULT_CONFIG @@ -87,6 +88,7 @@ class EpochBasedTrainer(BaseTrainer): None), model_revision: Optional[str] = DEFAULT_MODEL_REVISION, **kwargs): + if isinstance(model, str): if os.path.exists(model): self.model_dir = model if os.path.isdir( @@ -108,9 +110,9 @@ class EpochBasedTrainer(BaseTrainer): self.model = model super().__init__(cfg_file, arg_parse_fn) - # add default config self.cfg.merge_from_dict(self._get_default_config(), force=False) + self.cfg = self.rebuild_config(self.cfg) if 'work_dir' in kwargs: self.work_dir = kwargs['work_dir'] @@ -130,9 +132,9 @@ class EpochBasedTrainer(BaseTrainer): self.device = create_device(device_name == 'cpu') self.train_dataset = self.to_task_dataset( - train_dataset, mode='train', preprocessor=self.preprocessor) + train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) self.eval_dataset = self.to_task_dataset( - eval_dataset, mode='eval', preprocessor=self.preprocessor) + eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) self.data_collator = data_collator if data_collator is not None else torch_default_data_collator self.metrics = self.get_metrics() @@ -168,6 +170,14 @@ class EpochBasedTrainer(BaseTrainer): if not is_parallel(self.model) and self._dist: self.model = self.to_parallel(self.model) + def rebuild_config(self, cfg: Config): + """A method used to rebuild the config, any subclass can override this method. + + Returns: The rebuilt config + + """ + return cfg + @property def mode(self): return self._mode @@ -203,7 +213,7 @@ class EpochBasedTrainer(BaseTrainer): return self._max_epochs * len(self.data_loader) def to_task_dataset(self, - datasets: Tuple[Dataset, List[Dataset]], + datasets: Union[Dataset, List[Dataset]], mode: str, preprocessor: Optional[Preprocessor] = None): """Build the task specific dataset processor for this trainer. @@ -229,17 +239,13 @@ class EpochBasedTrainer(BaseTrainer): cfg = ConfigDict( type=self.cfg.task, mode=mode, datasets=datasets) return build_task_dataset(cfg, self.cfg.task) - elif isinstance(datasets, - Dataset) or (isinstance(datasets, List) - and isinstance(datasets[0], Dataset)): + else: cfg = ConfigDict( - type=self.cfg.model.type, mode=mode, datasets=datasets) + type=self.cfg.model.type, + mode=mode, + datasets=datasets, + preprocessor=preprocessor) return build_task_dataset(cfg, self.cfg.task) - else: - raise ValueError( - f'invalid datasets type: {type(datasets)}, ' - f'expected `MsDataset`, `torch.utils.data.Dataset` or list of them.' - ) except Exception: if isinstance(datasets, (List, Tuple)) or preprocessor is not None: return TorchTaskDataset( @@ -262,8 +268,11 @@ class EpochBasedTrainer(BaseTrainer): # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor # when they are different ones in training and evaluation cfg = ConfigDict({ - **getattr(self.cfg, 'preprocessor'), 'model_dir': - self.model_dir + **getattr(self.cfg, 'preprocessor'), + 'model_dir': + self.model_dir, + 'mode': + ModeKeys.TRAIN, }) return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) @@ -324,6 +333,8 @@ class EpochBasedTrainer(BaseTrainer): **self.cfg.evaluation.get('dataloader', {})) self.data_loader = self.eval_dataloader metric_classes = [build_metric(metric) for metric in self.metrics] + for m in metric_classes: + m.trainer = self metric_values = self.evaluation_loop(self.eval_dataloader, checkpoint_path, metric_classes) @@ -338,10 +349,9 @@ class EpochBasedTrainer(BaseTrainer): """ Instantiate a pytorch model and return. By default, we will create a model using config from configuration file. You can - subclass and override this method in a subclass. + override this method in a subclass. """ - # TODO temp implementation, waiting for @zhangzhicheng model = Model.from_pretrained(self.model_dir) if not isinstance(model, nn.Module) and hasattr(model, 'model'): return model.model @@ -412,9 +422,8 @@ class EpochBasedTrainer(BaseTrainer): self._mode = ModeKeys.TRAIN inputs = self.collate_fn(inputs) # call model forward but not __call__ to skip postprocess - if isinstance( - inputs, - Mapping) and not if_func_receive_dict_inputs(model.forward): + if isinstance(inputs, + Mapping) and not func_receive_dict_inputs(model.forward): train_outputs = model.forward(**inputs) else: train_outputs = model.forward(inputs) @@ -495,7 +504,7 @@ class EpochBasedTrainer(BaseTrainer): if self.eval_dataset is None: val_data = self.cfg.dataset.val self.eval_dataset = self.build_dataset( - val_data, mode=ModeKeys.TRAIN) + val_data, mode=ModeKeys.EVAL) batch_size = self.cfg.evaluation.batch_size workers = self.cfg.evaluation.workers @@ -523,7 +532,8 @@ class EpochBasedTrainer(BaseTrainer): ) torch_dataset = dataset.to_torch_dataset( preprocessors=self.preprocessor, ) - return torch_dataset + dataset = self.to_task_dataset(torch_dataset, mode) + return dataset def create_optimizer_and_scheduler(self): """ Create optimizer and lr scheduler diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index c30d1d15..a90a58b6 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -10,9 +10,9 @@ import torch from torch import distributed as dist from tqdm import tqdm +from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, make_tmp_dir) -from modelscope.utils.utils import if_func_receive_dict_inputs def single_gpu_test(model, @@ -37,18 +37,19 @@ def single_gpu_test(model, if data_collate_fn is not None: data = data_collate_fn(data) with torch.no_grad(): - if isinstance(data, - Mapping) and not if_func_receive_dict_inputs( - model.forward): - - result = model(**data) + if isinstance(data, Mapping) and not func_receive_dict_inputs( + model.forward): + result = model.forward(**data) else: - result = model(data) + result = model.forward(data) if metric_classes is not None: for metric_cls in metric_classes: metric_cls.add(result, data) - batch_size = len(result) + if isinstance(data, dict): + batch_size = len(next(iter(data.values()))) + else: + batch_size = len(data) for _ in range(batch_size): pbar.update() @@ -101,16 +102,18 @@ def multi_gpu_test(model, data = data_collate_fn(data) data_list.append(data) with torch.no_grad(): - if isinstance(data, - Mapping) and not if_func_receive_dict_inputs( - model.forward): - result = model(**data) + if isinstance(data, Mapping) and not func_receive_dict_inputs( + model.forward): + result = model.forward(**data) else: - result = model(data) + result = model.forward(data) results.append(result) if rank == 0: - batch_size = len(result) + if isinstance(data, dict): + batch_size = len(next(iter(data.values()))) + else: + batch_size = len(data) batch_size_all = batch_size * world_size count += batch_size_all if count > len(dataset): diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index b7b32c81..b8ee1258 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -16,9 +16,9 @@ from modelscope.fileio.file import LocalStorage from modelscope.metainfo import (Heads, Metrics, Models, Pipelines, Preprocessors, TaskModels, Trainers) from modelscope.utils.constant import Fields, Tasks +from modelscope.utils.file_utils import get_default_cache_dir from modelscope.utils.logger import get_logger from modelscope.utils.registry import default_group -from modelscope.utils.utils import get_default_cache_dir logger = get_logger() storage = LocalStorage() diff --git a/modelscope/utils/utils.py b/modelscope/utils/file_utils.py similarity index 96% rename from modelscope/utils/utils.py rename to modelscope/utils/file_utils.py index c2c47092..a04d890f 100644 --- a/modelscope/utils/utils.py +++ b/modelscope/utils/file_utils.py @@ -5,7 +5,7 @@ import os # TODO: remove this api, unify to flattened args -def if_func_receive_dict_inputs(func): +def func_receive_dict_inputs(func): """to decide if a func could recieve dict inputs or not Args: diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 5af67944..6e5326f4 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -98,4 +98,14 @@ def parse_label_mapping(model_dir): label_mapping = json.load(f) label2id = {name: idx for name, idx in label_mapping.items()} + if label2id is None: + config_path = os.path.join(model_dir, ModelFile.CONFIGURATION) + config = Config.from_file(config_path) + if hasattr(config, 'model') and hasattr(config.model, 'label2id'): + label2id = config.model.label2id + if label2id is None: + config_path = os.path.join(model_dir, 'config.json') + config = Config.from_file(config_path) + if hasattr(config, 'label2id'): + label2id = config.label2id return label2id diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py index 01b68f78..aca103d2 100644 --- a/modelscope/utils/tensor_utils.py +++ b/modelscope/utils/tensor_utils.py @@ -68,7 +68,7 @@ def torch_default_data_collator(features): ) and v is not None and not isinstance(v, str): if isinstance(v, torch.Tensor): batch[k] = torch.stack([f[k] for f in features]) - elif isinstance(v, list): + elif isinstance(v, list) and isinstance(v[0], torch.Tensor): batch[k] = torch.stack([d for f in features for d in f[k]]) else: batch[k] = torch.tensor(np.array([f[k] for f in features])) diff --git a/requirements/nlp.txt b/requirements/nlp.txt index deb6a5bd..c69174fe 100644 --- a/requirements/nlp.txt +++ b/requirements/nlp.txt @@ -4,5 +4,5 @@ pai-easynlp # rough-score was just recently updated from 0.0.4 to 0.0.7 # which introduced compatability issues that are being investigated rouge_score<=0.0.4 -sofa>=1.0.5 +seqeval spacy>=2.3.5 diff --git a/requirements/runtime.txt b/requirements/runtime.txt index fbf33854..5675f031 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -15,5 +15,5 @@ setuptools tensorboard tokenizers tqdm>=4.64.0 -transformers>=4.10.3 +transformers>=4.12.0 yapf diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/metrics/test_token_classification_metrics.py b/tests/metrics/test_token_classification_metrics.py new file mode 100644 index 00000000..b249b227 --- /dev/null +++ b/tests/metrics/test_token_classification_metrics.py @@ -0,0 +1,44 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +import numpy as np + +from modelscope.metrics.token_classification_metric import \ + TokenClassificationMetric +from modelscope.utils.test_utils import test_level + + +class TestTokenClsMetrics(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_value(self): + metric = TokenClassificationMetric() + + class Trainer: + pass + + metric.trainer = Trainer() + metric.trainer.label2id = { + 'B-obj': 0, + 'I-obj': 1, + 'O': 2, + } + + outputs = { + 'logits': + np.array([[[2.0, 1.0, 0.5], [1.0, 1.5, 1.0], [2.0, 1.0, 3.0], + [2.4, 1.5, 4.0], [2.0, 1.0, 3.0], [2.4, 1.5, 1.7], + [2.0, 1.0, 0.5], [2.4, 1.5, 0.5]]]) + } + inputs = {'labels': np.array([[0, 1, 2, 2, 0, 1, 2, 2]])} + metric.add(outputs, inputs) + ret = metric.evaluate() + self.assertTrue(np.isclose(ret['precision'], 0.25)) + self.assertTrue(np.isclose(ret['recall'], 0.5)) + self.assertTrue(np.isclose(ret['accuracy'], 0.5)) + print(ret) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/models/test_base_torch.py b/tests/models/test_base_torch.py index dcdf79be..c147259b 100644 --- a/tests/models/test_base_torch.py +++ b/tests/models/test_base_torch.py @@ -21,8 +21,8 @@ class TorchBaseTest(unittest.TestCase): self.conv1 = nn.Conv2d(1, 20, 5) self.conv2 = nn.Conv2d(20, 20, 5) - def forward(self, x): - x = F.relu(self.conv1(x)) + def forward(self, input): + x = F.relu(self.conv1(input)) return F.relu(self.conv2(x)) model = MyTorchModel() @@ -41,8 +41,8 @@ class TorchBaseTest(unittest.TestCase): self.conv1 = nn.Conv2d(1, 20, 5) self.conv2 = nn.Conv2d(20, 20, 5) - def forward(self, x): - x = F.relu(self.conv1(x)) + def forward(self, input): + x = F.relu(self.conv1(input)) return F.relu(self.conv2(x)) def postprocess(self, x): diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py index 449b0cb7..a5c29f16 100644 --- a/tests/pipelines/test_csanmt_translation.py +++ b/tests/pipelines/test_csanmt_translation.py @@ -12,7 +12,7 @@ class TranslationTest(unittest.TestCase): model_id = 'damo/nlp_csanmt_translation' inputs = 'Gut@@ ach : Incre@@ ased safety for pedestri@@ ans' - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_name(self): pipeline_ins = pipeline(task=Tasks.translation, model=self.model_id) print(pipeline_ins(input=self.inputs)) diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index b028cfbe..2f57b2d8 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -45,7 +45,7 @@ class FillMaskTest(unittest.TestCase): model_dir = snapshot_download(self.model_id_sbert[language]) preprocessor = FillMaskPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) - model = StructBertForMaskedLM(model_dir) + model = StructBertForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) @@ -60,7 +60,7 @@ class FillMaskTest(unittest.TestCase): model_dir = snapshot_download(self.model_id_veco) preprocessor = FillMaskPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) - model = VecoForMaskedLM(model_dir) + model = VecoForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) @@ -77,7 +77,7 @@ class FillMaskTest(unittest.TestCase): model_dir = snapshot_download(self.model_id_bert) preprocessor = FillMaskPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) - model = BertForMaskedLM(model_dir) + model = BertForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index 8d5d3dfa..f477fb37 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -3,10 +3,10 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import SbertForNLI +from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import NLIPipeline -from modelscope.preprocessors import NLIPreprocessor +from modelscope.pipelines.nlp import PairSentenceClassificationPipeline +from modelscope.preprocessors import PairSentenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -19,9 +19,10 @@ class NLITest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = NLIPreprocessor(cache_path) - model = SbertForNLI(cache_path, tokenizer=tokenizer) - pipeline1 = NLIPipeline(model, preprocessor=tokenizer) + tokenizer = PairSentenceClassificationPreprocessor(cache_path) + model = SbertForSequenceClassification.from_pretrained(cache_path) + pipeline1 = PairSentenceClassificationPipeline( + model, preprocessor=tokenizer) pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n' f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}') @@ -33,7 +34,7 @@ class NLITest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = NLIPreprocessor(model.model_dir) + tokenizer = PairSentenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.nli, model=model, preprocessor=tokenizer) print(pipeline_ins(input=(self.sentence1, self.sentence2))) diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index 8cfb2c20..7a30d779 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -4,10 +4,10 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import SbertForSentenceSimilarity +from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SentenceSimilarityPipeline -from modelscope.preprocessors import SentenceSimilarityPreprocessor +from modelscope.pipelines.nlp import PairSentenceClassificationPipeline +from modelscope.preprocessors import PairSentenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -20,9 +20,10 @@ class SentenceSimilarityTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): cache_path = snapshot_download(self.model_id) - tokenizer = SentenceSimilarityPreprocessor(cache_path) - model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer) - pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer) + tokenizer = PairSentenceClassificationPreprocessor(cache_path) + model = SbertForSequenceClassification.from_pretrained(cache_path) + pipeline1 = PairSentenceClassificationPipeline( + model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.sentence_similarity, model=model, preprocessor=tokenizer) print('test1') @@ -36,7 +37,7 @@ class SentenceSimilarityTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SentenceSimilarityPreprocessor(model.model_dir) + tokenizer = PairSentenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentence_similarity, model=model, diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py index 53031e9d..82c068be 100644 --- a/tests/pipelines/test_sentiment_classification.py +++ b/tests/pipelines/test_sentiment_classification.py @@ -3,11 +3,10 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import (SbertForSentimentClassification, - SequenceClassificationModel) +from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SentimentClassificationPipeline -from modelscope.preprocessors import SentimentClassificationPreprocessor +from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline +from modelscope.preprocessors import SingleSentenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -19,46 +18,52 @@ class SentimentClassificationTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = SentimentClassificationPreprocessor(cache_path) - model = SequenceClassificationModel.from_pretrained( + tokenizer = SingleSentenceClassificationPreprocessor(cache_path) + model = SbertForSequenceClassification.from_pretrained( self.model_id, num_labels=2) - pipeline1 = SentimentClassificationPipeline( + pipeline1 = SingleSentenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.sentiment_classification, model=model, - preprocessor=tokenizer, - model_revision='beta') + preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\n' f'pipeline1:{pipeline1(input=self.sentence1)}') print() print(f'sentence1: {self.sentence1}\n' f'pipeline1: {pipeline2(input=self.sentence1)}') + self.assertTrue( + isinstance(pipeline1.model, SbertForSequenceClassification)) + self.assertTrue( + isinstance(pipeline2.model, SbertForSequenceClassification)) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SentimentClassificationPreprocessor(model.model_dir) + tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentiment_classification, model=model, - preprocessor=tokenizer, - model_revision='beta') + preprocessor=tokenizer) print(pipeline_ins(input=self.sentence1)) + self.assertTrue( + isinstance(pipeline_ins.model, SbertForSequenceClassification)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): pipeline_ins = pipeline( - task=Tasks.sentiment_classification, - model=self.model_id, - model_revision='beta') + task=Tasks.sentiment_classification, model=self.model_id) print(pipeline_ins(input=self.sentence1)) + print(pipeline_ins.model.__class__) + self.assertTrue( + isinstance(pipeline_ins.model, SbertForSequenceClassification)) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): - pipeline_ins = pipeline( - task=Tasks.sentiment_classification, model_revision='beta') + pipeline_ins = pipeline(task=Tasks.sentiment_classification) print(pipeline_ins(input=self.sentence1)) + self.assertTrue( + isinstance(pipeline_ins.model, SbertForSequenceClassification)) if __name__ == '__main__': diff --git a/tests/pipelines/test_sentiment_classification_task_model.py b/tests/pipelines/test_sentiment_classification_task_model.py new file mode 100644 index 00000000..2808ec84 --- /dev/null +++ b/tests/pipelines/test_sentiment_classification_task_model.py @@ -0,0 +1,70 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.models.nlp.task_models.sequence_classification import \ + SequenceClassificationModel +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline +from modelscope.preprocessors import SingleSentenceClassificationPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class SentimentClassificationTaskModelTest(unittest.TestCase): + model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' + sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' + + @unittest.skip + def test_run_with_direct_file_download(self): + cache_path = snapshot_download(self.model_id) + tokenizer = SingleSentenceClassificationPreprocessor(cache_path) + model = SequenceClassificationModel.from_pretrained( + self.model_id, num_labels=2) + pipeline1 = SingleSentenceClassificationPipeline( + model, preprocessor=tokenizer) + pipeline2 = pipeline( + Tasks.sentiment_classification, + model=model, + preprocessor=tokenizer, + model_revision='beta') + print(f'sentence1: {self.sentence1}\n' + f'pipeline1:{pipeline1(input=self.sentence1)}') + print() + print(f'sentence1: {self.sentence1}\n' + f'pipeline1: {pipeline2(input=self.sentence1)}') + + @unittest.skip + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id, revision='beta') + tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir) + pipeline_ins = pipeline( + task=Tasks.sentiment_classification, + model=model, + preprocessor=tokenizer) + print(pipeline_ins(input=self.sentence1)) + self.assertTrue( + isinstance(pipeline_ins.model, SequenceClassificationModel)) + + @unittest.skip + def test_run_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.sentiment_classification, + model=self.model_id, + model_revision='beta') + print(pipeline_ins(input=self.sentence1)) + self.assertTrue( + isinstance(pipeline_ins.model, SequenceClassificationModel)) + + @unittest.skip + def test_run_with_default_model(self): + pipeline_ins = pipeline( + task=Tasks.sentiment_classification, model_revision='beta') + print(pipeline_ins(input=self.sentence1)) + self.assertTrue( + isinstance(pipeline_ins.model, SequenceClassificationModel)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py index fd397de3..c391e0a1 100644 --- a/tests/pipelines/test_text_generation.py +++ b/tests/pipelines/test_text_generation.py @@ -39,7 +39,7 @@ class TextGenerationTest(unittest.TestCase): for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh), (self.palm_model_id_en, self.palm_input_en)): cache_path = snapshot_download(model_id) - model = PalmForTextGeneration(cache_path) + model = PalmForTextGeneration.from_pretrained(cache_path) preprocessor = TextGenerationPreprocessor( cache_path, model.tokenizer, diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py index 5e3571f7..98fab808 100644 --- a/tests/pipelines/test_word_segmentation.py +++ b/tests/pipelines/test_word_segmentation.py @@ -20,7 +20,7 @@ class WordSegmentationTest(unittest.TestCase): def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) tokenizer = TokenClassificationPreprocessor(cache_path) - model = SbertForTokenClassification(cache_path, tokenizer=tokenizer) + model = SbertForTokenClassification.from_pretrained(cache_path) pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.word_segmentation, model=model, preprocessor=tokenizer) diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py index df0098f0..ee0b5bae 100644 --- a/tests/pipelines/test_zero_shot_classification.py +++ b/tests/pipelines/test_zero_shot_classification.py @@ -3,7 +3,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import SbertForZeroShotClassification +from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import ZeroShotClassificationPipeline from modelscope.preprocessors import ZeroShotClassificationPreprocessor @@ -21,7 +21,7 @@ class ZeroShotClassificationTest(unittest.TestCase): def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) tokenizer = ZeroShotClassificationPreprocessor(cache_path) - model = SbertForZeroShotClassification(cache_path, tokenizer=tokenizer) + model = SbertForSequenceClassification.from_pretrained(cache_path) pipeline1 = ZeroShotClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py new file mode 100644 index 00000000..fc59750d --- /dev/null +++ b/tests/taskdataset/test_veco_dataset.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +from modelscope.task_datasets.veco_dataset import VecoDataset +from modelscope.utils.test_utils import test_level + + +class TestVecoDataset(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_veco_dataset_train(self): + from datasets import Dataset + d0 = Dataset.from_dict({'a': [0, 1, 2]}) + d1 = Dataset.from_dict({'a': [10, 11, 12, 13, 14]}) + d2 = Dataset.from_dict({'a': [21, 22, 23, 24, 25, 26, 27]}) + dataset = VecoDataset([d0, d1, d2], mode='train') + self.assertEqual(len(dataset), 15) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_veco_dataset_eval(self): + from datasets import Dataset + d0 = Dataset.from_dict({'a': [0, 1, 2]}) + d1 = Dataset.from_dict({'a': [10, 11, 12, 13, 14]}) + d2 = Dataset.from_dict({'a': [21, 22, 23, 24, 25, 26, 27]}) + dataset = VecoDataset([d0, d1, d2], mode='eval') + self.assertEqual(len(dataset), 3) + dataset.switch_dataset(1) + self.assertEqual(len(dataset), 5) + dataset.switch_dataset(2) + self.assertEqual(len(dataset), 7) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py index afb887a4..7e057ff0 100644 --- a/tests/trainers/hooks/test_lr_scheduler_hook.py +++ b/tests/trainers/hooks/test_lr_scheduler_hook.py @@ -270,6 +270,7 @@ class PlateauLrSchedulerHookTest(unittest.TestCase): trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.train_dataloader = train_dataloader trainer.data_loader = train_dataloader trainer.register_optimizers_hook() trainer.register_hook_from_cfg(trainer.cfg.train.hooks) diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py new file mode 100644 index 00000000..8e147f92 --- /dev/null +++ b/tests/trainers/test_finetune_sequence_classification.py @@ -0,0 +1,244 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +from modelscope.trainers import build_trainer + + +class TestFinetuneSequenceClassification(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + def finetune(self, + model_id, + train_dataset, + eval_dataset, + name='NlpEpochBasedTrainer', + cfg_modify_fn=None, + **kwargs): + kwargs = dict( + model=model_id, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + work_dir=self.tmp_dir, + cfg_modify_fn=cfg_modify_fn, + **kwargs) + + os.environ['LOCAL_RANK'] = '0' + trainer = build_trainer(name=name, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(10): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skip + def test_finetune_afqmc(self): + + def cfg_modify_fn(cfg): + cfg.task = 'sentence-similarity' + cfg['preprocessor'] = {'type': 'sen-sim-tokenizer'} + cfg.train.optimizer.lr = 2e-5 + cfg['dataset'] = { + 'train': { + 'labels': ['0', '1'], + 'first_sequence': 'sentence1', + 'second_sequence': 'sentence2', + 'label': 'label', + } + } + cfg.train.max_epochs = 10 + cfg.train.lr_scheduler = { + 'type': 'LinearLR', + 'start_factor': 1.0, + 'end_factor': 0.0, + 'total_iters': + int(len(dataset['train']) / 32) * cfg.train.max_epochs, + 'options': { + 'by_epoch': False + } + } + cfg.train.hooks = [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 100 + }] + return cfg + + from datasets import load_dataset + from datasets import DownloadConfig + dc = DownloadConfig() + dc.local_files_only = True + dataset = load_dataset('clue', 'afqmc', download_config=dc) + self.finetune( + model_id='damo/nlp_structbert_backbone_tiny_std', + train_dataset=dataset['train'], + eval_dataset=dataset['validation'], + cfg_modify_fn=cfg_modify_fn) + + @unittest.skip + def test_finetune_tnews(self): + + def cfg_modify_fn(cfg): + # TODO no proper task for tnews + cfg.task = 'nli' + cfg['preprocessor'] = {'type': 'nli-tokenizer'} + cfg.train.optimizer.lr = 2e-5 + cfg['dataset'] = { + 'train': { + 'labels': [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', + '11', '12', '13', '14' + ], + 'first_sequence': + 'sentence', + 'label': + 'label', + } + } + cfg.train.max_epochs = 5 + cfg.train.lr_scheduler = { + 'type': 'LinearLR', + 'start_factor': 1.0, + 'end_factor': 0.0, + 'total_iters': + int(len(dataset['train']) / 32) * cfg.train.max_epochs, + 'options': { + 'by_epoch': False + } + } + cfg.train.hooks = [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 100 + }] + return cfg + + from datasets import load_dataset + from datasets import DownloadConfig + dc = DownloadConfig() + dc.local_files_only = True + dataset = load_dataset('clue', 'tnews', download_config=dc) + + self.finetune( + model_id='damo/nlp_structbert_backbone_tiny_std', + train_dataset=dataset['train'], + eval_dataset=dataset['validation'], + cfg_modify_fn=cfg_modify_fn) + + @unittest.skip + def test_veco_xnli(self): + from datasets import load_dataset + langs = ['en'] + langs_eval = ['en'] + train_datasets = [] + from datasets import DownloadConfig + dc = DownloadConfig() + dc.local_files_only = True + for lang in langs: + train_datasets.append( + load_dataset('xnli', lang, split='train', download_config=dc)) + eval_datasets = [] + for lang in langs_eval: + eval_datasets.append( + load_dataset( + 'xnli', lang, split='validation', download_config=dc)) + train_len = sum([len(dataset) for dataset in train_datasets]) + labels = ['0', '1', '2'] + + def cfg_modify_fn(cfg): + cfg.task = 'nli' + cfg['preprocessor'] = {'type': 'nli-tokenizer'} + cfg['dataset'] = { + 'train': { + 'first_sequence': 'premise', + 'second_sequence': 'hypothesis', + 'labels': labels, + 'label': 'label', + } + } + cfg['train'] = { + 'work_dir': + '/tmp', + 'max_epochs': + 2, + 'dataloader': { + 'batch_size_per_gpu': 16, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'AdamW', + 'lr': 2e-5, + 'options': { + 'cumulative_iters': 8, + } + }, + 'lr_scheduler': { + 'type': 'LinearLR', + 'start_factor': 1.0, + 'end_factor': 0.0, + 'total_iters': int(train_len / 16) * 2, + 'options': { + 'by_epoch': False + } + }, + 'hooks': [{ + 'type': 'CheckpointHook', + 'interval': 1, + 'save_dir': '/root' + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 500 + }] + } + cfg['evaluation'] = { + 'dataloader': { + 'batch_size_per_gpu': 128, + 'workers_per_gpu': 1, + 'shuffle': False + } + } + return cfg + + self.finetune( + 'damo/nlp_veco_fill-mask-large', + train_datasets, + eval_datasets, + name='VecoTrainer', + cfg_modify_fn=cfg_modify_fn) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py new file mode 100644 index 00000000..7449bc69 --- /dev/null +++ b/tests/trainers/test_finetune_token_classificatin.py @@ -0,0 +1,200 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from functools import reduce + +from modelscope.trainers import build_trainer +from modelscope.utils.test_utils import test_level + + +class TestFinetuneTokenClassification(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + def finetune(self, + model_id, + train_dataset, + eval_dataset, + name='NlpEpochBasedTrainer', + cfg_modify_fn=None, + **kwargs): + kwargs = dict( + model=model_id, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + work_dir=self.tmp_dir, + cfg_modify_fn=cfg_modify_fn, + **kwargs) + + os.environ['LOCAL_RANK'] = '0' + trainer = build_trainer(name=name, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(10): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skip + def test_token_classification(self): + # WS task + os.system( + f'curl http://dingkun.oss-cn-hangzhou-zmf.aliyuncs.com/atemp/train.txt > {self.tmp_dir}/train.txt' + ) + os.system( + f'curl http://dingkun.oss-cn-hangzhou-zmf.aliyuncs.com/atemp/dev.txt > {self.tmp_dir}/dev.txt' + ) + from datasets import load_dataset + dataset = load_dataset( + 'text', + data_files={ + 'train': f'{self.tmp_dir}/train.txt', + 'test': f'{self.tmp_dir}/dev.txt' + }) + + def split_to_dict(examples): + text, label = examples['text'].split('\t') + return { + 'first_sequence': text.split(' '), + 'labels': label.split(' ') + } + + dataset = dataset.map(split_to_dict, batched=False) + + def reducer(x, y): + x = x.split(' ') if isinstance(x, str) else x + y = y.split(' ') if isinstance(y, str) else y + return x + y + + label_enumerate_values = list( + set(reduce(reducer, dataset['train'][:1000]['labels']))) + label_enumerate_values.sort() + + def cfg_modify_fn(cfg): + cfg.task = 'token-classification' + cfg['preprocessor'] = {'type': 'token-cls-tokenizer'} + cfg['dataset'] = { + 'train': { + 'labels': label_enumerate_values, + 'first_sequence': 'first_sequence', + 'label': 'labels', + } + } + cfg.train.max_epochs = 3 + cfg.train.lr_scheduler = { + 'type': 'LinearLR', + 'start_factor': 1.0, + 'end_factor': 0.0, + 'total_iters': + int(len(dataset['train']) / 32) * cfg.train.max_epochs, + 'options': { + 'by_epoch': False + } + } + cfg.train.hooks = [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 300 + }] + return cfg + + self.finetune( + 'damo/nlp_structbert_backbone_tiny_std', + dataset['train'], + dataset['test'], + cfg_modify_fn=cfg_modify_fn) + + @unittest.skip + def test_word_segmentation(self): + os.system( + f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip' + ) + shutil.unpack_archive(f'{self.tmp_dir}/icwb2-data.zip', self.tmp_dir) + from datasets import load_dataset + from modelscope.preprocessors.nlp import WordSegmentationBlankSetToLabelPreprocessor + preprocessor = WordSegmentationBlankSetToLabelPreprocessor() + dataset = load_dataset( + 'text', + data_files=f'{self.tmp_dir}/icwb2-data/training/pku_training.utf8') + + def split_to_dict(examples): + return preprocessor(examples['text']) + + dataset = dataset.map(split_to_dict, batched=False) + + def reducer(x, y): + x = x.split(' ') if isinstance(x, str) else x + y = y.split(' ') if isinstance(y, str) else y + return x + y + + label_enumerate_values = list( + set(reduce(reducer, dataset['train'][:1000]['labels']))) + label_enumerate_values.sort() + + train_len = int(len(dataset['train']) * 0.7) + train_dataset = dataset['train'].select(range(train_len)) + dev_dataset = dataset['train'].select( + range(train_len, len(dataset['train']))) + + def cfg_modify_fn(cfg): + cfg.task = 'token-classification' + cfg['dataset'] = { + 'train': { + 'labels': label_enumerate_values, + 'first_sequence': 'first_sequence', + 'label': 'labels', + } + } + cfg['preprocessor'] = {'type': 'token-cls-tokenizer'} + cfg.train.max_epochs = 3 + cfg.train.lr_scheduler = { + 'type': 'LinearLR', + 'start_factor': 1.0, + 'end_factor': 0.0, + 'total_iters': + int(len(train_dataset) / 32) * cfg.train.max_epochs, + 'options': { + 'by_epoch': False + } + } + cfg.train.hooks = [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 50 + }] + return cfg + + self.finetune( + 'damo/nlp_structbert_backbone_tiny_std', + train_dataset, + dev_dataset, + cfg_modify_fn=cfg_modify_fn) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/test_text_generation_trainer.py b/tests/trainers/test_text_generation_trainer.py index 7c24bc0a..9c79f2f5 100644 --- a/tests/trainers/test_text_generation_trainer.py +++ b/tests/trainers/test_text_generation_trainer.py @@ -5,8 +5,7 @@ import tempfile import unittest from modelscope.hub.snapshot_download import snapshot_download -from modelscope.models.nlp.palm_for_text_generation import \ - PalmForTextGeneration +from modelscope.models.nlp.palm_v2 import PalmForTextGeneration from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer from modelscope.utils.constant import ModelFile @@ -50,13 +49,21 @@ class TestTextGenerationTrainer(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_trainer(self): + + def cfg_modify_fn(cfg): + cfg.preprocessor.type = 'text-gen-tokenizer' + return cfg + kwargs = dict( model=self.model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + cfg_modify_fn=cfg_modify_fn, + model_revision='beta') - trainer = build_trainer(default_args=kwargs) + trainer = build_trainer( + name='NlpEpochBasedTrainer', default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) @@ -69,7 +76,7 @@ class TestTextGenerationTrainer(unittest.TestCase): if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) - cache_path = snapshot_download(self.model_id) + cache_path = snapshot_download(self.model_id, revision='beta') model = PalmForTextGeneration.from_pretrained(cache_path) kwargs = dict( cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), @@ -86,6 +93,44 @@ class TestTextGenerationTrainer(unittest.TestCase): for i in range(2): self.assertIn(f'epoch_{i+1}.pth', results_files) + @unittest.skip + def test_finetune_cnndm(self): + from datasets import load_dataset + dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0') + train_dataset = dataset_dict['train'] \ + .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \ + .remove_columns('id') + eval_dataset = dataset_dict['validation'] \ + .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \ + .remove_columns('id') + num_warmup_steps = 2000 + + def noam_lambda(current_step: int): + current_step += 1 + return min(current_step**(-0.5), + current_step * num_warmup_steps**(-1.5)) + + def cfg_modify_fn(cfg): + cfg.train.lr_scheduler = { + 'type': 'LambdaLR', + 'lr_lambda': noam_lambda, + 'options': { + 'by_epoch': False + } + } + return cfg + + kwargs = dict( + model=self.model_id, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + work_dir=self.tmp_dir, + cfg_modify_fn=cfg_modify_fn, + model_revision='beta') + trainer = build_trainer( + name='NlpEpochBasedTrainer', default_args=kwargs) + trainer.train() + if __name__ == '__main__': unittest.main() diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index 603d6e5b..a2d899ba 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -6,8 +6,8 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Metrics -from modelscope.models.nlp.sbert_for_sequence_classification import \ - SbertTextClassfier +from modelscope.models.nlp.sequence_classification import \ + SbertForSequenceClassification from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer from modelscope.utils.constant import ModelFile @@ -102,7 +102,7 @@ class TestTrainerWithNlp(unittest.TestCase): model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' cache_path = snapshot_download(model_id) - model = SbertTextClassfier.from_pretrained(cache_path) + model = SbertForSequenceClassification.from_pretrained(cache_path) kwargs = dict( cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), model=model,