From 303ae2ff36d1cfa23abdadb59dfaa4d25b9bfb82 Mon Sep 17 00:00:00 2001 From: pangda Date: Fri, 28 Oct 2022 15:26:17 +0800 Subject: [PATCH 01/46] [to #42322933] fix bug for text logger Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10560149 --- modelscope/trainers/hooks/logger/text_logger_hook.py | 2 +- modelscope/trainers/trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py index 8552ab4e..95644783 100644 --- a/modelscope/trainers/hooks/logger/text_logger_hook.py +++ b/modelscope/trainers/hooks/logger/text_logger_hook.py @@ -61,7 +61,7 @@ class TextLoggerHook(LoggerHook): self.json_log_path = osp.join(self.out_dir, '{}.log.json'.format(trainer.timestamp)) if hasattr(trainer, 'meta') and trainer.meta is not None: - self._dump_log(trainer.meta, trainer) + self._dump_log(trainer.meta) def _get_max_memory(self, trainer): device = getattr(trainer.model, 'output_device', None) diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index e1fd7522..aaf24cfa 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -183,7 +183,7 @@ class EpochBasedTrainer(BaseTrainer): preprocessor=self.eval_preprocessor, **kwargs) - self.train_data_collator, self.eval_default_collate = None, None + self.train_data_collator, self.eval_data_collator = None, None if isinstance(data_collator, Mapping): if not (ConfigKeys.train in data_collator or ConfigKeys.val in data_collator): From 84ed59d8578aa0a1b041822dc267c4289a4c1e13 Mon Sep 17 00:00:00 2001 From: "lingcai.wl" Date: Fri, 28 Oct 2022 16:10:50 +0800 Subject: [PATCH 02/46] [to #44834022] add service utils for model deploy Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10529621 --- modelscope/utils/demo_utils.py | 17 +-- modelscope/utils/regress_test_utils.py | 15 +-- modelscope/utils/service_utils.py | 179 +++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 29 deletions(-) create mode 100644 modelscope/utils/service_utils.py diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py index 363ae950..e57b3348 100644 --- a/modelscope/utils/demo_utils.py +++ b/modelscope/utils/demo_utils.py @@ -4,11 +4,11 @@ import io import cv2 import json -import numpy as np from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks, TasksIODescriptions +from modelscope.utils.service_utils import NumpyEncoder TASKS_INPUT_TEMPLATES = { # vision tasks @@ -234,21 +234,6 @@ class DemoCompatibilityCheck(object): return True -class NumpyEncoder(json.JSONEncoder): - - def default(self, obj): - if isinstance(obj, np.ndarray): - return obj.tolist() - - if isinstance(obj, np.floating): - return float(obj) - - if isinstance(obj, np.integer): - return int(obj) - - return json.JSONEncoder.default(self, obj) - - def preprocess(req): in_urls = req.get('urlPaths').get('inUrls') if len(req['inputs']) == 1: diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py index 8045d3e9..be983c6c 100644 --- a/modelscope/utils/regress_test_utils.py +++ b/modelscope/utils/regress_test_utils.py @@ -19,6 +19,8 @@ import torch import torch.optim from torch import nn +from modelscope.utils.service_utils import NumpyEncoder + class RegressTool: """This class is used to stop inference/training results from changing by some unaware affections by unittests. @@ -117,19 +119,6 @@ class RegressTool: with open(baseline, 'rb') as f: base = pickle.load(f) - class NumpyEncoder(json.JSONEncoder): - """Special json encoder for numpy types - """ - - def default(self, obj): - if isinstance(obj, np.integer): - return int(obj) - elif isinstance(obj, np.floating): - return float(obj) - elif isinstance(obj, np.ndarray): - return obj.tolist() - return json.JSONEncoder.default(self, obj) - print(f'baseline: {json.dumps(base, cls=NumpyEncoder)}') print(f'latest : {json.dumps(io_json, cls=NumpyEncoder)}') if not compare_io_and_print(base, io_json, compare_fn, **kwargs): diff --git a/modelscope/utils/service_utils.py b/modelscope/utils/service_utils.py new file mode 100644 index 00000000..29c111f8 --- /dev/null +++ b/modelscope/utils/service_utils.py @@ -0,0 +1,179 @@ +import base64 +import mimetypes +from io import BytesIO + +import json +import numpy as np +import requests +from PIL import Image + +from modelscope.outputs import TASK_OUTPUTS, OutputKeys +from modelscope.pipeline_inputs import TASK_INPUTS, InputType +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks, TasksIODescriptions + + +# service data decoder func decodes data from network and convert it to pipeline's input +# for example +def ExampleDecoder(data): + # Assuming the pipeline inputs is a dict contains an image and a text, + # to decode the data from network we decode the image as base64 + data_json = json.loads(data) + # data: {"image": "xxxxxxxx=="(base64 str), "text": "a question"} + # pipeline(inputs) as follows: + # pipeline({'image': image, 'text': text}) + inputs = { + 'image': decode_base64_to_image(data_json.get('image')), + 'text': data_json.get('text') + } + return inputs + + +# service data encoder func encodes data from pipeline outputs and convert to network response (such as json) +# for example +def ExampleEncoder(data): + # Assuming the pipeline outputs is a dict contains an image and a text, + # and transmit it through network, this func encode image to base64 and dumps into json + # data (for e.g. python dict): + # {"image": a numpy array represents a image, "text": "output"} + image = data['image'] + text = data['text'] + data = {'image': encode_array_to_img_base64(image), 'text': text} + return json.dumps(data, cls=NumpyEncoder) + + +CustomEncoder = { + # Tasks.visual_question_answering: ExampleEncoder +} + +CustomDecoder = { + # Tasks.visual_question_answering: ExampleDecoder +} + + +class NumpyEncoder(json.JSONEncoder): + + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + + if isinstance(obj, np.floating): + return float(obj) + + if isinstance(obj, np.integer): + return int(obj) + + return json.JSONEncoder.default(self, obj) + + +def get_extension(encoding): + encoding = encoding.replace('audio/wav', 'audio/x-wav') + tp = mimetypes.guess_type(encoding)[0] + if tp == 'audio/flac': # flac is not supported by mimetypes + return 'flac' + extension = mimetypes.guess_extension(tp) + if extension is not None and extension.startswith('.'): + extension = extension[1:] + return extension + + +def get_mimetype(filename): + mimetype = mimetypes.guess_type(filename)[0] + if mimetype is not None: + mimetype = mimetype.replace('x-wav', 'wav').replace('x-flac', 'flac') + return mimetype + + +def decode_base64_to_binary(encoding): + extension = get_extension(encoding) + data = encoding.split(',')[1] + return base64.b64decode(data), extension + + +def decode_base64_to_image(encoding): + content = encoding.split(';')[1] + image_encoded = content.split(',')[1] + return Image.open(BytesIO(base64.b64decode(image_encoded))) + + +def encode_array_to_img_base64(image_array): + with BytesIO() as output_bytes: + pil_image = Image.fromarray(image_array.astype(np.uint8)) + pil_image.save(output_bytes, 'PNG') + bytes_data = output_bytes.getvalue() + base64_str = str(base64.b64encode(bytes_data), 'utf-8') + return 'data:image/png;base64,' + base64_str + + +def encode_pcm_to_base64(bytes_data): + from scipy.io.wavfile import write + with BytesIO() as out_mem_file: + write(out_mem_file, 16000, bytes_data) + base64_str = str(base64.b64encode(out_mem_file.getvalue()), 'utf-8') + return 'data:audio/pcm;base64,' + base64_str + + +def encode_url_to_base64(url): + encoded_string = base64.b64encode(requests.get(url).content) + base64_str = str(encoded_string, 'utf-8') + mimetype = get_mimetype(url) + return ('data:' + (mimetype if mimetype is not None else '') + ';base64,' + + base64_str) + + +def encode_file_to_base64(f): + with open(f, 'rb') as file: + encoded_string = base64.b64encode(file.read()) + base64_str = str(encoded_string, 'utf-8') + mimetype = get_mimetype(f) + return ('data:' + (mimetype if mimetype is not None else '') + + ';base64,' + base64_str) + + +def encode_url_or_file_to_base64(path): + try: + requests.get(path) + return encode_url_to_base64(path) + except (requests.exceptions.MissingSchema, + requests.exceptions.InvalidSchema): + return encode_file_to_base64(path) + + +def service_data_decoder(task, data): + if CustomDecoder.get(task) is not None: + return CustomDecoder[task](data) + input_type = TASK_INPUTS[task] + input_data = data.decode('utf-8') + if input_type == InputType.IMAGE: + return decode_base64_to_image(input_data) + elif input_type == InputType.AUDIO: + return decode_base64_to_binary(input_data)[0] + elif input_type == InputType.TEXT: + return input_data + elif isinstance(input_type, dict): + input_data = {} + for key, val in input_type.items(): + if val == InputType.IMAGE: + input_data[key] = decode_base64_to_image(data[key]) + elif val == InputType.AUDIO: + input_data[key] = decode_base64_to_binary(data[key])[0] + elif val == InputType.TEXT: + input_data[key] = data[key] + + return input_data + + +def service_data_encoder(task, data): + if CustomEncoder.get(task) is not None: + return CustomEncoder[task](data) + output_keys = TASK_OUTPUTS[task] + result = data + for output_key in output_keys: + if output_key == OutputKeys.OUTPUT_IMG: + result[OutputKeys.OUTPUT_IMG] = encode_array_to_img_base64( + data[OutputKeys.OUTPUT_IMG][..., ::-1]) + elif output_key == OutputKeys.OUTPUT_PCM: + result[OutputKeys.OUTPUT_PCM] = encode_pcm_to_base64( + data[OutputKeys.OUTPUT_PCM]) + result = bytes(json.dumps(result, cls=NumpyEncoder), encoding='utf8') + return result From 261c04b8b59527e3b10ae7bb8b37ea42a7d6510b Mon Sep 17 00:00:00 2001 From: Yufeng <47727949+shuaigezhu@users.noreply.github.com> Date: Fri, 28 Oct 2022 17:09:27 +0800 Subject: [PATCH 03/46] add Mglm (#5) * mglm init * add mglm requirements Co-authored-by: Yufeng Co-authored-by: wenmeng.zwm --- modelscope/metainfo.py | 3 + modelscope/models/nlp/__init__.py | 2 + modelscope/models/nlp/mglm/__init__.py | 22 + modelscope/models/nlp/mglm/arguments.py | 793 +++++++++ modelscope/models/nlp/mglm/blocklm_utils.py | 625 +++++++ modelscope/models/nlp/mglm/configure_data.py | 513 ++++++ .../models/nlp/mglm/data_utils/__init__.py | 341 ++++ .../models/nlp/mglm/data_utils/corpora.py | 583 ++++++ .../models/nlp/mglm/data_utils/datasets.py | 1244 +++++++++++++ .../models/nlp/mglm/data_utils/extraction.py | 71 + .../models/nlp/mglm/data_utils/file_utils.py | 256 +++ .../models/nlp/mglm/data_utils/lazy_loader.py | 286 +++ .../models/nlp/mglm/data_utils/samplers.py | 190 ++ .../nlp/mglm/data_utils/sp_tokenizer.py | 158 ++ .../nlp/mglm/data_utils/tokenization.py | 1396 +++++++++++++++ .../nlp/mglm/data_utils/tokenization_gpt2.py | 359 ++++ .../models/nlp/mglm/data_utils/wordpiece.py | 408 +++++ modelscope/models/nlp/mglm/fp16/__init__.py | 20 + modelscope/models/nlp/mglm/fp16/fp16.py | 660 +++++++ modelscope/models/nlp/mglm/fp16/fp16util.py | 220 +++ .../models/nlp/mglm/fp16/loss_scaler.py | 245 +++ .../models/nlp/mglm/generation_utils.py | 483 +++++ .../nlp/mglm/mglm_for_text_summarization.py | 469 +++++ modelscope/models/nlp/mglm/model/__init__.py | 20 + .../models/nlp/mglm/model/distributed.py | 127 ++ .../models/nlp/mglm/model/downstream.py | 242 +++ .../models/nlp/mglm/model/modeling_bert.py | 1576 +++++++++++++++++ .../models/nlp/mglm/model/modeling_glm.py | 245 +++ modelscope/models/nlp/mglm/model/prompt.py | 59 + modelscope/models/nlp/mglm/mpu/__init__.py | 37 + .../models/nlp/mglm/mpu/cross_entropy.py | 110 ++ modelscope/models/nlp/mglm/mpu/data.py | 117 ++ modelscope/models/nlp/mglm/mpu/grads.py | 72 + modelscope/models/nlp/mglm/mpu/initialize.py | 130 ++ modelscope/models/nlp/mglm/mpu/layers.py | 357 ++++ modelscope/models/nlp/mglm/mpu/mappings.py | 144 ++ modelscope/models/nlp/mglm/mpu/random.py | 408 +++++ .../models/nlp/mglm/mpu/tests/__init__.py | 0 .../models/nlp/mglm/mpu/tests/commons.py | 86 + .../nlp/mglm/mpu/tests/test_cross_entropy.py | 106 ++ .../models/nlp/mglm/mpu/tests/test_data.py | 91 + .../nlp/mglm/mpu/tests/test_initialize.py | 95 + .../models/nlp/mglm/mpu/tests/test_layers.py | 533 ++++++ .../models/nlp/mglm/mpu/tests/test_random.py | 206 +++ modelscope/models/nlp/mglm/mpu/transformer.py | 1200 +++++++++++++ modelscope/models/nlp/mglm/mpu/utils.py | 70 + modelscope/models/nlp/mglm/process_grid.py | 61 + modelscope/models/nlp/mglm/requirements.txt | 22 + modelscope/models/nlp/mglm/run_test.py | 10 + .../models/nlp/mglm/tasks/data_utils.py | 389 ++++ .../models/nlp/mglm/tasks/eval_utils.py | 249 +++ .../nlp/mglm/tasks/language_model/dataset.py | 249 +++ .../mglm/tasks/language_model/detokenizer.py | 63 + .../nlp/mglm/tasks/language_model/finetune.py | 254 +++ .../models/nlp/mglm/tasks/seq2seq/dataset.py | 667 +++++++ .../models/nlp/mglm/tasks/seq2seq/evaluate.py | 538 ++++++ .../models/nlp/mglm/tasks/seq2seq/finetune.py | 151 ++ .../models/nlp/mglm/tasks/superglue/README.md | 137 ++ .../nlp/mglm/tasks/superglue/__init__.py | 0 .../nlp/mglm/tasks/superglue/dataset.py | 1475 +++++++++++++++ .../nlp/mglm/tasks/superglue/evaluate.py | 101 ++ .../nlp/mglm/tasks/superglue/finetune.py | 138 ++ .../models/nlp/mglm/tasks/superglue/pvp.py | 1541 ++++++++++++++++ modelscope/models/nlp/mglm/test/__init__.py | 0 modelscope/models/nlp/mglm/test/test_block.py | 36 + .../models/nlp/mglm/test/test_rel_shift.py | 27 + modelscope/models/nlp/mglm/train_utils.py | 472 +++++ modelscope/models/nlp/mglm/utils.py | 529 ++++++ modelscope/outputs/outputs.py | 6 + modelscope/pipelines/nlp/__init__.py | 2 + .../nlp/mglm_text_summarization_pipeline.py | 43 + modelscope/preprocessors/__init__.py | 19 +- modelscope/preprocessors/nlp/__init__.py | 2 + .../nlp/mglm_summarization_preprocessor.py | 32 + requirements/nlp.txt | 15 +- .../pipelines/test_mglm_text_summarization.py | 47 + 76 files changed, 22640 insertions(+), 13 deletions(-) create mode 100644 modelscope/models/nlp/mglm/__init__.py create mode 100755 modelscope/models/nlp/mglm/arguments.py create mode 100644 modelscope/models/nlp/mglm/blocklm_utils.py create mode 100644 modelscope/models/nlp/mglm/configure_data.py create mode 100644 modelscope/models/nlp/mglm/data_utils/__init__.py create mode 100755 modelscope/models/nlp/mglm/data_utils/corpora.py create mode 100644 modelscope/models/nlp/mglm/data_utils/datasets.py create mode 100644 modelscope/models/nlp/mglm/data_utils/extraction.py create mode 100755 modelscope/models/nlp/mglm/data_utils/file_utils.py create mode 100644 modelscope/models/nlp/mglm/data_utils/lazy_loader.py create mode 100644 modelscope/models/nlp/mglm/data_utils/samplers.py create mode 100644 modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py create mode 100755 modelscope/models/nlp/mglm/data_utils/tokenization.py create mode 100644 modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py create mode 100755 modelscope/models/nlp/mglm/data_utils/wordpiece.py create mode 100644 modelscope/models/nlp/mglm/fp16/__init__.py create mode 100755 modelscope/models/nlp/mglm/fp16/fp16.py create mode 100644 modelscope/models/nlp/mglm/fp16/fp16util.py create mode 100755 modelscope/models/nlp/mglm/fp16/loss_scaler.py create mode 100644 modelscope/models/nlp/mglm/generation_utils.py create mode 100644 modelscope/models/nlp/mglm/mglm_for_text_summarization.py create mode 100755 modelscope/models/nlp/mglm/model/__init__.py create mode 100755 modelscope/models/nlp/mglm/model/distributed.py create mode 100644 modelscope/models/nlp/mglm/model/downstream.py create mode 100644 modelscope/models/nlp/mglm/model/modeling_bert.py create mode 100644 modelscope/models/nlp/mglm/model/modeling_glm.py create mode 100644 modelscope/models/nlp/mglm/model/prompt.py create mode 100755 modelscope/models/nlp/mglm/mpu/__init__.py create mode 100644 modelscope/models/nlp/mglm/mpu/cross_entropy.py create mode 100644 modelscope/models/nlp/mglm/mpu/data.py create mode 100644 modelscope/models/nlp/mglm/mpu/grads.py create mode 100644 modelscope/models/nlp/mglm/mpu/initialize.py create mode 100644 modelscope/models/nlp/mglm/mpu/layers.py create mode 100644 modelscope/models/nlp/mglm/mpu/mappings.py create mode 100755 modelscope/models/nlp/mglm/mpu/random.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/__init__.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/commons.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_data.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_initialize.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_layers.py create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_random.py create mode 100755 modelscope/models/nlp/mglm/mpu/transformer.py create mode 100644 modelscope/models/nlp/mglm/mpu/utils.py create mode 100644 modelscope/models/nlp/mglm/process_grid.py create mode 100644 modelscope/models/nlp/mglm/requirements.txt create mode 100644 modelscope/models/nlp/mglm/run_test.py create mode 100644 modelscope/models/nlp/mglm/tasks/data_utils.py create mode 100644 modelscope/models/nlp/mglm/tasks/eval_utils.py create mode 100644 modelscope/models/nlp/mglm/tasks/language_model/dataset.py create mode 100755 modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py create mode 100644 modelscope/models/nlp/mglm/tasks/language_model/finetune.py create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/README.md create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/__init__.py create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/dataset.py create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/evaluate.py create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/finetune.py create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/pvp.py create mode 100644 modelscope/models/nlp/mglm/test/__init__.py create mode 100644 modelscope/models/nlp/mglm/test/test_block.py create mode 100644 modelscope/models/nlp/mglm/test/test_rel_shift.py create mode 100644 modelscope/models/nlp/mglm/train_utils.py create mode 100644 modelscope/models/nlp/mglm/utils.py create mode 100644 modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py create mode 100644 modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py create mode 100644 tests/pipelines/test_mglm_text_summarization.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index a671ded5..3951541c 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -82,6 +82,7 @@ class Models(object): bert_for_ds = 'bert-for-document-segmentation' ponet = 'ponet' T5 = 'T5' + mglm = 'mglm' bloom = 'bloom' # audio models @@ -251,6 +252,7 @@ class Pipelines(object): relation_extraction = 'relation-extraction' document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' + mglm_text_summarization = 'mglm-text-summarization' translation_en_to_de = 'translation_en_to_de' # keep it underscore translation_en_to_ro = 'translation_en_to_ro' # keep it underscore translation_en_to_fr = 'translation_en_to_fr' # keep it underscore @@ -376,6 +378,7 @@ class Preprocessors(object): re_tokenizer = 're-tokenizer' document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' + mglm_summarization = 'mglm-summarization' sentence_piece = 'sentence-piece' # audio preprocessor diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index ccb2d382..1d71469a 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -35,6 +35,7 @@ if TYPE_CHECKING: SbertTokenizerFast, ) from .T5 import T5ForConditionalGeneration + from .mglm import MGLMForTextSummarization from .task_models import ( FeatureExtractionModel, InformationExtractionModel, @@ -106,6 +107,7 @@ else: ], 'sentence_embedding': ['SentenceEmbedding'], 'T5': ['T5ForConditionalGeneration'], + 'mglm': ['MGLMForTextSummarization'], 'gpt_neo': ['GPTNeoModel'], 'bloom': ['BloomModel'], } diff --git a/modelscope/models/nlp/mglm/__init__.py b/modelscope/models/nlp/mglm/__init__.py new file mode 100644 index 00000000..26d1101b --- /dev/null +++ b/modelscope/models/nlp/mglm/__init__.py @@ -0,0 +1,22 @@ +# Modified by Zhipu.AI +# Original Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .mglm_for_text_summarization import mGlmForSummarization +else: + _import_structure = { + 'mglm_for_text_summarization': ['MGLMForTextSummarization'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/mglm/arguments.py b/modelscope/models/nlp/mglm/arguments.py new file mode 100755 index 00000000..13b3aeab --- /dev/null +++ b/modelscope/models/nlp/mglm/arguments.py @@ -0,0 +1,793 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""argparser configuration""" + +import argparse +import os + +import deepspeed +import json +import torch + +from .utils import get_hostname + + +def add_model_config_args(parser): + """Model arguments""" + + group = parser.add_argument_group('model', 'model configuration') + + group.add_argument( + '--transformer-xl', + action='store_true', + help='use transformer-xl for training') + group.add_argument( + '--pretrained-bert', + action='store_true', + help='use a pretrained bert-large-uncased model instead' + 'of initializing from scratch. See ' + '--tokenizer-model-type to specify which pretrained ' + 'BERT model to use') + group.add_argument( + '--encoder-decoder', + action='store_true', + help='use the encoder-decoder architecture for blocklm') + group.add_argument( + '--attention-dropout', + type=float, + default=0.1, + help='dropout probability for attention weights') + group.add_argument( + '--num-attention-heads', + type=int, + default=16, + help='num of transformer attention heads') + group.add_argument( + '--hidden-size', type=int, default=1024, help='tansformer hidden size') + group.add_argument( + '--intermediate-size', + type=int, + default=None, + help='transformer embedding dimension for FFN' + 'set to 4*`--hidden-size` if it is None') + group.add_argument( + '--num-layers', type=int, default=24, help='num decoder layers') + group.add_argument( + '--layernorm-epsilon', + type=float, + default=1e-5, + help='layer norm epsilon') + group.add_argument( + '--hidden-dropout', + type=float, + default=0.1, + help='dropout probability for hidden state transformer') + group.add_argument( + '--output-dropout', + type=float, + default=0.1, + help='dropout probability for pooled output') + group.add_argument( + '--max-position-embeddings', + type=int, + default=512, + help='maximum number of position embeddings to use') + group.add_argument( + '--vocab-size', + type=int, + default=250112, + help='vocab size to use for non-character-level ' + 'tokenization. This value will only be used when ' + 'creating a tokenizer') + group.add_argument( + '--deep-init', + action='store_true', + help='initialize bert model similar to gpt2 model.' + 'scales initialization of projection layers by a ' + 'factor of 1/sqrt(2N). Necessary to train bert ' + 'models larger than BERT-Large.') + group.add_argument( + '--make-vocab-size-divisible-by', + type=int, + default=128, + help='Pad the vocab size to be divisible by this value.' + 'This is added for computational efficieny reasons.') + group.add_argument( + '--cpu-optimizer', action='store_true', help='Run optimizer on CPU') + group.add_argument( + '--cpu_torch_adam', + action='store_true', + help='Use Torch Adam as optimizer on CPU.') + + return parser + + +def add_fp16_config_args(parser): + """Mixed precision arguments.""" + + group = parser.add_argument_group('fp16', 'fp16 configurations') + + group.add_argument( + '--fp16', action='store_true', help='Run model in fp16 mode') + group.add_argument( + '--fp32-embedding', action='store_true', help='embedding in fp32') + group.add_argument( + '--fp32-layernorm', action='store_true', help='layer norm in fp32') + group.add_argument( + '--fp32-tokentypes', + action='store_true', + help='embedding token types in fp32') + group.add_argument( + '--fp32-allreduce', action='store_true', help='all-reduce in fp32') + group.add_argument( + '--hysteresis', + type=int, + default=2, + help='hysteresis for dynamic loss scaling') + group.add_argument( + '--loss-scale', + type=float, + default=None, + help='Static loss scaling, positive power of 2 ' + 'values can improve fp16 convergence. If None, dynamic' + 'loss scaling is used.') + group.add_argument( + '--loss-scale-window', + type=float, + default=1000, + help='Window over which to raise/lower dynamic scale') + group.add_argument( + '--min-scale', + type=float, + default=1, + help='Minimum loss scale for dynamic loss scale') + group.add_argument('--attention-scale', type=float, default=1.0) + return parser + + +def add_training_args(parser): + """Training arguments.""" + + group = parser.add_argument_group('train', 'training configurations') + + group.add_argument( + '--experiment-name', + type=str, + default='gpt-345M', + help='The experiment name for summary and checkpoint') + group.add_argument( + '--batch-size', type=int, default=4, help='Data Loader batch size') + group.add_argument( + '--gradient-accumulation-steps', + type=int, + default=1, + help='Data Loader batch size') + group.add_argument( + '--weight-decay', + type=float, + default=0.01, + help='weight decay coefficient for L2 regularization') + group.add_argument( + '--checkpoint-activations', + action='store_true', + help='checkpoint activation to allow for training ' + 'with larger models and sequences') + group.add_argument( + '--checkpoint-num-layers', + type=int, + default=1, + help='chunk size (number of layers) for checkpointing') + group.add_argument( + '--deepspeed-activation-checkpointing', + action='store_true', + help='uses activation checkpointing from deepspeed') + group.add_argument( + '--epochs', + type=int, + default=None, + help='Number of finetunning epochs. Zero results in evaluation only.') + group.add_argument( + '--clip-grad', type=float, default=1.0, help='gradient clipping') + group.add_argument( + '--train-iters', + type=int, + default=0, + help='total number of iterations to train over all training runs') + group.add_argument('--label-smoothing', type=float, default=0.0) + group.add_argument( + '--log-interval', type=int, default=100, help='report interval') + group.add_argument( + '--summary-dir', + type=str, + default='', + help='The directory to store the summary') + group.add_argument('--seed', type=int, default=1234, help='random seed') + # Batch producer arguments + group.add_argument( + '--reset-position-ids', + action='store_true', + help='Reset posistion ids after end-of-document token.') + group.add_argument( + '--reset-attention-mask', + action='store_true', + help='Reset self attention maske after ' + 'end-of-document token.') + + # Learning rate. + group.add_argument( + '--lr-decay-iters', + type=int, + default=None, + help='number of iterations to decay LR over,' + ' If None defaults to `--train-iters`*`--epochs`') + group.add_argument( + '--lr-decay-style', + type=str, + default='linear', + choices=['constant', 'linear', 'cosine', 'exponential'], + help='learning rate decay function') + group.add_argument('--lr-decay-ratio', type=float, default=0.1) + group.add_argument( + '--lr', type=float, default=1.0e-4, help='initial learning rate') + group.add_argument( + '--warmup', + type=float, + default=0.01, + help='percentage of data to warmup on (.01 = 1% of all ' + 'training iters). Default 0.01') + group.add_argument( + '--switch-linear', + action='store_true', + help='Switch to linear decay for cosine decay') + # model checkpointing + group.add_argument( + '--save', + type=str, + default=None, + help='Output directory to save checkpoints to.') + group.add_argument('--new-save-directory', action='store_true') + group.add_argument( + '--save-epoch', + type=int, + default=1, + help='number of epochs between saves') + group.add_argument( + '--save-interval', + type=int, + default=5000, + help='number of iterations between saves') + group.add_argument( + '--no-save-optim', + action='store_true', + help='Do not save current optimizer.') + group.add_argument( + '--no-save-rng', + action='store_true', + help='Do not save current rng state.') + group.add_argument( + '--load', + type=str, + default=None, + help='Path to a directory containing a model checkpoint.') + group.add_argument( + '--no-load-optim', + action='store_true', + help='Do not load optimizer when loading checkpoint.') + group.add_argument( + '--no-load-rng', + action='store_true', + help='Do not load rng state when loading checkpoint.') + group.add_argument( + '--no-load-lr-scheduler', + action='store_true', + help='Do not load lr scheduler when loading checkpoint.') + group.add_argument( + '--no-deepspeed-load', + action='store_true', + help='Not use deepspeed when loading checkpoint') + group.add_argument( + '--finetune', + action='store_true', + help='Load model for finetuning. Do not load optimizer ' + 'or rng state from checkpoint and set iteration to 0. ' + 'Assumed when loading a release checkpoint.') + group.add_argument( + '--resume-dataloader', + action='store_true', + help='Resume the dataloader when resuming training. ' + 'Does not apply to tfrecords dataloader, try resuming' + 'with a different seed in this case.') + # distributed training args + group.add_argument( + '--distributed-backend', + default='nccl', + help= + 'which backend to use for distributed training. One of [gloo, nccl]', + choices=['nccl', 'gloo']) + group.add_argument( + '--DDP-impl', + default='torch', + choices=['local', 'torch', 'none'], + help='which DistributedDataParallel implementation to use.') + + group.add_argument( + '--local_rank', + type=int, + default=None, + help='local rank passed from distributed launcher') + # BlockLM training args + group.add_argument( + '--block-lm', + action='store_true', + help='whether use the BlockLM pre-training') + group.add_argument( + '--masked-lm', + action='store_true', + help='whether to use the mlm objective') + group.add_argument('--bert-prob', type=float, default=0.5) + group.add_argument('--gpt-infill-prob', type=float, default=0.5) + group.add_argument('--gpt-min-ratio', type=float, default=0.5) + group.add_argument('--gap-sentence-prob', type=float, default=0.0) + group.add_argument('--gap-sentence-ratio', type=float, default=0.15) + group.add_argument('--avg-block-length', type=int, default=3) + group.add_argument('--short-seq-prob', type=float, default=0.0) + group.add_argument('--single-span-prob', type=float, default=0.0) + group.add_argument( + '--task-mask', + action='store_true', + help='Use different mask for generation and blank filling') + group.add_argument( + '--no-shuffle-block', + action='store_true', + help='not shuffle the blocks when filling the blank') + group.add_argument( + '--no-block-position', + action='store_true', + help='Use (rough) absolute positions instead of block positions') + group.add_argument( + '--sentinel-token', + action='store_true', + help='Use sentinel (mask) tokens to replace 2d position encoding') + group.add_argument('--block-mask-prob', type=float, default=0.0) + group.add_argument('--context-mask-ratio', type=float, default=0.0) + group.add_argument( + '--random-position', + action='store_true', + help='Use random start position to cover all the position embeddings') + return parser + + +def add_evaluation_args(parser): + """Evaluation arguments.""" + + group = parser.add_argument_group('validation', + 'validation configurations') + + group.add_argument( + '--eval-batch-size', + type=int, + default=None, + help='Data Loader batch size for evaluation datasets.' + 'Defaults to `--batch-size`') + group.add_argument( + '--eval-iters', + type=int, + default=100, + help='number of iterations to run for evaluation' + 'validation/test for') + group.add_argument( + '--eval-interval', + type=int, + default=1000, + help='interval between running evaluation on validation set') + group.add_argument( + '--eval-epoch', + type=int, + default=1, + help='epoch between running evaluation on validation set') + group.add_argument( + '--eval-seq-length', + type=int, + default=None, + help='Maximum sequence length to process for ' + 'evaluation. Defaults to `--seq-length`') + group.add_argument( + '--eval-max-preds-per-seq', + type=int, + default=None, + help='Maximum number of predictions to use for ' + 'evaluation. Defaults to ' + 'math.ceil(`--eval-seq-length`*.15/10)*10') + group.add_argument('--overlapping-eval', type=int, default=32) + + return parser + + +def add_text_generate_args(parser): + """Text generate arguments.""" + + group = parser.add_argument_group('Text generation', 'configurations') + group.add_argument('--temperature', type=float, default=1.0) + group.add_argument('--top_p', type=float, default=0.0) + group.add_argument('--top_k', type=int, default=0) + group.add_argument('--out-seq-length', type=int, default=256) + group.add_argument('--num-beams', type=int, default=1) + group.add_argument('--length-penalty', type=float, default=0.0) + group.add_argument('--no-repeat-ngram-size', type=int, default=0) + group.add_argument('--min-tgt-length', type=int, default=0) + group.add_argument('--select-topk', action='store_true') + group.add_argument('--blank-maskratio', type=float, default=0.1) + return parser + + +def add_data_args(parser): + """Train/valid/test data arguments.""" + + group = parser.add_argument_group('data', 'data configurations') + + group.add_argument( + '--model-parallel-size', + type=int, + default=1, + help='size of the model parallel.') + group.add_argument( + '--shuffle', + action='store_true', + help='Shuffle data. Shuffling is deterministic ' + 'based on seed and current epoch.') + group.add_argument('--filter-english', action='store_true') + group.add_argument( + '--train-data', + nargs='+', + default=None, + help='Whitespace separated filenames or corpora names ' + 'for training.') + group.add_argument( + '--valid-data', + nargs='*', + default=None, + help="""Filename for validation data.""") + group.add_argument( + '--test-data', + nargs='*', + default=None, + help="""Filename for testing""") + group.add_argument( + '--data-dir', + type=str, + default=None, + help='The data path to all the data files') + group.add_argument( + '--input-data-sizes-file', + type=str, + default='sizes.txt', + help='the filename containing all the shards sizes') + + group.add_argument( + '--delim', default=',', help='delimiter used to parse csv data files') + group.add_argument( + '--text-key', + default='sentence', + help='key to use to extract text from json/csv') + group.add_argument( + '--eval-text-key', + default=None, + help='key to use to extract text from ' + 'json/csv evaluation datasets') + group.add_argument( + '--split', + default='1000,1,1', + help='comma-separated list of proportions for training,' + ' validation, and test split') + + group.add_argument( + '--no-lazy-loader', + action='store_true', + help='whether to lazy read the data set') + group.add_argument('--half-lazy-loader', action='store_true') + group.add_argument( + '--loader-scatter', + type=int, + default=None, + help='Number of scatters to use for dataloaders') + group.add_argument( + '--loose-json', + action='store_true', + help='Use loose json (one json-formatted string per ' + 'newline), instead of tight json (data file is one ' + 'json string)') + group.add_argument( + '--presplit-sentences', + action='store_true', + help='Dataset content consists of documents where ' + 'each document consists of newline separated sentences') + group.add_argument( + '--num-workers', + type=int, + default=2, + help="""Number of workers to use for dataloading""") + group.add_argument( + '--tokenizer-model-type', + type=str, + default=None, + help="Model type to use for sentencepiece tokenization \ + (one of ['bpe', 'char', 'unigram', 'word']) or \ + bert vocab to use for BertWordPieceTokenizer (one of \ + ['bert-large-uncased', 'bert-large-cased', etc.])") + group.add_argument( + '--tokenizer-path', + type=str, + default='tokenizer.model', + help='path used to save/load sentencepiece tokenization ' + 'models') + group.add_argument( + '--tokenizer-type', + type=str, + default='BertWordPieceTokenizer', + choices=[ + 'CharacterLevelTokenizer', 'SentencePieceTokenizer', + 'BertWordPieceTokenizer', 'GPT2BPETokenizer', 'ChineseSPTokenizer' + ], + help='what type of tokenizer to use') + group.add_argument('--no-pre-tokenize', action='store_true') + group.add_argument( + '--cache-dir', + default=None, + type=str, + help='Where to store pre-trained BERT downloads') + group.add_argument( + '--use-tfrecords', + action='store_true', + help='load `--train-data`, `--valid-data`, ' + '`--test-data` from BERT tf records instead of ' + 'normal data pipeline') + group.add_argument( + '--seq-length', + type=int, + default=512, + help='Maximum sequence length to process') + group.add_argument( + '--mem-length', + type=int, + default=0, + help='The memory length to preserve') + group.add_argument( + '--max-preds-per-seq', + type=int, + default=None, + help='Maximum number of predictions to use per sequence.' + 'Defaults to math.ceil(`--seq-length`*.15/10)*10.' + 'MUST BE SPECIFIED IF `--use-tfrecords` is True.') + group.add_argument('--non-sentence-start', type=float, default=0.0) + group.add_argument( + '--sample-one-document', + action='store_true', + help='only sample one document in one sample') + group.add_argument( + '--load-splits', + type=str, + default=None, + help='The path to load split indices from') + group.add_argument( + '--save-splits', + type=str, + default=None, + help='The path to save split indices to') + group.add_argument( + '--save-test-data', + type=str, + default=None, + help='The path to save the test data') + group.add_argument( + '--multi-task-data', + nargs='*', + default=None, + help='Downsteam task names for multi-task pre-training') + group.add_argument( + '--multi-task-ratio', + type=float, + default=0.0, + help='Ratio for multi-task pre-training') + group.add_argument('--multi-seq-length', type=int, default=None) + group.add_argument('--multi-batch-size', type=int, default=None) + return parser + + +def add_finetune_config_args(parser): + group = parser.add_argument_group('finetune', 'finetune configurations') + group.add_argument('--task', type=str, help='Task name.') + group.add_argument( + '--load-pretrained', + type=str, + help='Load pretrained model', + default=None) + group.add_argument( + '--pool-token', + type=str, + choices=['start', 'pad', 'cls'], + help='The token to pool the sequence representation', + default='cls') + group.add_argument( + '--cloze-eval', + action='store_true', + help='Evaluation dataset with cloze task') + group.add_argument( + '--multi-token', + action='store_true', + help='Use multi token for cloze evaluation') + group.add_argument( + '--segment-length', + type=int, + default=0, + help='The maximum segment length for cloze evaluation') + group.add_argument( + '--loss-func', + type=str, + choices=['cross_entropy', 'hinge', 'generative', 'mix'], + default='cross_entropy') + group.add_argument('--block-lm-ratio', type=float, default=0.0) + group.add_argument( + '--adapet', + action='store_true', + help='Use the decoupled cross entropy loss in AdaPET') + group.add_argument('--pattern-id', type=int, default=0) + group.add_argument( + '--fast-decode', + action='store_true', + help= + 'Fast decode for multi-token cloze. Can only be used without checkpoint activation.' + ) + group.add_argument('--few-superglue', action='store_true') + group.add_argument( + '--eval-valid', + action='store_true', + help='Whether evaluate on the valid set') + group.add_argument('--validation-metric', type=str, default=None) + group.add_argument( + '--unidirectional', + action='store_true', + help='Use the left to right language model') + group.add_argument('--src-seq-length', type=int, default=None) + group.add_argument('--tgt-seq-length', type=int, default=None) + group.add_argument('--adam-beta1', type=float, default=0.9) + group.add_argument('--adam-beta2', type=float, default=0.999) + group.add_argument('--adam-eps', type=float, default=1e-8) + group.add_argument( + '--optimizer', type=str, choices=['adam', 'adafactor'], default='adam') + group.add_argument('--wsc-negative', action='store_true') + group.add_argument('--overwrite', action='store_true') + group.add_argument('--no-validation', action='store_true') + # Continuous prompt arguments + group.add_argument( + '--continuous-prompt', + action='store_true', + help='Use continuous prompt for PET') + group.add_argument('--num-prompt-tokens', type=int, default=0) + group.add_argument( + '--prompt-func', default='lstm', choices=['lstm', 'mlp', 'none']) + group.add_argument( + '--freeze-transformer', action='store_true', default=False) + group.add_argument('--tune-prefix-layers', type=int, default=None) + group.add_argument('--prefix-prompt', type=int, default=0) + group.add_argument('--prompt-init', action='store_true', default=False) + return parser + + +def get_args(): + """Parse all the args.""" + + parser = argparse.ArgumentParser(description='PyTorch BERT Model') + parser = add_model_config_args(parser) + parser = add_fp16_config_args(parser) + parser = add_training_args(parser) + parser = add_evaluation_args(parser) + parser = add_text_generate_args(parser) + parser = add_data_args(parser) + parser = add_finetune_config_args(parser) + + # Include DeepSpeed configuration arguments + parser = deepspeed.add_config_arguments(parser) + + args = parser.parse_args(args=[]) + if not args.train_data and not args.data_dir: + print('WARNING: No training data specified') + + args.cuda = torch.cuda.is_available() + + args.rank = int(os.getenv('RANK', '0')) + args.world_size = int(os.getenv('WORLD_SIZE', '1')) + if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi: + mpi_define_env(args) + elif os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'): + # We are using (OpenMPI) mpirun for launching distributed data parallel processes + local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK')) + local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) + + # Possibly running with Slurm + num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1')) + nodeid = int(os.getenv('SLURM_NODEID', '0')) + + args.local_rank = local_rank + args.rank = nodeid * local_size + local_rank + args.world_size = num_nodes * local_size + + args.model_parallel_size = min(args.model_parallel_size, args.world_size) + if args.rank == 0: + print('using world size: {} and model-parallel size: {} '.format( + args.world_size, args.model_parallel_size)) + + args.dynamic_loss_scale = False + if args.loss_scale is None: + args.dynamic_loss_scale = True + if args.rank == 0: + print(' > using dynamic loss scaling') + + # The args fp32_* or fp16_* meant to be active when the + # args fp16 is set. So the default behaviour should all + # be false. + if not args.fp16: + args.fp32_embedding = False + args.fp32_tokentypes = False + args.fp32_layernorm = False + + if hasattr(args, 'deepspeed' + ) and args.deepspeed and args.deepspeed_config is not None: + with open(args.deepspeed_config) as file: + deepspeed_config = json.load(file) + if 'train_micro_batch_size_per_gpu' in deepspeed_config: + args.batch_size = deepspeed_config[ + 'train_micro_batch_size_per_gpu'] + if 'gradient_accumulation_steps' in deepspeed_config: + args.gradient_accumulation_steps = deepspeed_config[ + 'gradient_accumulation_steps'] + else: + args.gradient_accumulation_steps = 1 + if 'optimizer' in deepspeed_config: + optimizer_params_config = deepspeed_config['optimizer'].get( + 'params', {}) + args.lr = optimizer_params_config.get('lr', args.lr) + args.weight_decay = optimizer_params_config.get( + 'weight_decay', args.weight_decay) + return args + + +def mpi_define_env(args): + from mpi4py import MPI + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + world_size = comm.Get_size() + + master_addr = None + if rank == 0: + master_addr = get_hostname() + master_addr = comm.bcast(master_addr, root=0) + + # Determine local rank by assuming hostnames are unique + proc_name = MPI.Get_processor_name() + all_procs = comm.allgather(proc_name) + local_rank = sum([i == proc_name for i in all_procs[:rank]]) + + os.environ['RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(world_size) + args.local_rank = local_rank + args.world_size = world_size + args.rank = rank + os.environ['MASTER_ADDR'] = master_addr + os.environ[ + 'MASTER_PORT'] = '29500' # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500 + + print( + 'Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}' + .format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'], + os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])) diff --git a/modelscope/models/nlp/mglm/blocklm_utils.py b/modelscope/models/nlp/mglm/blocklm_utils.py new file mode 100644 index 00000000..9af83f67 --- /dev/null +++ b/modelscope/models/nlp/mglm/blocklm_utils.py @@ -0,0 +1,625 @@ +# Copyright (c) 2022 Zhipu.AI + +import copy +import math +import random + +import numpy as np +import torch +import torch.utils.data +from scipy.stats import poisson + +from . import mpu +from .utils import print_rank_0 + + +def rindex(lst, val, start=None): + if start is None: + start = len(lst) - 1 + for i in range(start, -1, -1): + if lst[i] == val: + return i + return -1 + + +def index_in_list(lst, val, start=None): + if start is None: + start = 0 + for i in range(start, len(lst)): + if lst[i] == val: + return i + return -1 + + +class ConstructBlockStrategy: + + def __init__(self, + args, + tokenizer, + max_seq_length, + bert_prob=1.0, + gap_sentence_prob=0.0, + gpt_infill_prob=0.5, + gpt_min_ratio=0.5, + bert_ratio=0.15, + gap_sentence_ratio=0.15, + average_block_length=3, + max_block_length=40, + block_mask_prob=0.0, + context_mask_ratio=0.0, + context_mask_range=3, + short_seq_prob=0.0, + single_span_prob=0.0, + block_position_encoding=True, + encoder_decoder=False, + shuffle_blocks=True, + sentinel_token=False, + task_mask=False, + random_position=False, + masked_lm=False): + self.eod_token = args.eod_token + self.tokenizer = tokenizer + self.count = 0 + self.max_seq_length = max_seq_length + self.rank = mpu.get_data_parallel_rank() + self.world_size = mpu.get_data_parallel_world_size() + # self.rank = 0 + # self.world_size = 1 + assert 0.0 <= bert_prob <= 1.0 + self.bert_prob = bert_prob + self.gap_sentence_prob = gap_sentence_prob + self.gpt_prob = 1 - bert_prob - gap_sentence_prob + assert self.gpt_prob >= -1e-10 + self.infill_prob = gpt_infill_prob + self.gpt_min_ratio = gpt_min_ratio + self.bert_ratio = bert_ratio + self.gap_sentence_ratio = gap_sentence_ratio + self.block_length_distribution = [ + poisson.pmf(i, average_block_length) + for i in range(1, max_block_length) + ] + self.block_mask_prob = block_mask_prob + self.context_mask_ratio = context_mask_ratio + self.context_mask_range = context_mask_range + self.short_seq_prob = short_seq_prob + self.single_span_prob = single_span_prob + self.block_position_encoding = block_position_encoding + self.encoder_decoder = encoder_decoder + self.shuffle_blocks = shuffle_blocks + self.sentinel_token = sentinel_token + self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = self.tokenizer.get_command( + self.generation_mask).Id + self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = self.tokenizer.get_command( + self.gap_sentence_mask).Id + self.random_position = random_position + self.masked_lm = masked_lm + print_rank_0( + f'BERT prob {self.bert_prob}, gap sent prob {self.gap_sentence_prob}, GPT prob {self.gpt_prob}, infill prob {self.infill_prob}' # noqa + ) + print_rank_0( + f'generation min ratio {self.gpt_min_ratio}, block ratio {self.bert_ratio}, gap sent ratio {self.gap_sentence_ratio}' # noqa + ) + print_rank_0( + f'block length distribution {self.block_length_distribution}') + print_rank_0( + f'block mask prob {self.block_mask_prob}, context mask ratio {self.context_mask_ratio}' + ) + + def contains_sentence_end(self, tok): + tok = self.tokenizer.IdToToken(tok) + if '.' in tok: + return True + if '?' in tok: + return True + if '!' in tok: + return True + if ';' in tok: + return True + if ':' in tok: + return True + if '。' in tok: + return True + if '?' in tok: + return True + if '!' in tok: + return True + if ';' in tok: + return True + if '…' in tok: + return True + if '\n' in tok: + return True + return False + + @staticmethod + def sample_spans(span_lengths, total_length, rng, offset=0): + blank_length = total_length - sum(span_lengths) + m = blank_length - len(span_lengths) + 1 + places = [rng.randrange(m + 1) for _ in range(len(span_lengths))] + places.sort() + spans = [] + for place, span_length in zip(places, span_lengths): + start = offset + place + end = offset + place + span_length + spans.append((start, end)) + offset += span_length + 1 + return spans + + def sample_span_in_document(self, tokens, masked_lengths, rng): + rng.shuffle(masked_lengths) + mask_spans = [] + mask_index = 0 + indices = [-1] + np.where(tokens == self.eod_token)[0].tolist() + last_index = len(tokens) + documents = [] + for index in reversed(indices): + start_index = index + if start_index + 1 < len(tokens) and tokens[ + start_index + 1] == self.tokenizer.get_command('ENC').Id: + start_index += 1 + length = last_index - start_index - 1 + if last_index == len(tokens) and length > 0: + length -= 1 + documents.append((start_index + 1, length)) + last_index = index + documents.sort(key=lambda x: x[1]) + for i, (offset, length) in enumerate(documents): + if i == len(documents) - 1: + current_masked_length, current_count = 0, 0 + while mask_index + current_count < len( + masked_lengths + ) and masked_lengths[ + mask_index + # noqa + current_count] + current_masked_length + current_count <= length: + current_masked_length += masked_lengths[mask_index + + current_count] + current_count += 1 + if current_count > 0: + spans = self.sample_spans( + masked_lengths[mask_index:mask_index + current_count], + length, + rng, + offset=offset) + mask_spans += spans + if mask_index + current_count < len(masked_lengths) - 1: + print(length, masked_lengths[mask_index:], + masked_lengths[:mask_index], indices) + else: + current_masked_total = int(length * self.bert_ratio) + current_masked_length, current_count = 0, 0 + while mask_index + current_count < len( + masked_lengths + ) and masked_lengths[ + mask_index + # noqa + current_count] + current_masked_length <= current_masked_total: + current_masked_length += masked_lengths[mask_index + + current_count] + current_count += 1 + if current_count > 0: + spans = self.sample_spans( + masked_lengths[mask_index:mask_index + current_count], + length, + rng, + offset=offset) + mask_spans += spans + mask_index += current_count + return mask_spans + + def make_masked_data(self, + tokens, + loss_masks, + attention_mask, + block_spans, + rng, + task='bert'): + position_ids = np.arange(len(tokens), dtype=np.long) + targets = copy.deepcopy(tokens) + mask_id = self.tokenizer.get_command('MASK').Id + mlm_masks = np.zeros(len(tokens), dtype=np.long) + for start, end in block_spans: + for idx in range(start, end): + tokens[idx] = mask_id + mlm_masks[start:end] = 1 + loss_masks = loss_masks * mlm_masks + return tokens, targets, loss_masks, position_ids + + def make_block_data(self, + tokens, + loss_masks, + attention_mask, + block_spans, + rng, + task='bert'): + text_length = len(tokens) + position_ids = np.ones(len(tokens), dtype=np.long) + for start, end in block_spans: + position_ids[start + 1:end] = 0 + position_ids = np.cumsum(position_ids) - 1 + if self.random_position and position_ids[-1] < self.max_seq_length - 1: + position_bias = self.max_seq_length - position_ids[-1] + position_bias = rng.randrange(0, position_bias) + position_ids = position_ids + position_bias + if self.encoder_decoder or not self.shuffle_blocks: + block_spans.sort(key=lambda x: x[0]) + else: + rng.shuffle(block_spans) + if self.sentinel_token: + block_spans = [(start, end, idx) + for idx, (start, end) in enumerate(block_spans)] + else: + block_spans = [(start, end, 0) for start, end in block_spans] + target_tokens, target_position_ids, target_block_position_ids, targets = [], [], [], [] + for start, end, idx in block_spans: + sop_token = 'sop' if idx == 0 else f'sop{idx}' + target_tokens.append([self.tokenizer.get_command(sop_token).Id]) + span_tokens = copy.deepcopy(tokens[start:end]) + if self.block_mask_prob > 0.0 and task == 'bert': + for sub_idx in range(len(span_tokens)): + if random.random() < self.block_mask_prob: + span_tokens[sub_idx] = self.tokenizer.get_command( + 'dBLOCK').Id + target_tokens.append(span_tokens) + targets.append(tokens[start:end]) + targets.append([self.tokenizer.get_command('eop').Id]) + if not self.sentinel_token: + target_position_id = position_ids[start:end] + target_position_ids.append(target_position_id) + target_position_ids.append([target_position_id[0]]) + else: + target_position_ids.append([self.max_seq_length] * # noqa + (end - start + 1)) + if self.block_position_encoding: + target_block_position_ids.append( + np.arange(1, end - start + 2, dtype=np.long)) + else: + target_block_position_ids.append([1] * (end - start + 1)) + block_spans.sort(key=lambda x: x[0]) + source_tokens, source_position_ids, local_spans = [], [], [] + last, current_length = 0, 0 + for start, end, idx in block_spans: + if task == 'generation': + mask_id = self.generation_mask + elif task == 'gap_sentence': + mask_id = self.gap_sentence_mask + else: + mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_id = self.tokenizer.get_command(mask_token).Id + local_spans.append((current_length, current_length + start - last)) + source_tokens.append(tokens[last:start]) + source_tokens.append([mask_id]) + source_position_ids.append(position_ids[last:start]) + source_position_ids.append([position_ids[start]]) + current_length += start - last + 1 + last = end + if last < len(tokens): + local_spans.append( + (current_length, current_length + len(tokens) - last)) + source_tokens.append(tokens[last:]) + source_position_ids.append(position_ids[last:]) + source_length = sum(map(len, source_tokens)) + if attention_mask is not None: + assert source_length == attention_mask + if target_tokens and self.eod_token in np.concatenate( + target_tokens).tolist(): + print('Found EOS in target', self.tokenizer.DecodeIds(tokens)) + raise RuntimeError + if self.encoder_decoder: + target_tokens = target_tokens + [ + self.tokenizer.get_command('eop').Id + ] + loss_masks = np.ones(len(target_tokens), dtype=np.long) + return source_tokens, target_tokens, loss_masks + else: + tokens = np.concatenate(source_tokens + target_tokens) + if task == 'bert' and self.context_mask_ratio > 0: + mask_candidates = set() + for start, end in local_spans: + if start != 0: + local_end = min(end, start + self.context_mask_range) + mask_candidates.update(range(start, local_end)) + if end != 0: + local_start = max(start, end - self.context_mask_range) + mask_candidates.update(range(local_start, end)) + mask_pos = rng.sample( + mask_candidates, + int(self.context_mask_ratio * text_length)) + for pos in mask_pos: + tokens[pos] = self.tokenizer.get_command('dBLOCK').Id + targets = np.concatenate(source_tokens + targets) + loss_masks = np.ones(len(tokens), dtype=np.long) + loss_masks[:source_length] = 0 + position_ids = np.concatenate(source_position_ids + + target_position_ids) + block_position_ids = np.concatenate( + [np.zeros(source_length, dtype=np.long)] + + target_block_position_ids) + position_ids = np.stack([position_ids, block_position_ids], axis=0) + if attention_mask is not None: + return tokens, targets, loss_masks, position_ids + else: + return tokens, targets, loss_masks, position_ids, source_length + + def generate_blank_data(self, + sample, + masked_lengths, + attention_mask, + rng, + task='bert'): + rng.shuffle(masked_lengths) + tokens, loss_masks = sample['text'], sample['loss_mask'] + assert tokens[0] == self.tokenizer.get_command('ENC').Id + block_spans = self.sample_span_in_document(tokens, masked_lengths, rng) + if len(block_spans) < len(masked_lengths): + return None + if self.masked_lm: + data = self.make_masked_data(tokens, loss_masks, attention_mask, + block_spans, rng) + else: + data = self.make_block_data( + tokens, + loss_masks, + attention_mask, + block_spans, + rng, + task=task) + return data + + def split_samples(self, samples, rng): + target_length = rng.randrange(32, self.max_seq_length - 1) + num_splits = (self.max_seq_length - 1) // target_length + new_samples = [] + cls_id = self.tokenizer.get_command('ENC').Id + eos_id = self.tokenizer.get_command('eos').Id + for sample in samples: + tokens, loss_masks = sample['text'][1:], sample['loss_mask'][1:] + for _ in range(num_splits): + if target_length >= len(tokens): + new_tokens, new_loss_masks = tokens, loss_masks + else: + random_start = rng.randrange(0, + len(tokens) - target_length) + while random_start > 0 and ( + tokens[random_start] == eos_id or # noqa + not (self.contains_sentence_end( # noqa + tokens[random_start - 1]) or # noqa + tokens[random_start - 1] == eos_id)): # noqa + random_start -= 1 + random_end = random_start + target_length + while random_end > random_start and not ( + self.contains_sentence_end(tokens[random_end - 1]) + or tokens[random_end - 1] == eos_id): + random_end -= 1 + if random_end - random_start < target_length // 2: + random_end = random_start + target_length + new_tokens, new_loss_masks = tokens[ + random_start:random_end], loss_masks[ + random_start:random_end] + new_tokens = np.concatenate(([cls_id], new_tokens)) + new_loss_masks = np.concatenate(([0], new_loss_masks)) + new_samples.append({ + 'text': new_tokens, + 'loss_mask': new_loss_masks + }) + return new_samples + + def construct_blocks(self, samples): + worker_info = torch.utils.data.get_worker_info() + if worker_info is not None: + worker_id, num_workers = worker_info.id, worker_info.num_workers + else: + worker_id, num_workers = 0, 1 + rng = random.Random((self.count * num_workers + worker_id) + * self.world_size + self.rank) + self.count += 1 + token_batch, target_batch, loss_mask_batch, position_id_batch = [], [], [], [] + source_batch, target_batch = [], [] + if rng.random() < self.short_seq_prob: + samples = self.split_samples(samples, rng) + rand = rng.random() + single_span = rand < self.single_span_prob + rand = 0.0 if single_span else rng.random() + attention_mask = [] + if rand < self.bert_prob: + mode = 'bert' + for sample in samples: + if single_span: + masked_lengths = [ + rng.choices( + range(1, + len(self.block_length_distribution) + 1), + weights=self.block_length_distribution)[0] + ] + masked_count = masked_lengths[0] + else: + masked_lengths, masked_count = [], 0 + while masked_count < int( + self.bert_ratio * len(sample['text'])): + block_length = rng.choices( + range(1, + len(self.block_length_distribution) + 1), + weights=self.block_length_distribution)[0] + masked_lengths.append(block_length) + masked_count += block_length + if self.masked_lm: + sep = len(sample['text']) + else: + sep = len( + sample['text']) - masked_count + len(masked_lengths) + data = self.generate_blank_data( + sample, masked_lengths, sep, rng, task='bert') + if data is not None: + if self.encoder_decoder: + source_tokens, target_tokens, loss_masks = data + source_batch.append(source_tokens) + target_batch.append(target_tokens) + loss_mask_batch.append(loss_masks) + else: + tokens, targets, loss_masks, position_ids = data + token_batch.append(tokens) + target_batch.append(targets) + loss_mask_batch.append(loss_masks) + position_id_batch.append(position_ids) + attention_mask.append(sep) + + elif rand < self.bert_prob + self.gap_sentence_prob: + mode = 'sentence' + for sample in samples: + tokens, loss_masks = sample['text'], sample['loss_mask'] + sentence_spans = [] + last_index = 1 if tokens[0] == self.tokenizer.get_command( + 'ENC').Id else 0 + for i in range(len(tokens)): + if self.contains_sentence_end(tokens[i]): + if last_index < i + 1: + sentence_spans.append((last_index, i + 1)) + last_index = i + 1 + elif tokens[i] == self.tokenizer.get_command('eos').Id: + last_index = i + 1 + if last_index < len(tokens): + sentence_spans.append((last_index, len(tokens))) + if not sentence_spans and torch.distributed.get_rank() == 0: + try: + print(self.tokenizer.DecodeIds(tokens[1:])) + except IndexError: + print(tokens[1:]) + rng.shuffle(sentence_spans) + block_spans, block_length = [], 0 + for start, end in sentence_spans: + block_spans.append((start, end)) + block_length += end - start + if block_length >= int( + self.gap_sentence_ratio * len(tokens)): + break + data = self.make_block_data( + tokens, + loss_masks, + None, + block_spans, + rng, + task='gap_sentence') + tokens, targets, loss_masks, position_ids, sep = data + token_batch.append(tokens) + target_batch.append(targets) + loss_mask_batch.append(loss_masks) + position_id_batch.append(position_ids) + attention_mask.append(sep) + else: + # start_indices = [index_in_list(sample['loss_mask'], 1) for sample in samples] + # end_indices = [rindex(sample['loss_mask'], 1) for sample in samples] + # start_index, end_index = max(start_indices), min(end_indices) - self.min_generation_length + # if end_index < start_index + 1: + # end_index = start_index + 1 + # division = rng.randrange(start_index, end_index) + mode = 'gpt' + max_generation_length = rng.randint( + int(self.gpt_min_ratio + * min(map(lambda x: len(x['text']), samples))), + max(map(lambda x: len(x['text']), samples)) - 2) + for sample in samples: + generation_length = min(max_generation_length, + len(sample['text']) - 2) + attention_mask.append( + len(sample['text']) - generation_length + 1) + multiple_doc = index_in_list( + sample['text'], + self.tokenizer.get_command('eos').Id) not in [ + -1, len(sample['text']) - 1 + ] # noqa + if multiple_doc or rng.random() < self.infill_prob: + division = len(sample['text']) - generation_length + tokens, loss_masks = sample['text'], sample['loss_mask'] + source_tokens, target_tokens = tokens[:division], tokens[ + division:] + target_masks = loss_masks[division:] + tokens = np.concatenate((source_tokens, [ + self.generation_mask, + self.tokenizer.get_command('sop').Id + ], target_tokens[:-1])) + targets = np.concatenate( + (source_tokens, [self.generation_mask], target_tokens)) + loss_masks = np.concatenate( + (np.zeros(len(source_tokens) + 1, + dtype=np.long), target_masks)) + token_batch.append(tokens) + target_batch.append(targets) + loss_mask_batch.append(loss_masks) + position_ids = np.arange( + len(source_tokens) + len(target_tokens) + 1, + dtype=np.long) + position_ids[len(source_tokens) + 1:] = len(source_tokens) + if self.block_position_encoding: + block_position_ids = np.concatenate( + (np.zeros(len(source_tokens), dtype=np.long), + np.arange(len(target_tokens) + 1, dtype=np.long))) + else: + block_position_ids = np.concatenate( + (np.zeros(len(source_tokens) + 1, dtype=np.long), + np.ones(len(target_tokens) + 1, dtype=np.long))) + position_id_batch.append( + np.stack([position_ids, block_position_ids], axis=0)) + else: + tokens, targets, loss_masks, position_ids = self.generate_blank_data( + sample, [generation_length], + attention_mask[-1], + rng, + task='generation') + token_batch.append(tokens) + target_batch.append(targets) + loss_mask_batch.append(loss_masks) + position_id_batch.append(position_ids) + if tokens is None: + print(sample, generation_length, multiple_doc) + if self.encoder_decoder: + return { + 'text': torch.tensor(source_batch, dtype=torch.long), + 'target': torch.tensor(target_batch, dtype=torch.long), + 'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long) + } + else: + token_batch, target_batch, loss_mask_batch, position_id_batch = self.pad_batch( + token_batch, target_batch, loss_mask_batch, position_id_batch) + return { + 'text': torch.tensor(token_batch, dtype=torch.long), + 'target': torch.tensor(target_batch, dtype=torch.long), + 'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long), + 'position_id': + torch.tensor(position_id_batch, dtype=torch.long), + 'attention_mask': + torch.tensor(attention_mask, dtype=torch.long), + 'mode': mode + } + + @staticmethod + def pad_batch(token_batch, target_batch, loss_mask_batch, + position_id_batch): + seq_lengths = list(map(len, token_batch)) + if seq_lengths.count(seq_lengths[0]) != len(seq_lengths): + max_length = max(seq_lengths) + token_batch = [ + np.concatenate( + (tokens, np.zeros(max_length - len(tokens), + dtype=np.long))) + for tokens in token_batch + ] + target_batch = [ + np.concatenate( + (targets, + np.zeros(max_length - len(targets), dtype=np.long))) + for targets in target_batch + ] + loss_mask_batch = [ + np.concatenate( + (loss_masks, + np.zeros(max_length - len(loss_masks), dtype=np.long))) + for loss_masks in loss_mask_batch + ] + position_id_batch = [ + np.concatenate((position_ids, + np.zeros( + (2, max_length - position_ids.shape[1]), + dtype=np.long)), + axis=1) for position_ids in position_id_batch + ] + return token_batch, target_batch, loss_mask_batch, position_id_batch diff --git a/modelscope/models/nlp/mglm/configure_data.py b/modelscope/models/nlp/mglm/configure_data.py new file mode 100644 index 00000000..6921de08 --- /dev/null +++ b/modelscope/models/nlp/mglm/configure_data.py @@ -0,0 +1,513 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""parses arguments and preps data loader""" + +import copy +import os +import random +from bisect import bisect_right +from itertools import accumulate + +import numpy as np +import torch +import torch.utils.data + +from . import data_utils, mpu +from .blocklm_utils import ConstructBlockStrategy +from .data_utils.tokenization import make_tokenizer +from .utils import print_rank_0 + + +class MultiTaskDataset(torch.utils.data.Dataset): + + def __init__(self, + tasks, + datasets, + reweight=True, + temperature=0.8, + max_limit=200000): + super(MultiTaskDataset, self).__init__() + self.tasks = tasks + self.datasets = datasets + self.reweight = reweight + self.temperature = temperature + self.lens = [len(dataset) for dataset in datasets] + self.weights = np.array( + [min(length, max_limit)**temperature for length in self.lens]) + self.total_len = sum(self.lens) + self.cumulative_lens = list(accumulate(self.lens)) + if self.reweight: + print_rank_0(list(zip(self.tasks, self.lens, self.weights))) + else: + print_rank_0(list(zip(self.tasks, self.lens))) + self.weights /= self.weights.sum() + + def __len__(self): + return self.total_len * 1000 + + @staticmethod + def pet_wrapper(data): + text = data['text'] + loss_mask = data['logit_mask'] + target = data['target'] + attention_mask = data['mask'] + position_id = data['position'] + label = data['label'] + if len(text.shape) == 2: + text = text[label] + loss_mask = loss_mask[label] + target = target[label] + attention_mask = attention_mask[label] + position_id = position_id[label] + else: + target = target[label] + if not target.shape: + target = target.repeat(len(text)) + return { + 'text': text, + 'target': target, + 'loss_mask': loss_mask, + 'position_id': position_id, + 'attention_mask': attention_mask + } + + def __getitem__(self, idx): + if self.reweight: + rng = random.Random(idx) + rng = np.random.RandomState( + seed=[rng.randint(0, 2**32 - 1) for _ in range(16)]) + dataset_idx = rng.choice( + np.arange(len(self.datasets)), p=self.weights) + dataset = self.datasets[dataset_idx] + sample_idx = rng.choice(np.arange(len(dataset))) + item = self.datasets[dataset_idx][sample_idx] + else: + dataset_idx = bisect_right(self.cumulative_lens, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_lens[dataset_idx - 1] + item = self.datasets[dataset_idx][sample_idx] + item = self.pet_wrapper(item) + return item + + +class DataConfig: + + def __init__(self, defaults=None): + super(DataConfig, self).__init__() + if defaults is None: + defaults = {} + self.defaults = defaults + + def apply(self, args, tokenizer): + if torch.distributed.get_rank() == 0: + print('configuring data') + self.apply_defaults(args) + return make_loaders(args, tokenizer) + + def set_defaults(self, **kwargs): + for k, v in kwargs.items(): + self.defaults[k] = v + + def apply_defaults(self, args): + for k, v in self.defaults.items(): + k = k.replace('-', '_') + if not hasattr(args, k): + setattr(args, k, v) + + +def prepare_tokenizer(args): + add_sentinel_token = 0 + if args.sentinel_token: + add_sentinel_token = args.max_position_embeddings + tokenizer = make_tokenizer( + args.tokenizer_type, + None, + args.tokenizer_path, + args.vocab_size, + args.tokenizer_model_type, + add_block_symbols=args.block_lm, + cache_dir=args.cache_dir, + add_sentinel_token=add_sentinel_token, + add_task_mask=args.task_mask, + add_decoder_mask=args.block_mask_prob > 0.0 + or args.context_mask_ratio > 0.0) + if mpu.get_model_parallel_rank() == 0: + num_tokens = tokenizer.num_tokens + eod_token = tokenizer.get_command('eos').Id + assert eod_token == tokenizer.get_command('pad').Id + before = num_tokens + after = before + multiple = args.make_vocab_size_divisible_by + while (after % multiple) != 0: + after += 1 + print_rank_0('> padded vocab (size: {}) with {} dummy ' + 'tokens (new size: {})'.format(before, after - before, + after)) + print_rank_0('> found end-of-document token: {}'.format(eod_token)) + token_counts = torch.cuda.LongTensor([after, eod_token]) + else: + token_counts = torch.cuda.LongTensor([0, 0]) + # Broadcast num tokens. + torch.distributed.broadcast( + token_counts, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group()) + num_tokens = token_counts[0].item() + eod_token = token_counts[1].item() + args.vocab_size, args.eod_token = num_tokens, eod_token + return tokenizer + + +def make_data_loader(dataset, + tokenizer, + batch_size, + num_iters, + args, + shuffle=False, + block_collate=False): + world_size = torch.distributed.get_world_size( + group=mpu.get_data_parallel_group()) + rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) + if args.loader_scatter is not None: + rank = rank // args.loader_scatter + world_size = world_size // args.loader_scatter + batch_size = batch_size // args.loader_scatter + distributed = world_size > 1 + if args.transformer_xl: + batch_sampler = data_utils.samplers.DistributedSequentialSampler( + len(dataset), num_iters, batch_size, rank, world_size) + else: + if shuffle: + sampler = data_utils.samplers.RandomSampler( + dataset, + replacement=True, + num_samples=batch_size * args.train_iters + * args.gradient_accumulation_steps) + else: + sampler = torch.utils.data.SequentialSampler(dataset) + drop_last = distributed + # the GPUs in the same model parallel group receive the same data + if distributed: + batch_sampler = data_utils.samplers.DistributedBatchSampler( + sampler, + batch_size, + drop_last, + rank, + world_size, + gradient_accumulation_steps=args.gradient_accumulation_steps) + else: + batch_sampler = torch.utils.data.BatchSampler( + sampler, batch_size, drop_last) + collate_fn = None + if block_collate: + collate_fn = ConstructBlockStrategy( + args, + tokenizer, + args.seq_length, + bert_prob=args.bert_prob, + gap_sentence_prob=args.gap_sentence_prob, + gap_sentence_ratio=args.gap_sentence_ratio, + gpt_infill_prob=args.gpt_infill_prob, + average_block_length=args.avg_block_length, + gpt_min_ratio=args.gpt_min_ratio, + block_mask_prob=args.block_mask_prob, + context_mask_ratio=args.context_mask_ratio, + short_seq_prob=args.short_seq_prob, + single_span_prob=args.single_span_prob, + shuffle_blocks=not args.no_shuffle_block, + block_position_encoding=not args.no_block_position, + sentinel_token=args.sentinel_token, + encoder_decoder=args.encoder_decoder, + task_mask=args.task_mask, + random_position=args.random_position, + masked_lm=args.masked_lm).construct_blocks + data_loader = torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + collate_fn=collate_fn) + + return data_loader + + +def make_tfrecord_loaders(args): + """Load train/val/test dataset from shuffled TFRecords""" + + import data_utils.tf_dl + data_set_args = { + 'batch_size': args.batch_size, + 'max_seq_len': args.seq_length, + 'max_preds_per_seq': args.max_preds_per_seq, + 'train': True, + 'num_workers': max(args.num_workers, 1), + 'seed': args.seed + args.rank + 1, + 'threaded_dl': args.num_workers > 0 + } + train = data_utils.tf_dl.TFRecordDataLoader(args.train_data, + **data_set_args) + data_set_args['train'] = False + if args.eval_seq_length is not None: + data_set_args['max_seq_len'] = args.eval_seq_length + if args.eval_max_preds_per_seq is not None: + data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq + valid = None + if args.valid_data is not None: + valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data, + **data_set_args) + test = None + if args.test_data is not None: + test = data_utils.tf_dl.TFRecordDataLoader(args.test_data, + **data_set_args) + tokenizer = data_utils.make_tokenizer( + args.tokenizer_type, + train, + args.tokenizer_path, + args.vocab_size, + args.tokenizer_model_type, + cache_dir=args.cache_dir) + + return (train, valid, test), tokenizer + + +def make_loaders(args, tokenizer): + """makes training/val/test""" + + if args.use_tfrecords: + return make_tfrecord_loaders(args) + world_size = torch.distributed.get_world_size( + group=mpu.get_data_parallel_group()) + if args.loader_scatter is not None: + assert world_size % args.loader_scatter == 0 + batch_size = args.batch_size * world_size + eval_batch_size = batch_size + if args.eval_batch_size is not None: + eval_batch_size = args.eval_batch_size * world_size + seq_length = args.seq_length + if seq_length < 0: + seq_length = seq_length * world_size + eval_seq_length = args.eval_seq_length + if eval_seq_length is not None and eval_seq_length < 0: + eval_seq_length = eval_seq_length * world_size + split = get_split(args) + data_set_args = { + 'path': args.train_data, + 'seq_length': seq_length, + 'mem_length': args.mem_length, + 'delim': args.delim, + 'text_key': args.text_key, + 'label_key': 'label', + 'ds_type': args.data_set_type, + 'split': split, + 'loose': args.loose_json, + 'max_preds_per_seq': args.max_preds_per_seq, + 'presplit_sentences': args.presplit_sentences, + 'sample_one_document': args.sample_one_document, + 'filter_english': args.filter_english, + 'pre_tokenize': not args.no_pre_tokenize, + 'tokenizer': tokenizer, + 'save_splits': args.save_splits, + 'load_splits': args.load_splits, + 'save_test_data': args.save_test_data, + 'no_lazy_loader': args.no_lazy_loader, + 'loader_scatter': args.loader_scatter, + 'data_parallel_rank': mpu.get_data_parallel_rank(), + 'non_sentence_start': args.non_sentence_start, + 'half_lazy_loader': args.half_lazy_loader + } + + eval_set_args = copy.copy(data_set_args) + eval_set_args['split'] = [1.] + # if optional eval args were set then replace their + # equivalent values in the arg dict + if eval_seq_length: + eval_set_args['seq_length'] = eval_seq_length + if args.eval_max_preds_per_seq: + eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq + if args.eval_text_key is not None: + eval_set_args['text_key'] = args.eval_text_key + + # make datasets splits and tokenizer + train, valid, test = None, None, None + + if args.train_data is not None: + train = data_utils.make_dataset(**data_set_args) + if data_utils.should_split(split): + train, valid, test = train + eval_set_args['tokenizer'] = tokenizer + + # make training and val dataset if necessary + if valid is None and args.valid_data is not None: + eval_set_args['path'] = args.valid_data + valid = data_utils.make_dataset(**eval_set_args) + eval_set_args['tokenizer'] = tokenizer + if test is None and args.test_data is not None: + eval_set_args['path'] = args.test_data + test = data_utils.make_dataset(**eval_set_args) + + # wrap datasets with data loader + use_block = args.block_lm or args.encoder_decoder + + if train is not None and args.batch_size > 0: + train = make_data_loader( + train, + tokenizer, + batch_size, + args.train_iters, + args, + shuffle=args.shuffle, + block_collate=use_block) + args.do_train = True + else: + args.do_train = False + eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size + if valid is not None: + valid = make_data_loader( + valid, + tokenizer, + eval_batch_size, + args.train_iters, + args, + shuffle=args.shuffle, + block_collate=use_block) + args.do_valid = True + else: + args.do_valid = False + if test is not None: + test = make_data_loader( + test, + tokenizer, + eval_batch_size, + len(test) // eval_batch_size + 1, + args, + shuffle=args.shuffle, + block_collate=use_block) + args.do_test = True + else: + args.do_test = False + + return train, valid, test + + +def build_multi_task_dataset(args, tokenizer): + task_dirs = { + 'mnli': 'MNLI', + 'cola': 'CoLA', + 'mrpc': 'MRPC', + 'qnli': 'QNLI', + 'qqp': 'QQP', + 'sst2': 'SST-2', + 'agnews': 'Agnews', + 'yelp-polarity': 'yelp_review_polarity_csv', + 'yelp-full': 'yelp_review_full_csv', + 'yahoo': 'Yahoo', + 'squad': 'SQuAD', + 'race': 'RACE' + } + train, valid = None, None + if mpu.get_model_parallel_rank() == 0: + multi_seq_length = args.seq_length + if args.multi_seq_length is not None: + multi_seq_length = args.multi_seq_length + train_datasets, valid_datasets = [], [] + for task in args.multi_task_data: + task = task.lower() + data_dir = os.path.join(args.data_dir, task_dirs[task]) + train_datasets.append( + SuperGlueDataset( + args, + task, + data_dir, + multi_seq_length, + 'train', + tokenizer, + pattern_ensemble=True)) + valid_datasets.append( + SuperGlueDataset( + args, + task, + data_dir, + multi_seq_length, + 'dev', + tokenizer, + pattern_ensemble=True)) + train = MultiTaskDataset(args.multi_task_data, train_datasets) + valid = MultiTaskDataset(args.multi_task_data, valid_datasets) + world_size = torch.distributed.get_world_size( + group=mpu.get_data_parallel_group()) + multi_batch_size = args.batch_size * world_size + if args.multi_batch_size is not None: + multi_batch_size = args.multi_batch_size * world_size + train = make_data_loader( + train, + tokenizer, + multi_batch_size, + args.train_iters, + args, + shuffle=True) + valid = make_data_loader( + valid, + tokenizer, + multi_batch_size, + args.train_iters, + args, + shuffle=True) + return train, valid + + +def get_split(args): + """ + Get dataset splits from comma separated string list + """ + splits = [] + if args.split.find(',') != -1: + splits = [float(s) for s in args.split.split(',')] + elif args.split.find('/') != -1: + splits = [float(s) for s in args.split.split('/')] + else: + splits = [float(args.split)] + split_total = sum(splits) + if split_total < 1.: + splits.append(1 - split_total) + while len(splits) < 3: + splits.append(0.) + splits = splits[:3] + if args.valid_data is not None: + splits[1] = 0. + if args.test_data is not None: + splits[2] = 0. + final_sum = sum(splits) + return [s / final_sum for s in splits] + + +def configure_data(): + """add cmdline flags for configuring datasets""" + # These are options that are used by data_utils, but are either + # deprecated or not meant to be exposed to the command line user. + # These options are intneded to be set in code by specific scripts. + defaults = { + 'world_size': 1, + 'rank': -1, + 'persist_state': 0, + 'lazy': False, + 'transpose': False, + 'data_set_type': 'supervised', + 'seq_length': 256, + 'eval_seq_length': 256, + 'samples_per_shard': 100 + } + + return DataConfig(defaults=defaults) diff --git a/modelscope/models/nlp/mglm/data_utils/__init__.py b/modelscope/models/nlp/mglm/data_utils/__init__.py new file mode 100644 index 00000000..fa243cb4 --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/__init__.py @@ -0,0 +1,341 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""utils for creating datasets""" +import math +import os +import random +import time + +import torch + +from . import corpora +from .datasets import (BertSentencepairDataset, BlockDataset, ConcatDataset, + GPT2Dataset, ShuffleDataset, SplitDataset, XLDataset, + split_ds) +from .lazy_loader import (LazyLoader, LazyWriter, exists_lazy, exists_scatter, + get_scatter_path) +from .samplers import DistributedBatchSampler +from .tokenization import (BertWordPieceTokenizer, CharacterLevelTokenizer, + CommandToken, GPT2BPETokenizer, Tokenization, + Tokenizer, make_tokenizer) + +TRAIN_DATA = 0 +VAL_DATA = 1 +TEST_DATA = 2 + + +def should_split(split): + """ + given split proportions checks if should split + Examples: + >>> should_split([10,0,0]) + False + >>> should_split([1,.1,.2]) + True + """ + return max(split) / sum(split) != 1. + + +def get_ext(path): + """gets path extension""" + return os.path.splitext(path)[1] + + +def get_dataset(name, + tokenizer, + pre_tokenize, + data_parallel_rank, + loader_scatter=None, + no_lazy_loader=False, + half_lazy_loader=False): + """gets dataset object based on keyword args and file at `path`""" + global_rank = torch.distributed.get_rank() + if not supported_corpus(name): + raise NotImplementedError('dataset %s is not supported' % name) + dataset = corpora.NAMED_CORPORA[name] + path = dataset.PATH + if issubclass(dataset, corpora.PromptReader): + if not (exists_lazy(path, data_type='prompt') + and exists_lazy(path, data_type='text')) and not ( + loader_scatter is not None and exists_scatter( + path, data_type='prompt', scatter_num=loader_scatter) + and exists_scatter( + path, data_type='text', scatter_num=loader_scatter)): + # create cached version of dataset for lazy loading if it doesn't exist + if global_rank == 0: + print(f'Creating lazy loader for dataset {name}') + prompt_writer = LazyWriter( + path, data_type='prompt', is_array=pre_tokenize) + text_writer = LazyWriter( + path, data_type='text', is_array=pre_tokenize) + writers = {'prompt': prompt_writer, 'text': text_writer} + reader = dataset( + writers=writers, + tokenizer=tokenizer, + tokenize=pre_tokenize) + reader.process() + prompt_writer.close() + text_writer.close() + else: + while not os.path.exists( + LazyWriter.get_len_path(path, data_type='prompt')): + time.sleep(1) + map_fn = (lambda x: x.tolist()) if pre_tokenize else None + if loader_scatter is not None: + if not (exists_scatter( + path, data_type='prompt', scatter_num=loader_scatter) + and exists_scatter( + path, data_type='text', scatter_num=loader_scatter)): + if global_rank == 0: + print(f'Creating scatter loader for dataset {name}') + prompts = LazyLoader( + path, + data_type='prompt', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize) + texts = LazyLoader( + path, + data_type='text', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize) + indices = list(range(len(texts))) + random.shuffle(indices) + segment_length = (len(indices) - 1) // loader_scatter + 1 + for i in range(loader_scatter): + scatter_path = get_scatter_path(path, scatter_rank=i) + prompt_writer = LazyWriter( + scatter_path, + data_type='prompt', + is_array=pre_tokenize) + text_writer = LazyWriter( + scatter_path, + data_type='text', + is_array=pre_tokenize) + for idx in indices[i * segment_length:(i + 1) + * segment_length]: + prompt_writer.write(prompts[idx]) + text_writer.write(texts[idx]) + prompt_writer.close() + text_writer.close() + else: + while not (exists_scatter( + path, data_type='prompt', + scatter_num=loader_scatter) and exists_scatter( + path, + data_type='text', + scatter_num=loader_scatter)): + time.sleep(1) + scatter_path = get_scatter_path( + path, scatter_rank=data_parallel_rank % loader_scatter) + print(f'Rank {global_rank} is using scatter from {scatter_path}') + prompts = LazyLoader( + scatter_path, + data_type='prompt', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize, + load_memory=no_lazy_loader, + half_load=half_lazy_loader) + texts = LazyLoader( + scatter_path, + data_type='text', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize, + load_memory=no_lazy_loader, + half_load=half_lazy_loader) + else: + prompts = LazyLoader( + path, + data_type='prompt', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize, + load_memory=no_lazy_loader, + half_load=half_lazy_loader) + texts = LazyLoader( + path, + data_type='text', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize, + load_memory=no_lazy_loader, + half_load=half_lazy_loader) + text = corpora.PromptDataset( + prompt_loader=prompts, + text_loader=texts, + tokenizer=tokenizer, + to_tokenize=not pre_tokenize) + if loader_scatter is None: + if global_rank == 0: + print(f'Create dataset {name} with {len(text)} documents') + for i in range(10): + rand_id = i if i < 5 else random.randrange(len(text)) + sample_tokens = text[rand_id]['tokens'][:1024] + print(sample_tokens) + print(tokenizer.DecodeIds(sample_tokens).encode('utf-8')) + else: + for scatter_id in range(loader_scatter): + if data_parallel_rank % loader_scatter == scatter_id and data_parallel_rank // loader_scatter == 0: + print( + f'Create dataset {name} at scatter {scatter_id} with {len(text)} documents' + ) + for i in range(10): + sample_tokens = text[i]['tokens'][:1024] + print(sample_tokens) + print(tokenizer.DecodeIds(sample_tokens)) + torch.distributed.barrier() + return text + elif issubclass(dataset, corpora.KeyReader): + if not (exists_lazy(path, data_type='text') + and exists_lazy(path, data_type='mask')): + # create cached version of dataset for lazy loading if it doesn't exist + if global_rank == 0: + text_writer = LazyWriter( + path, data_type='text', is_array=pre_tokenize) + mask_writer = LazyWriter(path, data_type='mask', is_array=True) + writers = {'mask': mask_writer, 'text': text_writer} + dataset( + writers=writers, + tokenizer=tokenizer, + tokenize=pre_tokenize) + mask_writer.close() + text_writer.close() + else: + while not os.path.exists( + LazyWriter.get_len_path(path, data_type='mask')): + time.sleep(1) + map_fn = (lambda x: x.tolist()) if pre_tokenize else None + masks = LazyLoader( + path, data_type='mask', map_fn=map_fn, mem_map=True, is_array=True) + texts = LazyLoader( + path, + data_type='text', + map_fn=map_fn, + mem_map=True, + is_array=pre_tokenize) + text = corpora.KeyDataset( + mask_loader=masks, + text_loader=texts, + tokenizer=tokenizer, + to_tokenize=not pre_tokenize) + return text + + +def supported_corpus(corpus_name): + """checks if corpus name is defined in `corpora.py`""" + return corpus_name in corpora.NAMED_CORPORA + + +def make_dataset(path, + seq_length, + mem_length, + shuffle=True, + split=None, + tokenizer=None, + sample_one_document=False, + pre_tokenize=False, + ds_type='', + save_splits=None, + load_splits=None, + save_test_data=None, + no_lazy_loader=False, + loader_scatter=None, + data_parallel_rank=None, + filter_english=False, + non_sentence_start=0.0, + half_lazy_loader=False, + **kwargs): + """function to create datasets+tokenizers for common options""" + if split is None: + split = [1.] + + # get one or multiple datasets and concatenate + if isinstance(path, str): + ds = get_dataset( + path, + tokenizer=tokenizer, + pre_tokenize=pre_tokenize, + no_lazy_loader=no_lazy_loader, + loader_scatter=loader_scatter, + data_parallel_rank=data_parallel_rank, + half_lazy_loader=half_lazy_loader) + else: + ds = [ + get_dataset( + p, + tokenizer=tokenizer, + pre_tokenize=pre_tokenize, + no_lazy_loader=no_lazy_loader, + loader_scatter=loader_scatter, + data_parallel_rank=data_parallel_rank, + half_lazy_loader=half_lazy_loader) for p in path + ] + ds = ConcatDataset(ds) + + # Split dataset into train/val/test (and wrap bert dataset) + def wrap_dataset(dataset): + if ds_type.lower() == 'bert': + presplit_sentences = kwargs[ + 'presplit_sentences'] if 'presplit_sentences' in kwargs else False + dataset = BertSentencepairDataset( + dataset, + max_seq_len=seq_length, + presplit_sentences=presplit_sentences) + elif ds_type.lower() == 'gpt-xl': + assert pre_tokenize + dataset = XLDataset( + dataset, + tokenizer, + max_seq_len=seq_length, + mem_len=mem_length, + sample_across_doc=not sample_one_document) + elif ds_type.lower() == 'gpt2': + dataset = GPT2Dataset( + dataset, + tokenizer, + max_seq_len=seq_length, + sample_across_doc=not sample_one_document) + elif ds_type.lower() == 'block': + dataset = BlockDataset( + dataset, + tokenizer, + max_seq_len=seq_length, + sample_across_doc=not sample_one_document, + filter_english=filter_english, + non_sentence_start=non_sentence_start) + return dataset + + if should_split(split): + ds = split_ds( + ds, + split, + shuffle=shuffle, + save_splits=save_splits, + load_splits=load_splits) + if save_test_data is not None and torch.distributed.get_rank() == 0: + test_ds = ds[-1] + with open(save_test_data, 'w', encoding='utf-8') as output: + for data in test_ds: + text = data['tokens'] + text = tokenizer.DecodeIds(text) + output.write(text) + output.write('\n') + print(f'Write test data to {save_test_data}') + ds = [wrap_dataset(d) if d is not None else None for d in ds] + else: + ds = wrap_dataset(ds) + return ds diff --git a/modelscope/models/nlp/mglm/data_utils/corpora.py b/modelscope/models/nlp/mglm/data_utils/corpora.py new file mode 100755 index 00000000..7c6f58f8 --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/corpora.py @@ -0,0 +1,583 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""several datasets with preset arguments""" +import os +import random +from collections import defaultdict +from multiprocessing import Process, Queue +from queue import Empty + +import json +import tqdm +from torch.utils import data + +from modelscope.models.nlp.mglm.utils import print_rank_0 +from .datasets import csv_dataset, json_dataset +from .lazy_loader import LazyLoader + +NUM_PROCESSES = 100 + + +def punctuation_standardization(string: str): + punctuation_dict = { + '\u201c': "\"", + '\u201d': "\"", + '\u2019': "'", + '\u2018': "'", + '\u2013': '-' + } + for key, value in punctuation_dict.items(): + string = string.replace(key, value) + return string + + +class KeyDataset(data.Dataset): + + def __init__(self, text_loader, mask_loader, **kwargs): + self.texts = text_loader + self.masks = mask_loader + self.is_lazy = False + if isinstance(self.texts, LazyLoader) and isinstance( + self.masks, LazyLoader): + self.text_lens = self.texts.lens + self.is_lazy = True + + def get_text_len(self, idx): + return self.text_lens[idx] + + def __getitem__(self, index): + text = self.texts[index] + mask_length = self.masks[index] + mask = [] + for i, length in enumerate(mask_length): + if i % 2 == 0: + mask += [0] * length + else: + mask += [1] * length + assert len(text) == len(mask) + return {'tokens': text, 'loss_masks': mask} + + def __len__(self): + return len(self.texts) + + +class PromptDataset(data.Dataset): + + def __init__(self, + prompt_loader, + text_loader, + tokenizer=None, + to_tokenize=False, + **kwargs): + self.prompts = prompt_loader + self.texts = text_loader + self.tokenizer = tokenizer + self.to_tokenize = to_tokenize + if isinstance(self.prompts, LazyLoader) and isinstance( + self.texts, LazyLoader): + self.prompt_lens = self.prompts.lens + self.text_lens = self.texts.lens + self.is_lazy = True + + def get_text_len(self, idx): + return self.prompt_lens[idx] + self.text_lens[idx] + + def __getitem__(self, index): + prompt = self.prompts[index] + text = self.texts[index] + if self.to_tokenize: + prompt = self.tokenizer.EncodeAsIds(prompt).tokenization + text = self.tokenizer.EncodeAsIds(text).tokenization + return { + 'tokens': prompt + text, + 'loss_masks': [0] * len(prompt) + [1] * len(text) + } + + def __len__(self): + return len(self.prompts) + + +class DataReader: + PATH = None + assert_str = None + reserve_punct = False + split_row = True + TASK_QUEUE_LIMIT = 10000000 + DONE_QUEUE_LIMIT = 10000000 + + def tokenize_worker(self, input, output, info, tokenizer, tokenize): + raise NotImplementedError + + def print_info(self, info): + pass + + def __init__(self, writers, tokenizer=None, tokenize=False, **kwargs): + print(self.PATH) + print(self.assert_str) + assert os.path.exists(self.PATH), self.assert_str + print_rank_0(f'Creating dataset from {self.PATH}') + self.tokenizer = tokenizer + self.tokenize = tokenize + self.writers = writers + + def process(self): + if os.path.isdir(self.PATH): + paths = [ + os.path.join(top, name) for top, _, names in os.walk(self.PATH) + for name in names + ] + # paths = [entry.path for entry in os.scandir(self.PATH) if + # not entry.is_dir() and not entry.name.endswith("bz2")] + else: + paths = [self.PATH] + task_queue, done_queue, info_queue = Queue( + maxsize=self.TASK_QUEUE_LIMIT), Queue( + maxsize=self.DONE_QUEUE_LIMIT), Queue() + processes = [] + for i in range(NUM_PROCESSES): + process = Process( + target=self.tokenize_worker, + args=(task_queue, done_queue, info_queue, self.tokenizer, + self.tokenize)) + process.start() + processes.append(process) + + def read_input_to_queue(): + for path in paths: + print_rank_0(f'Start reading {path}') + with open(path) as file: + items = json.load(file) + for item in items: + task_queue.put(item) + # if self.split_row: + # for row in file: + # task_queue.put(row) + # else: + # items = json.load(file) + # for item in items["RECORDS"]: + # task_queue.put(item) + print_rank_0('Read input complete') + for i in range(len(processes)): + task_queue.put('STOP') + + process = Process(target=read_input_to_queue) + process.start() + count = len(processes) + progress_bar = tqdm.tqdm() + while True: + data = done_queue.get() + if data == 'COMPLETE': + count -= 1 + if count == 0: + break + else: + self.write_result(data, self.writers) + progress_bar.update() + progress_bar.close() + self.print_info(info_queue) + + @staticmethod + def write_result(data, writers): + raise NotImplementedError + + @staticmethod + def get_token_count(contents): + return sum(map(len, contents)) + + @classmethod + def process_sample(cls, text, tokenizer, tokenize): + if isinstance(text, str) and tokenize: + if not cls.reserve_punct: + text = punctuation_standardization(text) + text = tokenizer.EncodeAsIds(text).tokenization if text else [] + return text + + @staticmethod + def trim_field(content, max_length): + if len(content) > max_length: + content = content[:max_length] + content += '......' + return content + + def process_line(self, data, tokenizer, tokenize): + raise NotImplementedError + + +class PromptReader(DataReader): + is_json = True + + def tokenize_worker(self, input, output, info, tokenizer, tokenize): + for row in iter(input.get, 'STOP'): + if row: + if self.is_json: + row = row.rstrip() + row = json.loads(row) + prompts, texts = self.process_line(row, tokenizer, tokenize) + for prompt, text in zip(prompts, texts): + output.put((prompt, text)) + output.put('COMPLETE') + + @staticmethod + def write_result(data, writers): + prompt, text = data + writers['prompt'].write(prompt) + writers['text'].write(text) + + +class KeyReader(DataReader): + PATH = '/root/data/wikipedia/wiki-key.txt' + assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py' + + def process_line(self, data, tokenizer, tokenize): + keys, contents = data['key'], data['content'] + assert len(keys) == len(contents) + for i in range(1, len(keys)): + keys[i] = ' ' + keys[i] + contents = [' ' + content for content in contents] + keys = [tokenizer.EncodeAsIds(key).tokenization for key in keys] + contents = [ + tokenizer.EncodeAsIds(content).tokenization for content in contents + ] + summary = sum(keys, []) + summary_prefix = self.process_sample('Summary: ', tokenizer, tokenize) + summary_mask = [len(summary_prefix), len(summary)] + summary = summary_prefix + summary + text, text_mask = [], [] + for key, content in zip(keys, contents): + content = content + [tokenizer.get_command('eop').Id] + text += key + text += content + text_mask.append(len(key)) + text_mask.append(len(content)) + return (summary, summary_mask), (text, text_mask) + + def tokenize_worker(self, input, output, info, tokenizer, tokenize): + for row in iter(input.get, 'STOP'): + data = json.loads(row) + summary, content = self.process_line(data, tokenizer, tokenize) + output.put((summary, content)) + output.put('COMPLETE') + + @staticmethod + def write_result(data, writers): + summary, content = data + writers['text'].write(summary[0]) + writers['mask'].write(summary[1]) + writers['text'].write(content[0]) + writers['mask'].write(content[1]) + + +class zhihu(PromptReader): + PATH = '/dataset/fd5061f6/data/tokenize_data/zhihu.lazy' + reserve_punct = True + assert_str = 'make sure to set PATH for zhihu data_utils/corpora.py' + qtitle_prefix = '问题:' + qcontent_prefix = '问题描述:' + user_prefix = '回答用户:' + answer_prefix = ' 回答:' + + # qtitle_prefix = [] + # qcontent_prefix = [] + # user_prefix = [] + # answer_prefix = [] + + def process_line(self, data, tokenizer, tokenize): + prompts, texts = [], [] + ans_length = len(data.get('ans-content', '')) + ans_up = data.get('ans-up-num', '') + ans_up = int(ans_up) if ans_up else 0 + if ans_length > 100 or ans_up > 1000: + qtitle = data['q_title'] + qcontent = data['q-content'] + if qcontent is None: + qcontent = '' + qcontent = self.trim_field(qcontent, max_length=100) + user = data.get('user-signature', '') + prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.user_prefix + user + self.answer_prefix # noqa + text = data['ans-content'] + prompt, text = self.process_sample(prompt, tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + prompts.append(prompt) + texts.append(text) + # prompt = data["q_title"] + data["q-content"] + data["user-signature"] + # text = data["ans-content"] + # prompts.append(prompt) + # texts.append(text) + return prompts, texts + + +class zhidao(PromptReader): + PATH = '/root/data/zhidao/zhidao' + reserve_punct = True + assert_str = 'make sure to set PATH for zhidao data_utils/corpora.py' + qtitle_prefix = '问题:' + qcontent_prefix = '问题描述:' + answer_prefix = '回答:' + + def process_line(self, data, tokenizer, tokenize): + if 'title' not in data: + return [], [] + prompts, texts = [], [] + qtitle = data['title'] + qcontent = data.get('content', '') + qcontent = self.trim_field(qcontent, max_length=100) + prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.answer_prefix + prompt = self.process_sample(prompt, tokenizer, tokenize) + if 'best_answer' in data: + text = data['best_answer']['content'] + if len(text) > 10: + text = self.process_sample(text, tokenizer, tokenize) + prompts.append(prompt) + texts.append(text) + for answer in data.get('other_answers', []): + text = answer['content'] + if len(text) > 100: + text = self.process_sample(text, tokenizer, tokenize) + prompts.append(prompt) + texts.append(text) + return prompts, texts + + +class baike(PromptReader): + PATH = '/dataset/fd5061f6/data/tokenize_data/baike.lazy' + reserve_punct = True + assert_str = 'make sure to set PATH for baike data_utils/corpora.py' + + def process_line(self, data, tokenizer, tokenize): + prompts, texts = [], [] + text = data.get('title', '') + data.get('abstract', '') + data.get( + 'content', '') + if text: + p, t = self.process_sample('', tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + prompts.append(p) + texts.append(t) + return prompts, texts + + +class wikipedia(PromptReader): + """ + dataset for wikipedia with arguments configured for convenience + + command line usage: `--train-data wikipedia` + """ + # PATH = '/dataset/data/wiki.txt' + PATH = '/root/data/bert_data/wiki.txt' + assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py' + + def process_line(self, data, tokenizer, tokenize): + text = data['text'] + prompt, text = self.process_sample('', tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + return [prompt], [text] + + +class TestDataset(PromptReader): + PATH = '/root/data/test.json' + assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py' + + def process_line(self, data, tokenizer, tokenize): + prompt, text = data['prompt'], data['text'] + prompt, text = self.process_sample(prompt, tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + return [prompt], [text] + + +class OpenWebText(PromptReader): + PATH = '/dataset/fd5061f6/english_data/openwebtext2' + assert_str = 'make sure to set PATH for openwebtext data_utils/corpora.py' + + def __init__(self, *args, **kwargs): + import fasttext + super().__init__(*args, **kwargs) + self.model = fasttext.load_model( + '/dataset/fd5061f6/english_data/lid.176.bin') + print_rank_0('Load language detection model') + + def process_line(self, data, tokenizer, tokenize): + text = data['text'] + if len(text) > 100: + lang = self.model.predict(text.replace('\n', ''))[0][0] + if lang == '__label__en': + prompt, text = self.process_sample( + '', tokenizer, + tokenize), self.process_sample(text, tokenizer, tokenize) + return [prompt], [text] + return [], [] + + +class CCNews(PromptReader): + PATH = '/mnt/cc_news.json' + assert_str = 'make sure to set PATH for cc-news data_utils/corpora.py' + + def process_line(self, data, tokenizer, tokenize): + text = '' + title = data.get('title', None) + description = data.get('description', None) + maintext = data.get('maintext', None) + if title: + text += title.strip() + ' ' + if description and (not maintext + or not maintext.startswith(description)): + text += description.strip() + ' ' + if maintext: + text += maintext + if len(text) > 100: + prompt, text = self.process_sample('', tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + return [prompt], [text] + else: + return [], [] + + +class BertData(PromptReader): + is_json = False + PATH = '/dataset/fd5061f6/english_data/wikibook' + + def process_line(self, data, tokenizer, tokenize): + if data: + prompt, text = '', data + prompt, text = self.process_sample(prompt, tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + return [prompt], [text] + else: + return [], [] + + +class Pile(PromptReader): + is_json = True + PATH = '/mnt/train' + filtered_sources = [ + 'Github', 'StackExchange', 'DM Mathematics', 'Ubuntu IRC', 'EuroParl', + 'YoutubeSubtitles', 'Enron Emails' + ] + downsample_sources = {'PubMed Central': 0.3, 'ArXiv': 0.3, 'FreeLaw': 0.3} + + def print_info(self, info): + total_dict = defaultdict(int) + while True: + try: + source_dict = info.get(block=False) + for source, length in source_dict.items(): + total_dict[source] += length + except Empty: + break + print_rank_0(total_dict) + + def tokenize_worker(self, input, output, info, tokenizer, tokenize): + source_dict = defaultdict(int) + for row in iter(input.get, 'STOP'): + row = row.rstrip() + if row: + if self.is_json: + row = json.loads(row) + prompts, texts, source = self.process_line( + row, tokenizer, tokenize) + length = 0 + for prompt, text in zip(prompts, texts): + length += len(text) + output.put((prompt, text)) + if source: + source_dict[source] += length + output.put('COMPLETE') + info.put(source_dict) + + def process_line(self, data, tokenizer, tokenize): + source = data['meta'].get('pile_set_name', None) + text = data.get('text', None) + if source and text: + if source in self.filtered_sources: + return [], [], None + elif source in self.downsample_sources and random.random( + ) > self.downsample_sources[source]: + return [], [], None + else: + prompt, text = self.process_sample( + '', tokenizer, + tokenize), self.process_sample(text, tokenizer, tokenize) + return [prompt], [text], source + else: + return [], [], None + + +class Stories(PromptReader): + is_json = True + PATH = '/dataset/fd5061f6/english_data/stories_31G.jsonl' + + def process_line(self, data, tokenizer, tokenize): + text = data.get('text', None) + if text: + prompt, text = self.process_sample('', tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + return [prompt], [text] + else: + return [], [] + + +class BertBaseData(BertData): + PATH = '/root/data/formatted_one_article_per_line' + + +class BertLargeData(BertData): + PATH = '/dataset/c07bd62b/cognitive/zhengxiao/formatted_one_article_per_line_large' + + +class WuDaoCorpus(PromptReader): + # PATH = "/dataset/fd5061f6/chinese_data/WuDao" + PATH = '/wudao' + is_json = False + reserve_punct = True + split_row = False + + def process_line(self, item, tokenizer, tokenize): + prompts, texts = [], [] + text = '' + title = item.get('title', None) + content = item.get('content', None) + if title: + text += title.strip() + ' ' + if content: + text += content + if len(text) > 100: + prompt, text = self.process_sample('', tokenizer, + tokenize), self.process_sample( + text, tokenizer, tokenize) + prompts.append(prompt) + texts.append(text) + return prompts, texts + + +NAMED_CORPORA = { + 'wikipedia': wikipedia, + 'wikipedia-key': KeyReader, + 'openwebtext': OpenWebText, + 'zhihu': zhihu, + 'zhidao': zhidao, + 'baike': baike, + 'test': TestDataset, + 'wikibook': BertData, + 'bert-base': BertBaseData, + 'bert-large': BertLargeData, + 'cc-news': CCNews, + 'pile': Pile, + 'stories': Stories, + 'wudao': WuDaoCorpus +} diff --git a/modelscope/models/nlp/mglm/data_utils/datasets.py b/modelscope/models/nlp/mglm/data_utils/datasets.py new file mode 100644 index 00000000..777b7d43 --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/datasets.py @@ -0,0 +1,1244 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""dataset objects for jsons, csvs, and BERT datasets""" + +import csv +import math +import os +import random +import time +from bisect import bisect_right +from itertools import accumulate +from operator import itemgetter + +import json +import nltk +import numpy as np +import pandas as pd +import torch +import tqdm +from nltk import tokenize +from torch.utils import data + +from modelscope.models.nlp.mglm.utils import print_rank_0 +from .lazy_loader import LazyLoader, exists_lazy + + +class ShuffleDataset(data.Dataset): + + def __init__(self, ds): + self.ds = ds + self.shuffle_ids = list(range(len(self.ds))) + random.shuffle(self.shuffle_ids) + self.is_lazy = hasattr(ds, 'is_lazy') and ds.is_lazy + if self.is_lazy: + self.prompt_lens = [ + self.ds.prompt_lens[idx] for idx in self.shuffle_ids + ] + self.text_lens = [ + self.ds.text_lens[idx] for idx in self.shuffle_ids + ] + + def __getitem__(self, idx): + return self.ds[self.shuffle_ids[idx]] + + def __len__(self): + return len(self.ds) + + +class ConcatDataset(data.Dataset): + """ + Dataset to concatenate multiple datasets. + Purpose: useful to assemble different existing datasets, possibly + large-scale datasets as the concatenation operation is done in an + on-the-fly manner. + Arguments: + datasets (sequence): List of datasets to be concatenated. + """ + + @staticmethod + def cumsum(sequence): + r, s = [], 0 + for e in sequence: + l = len(e) # noqa + r.append(l + s) + s += l + return r + + def __init__(self, datasets, **kwargs): + super(ConcatDataset, self).__init__() + assert len(datasets) > 0, 'datasets should not be an empty iterable' + self.datasets = list(datasets) + self.is_lazy = sum([ + isinstance(ds, LazyLoader) + or (hasattr(ds, 'is_lazy') and ds.is_lazy) for ds in self.datasets + ]) == len(self.datasets) + self.cumulative_sizes = self.cumsum(self.datasets) + self._X = None + self._Y = None + self._lens = None + + def get_text_len(self, idx): + dataset_idx = bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx].get_text_len(sample_idx) + + def SetTokenizer(self, tokenizer): + for ds in self.datasets: + ds.SetTokenizer(tokenizer) + + def GetTokenizer(self): + return self.datasets[0].GetTokenizer() + + def __len__(self): + return self.cumulative_sizes[-1] + + def __getitem__(self, idx): + dataset_idx = bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx][sample_idx] + + @property + def lens(self): + if self._lens is None: + self._lens = [] + if self.is_lazy: + for data in self.datasets: # noqa + self._lens.extend(data.lens) + else: + for data in self.datasets: # noqa + self._lens.extend([ + len(d['text']) if isinstance(d, dict) else len(d) + for d in data + ]) + return self._lens + + @property + def X(self): + if self._X is None: + self._X = [] + for data in self.datasets: # noqa + self._X.extend(data.X) + return self._X + + @property + def Y(self): + if self._Y is None: + self._Y = [] + for data in self.datasets: # noqa + self._Y.extend(list(data.Y)) + self._Y = np.array(self._Y) + return self._Y + + +class SplitDataset(data.Dataset): + """ + Dataset wrapper to access a subset of another dataset. + Purpose: useful to index into existing datasets, possibly + large-scale datasets as the subindexing operation is done in an + on-the-fly manner. + Arguments: + ds (Dataset or array-like): List of datasets to be subindexed + split_inds (1D array-like): List of indices part of subset + """ + + def __init__(self, ds, split_inds, **kwargs): + self.split_inds = list(split_inds) + self.wrapped_data = ds + self.is_lazy = isinstance(ds, LazyLoader) or (hasattr(ds, 'is_lazy') + and ds.is_lazy) + self._X = None + self._Y = None + + def __len__(self): + return len(self.split_inds) + + def get_text_len(self, idx): + return self.wrapped_data.get_text_len(self.split_inds[idx]) + + def __getitem__(self, index): + return self.wrapped_data[self.split_inds[index]] + + def SetTokenizer(self, tokenizer): + self.wrapped_data.SetTokenizer(tokenizer) + + def GetTokenizer(self): + return self.wrapped_data.GetTokenizer() + + @property + def X(self): + if self._X is None: + self._X = itemgetter(*self.split_inds)(self.wrapped_data.X) + return self._X + + @property + def Y(self): + if self._Y is None: + self._Y = np.array( + itemgetter(*self.split_inds)(self.wrapped_data.Y)) + return self._Y + + def __iter__(self): + for idx in self.split_inds: + yield self.wrapped_data[idx] + + +def split_ds(ds, split=None, shuffle=True, save_splits=None, load_splits=None): + """ + Split a dataset into subsets given proportions of how + much to allocate per split. If a split is 0% returns None for that split. + Purpose: Useful for creating train/val/test splits + Arguments: + ds (Dataset or array-like): Data to be split. + split (1D array-like): proportions to split `ds`. `sum(splits) != 0` + shuffle (boolean): Randomly split dataset. Default: True + save_splits: save split indices to file + load_splits: load split indices from file + """ + if split is None: + split = [.8, .2, .0] + split_sum = sum(split) + if split_sum == 0: + raise Exception('Split cannot sum to 0.') + split = np.array(split) + split /= split_sum + ds_len = len(ds) + inds = np.arange(ds_len) + if shuffle: + rng = np.random.RandomState(1234) + rng.shuffle(inds) + if load_splits is not None: + inds = np.load(load_splits) + assert len(inds) == ds_len + print_rank_0(f'Load split indices from {load_splits}') + elif save_splits is not None: + if torch.distributed.get_rank() == 0: + np.save(save_splits, inds) + print(f'Save split indices to {save_splits}') + start_idx = 0 + residual_idx = 0 + rtn_ds = [None] * len(split) + for i, f in enumerate(split): + if f != 0: + proportion = ds_len * split[i] + residual_idx += proportion % 1 + split_ = int(int(proportion) + residual_idx) + split_inds = inds[start_idx:start_idx + max(split_, 1)] + rtn_ds[i] = SplitDataset(ds, split_inds) + start_idx += split_ + residual_idx %= 1 + return rtn_ds + + +class csv_dataset(data.Dataset): + """ + Class for loading datasets from csv files. + Purpose: Useful for loading data for unsupervised modeling or transfer tasks + Arguments: + path (str): Path to csv file with dataset. + tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None + preprocess_fn (callable): Callable that process a string into desired format. + delim (str): delimiter for csv. Default: ',' + binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False + drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty + columns with -1 (regardless if rows are dropped based on value) Default: False + text_key (str): key to get text from csv. Default: 'sentence' + label_key (str): key to get label from json dictionary. Default: 'label' + Attributes: + X (list): all strings from the csv file + Y (np.ndarray): labels to train with + """ + + def __init__(self, + path, + tokenizer=None, + preprocess_fn=None, + delim=',', + binarize_sent=False, + drop_unlabeled=False, + text_key='sentence', + label_key='label', + **kwargs): + self.is_lazy = False + self.preprocess_fn = preprocess_fn + self.SetTokenizer(tokenizer) + self.path = path + self.delim = delim + self.text_key = text_key + self.label_key = label_key + self.drop_unlabeled = drop_unlabeled + + if '.tsv' in self.path: + self.delim = '\t' + + self.X = [] + self.Y = [] + try: + cols = [text_key] + if isinstance(label_key, list): + cols += label_key + else: + cols += [label_key] + data = pd.read_csv( + self.path, sep=self.delim, usecols=cols, encoding='latin-1') + except: # noqa + data = pd.read_csv( + self.path, + sep=self.delim, + usecols=[text_key], + encoding='latin-1') + + data = data.dropna(axis=0) + + self.X = data[text_key].values.tolist() + try: + self.Y = data[label_key].values + except Exception as e: # noqa + self.Y = np.ones(len(self.X)) * -1 + + if binarize_sent: + self.Y = binarize_labels(self.Y, hard=binarize_sent) + + def SetTokenizer(self, tokenizer): + if tokenizer is None: + self.using_tokenizer = False + if not hasattr(self, '_tokenizer'): + self._tokenizer = tokenizer + else: + self.using_tokenizer = True + self._tokenizer = tokenizer + + def GetTokenizer(self): + return self._tokenizer + + @property + def tokenizer(self): + if self.using_tokenizer: + return self._tokenizer + return None + + def __len__(self): + return len(self.X) + + def __getitem__(self, index): + """process+tokenize string and return string,label,and stringlen""" + x = self.X[index] + if self.tokenizer is not None: + x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn) + elif self.preprocess_fn is not None: + x = self.preprocess_fn(x) + y = self.Y[index] + if isinstance(y, str): + if self.tokenizer is not None: + y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn) + elif self.preprocess_fn is not None: + y = self.preprocess_fn(y) + return {'text': x, 'length': len(x), 'label': y} + + def write(self, writer_gen=None, path=None, skip_header=False): + """ + given a generator of metrics for each of the data points X_i, + write the metrics, text, and labels to a csv file + """ + if path is None: + path = self.path + '.results' + print('generating csv at ' + path) + with open(path, 'w') as csvfile: + c = csv.writer(csvfile, delimiter=self.delim) + if writer_gen is not None: + # if first item of generator is a header of what the metrics mean then write header to csv file + if not skip_header: + header = (self.label_key, ) + tuple( + next(writer_gen)) + (self.text_key, ) + c.writerow(header) + for i, row in enumerate(writer_gen): + row = (self.Y[i], ) + tuple(row) + (self.X[i], ) + c.writerow(row) + else: + c.writerow([self.label_key, self.text_key]) + for row in zip(self.Y, self.X): + c.writerow(row) + + +class json_dataset(data.Dataset): + """ + Class for loading datasets from a json dump. + Purpose: Useful for loading data for unsupervised modeling or transfer tasks + Arguments: + path (str): path to json file with dataset. + tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None + preprocess_fn (callable): callable function that process a string into desired format. + Takes string, maxlen=None, encode=None as arguments. Default: process_str + text_key (str): key to get text from json dictionary. Default: 'sentence' + label_key (str): key to get label from json dictionary. Default: 'label' + Attributes: + all_strs (list): list of all strings from the dataset + all_labels (list): list of all labels from the dataset (if they have it) + """ + + def __init__(self, + path, + tokenizer=None, + preprocess_fn=None, + binarize_sent=False, + text_key='sentence', + label_key='label', + loose_json=False, + **kwargs): + self.is_lazy = False + self.preprocess_fn = preprocess_fn + self.path = path + self.SetTokenizer(tokenizer) + self.X = [] + self.Y = [] + self.text_key = text_key + self.label_key = label_key + self.loose_json = loose_json + + for j in self.load_json_stream(self.path): + s = j[text_key] + self.X.append(s) + self.Y.append(j[label_key]) + + if binarize_sent: + self.Y = binarize_labels(self.Y, hard=binarize_sent) + + def SetTokenizer(self, tokenizer): + if tokenizer is None: + self.using_tokenizer = False + if not hasattr(self, '_tokenizer'): + self._tokenizer = tokenizer + else: + self.using_tokenizer = True + self._tokenizer = tokenizer + + def GetTokenizer(self): + return self._tokenizer + + @property + def tokenizer(self): + if self.using_tokenizer: + return self._tokenizer + return None + + def __getitem__(self, index): + """gets the index'th string from the dataset""" + x = self.X[index] + if self.tokenizer is not None: + x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn) + elif self.preprocess_fn is not None: + x = self.preprocess_fn(x) + y = self.Y[index] + if isinstance(y, str): + if self.tokenizer is not None: + y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn) + elif self.preprocess_fn is not None: + y = self.preprocess_fn(y) + return {'text': x, 'length': len(x), 'label': y} + + def __len__(self): + return len(self.X) + + def write(self, writer_gen=None, path=None, skip_header=False): + """ + given a generator of metrics for each of the data points X_i, + write the metrics, text, and labels to a json file + """ + if path is None: + path = self.path + '.results' + + if writer_gen is not None: + # if first item of generator is a header of what the metrics mean then write header to csv file + def gen_helper(): + keys = {} + keys[0] = self.label_key + if not skip_header: + for idx, k in enumerate(tuple(next(writer_gen))): + keys[idx + 1] = k + for i, row in enumerate(writer_gen): + if i == 0 and skip_header: + for idx, _ in enumerate(row): + keys[idx + 1] = 'metric_%d' % (idx, ) + j = {} + for idx, v in enumerate((self.Y[i], ) + tuple(row)): + k = keys[idx] + j[k] = v + yield j + else: + + def gen_helper(): + for y in self.Y: + j = {} + j[self.label_key] = y + yield j + + def out_stream(): + for i, j in enumerate(gen_helper()): + j[self.text_key] = self.X[i] + yield j + + self.save_json_stream(path, out_stream()) + + def save_json_stream(self, save_path, json_stream): + if self.loose_json: + with open(save_path, 'w') as f: + for i, j in enumerate(json_stream): + write_string = '' + if i != 0: + write_string = '\n' + write_string += json.dumps(j) + f.write(write_string) + else: + jsons = [j for j in json_stream] + json.dump(jsons, open(save_path, 'w'), separators=(',', ':')) + + def load_json_stream(self, load_path): + if not self.loose_json: + jsons = json.load(open(load_path, 'r')) + generator = iter(jsons) + else: + + def gen_helper(): + with open(load_path, 'r') as f: + for row in f: + yield json.loads(row) + + generator = gen_helper() + + for j in generator: + if self.label_key not in j: + j[self.label_key] = -1 + yield j + + +class XLDataset(data.Dataset): + + def __init__(self, + ds, + tokenizer, + max_seq_len=1024, + mem_len=None, + sample_across_doc=True, + **kwargs): + self.ds = ds + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + if mem_len is None: + mem_len = max_seq_len + self.mem_len = mem_len + self.sample_across_doc = sample_across_doc + self.indices, self.num_samples = None, None + if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy: + self.is_lazy = True + self.init_indices() + + def init_indices(self): + if self.is_lazy: + lens = np.array( + [self.ds.get_text_len(idx) for idx in range(len(self.ds))]) + else: + lens = np.array([ + len(d['prompt']) + + len(d['text']) if isinstance(d, dict) else len(d) + for d in self.ds + ]) + self.indices = list(accumulate(lens)) + print_rank_0( + f'Dataset document count {len(lens)}, token count {self.indices[-1]}' + ) + self.num_samples = self.indices[-1] // self.max_seq_len + 1 + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + tokens, targets, loss_mask, attention_mask = self.getidx(idx) + tokens = self.pad_seq(tokens) + targets = self.pad_seq(targets) + loss_mask = self.pad_seq(loss_mask, pad_id=0) + return { + 'text': np.array(tokens), + 'target': np.array(targets), + 'loss_mask': np.array(loss_mask), + 'attention_mask': np.array(attention_mask) + } + + def getidx(self, idx): + tokens, targets, loss_masks = [], [], [] + attention_mask = np.concatenate( + (np.zeros((self.max_seq_len, self.mem_len), dtype=np.long), + np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)), + axis=1) + sample_idx = bisect_right(self.indices, idx * self.max_seq_len) + last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1] + token_offset = idx * self.max_seq_len - last_end + if token_offset != 0: + history = min(self.mem_len, token_offset) + attention_mask[:, + -self.max_seq_len - history:-self.max_seq_len] = 1 + count = 0 + while len(tokens) < self.max_seq_len and sample_idx < len(self.ds): + item = self.ds[sample_idx] + text, masks = item['tokens'], item['loss_masks'] + text = text + [self.tokenizer.get_command('eos').Id] + end = min( + len(text) - 1, token_offset + self.max_seq_len - len(tokens)) + masks = masks + [1] + if count > 0: + current = len(tokens) + attention_mask[current:, :current + self.mem_len] = 0 + tokens += text[token_offset:end] + targets += text[token_offset + 1:end + 1] + loss_masks += masks[token_offset + 1:end + 1] + count += 1 + sample_idx += 1 + token_offset = 0 + return tokens, targets, loss_masks, attention_mask + + def pad_seq(self, seq, pad_id=None): + total_tokens = self.max_seq_len + num_pad_tokens = max(0, total_tokens - len(seq)) + seq += [ + self.tokenizer.get_command('pad').Id if pad_id is None else pad_id + ] * ( + num_pad_tokens) + return seq + + +class BlockDataset(data.Dataset): + + def __init__(self, + ds, + tokenizer, + max_seq_len=1024, + sample_across_doc=True, + non_sentence_start=0.0, + filter_english=False, + **kwargs): + """ + sentence_start: the stripped article must start with a complete sentence + """ + self.ds = ds + self.ds_len = len(self.ds) + self.num_samples = 1000 * self.ds_len + self.max_seq_len = max_seq_len + self.tokenizer = tokenizer + self.sample_across_doc = sample_across_doc + self.non_sentence_start = non_sentence_start + self.filter_english = filter_english + self.weighting, self.total_len = None, None + self.is_lazy = False + if self.filter_english: + import fasttext + self.model = fasttext.load_model('/mnt/lid.176.bin') + print_rank_0('Load language detection model') + if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy: + self.is_lazy = True + self.init_weighting() + + def init_weighting(self): + if self.is_lazy: + lens = np.array( + [self.ds.get_text_len(idx) for idx in range(len(self.ds))]) + else: + lens = np.array([ + len(d['text']) if isinstance(d, dict) else len(d) + for d in self.ds + ]) + self.total_len = np.sum(lens) + print_rank_0( + f'Dataset document count {len(lens)}, token count {self.total_len}, non sentence start{self.non_sentence_start}' # noqa + ) + self.weighting = list(accumulate(lens)) + + def get_weighted_samples(self, np_rng): + while True: + idx = np_rng.randint(self.total_len) + data_idx = bisect_right(self.weighting, idx) + tokens, loss_mask = self.getidx(data_idx) + if self.filter_english: + text = self.tokenizer.DecodeIds(tokens[:1024]) + lang = self.model.predict(text.replace('\n', ''))[0][0] + if lang == '__label__en': + break + else: + break + return tokens, loss_mask + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + # init rng + rng = random.Random(idx) + rng = np.random.RandomState( + seed=[rng.randint(0, 2**32 - 1) for _ in range(16)]) + + # get possibly weighted random index from dataset + tokens, loss_mask = self.get_weighted_samples(rng) + # truncate or pad tokens + num_tokens = len(tokens) + tokens_to_strip = num_tokens - self.max_seq_len + 1 + + # randomly choose a position for start + if tokens_to_strip > 0: + move_count = 0 + strip_left_tokens = rng.randint(tokens_to_strip) + if rng.random() > self.non_sentence_start: + if rng.random() < 0.5: + while move_count < self.max_seq_len // 2 and strip_left_tokens > 0 and not self.contains_sentence_end( # noqa + tokens[strip_left_tokens - 1]): # noqa + strip_left_tokens -= 1 + move_count += 1 + else: + while move_count < self.max_seq_len // 2 and strip_left_tokens < len( + tokens) and not self.contains_sentence_end( + tokens[strip_left_tokens - 1]): + strip_left_tokens += 1 + move_count += 1 + tokens = [self.tokenizer.get_command('ENC').Id + ] + tokens[strip_left_tokens:] + loss_mask = [0] + loss_mask[strip_left_tokens:] + if len(tokens) == 2 and tokens[1] == self.tokenizer.get_command( + 'eos').Id: + tokens, loss_mask = [], [] + tokens, loss_mask = self.right_strip_seq(tokens, loss_mask, + self.max_seq_len) + else: + tokens = [self.tokenizer.get_command('ENC').Id] + tokens + loss_mask = [0] + loss_mask + # Sample multiple documents + if self.sample_across_doc: + while len(tokens) < self.max_seq_len: + new_tokens, new_loss_mask = self.get_weighted_samples(rng) + new_tokens = [self.tokenizer.get_command('ENC').Id + ] + new_tokens + new_loss_mask = [0] + new_loss_mask + is_last = len(new_tokens) >= self.max_seq_len - len(tokens) + new_tokens, new_loss_mask = self.right_strip_seq( + new_tokens, new_loss_mask, + self.max_seq_len - len(tokens)) + tokens += new_tokens + loss_mask += new_loss_mask + if is_last: + break + return {'text': np.array(tokens), 'loss_mask': np.array(loss_mask)} + + def right_strip_seq(self, tokens, loss_mask, seq_length): + strip_right_tokens = len(tokens) - seq_length + if strip_right_tokens > 0: + while strip_right_tokens < len( + tokens) - 1 and not self.contains_sentence_end( + tokens[-strip_right_tokens - 1]): + strip_right_tokens += 1 + if len(tokens) - strip_right_tokens < seq_length // 2: + strip_right_tokens = len(tokens) - seq_length + tokens = tokens[:-strip_right_tokens] + loss_mask = loss_mask[:-strip_right_tokens] + return tokens, loss_mask + + def getidx(self, data_idx): + data = self.ds[data_idx] + tokens, loss_masks = data['tokens'], data['loss_masks'] + tokens = tokens + [self.tokenizer.get_command('eos').Id] + loss_masks = loss_masks + [1] + return tokens, loss_masks + + def pad_seq(self, seq, pad_id=None): + total_tokens = self.max_seq_len + num_pad_tokens = max(0, total_tokens - len(seq)) + seq += [ + self.tokenizer.get_command('pad').Id if pad_id is None else pad_id + ] * ( + num_pad_tokens) + return seq + + # TODO: rewrite this function for chinese + def contains_sentence_end(self, tok): + tok = self.tokenizer.IdToToken(tok) + if '.' in tok: + return True + if '?' in tok: + return True + if '!' in tok: + return True + if ';' in tok: + return True + if ':' in tok: + return True + if '\n' in tok: + return True + return False + + +class GPT2Dataset(data.Dataset): + + def __init__(self, + ds, + tokenizer, + max_seq_len=1024, + num_samples=None, + weighted=True, + sample_across_doc=True, + random_across_doc_sampling=True, + sentence_start=False, + **kwargs): + """ + sentence_start: the stripped article must start with a complete sentence + """ + self.ds = ds + self.ds_len = len(self.ds) + self.num_samples = num_samples + if num_samples is None: + self.num_samples = 1000 * self.ds_len + self.max_seq_len = max_seq_len + self.tokenizer = tokenizer + self.weighted = weighted + self.sample_across_doc = sample_across_doc + self.random_across_doc_sampling = random_across_doc_sampling + self.sentence_start = sentence_start + self.weighting, self.total_len = None, None + self.is_lazy = False + if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy: + self.is_lazy = True + self.init_weighting() + + def init_weighting(self): + if self.weighted: + if self.is_lazy: + lens = np.array( + [self.ds.get_text_len(idx) for idx in range(len(self.ds))]) + else: + lens = np.array([ + len(d['text']) if isinstance(d, dict) else len(d) + for d in self.ds + ]) + self.total_len = np.sum(lens) + print_rank_0( + f'Dataset document count {len(lens)}, token count {self.total_len}' + ) + self.weighting = list(accumulate(lens)) + else: + self.weighting = None + + def get_weighted_samples(self, np_rng): + if self.weighting is not None: + idx = np_rng.randint(self.total_len) + return bisect_right(self.weighting, idx) + else: + return np_rng.randint(self.ds_len) + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + # init rng + rng = random.Random(idx) + rng = np.random.RandomState( + seed=[rng.randint(0, 2**32 - 1) for _ in range(16)]) + + # get possibly weighted random index from dataset + data_idx = self.get_weighted_samples(rng) + # data_idx = rng.choice(self.ds_len, p=self.weighting) + tokens, loss_mask = self.getidx(data_idx) + + # truncate or pad tokens + num_tokens = len(tokens) + tokens_to_strip = num_tokens - self.max_seq_len - 1 + + # randomly choose a position for start + if tokens_to_strip > 0: + strip_left_tokens = rng.randint(tokens_to_strip + 1) + tokens = tokens[strip_left_tokens:] + loss_mask = loss_mask[strip_left_tokens:] + # if self.sentence_start: + # token_copy = list(tokens) + # not_done = True + # while (len(token_copy) > 0) and not_done: + # tok = token_copy.pop(0) + # if self.contains_sentence_end(tok): + # tokens = token_copy + # not_done = False + strip_right_rokens = len(tokens) - self.max_seq_len - 1 + if strip_right_rokens > 0: + tokens = tokens[:-strip_right_rokens] + loss_mask = loss_mask[:-strip_right_rokens] + # Sample multiple documents + if self.sample_across_doc: + while (len(tokens) < (self.max_seq_len + 1)): + if self.random_across_doc_sampling: + data_idx = self.get_weighted_samples(rng) + else: + data_idx = (data_idx + 1) % self.ds_len + new_tokens, new_loss_mask = self.getidx(data_idx) + tokens += new_tokens + loss_mask += new_loss_mask + tokens = tokens[:(self.max_seq_len + 1)] + loss_mask = loss_mask[:(self.max_seq_len + 1)] + + tokens = self.pad_seq(tokens) + loss_mask = self.pad_seq(loss_mask, pad_id=0) + return {'text': np.array(tokens), 'loss_mask': np.array(loss_mask)} + + def getidx(self, data_idx): + data = self.ds[data_idx] + tokens, loss_masks = data['tokens'], data['loss_masks'] + tokens = tokens + [self.tokenizer.get_command('eos').Id] + loss_masks = loss_masks + [1] + return tokens, loss_masks + + def pad_seq(self, seq, pad_id=None): + total_tokens = self.max_seq_len + 1 + num_pad_tokens = max(0, total_tokens - len(seq)) + seq += [ + self.tokenizer.get_command('pad').Id if pad_id is None else pad_id + ] * ( + num_pad_tokens) + return seq + + # TODO: rewrite this function for chinese + def contains_sentence_end(self, tok): + tok = self.tokenizer.IdToToken(tok) + if '.' in tok: + return True + if '?' in tok: + return True + if '!' in tok: + return True + return False + + +class BertSentencepairDataset(data.Dataset): + """ + Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair. + Arguments: + ds (Dataset or array-like): data corpus to use for training + max_seq_len (int): maximum sequence length to use for a sentence pair + mask_lm_prob (float): proportion of tokens to mask for masked LM + max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10 + short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len + dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1) + + """ # noqa + + def __init__(self, + ds, + max_seq_len=512, + mask_lm_prob=.15, + max_preds_per_seq=None, + short_seq_prob=.01, + dataset_size=None, + presplit_sentences=False, + weighted=True, + **kwargs): + self.ds = ds + self.ds_len = len(self.ds) + self.tokenizer = self.ds.GetTokenizer() + self.vocab_words = list(self.tokenizer.text_token_vocab.values()) + self.ds.SetTokenizer(None) + self.max_seq_len = max_seq_len + self.mask_lm_prob = mask_lm_prob + if max_preds_per_seq is None: + max_preds_per_seq = math.ceil(max_seq_len * mask_lm_prob / 10) * 10 + self.max_preds_per_seq = max_preds_per_seq + self.short_seq_prob = short_seq_prob + self.dataset_size = dataset_size + if self.dataset_size is None: + self.dataset_size = self.ds_len * (self.ds_len - 1) + self.presplit_sentences = presplit_sentences + if not self.presplit_sentences: + nltk.download('punkt', download_dir='./nltk') + self.weighted = weighted + self.get_weighting() + + def get_weighting(self): + if self.weighted: + if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy: + lens = np.array(self.ds.lens) + else: + lens = np.array([ + len(d['text']) if isinstance(d, dict) else len(d) + for d in self.ds + ]) + self.total_len = np.sum(lens) + self.weighting = list(accumulate(lens)) + else: + self.weighting = None + + def get_weighted_samples(self, np_rng): + if self.weighting is not None: + idx = np_rng.randint(self.total_len) + return bisect_right(self.weighting, idx) + else: + return np_rng.randint(self.ds_len) + + def __len__(self): + return self.dataset_size + + def __getitem__(self, idx): + # get rng state corresponding to index (allows deterministic random pair) + rng = random.Random(idx) + np_rng = np.random.RandomState( + seed=[rng.randint(0, 2**32 - 1) for _ in range(16)]) + # get seq length + target_seq_length = self.max_seq_len + short_seq = False # noqa + if rng.random() < self.short_seq_prob: + target_seq_length = rng.randint(2, target_seq_length) + short_seq = True # noqa + + # get sentence pair and label + is_random_next = None + lena = 0 + lenb = 0 + while (is_random_next is None) or (lena < 1) or (lenb < 1): + tokensa, tokensb, is_random_next = self.create_random_sentencepair( + target_seq_length, rng, np_rng) + lena = len(tokensa[0]) + lenb = len(tokensb[0]) + + # truncate sentence pair to max_seq_len + tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb, + self.max_seq_len, rng) + # join sentence pair, mask, and pad + tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions( + tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq, + self.vocab_words, rng) + sample = { + 'text': np.array(tokens[0]), + 'types': np.array(tokens[1]), + 'is_random': int(is_random_next), + 'mask': np.array(mask), + 'mask_labels': np.array(mask_labels), + 'pad_mask': np.array(pad_mask) + } + return sample + + def sentence_split(self, document): + """split document into sentences""" + lines = document.split('\n') + if self.presplit_sentences: + return [line for line in lines if line] + rtn = [] + for line in lines: + if line != '': + rtn.extend(tokenize.sent_tokenize(line)) + return rtn + + def sentence_tokenize(self, + sent, + sentence_num=0, + beginning=False, + ending=False): + """tokenize sentence and get token types""" + tokens = self.tokenizer.EncodeAsIds(sent).tokenization + str_type = 'str' + str(sentence_num) + token_types = [self.tokenizer.get_type(str_type).Id] * len(tokens) + return tokens, token_types + + def get_doc(self, idx): + """gets text of document corresponding to idx""" + rtn = self.ds[idx] + if isinstance(rtn, dict): + rtn = rtn['text'] + return rtn + + def create_random_sentencepair(self, target_seq_length, rng, np_rng): + """ + fetches a random sentencepair corresponding to rng state similar to + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294 + """ + is_random_next = None + + curr_strs = [] + curr_str_types = [] + curr_len = 0 + + while curr_len < 1: + curr_len = 0 + doc_a = None + while doc_a is None: + if self.weighted: + # doc_a_idx = np_rng.choice(self.ds_len, p=self.weighting) + doc_a_idx = self.get_weighted_samples(np_rng) + else: + doc_a_idx = rng.randint(0, self.ds_len - 1) + doc_a = self.sentence_split(self.get_doc(doc_a_idx)) + if not doc_a: + doc_a = None + + random_start_a = rng.randint(0, len(doc_a) - 1) + while random_start_a < len(doc_a): + sentence = doc_a[random_start_a] + sentence, sentence_types = self.sentence_tokenize( + sentence, 0, random_start_a == 0, + random_start_a == len(doc_a)) + curr_strs.append(sentence) + curr_str_types.append(sentence_types) + curr_len += len(sentence) + if random_start_a == len( + doc_a) - 1 or curr_len >= target_seq_length: + break + random_start_a = (random_start_a + 1) + + if curr_strs: + num_a = 1 + if len(curr_strs) >= 2: + num_a = rng.randint(0, len(curr_strs)) + + tokens_a = [] + token_types_a = [] + for j in range(num_a): + tokens_a.extend(curr_strs[j]) + token_types_a.extend(curr_str_types[j]) + + tokens_b = [] + token_types_b = [] + is_random_next = False + if len(curr_strs) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + b_len = 0 + while b_len < 1: + doc_b = None + while doc_b is None: + doc_b_idx = rng.randint(0, self.ds_len - 2) + doc_b_idx += int(doc_b_idx >= doc_a_idx) + + doc_b = self.sentence_split(self.get_doc(doc_b_idx)) + if not doc_b: + doc_b = None + + random_start_b = rng.randint(0, len(doc_b) - 1) + while random_start_b < len(doc_b): + sentence_b = doc_b[random_start_b] + new_b_tokens, new_b_types = self.sentence_tokenize( + sentence_b, 1, random_start_b == 0, + random_start_b == len(doc_b)) + b_len += len(new_b_tokens) + tokens_b.extend(new_b_tokens) + token_types_b.extend(new_b_types) + if len(tokens_b) >= target_b_length: + break + random_start_b = (random_start_b + 1) + else: + is_random_next = False + for j in range(num_a, len(curr_strs)): + tokens_b.extend(curr_strs[j]) + token_types_b.extend(curr_str_types[j]) + + return (tokens_a, token_types_a), (tokens_b, + token_types_b), is_random_next + + def truncate_seq_pair(self, a, b, max_seq_len, rng): + """ + Truncate sequence pair according to original BERT implementation: + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391 + """ + tokens_a, token_types_a = a + tokens_b, token_types_b = b + max_num_tokens = max_seq_len - 3 + while True: + len_a = len(tokens_a) + len_b = len(tokens_b) + total_length = len_a + len_b + if total_length <= max_num_tokens: + break + if len(tokens_a) > len(tokens_b): + trunc_tokens = tokens_a + trunc_types = token_types_a + else: + trunc_tokens = tokens_b + trunc_types = token_types_b + + assert len(trunc_tokens) >= 1 + + if rng.random() < 0.5: + trunc_tokens.pop(0) + trunc_types.pop(0) + else: + trunc_tokens.pop() + trunc_types.pop() + return (tokens_a, token_types_a), (tokens_b, token_types_b) + + def mask_token(self, idx, tokens, types, vocab_words, rng): + """ + helper function to mask `idx` token from `tokens` according to + section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf + """ + label = tokens[idx] + if rng.random() < 0.8: + new_label = self.tokenizer.get_command('MASK').Id + else: + if rng.random() < 0.5: + new_label = label + else: + new_label = rng.choice(vocab_words) + + tokens[idx] = new_label + + return label + + def pad_seq(self, seq): + """helper function to pad sequence pair""" + num_pad = max(0, self.max_seq_len - len(seq)) + pad_mask = [0] * len(seq) + [1] * num_pad + seq += [self.tokenizer.get_command('pad').Id] * num_pad + return seq, pad_mask + + def create_masked_lm_predictions(self, a, b, mask_lm_prob, + max_preds_per_seq, vocab_words, rng): + """ + Mask sequence pair for BERT training according to: + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338 + """ + tokens_a, token_types_a = a + tokens_b, token_types_b = b + tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [ + self.tokenizer.get_command('sep').Id + ] + tokens_b + [self.tokenizer.get_command('sep').Id] + token_types = [token_types_a[0]] + token_types_a + [ + token_types_a[0] + ] + token_types_b + [token_types_b[0]] + + len_a = len(tokens_a) + len_b = len(tokens_b) + + cand_indices = [idx + 1 for idx in range(len_a) + ] + [idx + 2 + len_a for idx in range(len_b)] + + rng.shuffle(cand_indices) + + output_tokens, pad_mask = self.pad_seq(list(tokens)) + output_types, _ = self.pad_seq(list(token_types)) + + num_to_predict = min(max_preds_per_seq, + max(1, int(round(len(tokens) * mask_lm_prob)))) + + mask = [0] * len(output_tokens) + mask_labels = [-1] * len(output_tokens) + + for idx in sorted(cand_indices[:num_to_predict]): + mask[idx] = 1 + label = self.mask_token(idx, output_tokens, output_types, + vocab_words, rng) + mask_labels[idx] = label + + return (output_tokens, output_types), mask, mask_labels, pad_mask diff --git a/modelscope/models/nlp/mglm/data_utils/extraction.py b/modelscope/models/nlp/mglm/data_utils/extraction.py new file mode 100644 index 00000000..53027e4f --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/extraction.py @@ -0,0 +1,71 @@ +# Copyright (c) 2022 Zhipu.AI + +import glob +import os + +import json +import nltk + +nltk.download('punkt') + + +class NLTKSegmenter: + + def __init(self): + pass + + @staticmethod + def segment_string(article): + return nltk.tokenize.sent_tokenize(article) + + +wiki_path = 'data/extracted' +output_path = 'formatted/wiki-key.txt' +segmenter = NLTKSegmenter() +with open(output_path, 'w') as output: + for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False): + for filename in glob.glob( + os.path.join(dirname, 'wiki_*'), recursive=True): + print(filename) + article_lines = [] + article_open = False + with open(filename, mode='r', newline='\n') as file: + for line in file: + line = line.rstrip() + if '' in line: + key_sentences, contents = [], [] + key, content = None, [] + for sentences in article_lines[1:]: + if len(sentences) > 1: + if key: + if len(content) > 0 or len(contents) == 0: + key_sentences.append(key) + contents.append(content) + else: + contents[-1].append(key) + key, content = None, [] + key_sentences.append(sentences[0]) + contents.append(sentences[1:]) + elif len(sentences) > 0: + if key: + content.append(sentences[0]) + else: + key = sentences[0] + if key: + if len(content) > 0 or len(contents) == 0: + key_sentences.append(key) + contents.append(content) + else: + contents[-1].append(key) + contents = [' '.join(content) for content in contents] + article = {'key': key_sentences, 'content': contents} + output.write(json.dumps(article)) + output.write('\n') + article_open = False + article_lines = [] + else: + if article_open and line: + sentences = segmenter.segment_string(line) + article_lines.append(sentences) diff --git a/modelscope/models/nlp/mglm/data_utils/file_utils.py b/modelscope/models/nlp/mglm/data_utils/file_utils.py new file mode 100755 index 00000000..794e127a --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/file_utils.py @@ -0,0 +1,256 @@ +# Modified by Zhipu.AI +# This file is provided as is from: +# https://github.com/huggingface/pytorch-pretrained-BERT +# Please refer to their repository for copyright. +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import logging +import os +import shutil +import sys +import tempfile +from functools import wraps +from hashlib import sha256 +from io import open +from urllib.parse import urlparse + +import boto3 +import json +import requests +from botocore.exceptions import ClientError +from tqdm import tqdm + +try: + from pathlib import Path + PYTORCH_PRETRAINED_BERT_CACHE = Path( + os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + Path.home() / '.pytorch_pretrained_bert')) +except (AttributeError, ImportError): + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv( + 'PYTORCH_PRETRAINED_BERT_CACHE', + os.path.join(os.path.expanduser('~'), '.pytorch_pretrained_bert')) + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +def url_to_filename(url, etag=None): + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + """ + url_bytes = url.encode('utf-8') + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode('utf-8') + etag_hash = sha256(etag_bytes) + filename += '.' + etag_hash.hexdigest() + + return filename + + +def filename_to_url(filename, cache_dir=None): + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise EnvironmentError('file {} not found'.format(cache_path)) + + meta_path = cache_path + '.json' + if not os.path.exists(meta_path): + raise EnvironmentError('file {} not found'.format(meta_path)) + + with open(meta_path, encoding='utf-8') as meta_file: + metadata = json.load(meta_file) + url = metadata['url'] + etag = metadata['etag'] + + return url, etag + + +def cached_path(url_or_filename, cache_dir=None): + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ('http', 'https', 's3'): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, cache_dir) + elif os.path.exists(url_or_filename): + # File, and it exists. + return url_or_filename + elif parsed.scheme == '': + # File, but it doesn't exist. + raise EnvironmentError('file {} not found'.format(url_or_filename)) + else: + # Something unknown + raise ValueError( + 'unable to parse {} as a URL or as a local path'.format( + url_or_filename)) + + +def split_s3_path(url): + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError('bad s3 path {}'.format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith('/'): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response['Error']['Code']) == 404: + raise EnvironmentError('file {} not found'.format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url): + """Check ETag on S3 object.""" + s3_resource = boto3.resource('s3') + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url, temp_file): + """Pull a file directly from S3.""" + s3_resource = boto3.resource('s3') + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url, temp_file): + req = requests.get(url, stream=True) + content_length = req.headers.get('Content-Length') + total = int(content_length) if content_length is not None else None + progress = tqdm(unit='B', total=total) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache(url, cache_dir=None): + """ + Given a URL, look for the corresponding dataset in the local cache. + If it's not there, download it. Then return the path to the cached file. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + # Get eTag to add to filename, if it exists. + if url.startswith('s3://'): + etag = s3_etag(url) + else: + response = requests.head(url, allow_redirects=True) + if response.status_code != 200: + raise IOError( + 'HEAD request failed for url {} with status code {}'.format( + url, response.status_code)) + etag = response.headers.get('ETag') + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + if not os.path.exists(cache_path): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with tempfile.NamedTemporaryFile() as temp_file: + logger.info('%s not found in cache, downloading to %s', url, + temp_file.name) + + # GET file object + if url.startswith('s3://'): + s3_get(url, temp_file) + else: + http_get(url, temp_file) + + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + # shutil.copyfileobj() starts at the current position, so go to the start + temp_file.seek(0) + + logger.info('copying %s to cache at %s', temp_file.name, + cache_path) + with open(cache_path, 'wb') as cache_file: + shutil.copyfileobj(temp_file, cache_file) + + logger.info('creating metadata file for %s', cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w', encoding='utf-8') as meta_file: + json.dump(meta, meta_file) + + logger.info('removing temp file %s', temp_file.name) + + return cache_path + + +def read_set_from_file(filename): + ''' + Extract a de-duped collection (set) of text from a file. + Expected file format is one item per line. + ''' + collection = set() + with open(filename, 'r', encoding='utf-8') as file_: + for line in file_: + collection.add(line.rstrip()) + return collection + + +def get_file_extension(path, dot=True, lower=True): + ext = os.path.splitext(path)[1] + ext = ext if dot else ext[1:] + return ext.lower() if lower else ext diff --git a/modelscope/models/nlp/mglm/data_utils/lazy_loader.py b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py new file mode 100644 index 00000000..77a77a8a --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py @@ -0,0 +1,286 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""utils for loading text from disk""" +import mmap +import os +import pickle as pkl +import time +from itertools import accumulate + +import numpy as np +import torch +from torch.multiprocessing import Lock + + +def get_lazy_path(path): + """ + Gets directory path where lazy files are stored. + """ + return os.path.splitext(path)[0] + '.lazy' + + +def exists_lazy(path, data_type='data'): + """ + Check if we've already made a lazy version of this file for the `data_type` field. + """ + if not os.path.exists(get_lazy_path(path)): + return False + contents = os.listdir(get_lazy_path(path)) + if data_type not in contents: + return False + if data_type + '.len.pkl' not in contents: + return False + return True + + +def get_scatter_path(path, scatter_rank): + path = os.path.splitext(path)[0] + '.scatter' + scatter_path = os.path.join(path, str(scatter_rank)) + return scatter_path + + +def exists_scatter(path, scatter_num=64, data_type='data'): + for i in range(scatter_num): + scatter_path = get_scatter_path(path, scatter_rank=i) + if not exists_lazy(scatter_path, data_type=data_type): + return False + return True + + +class LazyWriter: + + def __init__(self, + path, + data_type, + is_array=False, + array_data_type=np.int32): + lazypath = get_lazy_path(path) + if not os.path.exists(lazypath): + os.makedirs(lazypath) + self.datapath = os.path.join(lazypath, data_type) + self.lenpath = os.path.join(lazypath, data_type + '.len.pkl') + self.array_data_type = array_data_type + self.output = open(self.datapath, 'wb') + self.lengths = [] + self.is_array = is_array + + @staticmethod + def get_len_path(path, data_type): + lazypath = get_lazy_path(path) + return os.path.join(lazypath, data_type + '.len.pkl') + + def write(self, s): + if isinstance(s, dict): + s = s['text'] + if self.is_array: + encoded = np.array( + s, dtype=self.array_data_type).tobytes(order='C') + self.output.write(encoded) + self.lengths.append(len(s)) + else: + encoded = s.encode('utf-8') + self.output.write(encoded) + self.lengths.append(len(encoded)) + + def close(self): + self.output.close() + with open(self.lenpath, 'wb') as f: + pkl.dump(self.lengths, f) + + +def split_strings(strings, start, chr_lens): + """ + Split strings based on string lengths and given start. + """ + return [ + strings[i - start:j - start] + for i, j in zip([start] + chr_lens[:-1], chr_lens) + ] + + +class ProcessorTokenizer: + """ + callable class that runs a preprocessing, as well as tokenization step, + on input text. + """ + + def __init__(self, tokenizer, process_fn=None): + self.tokenizer = tokenizer + self.process_fn = process_fn + + def __call__(self, string): + if self.tokenizer is not None: + string = self.tokenizer(string, process_fn=self.process_fn) + elif self.process_fn is not None: + string = self.process_fn(string) + return string + + +class LazyLoader(object): + """ + Arguments: + path: path to directory where array entries are concatenated into one big string file + and the .len file are located + data_type (str): Some datsets have multiple fields that are stored in different paths. + `data_type` specifies which of these fields to load in this class + mem_map (boolean): Specifies whether to memory map file `path` + map_fn (callable): Fetched strings are passed through map_fn before being returned. + + Example of lazy loader directory structure: + file.json + file.lazy/ + data_type1 + data_type1.len.pkl + data_type2 + data_type2.len.pkl + """ + + def __init__(self, + path, + data_type='data', + mem_map=False, + map_fn=None, + is_array=False, + array_data_type=np.int32, + load_memory=False, + half_load=False): + lazypath = get_lazy_path(path) + datapath = os.path.join(lazypath, data_type) + # get file where array entries are concatenated into one big string + self._file = open(datapath, 'rb') + self.file = self._file + self.is_array = is_array + self.array_data_type = array_data_type + # memory map file if necessary + lenpath = os.path.join(lazypath, data_type + '.len.pkl') + self.lens = pkl.load(open(lenpath, 'rb')) + if half_load: + self.lens = self.lens[:2 * len(self.lens) // 3] + self.ends = list(accumulate(self.lens)) + self.dumb_ends = list(self.ends) + self.mem_map = mem_map + self.load_memory = load_memory + if self.load_memory: + data_type_size = np.dtype(self.array_data_type).itemsize + if half_load: + self.file = self.file.read(sum(self.lens) * data_type_size) + else: + self.file = self.file.read() + self.file = np.ndarray( + shape=(len(self.file) // data_type_size, ), + dtype=array_data_type, + buffer=self.file, + order='C') + elif self.mem_map: + if is_array: + if self.ends[-1] == 0: + self.file = np.array([], dtype=array_data_type) + else: + self.file = np.memmap( + self.file, dtype=array_data_type, mode='r', order='C') + else: + if self.ends[-1] == 0: + self.file = bytearray() + else: + self.file = mmap.mmap( + self.file.fileno(), 0, prot=mmap.PROT_READ) + self.read_lock = Lock() + self.process_fn = map_fn + self.map_fn = map_fn + self._tokenizer = None + self.is_lazy = True + + def SetTokenizer(self, tokenizer): + """ + logic to set and remove (set to None) tokenizer. + combines preprocessing/tokenization into one callable. + """ + if tokenizer is None: + if not hasattr(self, '_tokenizer'): + self._tokenizer = tokenizer + else: + self._tokenizer = tokenizer + self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn) + + def GetTokenizer(self): + return self._tokenizer + + def __getitem__(self, index): + """ + read file and splice strings based on string ending array `self.ends` + """ + if not isinstance(index, slice): + if index == 0: + start = 0 + else: + start = self.ends[index - 1] + end = self.ends[index] + rtn = self.file_read(start, end) + if self.map_fn is not None: + rtn = self.map_fn(rtn) + else: + # if slice, fetch strings with 1 diskread and then splice in memory + chr_lens = self.ends[index] + if index.start == 0 or index.start is None: + start = 0 + else: + start = self.ends[index.start - 1] + stop = chr_lens[-1] + strings = self.file_read(start, stop) + rtn = split_strings(strings, start, chr_lens) + if self.map_fn is not None: + rtn = [self.map_fn(s) for s in rtn] + return rtn + + def __len__(self): + return len(self.ends) + + def file_read(self, start=0, end=None): + """read specified portion of file""" + data_type_size = np.dtype(self.array_data_type).itemsize + # atomic reads to avoid race conditions with multiprocess dataloader + self.read_lock.acquire() + if not self.mem_map and not self.load_memory: + # seek to start of file read + if self.is_array: + start = start * data_type_size + end = end * data_type_size if end is not None else None + self.file.seek(start) + # read to end of file if no end point provided + if end is None: + rtn = self.file.read() + # else read amount needed to reach end point + else: + rtn = self.file.read(end - start) + if self.is_array: + rtn = np.ndarray( + shape=(len(rtn) // data_type_size, ), + dtype=self.array_data_type, + buffer=rtn, + order='C') + else: + rtn = rtn.decode('utf-8', 'ignore') + else: + rtn = self.file[start:end] + if self.is_array: + rtn = rtn.copy() + else: + rtn = rtn.decode('utf-8', 'strict') + self.read_lock.release() + # TODO: @raulp figure out mem map byte string bug + # if mem map'd need to decode byte string to string + # # rtn = str(rtn) + # if self.mem_map: + # rtn = rtn.decode('unicode_escape') + return rtn diff --git a/modelscope/models/nlp/mglm/data_utils/samplers.py b/modelscope/models/nlp/mglm/data_utils/samplers.py new file mode 100644 index 00000000..c0f6e1ab --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/samplers.py @@ -0,0 +1,190 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""batch samplers that work with either random or sequential data samplers""" +import math +import os +import sys + +import numpy as np +import torch +from torch.utils import data + + +class RandomSampler(data.sampler.Sampler): + r""" + Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler, + but this class lets the user set an epoch like DistributedSampler + Samples elements randomly. If without replacement, then sample from a shuffled dataset. + If with replacement, then user can specify ``num_samples`` to draw. + Arguments: + data_source (Dataset): dataset to sample from + num_samples (int): number of samples to draw, default=len(dataset) + replacement (bool): samples are drawn with replacement if ``True``, default=False + """ + + def __init__(self, data_source, replacement=False, num_samples=None): + super(RandomSampler, self).__init__(data_source) + self.data_source = data_source + self.replacement = replacement + self._num_samples = num_samples + self.epoch = -1 + + if self._num_samples is not None and replacement is False: + raise ValueError( + 'With replacement=False, num_samples should not be specified, ' + 'since a random permute will be performed.') + + if not isinstance(self.num_samples, int) or self.num_samples <= 0: + raise ValueError('num_samples should be a positive integer ' + 'value, but got num_samples={}'.format( + self.num_samples)) + if not isinstance(self.replacement, bool): + raise ValueError('replacement should be a boolean value, but got ' + 'replacement={}'.format(self.replacement)) + + @property + def num_samples(self): + # dataset size might change at runtime + if self._num_samples is None: + return len(self.data_source) + return self._num_samples + + def __iter__(self): + n = len(self.data_source) + g = torch.Generator() + if self.epoch >= 0: + g.manual_seed(self.epoch) + if self.replacement: + for _ in range(self.num_samples // 32): + yield from torch.randint( + high=n, size=(32, ), dtype=torch.int64, + generator=g).tolist() + yield from torch.randint( + high=n, + size=(self.num_samples % 32, ), + dtype=torch.int64, + generator=g).tolist() + else: + yield from torch.randperm(n, generator=self.generator).tolist() + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + + +class DistributedSequentialSampler(data.sampler.Sampler): + + def __init__(self, + num_samples, + train_iters, + batch_size, + rank=-1, + world_size=2): + super().__init__(num_samples) + if rank == -1: + rank = 0 + world_size = 1 + self.num_samples = num_samples + self.rank = rank + self.world_size = world_size + self.start_iter = 0 + self.train_iters = train_iters + self.batch_size = batch_size + self.batch_bias = [ + i * (num_samples // batch_size) for i in range(batch_size) + ] + + def __iter__(self): + for idx in range(self.start_iter, self.train_iters * 10): + batch = [(idx + bias) % self.num_samples + for bias in self.batch_bias] + tbatch = self._batch(batch) + yield tbatch + + def __len__(self): + return self.train_iters + + def _batch(self, batch): + """extracts samples only pertaining to this worker's batch""" + start = self.rank * self.batch_size // self.world_size + end = (self.rank + 1) * self.batch_size // self.world_size + return batch[start:end] + + +class DistributedBatchSampler(data.sampler.BatchSampler): + """ + similar to normal implementation of distributed sampler, except implementation is at the + batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary + data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. + """ + + def __init__(self, + sampler, + batch_size, + drop_last, + rank=-1, + world_size=2, + wrap_last=False, + gradient_accumulation_steps=None): + super(DistributedBatchSampler, self).__init__(sampler, batch_size, + drop_last) + if rank == -1: + assert False, 'should not be here' + self.rank = rank + self.world_size = world_size + self.sampler.wrap_around = 0 + self.wrap_around = 0 + self.wrap_last = wrap_last + self.start_iter = 0 + self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps # noqa + + def __iter__(self): + batch = [] + i = 0 + for idx in self.data_iterator(self.sampler, wrap_around=False): + batch.append(idx) + if len(batch) == self.batch_size: + tbatch = self._batch(batch) + if i >= self.start_iter * self.effective_batch_size: + yield tbatch + self.start_iter = 0 + i += len(batch) + batch = [] + batch_len = len(batch) + if batch_len > 0 and not self.drop_last: + if self.wrap_last: + self.sampler.wrap_around -= (self.batch_size) + self.wrap_around += (len(batch)) + self.wrap_around %= self.batch_size + yield self._batch(batch) + if self.wrap_last: + self.sampler.wrap_around += self.batch_size + + def data_iterator(self, _iter, wrap_around=False): + """iterates through data and handles wrap around""" + for i, idx in enumerate(_iter): + if i < self.wrap_around % self.batch_size: + continue + if wrap_around: + self.wrap_around += 1 + self.wrap_around %= self.batch_size + yield idx + + def _batch(self, batch): + """extracts samples only pertaining to this worker's batch""" + start = self.rank * self.batch_size // self.world_size + end = (self.rank + 1) * self.batch_size // self.world_size + return batch[start:end] diff --git a/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py new file mode 100644 index 00000000..b4d1afe3 --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py @@ -0,0 +1,158 @@ +# Modified by Zhipu.AI +""" +from https://github.com/openai/gpt-2/, changed for chinese +""" +import os # yapf: disable + + +""" +SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation +systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements +subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the +extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end +system that does not depend on language-specific pre/postprocessing. +https://github.com/google/sentencepiece + +pip install sentencepiece + +or git clone https://github.com/google/sentencepiece.git +python setup.py install + +""" + + +def get_pairs(word): + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class Encoder: + + def __init__(self, encoder, bpe_merges): + self.encoder = encoder + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.max_len = 0 + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + if not pairs: + return token + + while True: + bigram = min( + pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: # noqa + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[ + i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + return [self.encoder.get(token, 1) for token in self.tokenize(text)] + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + return text + + def tokenize(self, text): + bpe_tokens = [] + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + return [self.encoder.get(token, 1) for token in tokens] + + +class Encoder_SP: + + def __init__(self, model_path): + import sentencepiece as spm + self.sp = spm.SentencePieceProcessor() + self.sp.Load(model_path) + + def encode(self, text): + """ + text="...." + """ + return self.sp.EncodeAsIds(text) + + def decode(self, tokens): + """ + tokens=[x1,x2,...] + """ + text = [int(token) for token in tokens] + # print(text) + return self.sp.DecodeIds(text) + + def tokenize(self, text): + return self.sp.EncodeAsPieces(text) + + def convert_tokens_to_ids(self, tokens): + return [self.sp.PieceToId(token) for token in tokens] + + def convert_token_to_id(self, token): + return self.sp.PieceToId(token) + + def convert_id_to_token(self, idx): + return self.sp.IdToPiece(idx) + + +def get_encoder(encoder_file, bpe_file): + import json + filepath, filename = os.path.split(encoder_file) + shotname, extension = os.path.splitext(filename) + + if ('.model' == extension) and (bpe_file == ''): + return Encoder_SP(encoder_file) + else: + with open(encoder_file, 'r', encoding='utf-8') as f: + encoder = json.load(f) + with open(bpe_file, 'r', encoding='utf-8') as f: + bpe_data = f.read() + bpe_merges = [ + tuple(merge_str.split()) + for merge_str in bpe_data.split('\n')[1:-1] + ] + return Encoder( + encoder=encoder, + bpe_merges=bpe_merges, + ) + + +def from_pretrained(model_path): + return get_encoder(model_path + '/tokenizer/mglm250k/mglm250k-uni.model', + '') diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization.py b/modelscope/models/nlp/mglm/data_utils/tokenization.py new file mode 100755 index 00000000..c89cc371 --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/tokenization.py @@ -0,0 +1,1396 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)""" +import csv +import itertools +import os +import random +from collections import namedtuple + +import nltk +import regex as re +import sentencepiece as spm +import torch +from nltk import tokenize as nltk_tokenize + +from . import sp_tokenizer +from .tokenization_gpt2 import GPT2Tokenizer +from .wordpiece import PRETRAINED_VOCAB_ARCHIVE_MAP, BertTokenizer + + +def make_tokenizer(tokenizer_type, + corpus, + model_path=None, + vocab_size=None, + model_type=None, + pad_token=0, + character_coverage=1.0, + command_tokens=None, + type_tokens=None, + **kwargs): + """ + Helper function to instantiate a tokenizer given common combinations of options. + """ + tokenizer_class = tokenizer_type + if isinstance(tokenizer_class, str): + tokenizer_class = eval(tokenizer_class) + if tokenizer_class is BertWordPieceTokenizer: + return BertWordPieceTokenizer(model_type, **kwargs) + elif tokenizer_class is GPT2BPETokenizer: + if model_type is None: + model_type = 'gpt2' + return GPT2BPETokenizer(model_type, **kwargs) + elif tokenizer_class is ChineseSPTokenizer: + return ChineseSPTokenizer(model_path, **kwargs) + text_tokenizer = tokenizer_class( + corpus=corpus, + vocab_size=vocab_size, + model_path=model_path, + model_type=model_type, + pad_token=pad_token, + character_coverage=character_coverage) + return Tokenizer(text_tokenizer, command_tokens, type_tokens) + + +class Tokenization(object): + """ + Tokenization object to hold tokenization, (processed text),and original + text. Can hold tokenization as Ids or tokens. + + It also holds command tokens (pad, unk, etc.) for the tokenization. + This allows functions to pad/operate on tokenizations without having + access to the full tokenizer, just the tokenization. + + Several standard array operations are implemented (insert, append, extend). + """ + + def __init__(self, + tokenization, + text=None, + original_text=None, + command_tokens=None, + asIds=True): + self.tokenization = tokenization + self.text = text + if self.text is None: + self.text = self.tokenization + self.original_text = original_text + if self.original_text is None: + self.original_text = self.text + self.command_tokens = command_tokens + self.asIds = asIds + self.parse_command_tokens() + + def set_command_tokens(self, command_tokens): + self.command_tokens = command_tokens + return self.parse_command_tokens() + + def parse_command_tokens(self): + if self.command_tokens is None: + return + for command_token in self.command_tokens: + if self.asIds: + setattr(self, command_token.name, command_token.Id) + else: + setattr(self, command_token.name, command_token.token) + + def __getitem__(self, index): + return self.tokenization[index] + + def __len__(self): + return len(self.tokenization) + + def insert(self, idx, other): + if isinstance(other, (CommandToken, TypeToken)): + self.tokenization.insert(idx, other.Id) + if idx == 0: + self.text = other.token + self.text + self.original_text = other.token + self.original_text + elif idx == len(self.tokenization) - 1: + self.text += other.token + self.original_text += other.token + elif isinstance(other, Tokenization): + self.tokenization = self.tokenization[: + idx] + other.tokenization + self.tokenization[ + idx:] + else: + self.tokenization = self.tokenization[: + idx] + other.tokenization + self.tokenization[ + idx:] + + def append(self, other): + if isinstance(other, (CommandToken, TypeToken)): + self.tokenization.append(other.Id) + self.text += other.token + self.original_text += other.token + elif isinstance(other, Tokenization): + self.tokenization.extend(other.tokenization) + self.text += other.text + self.original_text += other.original_text + else: + self.tokenization.append(other) + return self + + def extend(self, other): + if isinstance(other, (CommandToken, TypeToken)): + self.tokenization.append(other.Id) + self.text += other.token + self.original_text += other.token + elif isinstance(other, list) and isinstance(other[0], + (CommandToken, TypeToken)): + self.tokenization.extend([o.Id for o in other]) + self.text += [o.token for o in other] + self.original_text += [o.token for o in other] + elif isinstance(other, Tokenization): + self.tokenization.extend(other.tokenization) + self.text += other.text + self.original_text += other.original_text + else: + self.tokenization.extend(other) + return self + + +"""define some default command tokens for the tokenizer to use""" +token_format = '<{0}>' + +COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id')) + + +def prep_command_tokens(tokenlist, token_format=token_format): + return [ + CommandToken(tok[0], token_format.format(tok[0]), tok[1]) + for tok in tokenlist + ] + + +class CommandToken(object): + + def __init__(self, name, token, Id, lstrip=False, rstrip=False): + self.name = name + self.token = token + self.Id = Id + self.lstrip = lstrip + self.rstrip = rstrip + + def __str__(self): + return str(COMMAND_TUPLE(self.name, self.token, self.Id)) + + +DEFAULT_COMMAND_TOKENS = [ + ('pad', 0), + ('eos', 1), + ('bos', 2), + ('unk', 3), + ('sep', 4), + ('L2R', 5), + ('ENC', 6), + ('MASK', 7), +] +DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) +"""define some default type tokens for bert training""" + +TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id')) + + +def prep_type_tokens(tokenlist, token_format=token_format): + return [ + TypeToken(tok[0], token_format.format(tok[0]), tok[1]) + for tok in tokenlist + ] + + +class TypeToken(object): + + def __init__(self, name, token, Id): + self.name = name + self.token = token + self.Id = Id + + def __str__(self): + return str(TYPE_TUPLE(self.name, self.token, self.Id)) + + +DEFAULT_TYPE_TOKENS = [ + ('function', 0), + ('command', 1), + ('str0', 2), + ('str1', 3), + ('str2', 4), + ('embedding0', 5), + ('embedding1', 6), + ('embedding2', 7), + ('arg0', 8), + ('arg1', 9), + ('arg2', 10), +] +DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS) + + +class Tokenizer(object): + """ + Tokenizer object that handles text tokenization, command tokens, and type tokens. + + Command tokens and text tokens are stored together in one mapping of size + `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first + `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. + + Token types are stored in a separate mapping of size `len(type_tokens)`. + """ + + def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None): + # set text tokenizer + self.text_tokenizer = text_tokenizer + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = len(self.text_tokenizer) + + # set command tokens + if command_tokens is None: + command_tokens = DEFAULT_COMMAND_TOKENS + self._command_tokens = command_tokens + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + if not hasattr(self, 'num_command_tokens'): + self.num_command_tokens = len(self._command_tokens) + if not hasattr(self, 'num_tokens'): + self.num_tokens = self.num_command_tokens + self.num_text_tokens + + # set type tokens + if type_tokens is None: + type_tokens = DEFAULT_TYPE_TOKENS + self.type_tokens = type_tokens + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + if not hasattr(self, 'num_type_tokens'): + self.num_type_tokens = len(self.type_tokens) + + # parse tokens and vocabs from tokenizer + self._tokens = list(self.command_token_map.keys()) + list( + self.text_tokenizer.tokens) + self._vocab = {t: Id for Id, t in self.command_id_map.items()} + self._vocab.update({ + t: Id + self.num_command_tokens + for t, Id in self.text_tokenizer.vocab.items() + }) + + self._text_tokens = list(self.text_tokenizer.tokens) + self._text_token_vocab = { + t: Id + self.num_command_tokens + for t, Id in self.text_tokenizer.vocab.items() + } + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = { + t: Id + for Id, t in self.command_id_map.items() + } + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()} + + def __call__(self, text, process_fn=None): + """run preprocessing and encode text as Ids""" + return self.EncodeAsIds(text, process_fn=process_fn) + + def __len__(self): + """total number of tokens""" + return self.num_tokens + + def get_command(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name] + + def get_type(self, name): + """get type token corresponding to `name`""" + return self.type_name_map[name] + + @property + def tokens(self): + """list (or iterable) of all tokens for tokenizer""" + return self._tokens + + @property + def vocab(self): + """dictionary mapping tokens to ids for tokenizer""" + return self._vocab + + @property + def token_types(self): + """list (or iterable) of all token types for tokenizer""" + return self._token_types + + @property + def token_type_vocab(self): + """dictionary mapping token types to ids for tokenizer""" + return self._token_type_vocab + + @property + def command_tokens(self): + """list (or iterable) of all command tokens for tokenizer""" + return self._command_token_tokens + + @property + def command_token_vocab(self): + """dictionary mapping command tokens to ids for tokenizer""" + return self._command_token_vocab + + @property + def text_tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + return self._text_tokens + + @property + def text_token_vocab(self): + """dictionary mapping text tokens to ids for text tokenizer""" + return self._text_token_vocab + + def EncodeAsIds(self, text, process_fn=None): + """ + encode text using text tokenizer and shift Id values for command tokens + """ + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + + def split_on_token(tok_extended: CommandToken, text): + result = [] + tok = tok_extended.token + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + # CommandToken can control whitespace stripping around them. + # We use them for GPT2 and Roberta to have different behavior depending on the special token + # Cf. https://github.com/huggingface/transformers/pull/2778 + # and https://github.com/huggingface/transformers/issues/3788 + # Strip white spaces on the right + if tok_extended.rstrip and i > 0: + # A bit counter-intuitive but we strip the left of the string + # since tok_extended.rstrip means the special token is eating all white spaces on its right + sub_text = sub_text.lstrip() + # Strip white spaces on the left + if tok_extended.lstrip and i < len(split_text) - 1: + sub_text = sub_text.rstrip() # Opposite here + + if i == 0 and not sub_text: + result.append(tok) + elif i == len(split_text) - 1: + if sub_text: + result.append(sub_text) + else: + pass + else: + if sub_text: + result.append(sub_text) + result.append(tok) + return result + + def split_on_tokens(tok_list, text): + if not text.strip(): + return [] + if not tok_list: + return self.text_tokenizer.encode(text) + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self._command_token_tokens: + tokenized_text.extend(split_on_token(tok, sub_text)) + else: + tokenized_text.append(sub_text) + text_list = tokenized_text + + return list( + itertools.chain.from_iterable( + (self._encode(token) + if token not in self._command_token_tokens else + [self.command_token_map[token].Id] + for token in tokenized_text))) + + no_split_tokens = self._command_tokens + Ids = split_on_tokens(no_split_tokens, processed_text) + tokenization = Tokenization(Ids, processed_text, text) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def _encode(self, text): + raise NotImplementedError + + def EncodeAsTokens(self, text, process_fn=None): + """ + encode text as tokens using text tokenizer + """ + tokenization = self.text_tokenizer.EncodeAsTokens( + text, process_fn=process_fn) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def IdToToken(self, Id, type_token=False): + """convert Id to token accounting for command and type tokens""" + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + if Id < self.num_command_tokens: + return self.command_id_map[Id].token + return self.text_tokenizer.IdToToken(Id - self.num_command_tokens) + + def TokenToId(self, token, type_token=False): + """convert token to Id accounting for command and type tokens""" + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + if token in self.command_token_map: + return self.command_token_map[token].Id + return self.text_tokenizer.TokenToId(token) + self.num_command_tokens + + def DecodeIds(self, Ids, type_token=False): + """ + convert Ids to tokens accounting for command and type tokens, tokens + are joined and returned as a string. + """ + if type_token: + return ' '.join( + Id.token if isinstance(Id, TypeToken) else self. + type_id_map[Id].token for Id in Ids) + rtn_strs = [] + current_str = [] + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + for Id in Ids: + if isinstance(Id, CommandToken): + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + current_str = [] + rtn_strs.append(Id.token) + elif Id < self.num_command_tokens: + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + current_str = [] + rtn_strs.append(self.command_id_map[Id].token) + else: + current_str.append(Id - self.num_command_tokens) + if current_str != []: + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + return ' '.join(rtn_strs) + + def DecodeTokens(self, Tokens, type_token=False): + """ + convert tokens to a string accounting for command and type tokens. + """ + if type_token: + return ' '.join( + t.token if isinstance(t, TypeToken) else t for t in Tokens) + rtn_strs = [] + current_str = [] + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + for t in Tokens: + if isinstance(t, CommandToken): + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + current_str = [] + rtn_strs.append(t.token) + elif t in self.command_token_map: + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + current_str = [] + rtn_strs.append(t) + else: + current_str.append(t) + if current_str != []: + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + return ' '.join(rtn_strs) + + +class TextTokenizer(object): + """ + Interface for text tokenizer + """ + + def __init__(self): + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = 0 + if not hasattr(self, 'num_tokens'): + self.num_tokens = self.num_text_tokens + + def __call__(self, text, process_fn=None): + return self.EncodeAsIds(text, process_fn) + + def __len__(self): + return self.num_text_tokens + + @property + def tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + raise NotImplementedError( + 'TextTokenizer tokens property not implemented') + + @property + def vocab(self): + """dictionary mapping tokens to ids""" + raise NotImplementedError( + 'TextTokenizer vocab property not implemented') + + @staticmethod + def exists(model_path): + """check if the filepath for a text tokenizer exists""" + raise NotImplementedError( + 'TextTokenizer exists method not implemented') + + def Train(self, corpus): + """train a tokenizer on a data corpus and save model for future use""" + raise NotImplementedError('TextTokenizer Train not implemented') + + def EncodeAsIds(self, text, process_fn=None): + """ + Preprocess text and encode as ids. Return a tokenization object with + original text, processed text, and id tokenization. + """ + raise NotImplementedError('TextTokenizer EncodeAsIds not implemented') + + def EncodeAsTokens(self, text, process_fn=None): + """ + Preprocess text and encode as tokens. Return a tokenization object with + original text, processed text, and token tokenization. + """ + raise NotImplementedError( + 'TextTokenizer EncodeAsTokens not implemented') + + def IdToToken(self, Id): + """Convert an Id to Token. Reverse lookup of self.vocab""" + raise NotImplementedError('TextTokenizer IdToToken not implemented') + + def TokenToId(self, token): + """Convert a Token to Id. Lookup of self.vocab""" + raise NotImplementedError('TextTokenizer TokenToId not implemented') + + def DecodeIds(self, Ids): + """Convert a list or tokenization object of Ids to a text string""" + raise NotImplementedError('TextTokenizer DecodeIds not implemented') + + def DecodeTokens(self, Tokens): + """Convert a list or tokenization object of tokens to a text string""" + raise NotImplementedError('TextTokenizer DecodeTokens not implemented') + + +class CharacterLevelTokenizer(TextTokenizer): + """ + Text tokenizer for ASCII-256 Character Level Tokenization. + """ + + def __init__(self, **kwargs): + self.num_text_tokens = 256 + super(CharacterLevelTokenizer, self).__init__() + self._tokens = [ + self.IdToToken(Id) for Id in range(self.num_text_tokens) + ] + self._vocab = {t: i for i, t in enumerate(self._tokens)} + + def __len__(self): + return 256 + + @staticmethod + def exists(model_path): + return True + + def Train(self, corpus): + pass + + @property + def tokens(self): + return self._tokens + + @property + def vocab(self): + return self._vocab + + def EncodeAsIds(self, text, process_fn=None): + """convert text to ascii 256 Ids""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + processed_text = str(processed_text) + tokens = [self.TokenToId(c) for c in processed_text] + return Tokenization(tokens, processed_text, text) + + def EncodeAsTokens(self, text, process_fn=None): + """convert text to ascii 256 characters""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + processed_text = str(processed_text) + tokens = [c for c in processed_text] + return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id): + """ascii index to character""" + return chr(Id) + + def TokenToId(self, token): + """ascii character to index""" + return ord(token) + + def DecodeIds(self, Ids): + """converts ascii ids to tokens before joining them into text""" + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + return ''.join([self.IdToToken(tok) for tok in Ids]) + + def DecodeTokens(self, Tokens): + """just concatenates ascii tokens into text""" + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return ''.join(Tokens) + + +MAX_SENTENCEPIECE_SENTENCES = 100000000 + + +def get_corpus_freq(dataset, filepath, filetype='tsv'): + """ + Take corpus, split it into sentences, and extract word frequencies. + Write frequencies to `filepath` as a tsv. Only write the first + MAX_SENTENCEPIECE_SENTENCES most common words to the file. + """ + nltk.download('punkt', download_dir='./nltk') + if filetype == 'tsv': + delimiter = '\t' + else: + delimiter = ',' + + print('compute corpus frequency\n', flush=True) + + total_sentence_count = 0 + maxlen = 0 + freqs = {} + for entry in dataset: + if isinstance(entry, dict): + entry = entry['text'] + lines = entry.strip().split('\n') + for line in lines: + sentences = nltk_tokenize.sent_tokenize(line) + total_sentence_count += len(sentences) + for sentence in sentences: + maxlen = max(len(line), maxlen) + for word in sentence.split(): + if word not in freqs: + freqs[word] = 0 + freqs[word] += 1 + + print('length of freqs before truncating ' + str(len(freqs)), flush=True) + print('file path for freq ' + str(filepath), flush=True) + + freqs_sorted = {} + counter = 0 + for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True): + if counter >= MAX_SENTENCEPIECE_SENTENCES: + break + counter += 1 + freqs_sorted[word] = count + + print( + 'length of freqs after trancating ' + str(len(freqs_sorted)), + flush=True) + + with open(filepath, 'w') as f: + writer = csv.writer(f, delimiter=delimiter) + for k, v in freqs_sorted.items(): + writer.writerow([str(k), str(v)]) + + return total_sentence_count, maxlen + + +class SentencePieceTokenizer(TextTokenizer): + """Trains and uses sentencepiece for text tokenization""" + + def __init__(self, + model_type='bpe', + vocab_size=None, + corpus=None, + model_path=None, + character_coverage=1.0, + **kwargs): + self.character_coverage = character_coverage + self.model_type = model_type.lower() + self.spm_model = model_path + self.num_text_tokens = vocab_size + make_train = not SentencePieceTokenizer.exists(self.spm_model) + if make_train: + assert corpus is not None and self.num_text_tokens is not None + self.Train(corpus, self.num_text_tokens) + self._tokens = [] + self._vocab = {} + self.load_spm_model() + super(SentencePieceTokenizer, self).__init__() + + def __len__(self): + return self.num_text_tokens + + @property + def tokens(self): + return self._tokens + + @property + def vocab(self): + return self._vocab + + @staticmethod + def exists(model_path): + if model_path is None: + return False + # check if path exists + dne = not os.path.exists(model_path) + # check if path.model exists + if dne and not model_path.endswith('.model'): + dne = not os.path.exists(model_path + '.model') + return not dne + + def load_spm_model(self): + """load sentencepiece model and parse vocab""" + if not os.path.exists( + self.spm_model) and not self.spm_model.endswith('.model'): + self.spm_model = self.spm_model + '.model' + self.sp = spm.SentencePieceProcessor() + self.sp.Load(self.spm_model) + self.vocab_size = self.num_text_tokens = len(self.sp) + self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)] + self._vocab = {t: i for i, t in enumerate(self._tokens)} + + def Train(self, corpus, num_text_tokens): + """train sentencepiece model on corpus using word frequencies""" + self.num_text_tokens = num_text_tokens + use_model_path = self.spm_model + random_hash = str(random.randint(0, 2147483647)) + if use_model_path is None: + use_model_path = random_hash + if use_model_path.endswith('.model'): + use_model_path = use_model_path[:use_model_path.rfind('.model')] + input_path = use_model_path + '.tsv.' + random_hash + line_count, maxlenline = get_corpus_freq(corpus, input_path) + line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES) + print( + 'line count used as input_sentence_size ', line_count, flush=True) + print('training sentencepiece model', flush=True) + train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \ + + ' --model_type={model_type} --character_coverage={character_coverage} ' \ + + '--input_sentence_size={input_sentence_size} ' \ + + '--input_format=tsv' + train_string = train_string.format( + file_path=input_path, + model_prefix=use_model_path, + vocab_size=num_text_tokens, + model_type=self.model_type, + character_coverage=self.character_coverage, + input_sentence_size=int(line_count)) # , #)#, + print( + 'calling spm.SentencePieceTrainer.Train(%s)' % (train_string), + flush=True) + spm.SentencePieceTrainer.Train(train_string) + os.remove(input_path) + self.spm_model = use_model_path + '.model' + print('sentencepiece model written to ' + self.spm_model, flush=True) + + def EncodeAsIds(self, text, process_fn=None): + """convert text to sentencepiece Ids""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.sp.EncodeAsIds(processed_text) + return Tokenization(tokens, processed_text, text) + + def EncodeAsTokens(self, text, process_fn=None): + """convert text to sentencepiece tokens""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.sp.EncodeAsTokens(processed_text) + return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id): + """convert Id to sentencpiece token""" + return self.sp.IdToPiece(Id) + + def TokenToId(self, token): + """convert sentencpiece token to Id""" + return self.sp.PieceToId(token) + + def DecodeIds(self, Ids): + """converts ids to a text string""" + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + return self.sp.DecodeIds(Ids) + + def DecodeTokens(self, Tokens): + """converts sentencepiece tokens to a text string""" + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return self.sp.DecodeTokens(Tokens) + + +class BertWordPieceTokenizer(Tokenizer): + """ + Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization + in BERT training. Default to bert-large-uncased tokenizer. + """ + + def __init__(self, + tokenizer_model_type=None, + cache_dir=None, + add_block_symbols=False, + add_sentinel_token=0, + add_task_mask=False, + add_decoder_mask=False, + **kwargs): + # default to bert-large-uncased tokenizer + if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP: + tokenizer_model_type = 'bert-large-uncased' + if not torch.distributed.is_initialized( + ) or torch.distributed.get_rank() == 0: + print('loading BertWordPieceTokenizer (', tokenizer_model_type, + ') from cache_dir ', cache_dir) + do_lower_case = not ('-cased' in tokenizer_model_type + or 'chinese' in tokenizer_model_type) + self.text_tokenizer = BertTokenizer.from_pretrained( + tokenizer_model_type, + do_lower_case=do_lower_case, + cache_dir=cache_dir) + if not torch.distributed.is_initialized( + ) or torch.distributed.get_rank() == 0: + print('loaded', tokenizer_model_type) + # disable max len warnings by increasing max len + self.text_tokenizer.max_len = int(1e12) + + # set command tokens from wordpiece tokenizer values + self.num_command_tokens = 6 + self.num_tokens = len(self.text_tokenizer.vocab) + self.num_text_tokens = self.num_tokens - 5 + self.num_type_tokens = 2 + + self._command_tokens = [ + CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']), + CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']), + CommandToken('MASK', '[MASK]', + self.text_tokenizer.vocab['[MASK]']), + CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']), + CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']), + CommandToken('eos', '[PAD]', self.text_tokenizer.vocab['[PAD]']), + ] + if add_block_symbols: + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', self.num_tokens), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_task_mask: + self._command_tokens.extend([ + CommandToken('gMASK', '[gMASK]', self.num_tokens), + CommandToken('sMASK', '[sMASK]', self.num_tokens + 1) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_decoder_mask: + self._command_tokens.extend( + [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + self.num_tokens += 1 + self.num_command_tokens += 1 + if add_sentinel_token > 0: + for i in range(1, add_sentinel_token): + self._command_tokens.extend([ + CommandToken(f'MASK{i}', f'[MASK{i}]', self.num_tokens), + CommandToken(f'sop{i}', f'<|startofpiece{i}|>', + self.num_tokens + 1) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + + # set type tokens + self.type_tokens = [ + TypeToken('str0', '', 0), + TypeToken('str1', '', 1), + ] + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + + # parse tokens and vocabs from tokenizer + + self._tokens = list(self.text_tokenizer.vocab.keys()) + self._vocab = {k: v for k, v in self.text_tokenizer.vocab.items()} + + self._text_tokens = list(self._tokens) + self._text_token_vocab = { + k: v + for k, v in self.text_tokenizer.vocab.items() + } + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = { + t: Id + for Id, t in self.command_id_map.items() + } + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()} + + def _encode(self, text): + tokens = self.text_tokenizer.tokenize(text) + ids = self.text_tokenizer.convert_tokens_to_ids(tokens) + return ids + + def EncodeAsTokens(self, text, process_fn=None): + """convert wordpiece token to Id""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.text_tokenizer.tokenize(processed_text) + return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id, type_token=False): + """convert Id to sentencpiece token""" + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + if Id in self.command_id_map: + return self.command_id_map[Id].token + return self.text_tokenizer.ids_to_tokens[Id] + + def TokenToId(self, token, type_token=False): + """convert sentencpiece token to Id""" + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + return self.text_tokenizer.vocab[token] + + def DecodeIds(self, Ids, type_token=False): + """converts ids to wordpiece tokens and joins them as a text string""" + if type_token: + return ' '.join( + Id.token if isinstance(Id, TypeToken) else self. + type_id_map[Id].token for Id in Ids) + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + Tokens = [] + for Id in Ids: + if Id in self.command_id_map: + Tokens.append(self.command_id_map[Id].token) + elif Id in self.text_tokenizer.ids_to_tokens: + Tokens.append(self.text_tokenizer.ids_to_tokens[Id]) + new_tokens = [] + for token in Tokens: + if token.startswith('##') and len(new_tokens) > 0: + new_tokens[-1] += token[2:] + else: + new_tokens.append(token) + return ' '.join(new_tokens) + + def DecodeTokens(self, Tokens, type_token=False): + """converts wordpiece tokens to a text string""" + if type_token: + return ' '.join( + t.token if isinstance(t, TypeToken) else t for t in Tokens) + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return ' '.join(Tokens) + + +class GPT2BPETokenizer(Tokenizer): + + def __init__(self, + model_type_or_path, + cache_dir=None, + add_block_symbols=False, + add_task_mask=False, + add_decoder_mask=False, + **kwargs): + self.text_tokenizer = GPT2Tokenizer.from_pretrained( + model_type_or_path, cache_dir=cache_dir) + + # disable max len warnings by increasing max len + self.text_tokenizer.max_len = int(1e12) + self.num_tokens = len(self.text_tokenizer.encoder) + self.num_type_tokens = 2 + if model_type_or_path.startswith('roberta'): + self.num_command_tokens = 6 + self.num_text_tokens = self.num_tokens - 3 + self._command_tokens = [ + CommandToken('pad', '<|endoftext|>', + self.text_tokenizer.encoder['']), + CommandToken('eos', '<|endoftext|>', + self.text_tokenizer.encoder['']), + CommandToken('sep', '[SEP]', + self.text_tokenizer.encoder['']), + CommandToken('ENC', '[CLS]', + self.text_tokenizer.encoder['']), + CommandToken( + 'MASK', + '[MASK]', + self.text_tokenizer.encoder[''], + lstrip=True), + CommandToken('unk', '[UNK]', + self.text_tokenizer.encoder['']) + ] + if add_block_symbols: + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', self.num_tokens), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + else: + self.num_command_tokens = 2 + self.num_text_tokens = self.num_tokens - 1 + self._command_tokens = [ + CommandToken('pad', '<|endoftext|>', + self.text_tokenizer.encoder['<|endoftext|>']), + CommandToken('eos', '<|endoftext|>', + self.text_tokenizer.encoder['<|endoftext|>']) + ] + if add_block_symbols: + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', self.num_tokens), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), + CommandToken('ENC', '[CLS]', self.num_tokens + 2), + CommandToken( + 'MASK', '[MASK]', self.num_tokens + 3, lstrip=True), + CommandToken('sep', '[SEP]', self.num_tokens + 4), + CommandToken('unk', '[UNK]', self.num_tokens + 5) + ]) + self.num_tokens += 6 + self.num_command_tokens += 6 + if add_block_symbols: + if add_task_mask: + self._command_tokens.extend([ + CommandToken( + 'gMASK', '[gMASK]', self.num_tokens, lstrip=True), + CommandToken( + 'sMASK', '[sMASK]', self.num_tokens + 1, lstrip=True) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_decoder_mask: + self._command_tokens.extend( + [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + self.num_tokens += 1 + self.num_command_tokens += 1 + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + + self.type_tokens = [ + TypeToken('str0', '', 0), + TypeToken('str1', '', 1), + ] + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + + self._tokens = list(self.text_tokenizer.encoder.keys()) + self._vocab = {k: v for k, v in self.text_tokenizer.encoder.items()} + + self._text_tokens = list(self._tokens) + self._text_token_vocab = { + k: v + for k, v in self.text_tokenizer.encoder.items() + } + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = { + t: Id + for Id, t in self.command_id_map.items() + } + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()} + + for idx, tok in self.command_id_map.items(): + self.text_tokenizer.decoder[idx] = tok.token + + def EncodeAsIds(self, text, process_fn=None): + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + + def split_on_token(tok_extended: CommandToken, text): + result = [] + tok = tok_extended.token + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + # CommandToken can control whitespace stripping around them. + # We use them for GPT2 and Roberta to have different behavior depending on the special token + # Cf. https://github.com/huggingface/transformers/pull/2778 + # and https://github.com/huggingface/transformers/issues/3788 + # Strip white spaces on the right + if tok_extended.rstrip and i > 0: + # A bit counter-intuitive but we strip the left of the string + # since tok_extended.rstrip means the special token is eating all white spaces on its right + sub_text = sub_text.lstrip() + # Strip white spaces on the left + if tok_extended.lstrip and i < len(split_text) - 1: + sub_text = sub_text.rstrip() # Opposite here + + if i == 0 and not sub_text: + result.append(tok) + elif i == len(split_text) - 1: + if sub_text: + result.append(sub_text) + else: + pass + else: + if sub_text: + result.append(sub_text) + result.append(tok) + return result + + def split_on_tokens(tok_list, text): + if not text.strip(): + return [] + if not tok_list: + return self.text_tokenizer.encode(text) + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self._command_token_tokens: + tokenized_text.extend(split_on_token(tok, sub_text)) + else: + tokenized_text.append(sub_text) + text_list = tokenized_text + + return list( + itertools.chain.from_iterable( + (self.text_tokenizer.encode(token) + if token not in self._command_token_tokens else + [self.command_token_map[token].Id] + for token in tokenized_text))) + + no_split_tokens = self._command_tokens + Ids = split_on_tokens(no_split_tokens, processed_text) + tokenization = Tokenization(Ids, processed_text, text) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def _encode(self, text): + return self.text_tokenizer.encode(text) + + def EncodeAsTokens(self, text, process_fn=None): + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = [] + for token in re.findall(self.text_tokenizer.pat, processed_text): + token = ''.join(self.text_tokenizer.bye_encoder[b] + for b in token.encode('utf-8')) + tokens.extend( + bpe_token + for bpe_token in self.text_tokenizer.bpe(token).split(' ')) + tokenization = Tokenization(tokens, processed_text, text, asIds=False) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def DecodeAsTokens(self, Ids): + return [self.IdToToken(x) for x in Ids] + + def IdToToken(self, Id, type_token=False): + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + if Id in self.command_id_map: + return self.command_id_map[Id].token + return self.text_tokenizer.decoder[Id] + + def TokenToId(self, token, type_token=False): + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + return self.text_tokenizer.encoder[token] + + def DecodeIds(self, Ids, type_token=False): + if type_token: + return ' '.join( + Id.token if isinstance(Id, TypeToken) else self. + type_id_map[Id].token for Id in Ids) + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + return self.text_tokenizer.decode(Ids) + + def DecodeTokens(self, Tokens, type_token=False): + if type_token: + return ' '.join( + t.token if isinstance(t, TypeToken) else t for t in Tokens) + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return self.text_tokenizer.decode( + [self.TokenToId(tok) for tok in Tokens]) + + +class ChineseSPTokenizer(Tokenizer): + + def __init__(self, + model_path, + add_block_symbols=False, + add_task_mask=False, + add_decoder_mask=False, + **kwargs): + self.text_tokenizer = sp_tokenizer.from_pretrained(model_path) + + self.num_command_tokens = 0 + self.num_text_tokens = self.text_tokenizer.sp.vocab_size() + self.num_tokens = self.num_text_tokens + self.num_type_tokens = 2 + + self._command_tokens = [ + CommandToken('pad', '<|endoftext|>', self.num_text_tokens), + CommandToken('eos', '<|endoftext|>', self.num_text_tokens), + CommandToken('sep', '[SEP]', self.num_text_tokens + 1), + CommandToken('ENC', '[CLS]', self.num_text_tokens + 2), + CommandToken( + 'MASK', '[MASK]', self.num_text_tokens + 3, lstrip=True), + CommandToken('unk', '[UNK]', self.num_text_tokens + 4) + ] + self.num_tokens += 5 + self.num_command_tokens += 6 + if add_block_symbols: + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', self.num_tokens + 1), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_task_mask: + self._command_tokens.extend([ + CommandToken( + 'gMASK', '[gMASK]', self.num_tokens, lstrip=True), + CommandToken( + 'sMASK', '[sMASK]', self.num_tokens + 1, lstrip=True) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_decoder_mask: + self._command_tokens.extend( + [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + self.num_tokens += 1 + self.num_command_tokens += 1 + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + + self.type_tokens = [ + TypeToken('str0', '', 0), + TypeToken('str1', '', 1), + ] + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + + # self._tokens = list(self.text_tokenizer.encoder.keys()) + # self._vocab = {k:v for k,v in self.text_tokenizer.encoder.items()} + # + # self._text_tokens = list(self._tokens) + # self._text_token_vocab = {k:v for k,v in self.text_tokenizer.encoder.items()} + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = { + t: Id + for Id, t in self.command_id_map.items() + } + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()} + + def _encode(self, text): + ids = self.text_tokenizer.encode(text) + return ids + + def EncodeAsTokens(self, text, process_fn=None): + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.text_tokenizer.tokenize(processed_text) + tokenization = Tokenization(tokens, processed_text, text, asIds=False) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + # return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id, type_token=False): + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + if Id in self.command_id_map: + return self.command_id_map[Id].token + elif Id in self.type_id_map: + return self.type_id_map[Id].token + else: + return self.text_tokenizer.convert_id_to_token(int(Id)) + + def TokenToId(self, token, type_token=False): + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + return self.text_tokenizer.convert_token_to_id(token) + + def DecodeIds(self, Ids, type_token=False): + if type_token: + return ' '.join( + Id.token if isinstance(Id, TypeToken) else self. + type_id_map[Id].token for Id in Ids) + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + Ids = list(map(int, Ids)) + pieces = [] + last = 0 + for i, token_id in enumerate(Ids): + if token_id in self.command_id_map: + pieces.append(Ids[last:i]) + pieces.append(token_id) + last = i + 1 + pieces.append(Ids[last:]) + text = '' + for piece in pieces: + if isinstance(piece, int): + text += self.command_id_map[piece].token + elif piece: + text += self.text_tokenizer.decode(piece) + return text + + def DecodeTokens(self, Tokens, type_token=False): + if type_token: + return ' '.join( + t.token if isinstance(t, TypeToken) else t for t in Tokens) + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return self.text_tokenizer.decode( + [self.TokenToId(tok) for tok in Tokens]) diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py new file mode 100644 index 00000000..d179e055 --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py @@ -0,0 +1,359 @@ +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import logging +import os +import sys +from io import open + +import json +import regex as re + +from .file_utils import cached_path + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'gpt2': '.pytorch_pretrained_bert/gpt2-vocab.json', + 'roberta': '.pytorch_pretrained_bert/roberta-vocab.json' +} +PRETRAINED_MERGES_ARCHIVE_MAP = { + 'gpt2': '.pytorch_pretrained_bert/gpt2-merges.txt', + 'roberta': '.pytorch_pretrained_bert/roberta-merges.txt' +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'gpt2': 1024, +} +VOCAB_NAME = 'vocab.json' +MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord('!'), + ord('~') + 1)) + list(range( + ord('¡'), + ord('¬') + 1)) + list(range(ord('®'), + ord('ÿ') + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class GPT2Tokenizer(object): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level BPE + """ + + @classmethod + def from_pretrained(cls, + pretrained_model_name_or_path, + cache_dir=None, + *inputs, + **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ + pretrained_model_name_or_path] + merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[ + pretrained_model_name_or_path] + special_tokens_file = None + else: + vocab_file = os.path.join(pretrained_model_name_or_path, + VOCAB_NAME) + merges_file = os.path.join(pretrained_model_name_or_path, + MERGES_NAME) + special_tokens_file = os.path.join(pretrained_model_name_or_path, + SPECIAL_TOKENS_NAME) + if not os.path.exists(special_tokens_file): + special_tokens_file = None + else: + logger.info('loading special tokens file {}'.format( + special_tokens_file)) + # redirect to the cache, if necessary + # try: + # resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + # resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) + # except EnvironmentError: + # logger.error( + # "Model name '{}' was not found in model name list ({}). " + # "We assumed '{}' was a path or url but couldn't find files {} and {} " + # "at this path or url.".format( + # pretrained_model_name_or_path, + # ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + # pretrained_model_name_or_path, + # vocab_file, merges_file)) + # return None + # if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: + # logger.info("loading vocabulary file {}".format(vocab_file)) + # logger.info("loading merges file {}".format(merges_file)) + # else: + # logger.info("loading vocabulary file {} from cache at {}".format( + # vocab_file, resolved_vocab_file)) + # logger.info("loading merges file {} from cache at {}".format( + # merges_file, resolved_merges_file)) + resolved_vocab_file = vocab_file + resolved_merges_file = merges_file + logger.info('loading vocabulary file {}'.format(vocab_file)) + logger.info('loading merges file {}'.format(merges_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ + pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + if special_tokens_file and 'special_tokens' not in kwargs: + special_tokens = open( + special_tokens_file, encoding='utf-8').read().split('\n')[:-1] + else: + special_tokens = kwargs.pop('special_tokens', []) + tokenizer = cls( + resolved_vocab_file, + resolved_merges_file, + special_tokens=special_tokens, + *inputs, + **kwargs) + return tokenizer + + def __init__(self, + vocab_file, + merges_file, + errors='replace', + special_tokens=None, + max_len=None): + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file)) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile( + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" + ) + + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + + def __len__(self): + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) + for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = { + v: k + for k, v in self.special_tokens.items() + } + logger.info('Special tokens {}'.format(self.special_tokens)) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min( + pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: # noqa + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[ + i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def tokenize(self, text): + """ Tokenize a string. """ + bpe_tokens = [] + for token in re.findall(self.pat, text): + if sys.version_info[0] == 2: + token = ''.join(self.byte_encoder[ord(b)] for b in token) + else: + token = ''.join(self.byte_encoder[b] + for b in token.encode('utf-8')) + bpe_tokens.extend( + bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 + and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + 'Token indices sequence length is longer than the specified maximum ' + ' sequence length for this OpenAI GPT model ({} > {}). Running this' + ' sequence through the model will result in indexing errors'. + format(len(ids), self.max_len)) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + 'utf-8', errors=self.errors) + return text + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error('Vocabulary path ({}) should be a directory'.format( + vocab_path)) + return + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, 'w', encoding='utf-8') as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted( + self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + 'Saving vocabulary to {}: BPE merge indices are not consecutive.' + ' Please check that the tokenizer is not corrupted!'. + format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + index = len(self.encoder) + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token, token_index in sorted( + self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + 'Saving special tokens vocabulary to {}: BPE indices are not consecutive.' + ' Please check that the tokenizer is not corrupted!'. + format(special_tokens_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + + return vocab_file, merge_file, special_tokens_file diff --git a/modelscope/models/nlp/mglm/data_utils/wordpiece.py b/modelscope/models/nlp/mglm/data_utils/wordpiece.py new file mode 100755 index 00000000..1cecffbd --- /dev/null +++ b/modelscope/models/nlp/mglm/data_utils/wordpiece.py @@ -0,0 +1,408 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py""" # noqa + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import collections +import logging +import os +import unicodedata +from io import open + +from .file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'bert-base-uncased': + '.pytorch_pretrained_bert/bert-base-uncased-vocab.txt', + 'bert-large-uncased': + '.pytorch_pretrained_bert/bert-large-uncased-vocab.txt', + 'bert-base-cased': + '.pytorch_pretrained_bert/bert-base-cased-vocab.txt', + 'bert-large-cased': + '.pytorch_pretrained_bert/bert-large-cased-vocab.txt', + 'bert-base-multilingual-uncased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt', + 'bert-base-multilingual-cased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt', + 'bert-base-chinese': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt', +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'bert-base-uncased': 512, + 'bert-large-uncased': 512, + 'bert-base-cased': 512, + 'bert-large-cased': 512, + 'bert-base-multilingual-uncased': 512, + 'bert-base-multilingual-cased': 512, + 'bert-base-chinese': 512, +} +VOCAB_NAME = 'vocab.txt' + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, 'r', encoding='utf-8') as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + + def __init__(self, + vocab_file, + do_lower_case=True, + max_len=None, + do_basic_tokenize=True, + never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')): + """Constructs a BertTokenizer. + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input + Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + """ + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + 'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`' + .format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([ + (ids, tok) for tok, ids in self.vocab.items() + ]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.max_len = max_len if max_len is not None else int(1e12) + + def tokenize(self, text): + if self.do_basic_tokenize: + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + if len(ids) > self.max_len: + logger.warning( + 'Token indices sequence length is longer than the specified maximum ' + ' sequence length for this BERT model ({} > {}). Running this' + ' sequence through BERT will result in indexing errors'.format( + len(ids), self.max_len)) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + @classmethod + def from_pretrained(cls, + pretrained_model_name_or_path, + cache_dir=None, + *inputs, + **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ + pretrained_model_name_or_path] + else: + vocab_file = pretrained_model_name_or_path + if os.path.isdir(vocab_file): + vocab_file = os.path.join(vocab_file, VOCAB_NAME) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + 'associated to this path or url.'.format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info('loading vocabulary file {}'.format(vocab_file)) + else: + logger.info('loading vocabulary file {} from cache at {}'.format( + vocab_file, resolved_vocab_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ + pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) + return tokenizer + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, + do_lower_case=True, + never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in self.never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(' '.join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize('NFD', text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == 'Mn': + continue + output.append(char) + return ''.join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + if text in self.never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return [''.join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(' ') + output.append(char) + output.append(' ') + else: + output.append(char) + return ''.join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # noqa + (cp >= 0x3400 and cp <= 0x4DBF) or # noqa + (cp >= 0x20000 and cp <= 0x2A6DF) or # noqa + (cp >= 0x2A700 and cp <= 0x2B73F) or # noqa + (cp >= 0x2B740 and cp <= 0x2B81F) or # noqa + (cp >= 0x2B820 and cp <= 0x2CEAF) or # noqa + (cp >= 0xF900 and cp <= 0xFAFF) or # noqa + (cp >= 0x2F800 and cp <= 0x2FA1F)): # noqa + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(' ') + else: + output.append(char) + return ''.join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = ''.join(chars[start:end]) + if start > 0: + substr = '##' + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == ' ' or char == '\t' or char == '\n' or char == '\r': + return True + cat = unicodedata.category(char) + if cat == 'Zs': + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == '\t' or char == '\n' or char == '\r': + return False + cat = unicodedata.category(char) + if cat.startswith('C'): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) + or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith('P'): + return True + return False diff --git a/modelscope/models/nlp/mglm/fp16/__init__.py b/modelscope/models/nlp/mglm/fp16/__init__.py new file mode 100644 index 00000000..90d20bcf --- /dev/null +++ b/modelscope/models/nlp/mglm/fp16/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .fp16 import * # noqa +from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm, + convert_module, convert_network, + master_params_to_model_params, + model_grads_to_master_grads, network_to_half, + prep_param_lists, to_python_float, tofp16) +from .loss_scaler import * # noqa diff --git a/modelscope/models/nlp/mglm/fp16/fp16.py b/modelscope/models/nlp/mglm/fp16/fp16.py new file mode 100755 index 00000000..10fbd804 --- /dev/null +++ b/modelscope/models/nlp/mglm/fp16/fp16.py @@ -0,0 +1,660 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Stable version of apex FP16 Optimizer""" +import torch +from torch import nn +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from torch.autograd import Variable +from torch.nn.parameter import Parameter + +from .fp16util import (clip_grad_norm, master_params_to_model_params, + model_grads_to_master_grads) +from .loss_scaler import DynamicLossScaler, LossScaler + +FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) +HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) + + +def conversion_helper(val, conversion): + """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure.""" + if not isinstance(val, (tuple, list)): + return conversion(val) + rtn = [conversion_helper(v, conversion) for v in val] + if isinstance(val, tuple): + rtn = tuple(rtn) + return rtn + + +def fp32_to_fp16(val): + """Convert fp32 `val` to fp16""" + + def half_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, FLOAT_TYPES): + val = val.half() + return val + + return conversion_helper(val, half_conversion) + + +def fp16_to_fp32(val): + """Convert fp16 `val` to fp32""" + + def float_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, HALF_TYPES): + val = val.float() + return val + + return conversion_helper(val, float_conversion) + + +class FP16_Module(nn.Module): + + def __init__(self, module): + super(FP16_Module, self).__init__() + self.add_module('module', module.half()) + + def forward(self, *inputs, **kwargs): + return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs)) + + def named_parameters(self, prefix: str = '', recurse: bool = True): + return self.module.named_parameters(prefix=prefix, recurse=recurse) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + return self.module.state_dict(destination, prefix, keep_vars) + + def load_state_dict(self, state_dict, strict=True): + return self.module.load_state_dict(state_dict, strict=strict) + + +# TODO: Update overflow check + downscale to use Carl's fused kernel. +class FP16_Optimizer(object): + """ + :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, + and manage static or dynamic loss scaling and master weights in a manner transparent to the user. + For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance, + and changing the call to ``backward``. + + Example:: + + model = torch.nn.Linear(D_in, D_out).cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + # Name the FP16_Optimizer instance to replace the existing optimizer + # (recommended but not required): + optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) + ... + # loss.backward() becomes: + optimizer.backward(loss) + ... + + Example with dynamic loss scaling:: + + ... + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + # optional arg to control dynamic loss scaling behavior + # dynamic_loss_args={'scale_window' : 500}) + # Usually, dynamic_loss_args is not necessary. + + Args: + init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. + static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. + dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option. + dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. + verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. + + ``init_optimizer`` is expected to have been constructed in the ordinary way. + It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be + named to replace ``init_optimizer``, for two reasons: + First, it means that references to the same name + later in the file will not have to change. + Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to + modify ``init_optimizer``. If you do choose a unique name for the new + :class:`FP16_Optimizer` instance, you should only work with this new instance, + because the preexisting optimizer might no longer behave as expected. + + ``init_optimizer`` may be any Pytorch optimizer. + It may contain a mixture of fp16 and fp32 parameters organized into any number of + ``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will + ingest these ``param_groups`` and remember them. + + Calls to :: + + loss.backward() + + must be replaced with :: + + optimizer.backward(loss) + + because :class:`FP16_Optimizer` requires ownership of the backward pass to implement + loss scaling and copies to master gradients. + + .. note:: + Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients + are downscaled before being applied. This means that adjusting the loss scale, or using + dynamic loss scaling, should not require retuning the learning rate or any other + hyperparameters. + + + **Advanced options** + + **Closures**: :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure. + See docstring for :attr:`step`. + + **Gradient clipping**: Use :attr:`clip_master_grads`. + + **Multiple losses**: If your model accumulates gradients from multiple losses, + this can be made more efficient by supplying ``update_master_grads=False`` + to :attr:`backward`. See docstring for :attr:`backward`. + + **Manually adjusting loss scale**: The current loss scale can be retrieved or set via :: + + print(optimizer.loss_scale) + optimizer.loss_scale = new_loss_scale + + For static loss scaling, manually adjusting the loss scale over time is a reasonable + thing to do. During later epochs, gradients may become smaller, and a + higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss + scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting + the loss scale is not recommended. + + **Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in + Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` + should still work as intended. + """ # noqa + + def __init__(self, + init_optimizer, + static_loss_scale=1.0, + dynamic_loss_scale=False, + dynamic_loss_args=None, + verbose=False): + if not torch.cuda.is_available: + raise SystemError('Cannot use fp16 without CUDA.') + + self.verbose = verbose + + self.optimizer = init_optimizer + # init_state_dict sets up an alternative way to cast per-param state tensors. + # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary. + # init_state_dict = init_optimizer.state_dict() + + self.fp16_groups = [] + self.fp32_from_fp16_groups = [] + self.fp32_from_fp32_groups = [] + for i, param_group in enumerate(self.optimizer.param_groups): + self.maybe_print( + 'FP16_Optimizer processing param group {}:'.format(i)) + fp16_params_this_group = [] + fp32_params_this_group = [] + fp32_from_fp16_params_this_group = [] + for i, param in enumerate(param_group['params']): + if param.requires_grad: + if param.type() == 'torch.cuda.HalfTensor': + self.maybe_print( + 'FP16_Optimizer received torch.cuda.HalfTensor with {}' + .format(param.size())) + fp16_params_this_group.append(param) + master_param = param.detach().clone().float() + master_param.requires_grad = True + # Copythe model parallel flag. + master_param.model_parallel = param.model_parallel + param_group['params'][i] = master_param + fp32_from_fp16_params_this_group.append(master_param) + # Reset existing state dict key to the new master param. + # We still need to recast per-param state tensors, if any, to FP32. + if param in self.optimizer.state: + self.optimizer.state[ + master_param] = self.optimizer.state.pop(param) + elif param.type() == 'torch.cuda.FloatTensor': + self.maybe_print( + 'FP16_Optimizer received torch.cuda.FloatTensor with {}' + .format(param.size())) + fp32_params_this_group.append(param) + param_group['params'][i] = param + else: + raise TypeError( + 'Wrapped parameters must be either ' + 'torch.cuda.FloatTensor or torch.cuda.HalfTensor. ' + 'Received {}'.format(param.type())) + + self.fp16_groups.append(fp16_params_this_group) + self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) + self.fp32_from_fp32_groups.append(fp32_params_this_group) + + # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors + self.optimizer.load_state_dict(self.optimizer.state_dict()) + # alternative way to cast per-param state tensors: + # self.optimizer.load_state_dict(init_state_dict) + + if dynamic_loss_scale: + self.dynamic_loss_scale = True + if dynamic_loss_args is not None: + self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) + else: + self.loss_scaler = DynamicLossScaler() + else: + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(static_loss_scale) + + self.overflow = False + self.first_closure_call_this_step = True + + self.clip_grad_norm = clip_grad_norm + + def maybe_print(self, msg): + if self.verbose: + print(msg) + + def __getstate__(self): + raise RuntimeError( + 'FP16_Optimizer should be serialized using state_dict().') + + def __setstate__(self, state): + raise RuntimeError( + 'FP16_Optimizer should be deserialized using load_state_dict().') + + def zero_grad(self, set_grads_to_None=False): + """ + Zero fp32 and fp16 parameter grads. + """ + # In principle, only the .grad attributes of the model params need to be zeroed, + # because gradients are copied into the FP32 master params. However, we zero + # all gradients owned by the optimizer, just to be safe: + for group in self.optimizer.param_groups: + for p in group['params']: + if set_grads_to_None: + p.grad = None + else: + if p.grad is not None: + p.grad.detach_() + p.grad.zero_() + + # Zero fp16 gradients owned by the model: + for fp16_group in self.fp16_groups: + for param in fp16_group: + if set_grads_to_None: + param.grad = None + else: + if param.grad is not None: + param.grad.detach_( + ) # as in torch.optim.optimizer.zero_grad() + param.grad.zero_() + + def _check_overflow(self): + params = [] + for group in self.fp16_groups: + for param in group: + params.append(param) + for group in self.fp32_from_fp32_groups: + for param in group: + params.append(param) + self.overflow = self.loss_scaler.has_overflow(params) + + def _update_scale(self, has_overflow=False): + self.loss_scaler.update_scale(has_overflow) + + def _master_params_to_model_params(self): + for fp16_group, fp32_from_fp16_group in zip( + self.fp16_groups, self.fp32_from_fp16_groups): + master_params_to_model_params(fp16_group, fp32_from_fp16_group) + + def _model_params_to_master_params(self): + for fp16_group, fp32_from_fp16_group in zip( + self.fp16_groups, self.fp32_from_fp16_groups): + master_params_to_model_params(fp32_from_fp16_group, fp16_group) + + # To consider: Integrate distributed with this wrapper by registering a hook on each variable + # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream. + def _model_grads_to_master_grads(self): + for fp16_group, fp32_from_fp16_group in zip( + self.fp16_groups, self.fp32_from_fp16_groups): + model_grads_to_master_grads(fp16_group, fp32_from_fp16_group) + + def _downscale_master(self): + if self.loss_scale != 1.0: + for group in self.optimizer.param_groups: + for param in group['params']: + if param.grad is not None: + param.grad.data.mul_(1. / self.loss_scale) + + def clip_master_grads(self, max_norm, norm_type=2): + """ + Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``. + + Args: + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + + Returns: + Total norm of the current fp32 gradients (viewed as a single vector). + + .. warning:: + Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). + """ # noqa + if not self.overflow: + fp32_params = [] + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + fp32_params.append(param) + return self.clip_grad_norm(fp32_params, max_norm, norm_type) + else: + return -1 + + def state_dict(self): + """ + Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. + This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict + of the contained Pytorch optimizer. + Example:: + + checkpoint = {} + checkpoint['model'] = model.state_dict() + checkpoint['optimizer'] = optimizer.state_dict() + torch.save(checkpoint, "saved.pth") + """ + state_dict = {} + state_dict['loss_scaler'] = self.loss_scaler + state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale + state_dict['overflow'] = self.overflow + state_dict[ + 'first_closure_call_this_step'] = self.first_closure_call_this_step + state_dict['optimizer_state_dict'] = self.optimizer.state_dict() + state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups + return state_dict + + def load_state_dict(self, state_dict): + """ + Loads a state_dict created by an earlier call to state_dict(). + If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, + whose parameters in turn came from ``model``, it is expected that the user + will call ``model.load_state_dict()`` before + ``fp16_optimizer_instance.load_state_dict()`` is called. + + Example:: + + model = torch.nn.Linear(D_in, D_out).cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) + ... + checkpoint = torch.load("saved.pth") + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + """ + # I think it should actually be ok to reload the optimizer before the model. + self.loss_scaler = state_dict['loss_scaler'] + self.dynamic_loss_scale = state_dict['dynamic_loss_scale'] + self.overflow = state_dict['overflow'] + self.first_closure_call_this_step = state_dict[ + 'first_closure_call_this_step'] + self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) + # At this point, the optimizer's references to the model's fp32 parameters are up to date. + # The optimizer's hyperparameters and internal buffers are also up to date. + # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still + # out of date. There are two options. + # 1: Refresh the master params from the model's fp16 params. + # This requires less storage but incurs precision loss. + # 2: Save and restore the fp32 master copies separately. + # We choose option 2. + # + # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device + # of their associated parameters, because it's possible those buffers might not exist yet in + # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been + # constructed in the same way as the one whose state_dict we are loading, the same master params + # are guaranteed to exist, so we can just copy_() from the saved master params. + for current_group, saved_group in zip(self.fp32_from_fp16_groups, + state_dict['fp32_from_fp16']): + for current, saved in zip(current_group, saved_group): + current.data.copy_(saved.data) + + def step(self, closure=None): # could add clip option. + """ + If no closure is supplied, :attr:`step` should be called after + ``fp16_optimizer_obj.backward(loss)``. + :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to + :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params + originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run + another forward pass using their model. + + If a closure is supplied, :attr:`step` may be called without a prior call to + :attr:`backward(loss)`. + This control flow is identical to `ordinary Pytorch optimizer use`_ with closures. + However, the user should take care that any ``loss.backward()`` call within the closure + has been replaced by ``fp16_optimizer_obj.backward(loss)``. + + Args: + closure (optional): Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor. closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. + + Example with closure:: + + # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an + # existing pytorch optimizer. + for input, target in dataset: + def closure(): + optimizer.zero_grad() + output = model(input) + loss = loss_fn(output, target) + # loss.backward() becomes: + optimizer.backward(loss) + return loss + optimizer.step(closure) + + .. warning:: + Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling. + + .. _`ordinary Pytorch optimizer use`: + http://pytorch.org/docs/master/optim.html#optimizer-step-closure + """ # noqa + + scale = self.loss_scaler.loss_scale + self._update_scale(self.overflow) + + if self.overflow: + self.maybe_print( + 'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}' + .format(scale, self.loss_scale)) + return + + if closure is not None: + retval = self._step_with_closure(closure) + else: + retval = self.optimizer.step() + + self._master_params_to_model_params() + + return retval + + def _step_with_closure(self, closure): + + def wrapped_closure(): + # helpful for debugging + # print("Calling wrapped_closure, first_closure_call_this_step = {}" + # .format(self.first_closure_call_this_step)) + if self.first_closure_call_this_step: + # We expect that the fp16 params are initially fresh on entering self.step(), + # so _master_params_to_model_params() is unnecessary the first time wrapped_closure() + # is called within self.optimizer.step(). + self.first_closure_call_this_step = False + else: + # If self.optimizer.step() internally calls wrapped_closure more than once, + # it may update the fp32 params after each call. However, self.optimizer + # doesn't know about the fp16 params at all. If the fp32 params get updated, + # we can't rely on self.optimizer to refresh the fp16 params. We need + # to handle that manually: + self._master_params_to_model_params() + # Our API expects the user to give us ownership of the backward() call by + # replacing all calls to loss.backward() with optimizer.backward(loss). + # This requirement holds whether or not the call to backward() is made within a closure. + # If the user is properly calling optimizer.backward(loss) within "closure," + # calling closure() here will give the fp32 master params fresh gradients + # for the optimizer to play with, so all wrapped_closure needs to do is call + # closure() and return the loss. + temp_loss = closure() + while (self.overflow): + scale = self.loss_scaler.loss_scale + self._update_scale(self.overflow) + self.maybe_print( + 'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, ' + 'reducing to {}'.format(scale, self.loss_scale)) + temp_loss = closure() + return temp_loss + + retval = self.optimizer.step(wrapped_closure) + + self.first_closure_call_this_step = True + + return retval + + def backward(self, loss, update_master_grads=True, retain_graph=False): + """ + :attr:`backward` performs the following conceptual steps: + + 1. fp32_loss = loss.float() (see first Note below) + 2. scaled_loss = fp32_loss*loss_scale + 3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). + 4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. + 5. Finally, master grads are divided by loss_scale. + + In this way, after :attr:`backward`, the master params have fresh gradients, + and :attr:`step` may be called. + + .. note:: + :attr:`backward` internally converts the loss to fp32 before applying the loss scale. + This provides some additional safety against overflow if the user has supplied an + fp16 loss value. + However, for maximum overflow safety, the user should + compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to + :attr:`backward`. + + .. warning:: + The gradients found in a model's leaves after the call to + :attr:`backward` should not be regarded as valid in general, + because it's possible + they have been scaled (and in the case of dynamic loss scaling, + the scale factor may change over time). + If the user wants to inspect gradients after a call to :attr:`backward`, + only the master gradients should be regarded as valid. These can be retrieved via + :attr:`inspect_master_grad_data()`. + + Args: + loss: The loss output by the user's model. loss may be either float or half (but see first Note above). + update_master_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. + retain_graph (bool, optional, default=False): Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``. If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). + + Example:: + + # Ordinary operation: + optimizer.backward(loss) + + # Naive operation with multiple losses (technically valid, but less efficient): + # fp32 grads will be correct after the second call, but + # the first call incurs an unnecessary fp16->fp32 grad copy. + optimizer.backward(loss1) + optimizer.backward(loss2) + + # More efficient way to handle multiple losses: + # The fp16->fp32 grad copy is delayed until fp16 grads from all + # losses have been accumulated. + optimizer.backward(loss1, update_master_grads=False) + optimizer.backward(loss2, update_master_grads=False) + optimizer.update_master_grads() + """ # noqa + # To consider: try multiple backward passes using retain_grad=True to find + # a loss scale that works. After you find a loss scale that works, do a final dummy + # backward pass with retain_graph=False to tear down the graph. Doing this would avoid + # discarding the iteration, but probably wouldn't improve overall efficiency. + self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) + if update_master_grads: + self.update_master_grads() + + def update_master_grads(self): + """ + Copy the ``.grad`` attribute from stored references to fp16 parameters to + the ``.grad`` attribute of the fp32 master parameters that are directly + updated by the optimizer. :attr:`update_master_grads` only needs to be called if + ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``. + """ # noqa + if self.dynamic_loss_scale: + self._check_overflow() + if self.overflow: return # noqa + self._model_grads_to_master_grads() + self._downscale_master() + + def inspect_master_grad_data(self): + """ + When running with :class:`FP16_Optimizer`, + ``.grad`` attributes of a model's fp16 leaves should not be + regarded as truthful, because they might be scaled. + After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered, + the fp32 master params' ``.grad`` + attributes will contain valid gradients properly divided by the loss scale. However, + because :class:`FP16_Optimizer` flattens some parameters, accessing them may be + nonintuitive. :attr:`inspect_master_grad_data` + allows those gradients to be viewed with shapes corresponding to their associated model leaves. + + Returns: + List of lists (one list for each parameter group). The list for each parameter group + is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group. + """ + if self.overflow: + print( + 'Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. ' + 'Gradients are currently invalid (may be inf, nan, or stale). Returning None.' + ) + return None + else: + # The optimizer owns only references to master params. + master_grads_data = [] + for param_group in self.optimizer.param_groups: + master_grads_this_group = [] + for param in param_group['params']: + if param.grad is not None: + master_grads_this_group.append(param.grad.data) + else: + master_grads_this_group.append(None) + master_grads_data.append(master_grads_this_group) + return master_grads_data + + # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale" + def _get_loss_scale(self): + return self.loss_scaler.loss_scale + + def _set_loss_scale(self, value): + self.loss_scaler.cur_scale = value + + loss_scale = property(_get_loss_scale, _set_loss_scale) + + # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state" + def _get_state(self): + return self.optimizer.state + + def _set_state(self, value): + self.optimizer.state = value + + state = property(_get_state, _set_state) + + # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups" + # (for example, to adjust the learning rate) + def _get_param_groups(self): + return self.optimizer.param_groups + + def _set_param_groups(self, value): + self.optimizer.param_groups = value + + param_groups = property(_get_param_groups, _set_param_groups) diff --git a/modelscope/models/nlp/mglm/fp16/fp16util.py b/modelscope/models/nlp/mglm/fp16/fp16util.py new file mode 100644 index 00000000..3fcd3005 --- /dev/null +++ b/modelscope/models/nlp/mglm/fp16/fp16util.py @@ -0,0 +1,220 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from torch.autograd import Variable + +from modelscope.models.nlp.mglm import mpu + + +class tofp16(nn.Module): + """ + Utility module that implements:: + + def forward(self, input): + return input.half() + """ + + def __init__(self): + super(tofp16, self).__init__() + + def forward(self, input): + return input.half() + + +def BN_convert_float(module): + """ + Utility function for network_to_half(). + + Retained for legacy purposes. + """ + if isinstance( + module, + torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: + module.float() + for child in module.children(): + BN_convert_float(child) + return module + + +def network_to_half(network): + """ + Convert model to half precision in a batchnorm-safe way. + + Retained for legacy purposes. It is recommended to use FP16Model. + """ + return nn.Sequential(tofp16(), BN_convert_float(network.half())) + + +def convert_module(module, dtype): + """ + Converts a module's immediate parameters and buffers to dtype. + """ + for param in module.parameters(recurse=False): + if param is not None: + if param.data.dtype.is_floating_point: + param.data = param.data.to(dtype=dtype) + if param._grad is not None and param._grad.data.dtype.is_floating_point: + param._grad.data = param._grad.data.to(dtype=dtype) + + for buf in module.buffers(recurse=False): + if buf is not None and buf.data.dtype.is_floating_point: + buf.data = buf.data.to(dtype=dtype) + + +def convert_network(network, dtype): + """ + Converts a network's parameters and buffers to dtype. + """ + for module in network.modules(): + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm + ) and module.affine is True: + continue + convert_module(module, dtype) + return network + + +class FP16Model(nn.Module): + """ + Convert model to half precision in a batchnorm-safe way. + """ + + def __init__(self, network): + super(FP16Model, self).__init__() + self.network = convert_network(network, dtype=torch.half) + + def forward(self, *inputs): + inputs = tuple(t.half() for t in inputs) + return self.network(*inputs) + + +def backwards_debug_hook(grad): + raise RuntimeError( + 'master_params recieved a gradient in the backward pass!') + + +def prep_param_lists(model, flat_master=False): + """ + Creates a list of FP32 master parameters for a given model, as in + `Training Neural Networks with Mixed Precision: Real Examples`_. + + Args: + model (torch.nn.Module): Existing Pytorch model + flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. + Returns: + A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. + + Example:: + + model_params, master_params = prep_param_lists(model) + + .. warning:: + Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. + + .. _`Training Neural Networks with Mixed Precision: Real Examples`: + http://on-demand.gputechconf.com/gtc/2018/video/S81012/ + """ # noqa + model_params = [ + param for param in model.parameters() if param.requires_grad + ] + + if flat_master: + # Give the user some more useful error messages + try: + # flatten_dense_tensors returns a contiguous flat array. + # http://pytorch.org/docs/master/_modules/torch/_utils.html + master_params = _flatten_dense_tensors( + [param.data for param in model_params]).float() + except: # noqa + print( + 'Error in prep_param_lists: model may contain a mixture of parameters ' + 'of different types. Use flat_master=False, or use F16_Optimizer.' + ) + raise + master_params = torch.nn.Parameter(master_params) + master_params.requires_grad = True + # master_params.register_hook(backwards_debug_hook) + if master_params.grad is None: + master_params.grad = master_params.new(*master_params.size()) + return model_params, [master_params] + else: + master_params = [ + param.clone().float().detach() for param in model_params + ] + for param in master_params: + param.requires_grad = True + return model_params, master_params + + +def model_grads_to_master_grads(model_params, + master_params, + flat_master=False): + """ + Copy model gradients to master gradients. + + Args: + model_params: List of model parameters created by :func:`prep_param_lists`. + master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. + """ # noqa + if flat_master: + # The flattening may incur one more deep copy than is necessary. + master_params[0].grad.data.copy_( + _flatten_dense_tensors([p.grad.data for p in model_params])) + else: + for model, master in zip(model_params, master_params): + if model.grad is not None: + if master.grad is None: + master.grad = Variable( + master.data.new(*master.data.size())) + master.grad.data.copy_(model.grad.data) + else: + master.grad = None + + +def master_params_to_model_params(model_params, + master_params, + flat_master=False): + """ + Copy master parameters to model parameters. + + Args: + model_params: List of model parameters created by :func:`prep_param_lists`. + master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. + """ # noqa + if flat_master: + for model, master in zip( + model_params, + _unflatten_dense_tensors(master_params[0].data, model_params)): + model.data.copy_(master) + else: + for model, master in zip(model_params, master_params): + model.data.copy_(master.data) + + +# Backward compatibility fixes + + +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + + +TORCH_MAJOR = int(torch.__version__.split('.')[0]) +TORCH_MINOR = int(torch.__version__.split('.')[1]) + +clip_grad_norm = mpu.clip_grad_norm diff --git a/modelscope/models/nlp/mglm/fp16/loss_scaler.py b/modelscope/models/nlp/mglm/fp16/loss_scaler.py new file mode 100755 index 00000000..721571b3 --- /dev/null +++ b/modelscope/models/nlp/mglm/fp16/loss_scaler.py @@ -0,0 +1,245 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from modelscope.models.nlp.mglm import mpu + + +# item() is a recent addition, so this helps with backward compatibility. +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + + +class LossScaler: + """ + Class that manages a static loss scale. This class is intended to interact with + :class:`FP16_Optimizer`, and should not be directly manipulated by the user. + + Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to + :class:`FP16_Optimizer`'s constructor. + + Args: + scale (float, optional, default=1.0): The loss scale. + """ + + def __init__(self, scale=1): + self.cur_scale = scale + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + return False + + def update_scale(self, overflow): + pass + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss * self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + + +class DynamicLossScaler: + """ + Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` + indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of + :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` + operates, because the default options can be changed using the + the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. + + Loss scaling is designed to combat the problem of underflowing gradients encountered at long + times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss + scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are + encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has + occurred. + :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, + and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. + If a certain number of iterations occur without overflowing gradients detected, + :class:`DynamicLossScaler` increases the loss scale once more. + In this way :class:`DynamicLossScaler` attempts to "ride the edge" of + always using the highest loss scale possible without incurring overflow. + + Args: + init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` + scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. + scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. + """ # noqa + + def __init__(self, + init_scale=2**32, + scale_factor=2., + scale_window=1000, + min_scale=1, + delayed_shift=1, + consecutive_hysteresis=False): + self.cur_scale = init_scale + self.cur_iter = 0 + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + self.min_scale = min_scale + self.delayed_shift = delayed_shift + self.cur_hysteresis = delayed_shift + self.consecutive_hysteresis = consecutive_hysteresis + + # `params` is a list / generator of torch.Variable + def has_overflow_serial(self, params): + for p in params: + if p.grad is not None and DynamicLossScaler._has_inf_or_nan( + p.grad.data): + return True + + return False + + def has_overflow(self, params): + overflow = self.has_overflow_serial(params) + # Since each model parallel GPU carries only part of the model, + # make sure overflow flag is synced across all the model parallel GPUs + overflow_gpu = torch.cuda.ByteTensor([overflow]) + torch.distributed.all_reduce( + overflow_gpu, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_model_parallel_group()) + overflow = overflow_gpu[0].item() + return bool(overflow) + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + try: + # if x is half, the .float() incurs an additional deep copy, but it's necessary if + # Pytorch's .sum() creates a one-element tensor of the same type as x + # (which is true for some recent version of pytorch). + cpu_sum = float(x.float().sum()) + # More efficient version that can be used if .sum() returns a Python scalar + # cpu_sum = float(x.sum()) + except RuntimeError as instance: + # We want to check if inst is actually an overflow exception. + # RuntimeError could come from a different error. + # If so, we still want the exception to propagate. + if 'value cannot be converted' not in instance.args[0]: + raise + return True + else: + if cpu_sum == float( + 'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: + return True + return False + + # `overflow` is boolean indicating whether the gradient overflowed + def update_scale(self, overflow): + + if not hasattr(self, 'min_scale'): + self.min_scale = 1 + if not hasattr(self, 'delayed_shift'): + self.delayed_shift = 1 + if not hasattr(self, 'cur_hysteresis'): + self.cur_hysteresis = 1 + if not hasattr(self, 'consecutive_hysteresis'): + self.consecutive_hysteresis = True + if overflow: + # self.cur_scale /= self.scale_factor + if self.delayed_shift == 1 or self.cur_hysteresis == 1: + self.cur_scale = max(self.cur_scale / self.scale_factor, + self.min_scale) + else: + self.cur_hysteresis -= 1 + self.last_overflow_iter = self.cur_iter + else: + if self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + if (self.cur_iter + - self.last_overflow_iter) % self.scale_window == 0: + if not self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + self.cur_scale *= self.scale_factor + self.cur_iter += 1 + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss * self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + + +############################################################## +# Example usage below here -- assuming it's in a separate file +############################################################## +""" +TO-DO separate out into an example. +if __name__ == "__main__": + import torch + from torch.autograd import Variable + from dynamic_loss_scaler import DynamicLossScaler + + # N is batch size; D_in is input dimension; + # H is hidden dimension; D_out is output dimension. + N, D_in, H, D_out = 64, 1000, 100, 10 + + # Create random Tensors to hold inputs and outputs, and wrap them in Variables. + x = Variable(torch.randn(N, D_in), requires_grad=False) + y = Variable(torch.randn(N, D_out), requires_grad=False) + + w1 = Variable(torch.randn(D_in, H), requires_grad=True) + w2 = Variable(torch.randn(H, D_out), requires_grad=True) + parameters = [w1, w2] + + learning_rate = 1e-6 + optimizer = torch.optim.SGD(parameters, lr=learning_rate) + loss_scaler = DynamicLossScaler() + + for t in range(500): + y_pred = x.mm(w1).clamp(min=0).mm(w2) + loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale + print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) + print('Iter {} scaled loss: {}'.format(t, loss.data[0])) + print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) + + # Run backprop + optimizer.zero_grad() + loss.backward() + + # Check for overflow + has_overflow = DynamicLossScaler.has_overflow(parameters) + + # If no overflow, unscale grad and update as usual + if not has_overflow: + for param in parameters: + param.grad.data.mul_(1. / loss_scaler.loss_scale) + optimizer.step() + # Otherwise, don't do anything -- ie, skip iteration + else: + print('OVERFLOW!') + + # Update loss scale for next iteration + loss_scaler.update_scale(has_overflow) + +""" diff --git a/modelscope/models/nlp/mglm/generation_utils.py b/modelscope/models/nlp/mglm/generation_utils.py new file mode 100644 index 00000000..6db75b2d --- /dev/null +++ b/modelscope/models/nlp/mglm/generation_utils.py @@ -0,0 +1,483 @@ +# Copyright 2020 The HuggingFace Inc. team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from collections import UserDict +from typing import Iterable, List, Optional, Tuple + +import torch + +PROCESS_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`): + Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses. + next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): + :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses. + next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): + Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + + Return: + :obj:`UserDict`: A dictionary composed of the fields as defined above: + + - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated + scores of all non-finished beams. + - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens + to be added to the non-finished beam_hypotheses. + - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices + indicating to which beam the next tokens shall be added. + +""" + +FINALIZE_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): + The final scores of all non-finished beams. + final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): + The last tokens to be added to the non-finished beam_hypotheses. + final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): + The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + + Return: + :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated + sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all + batches finished early due to the :obj:`eos_token_id`. + +""" + + +class BeamScorer(ABC): + """ + Abstract base class for all beam scorers that are used for :meth:`~transformers.PretrainedModel.beam_search` and + :meth:`~transformers.PretrainedModel.beam_sample`. + """ + + @abstractmethod + def process(self, input_ids: torch.LongTensor, + next_scores: torch.FloatTensor, next_tokens: torch.LongTensor, + next_indices: torch.LongTensor, + **kwargs) -> Tuple[torch.Tensor]: + raise NotImplementedError('This is an abstract method.') + + @abstractmethod + def finalize(self, input_ids: torch.LongTensor, + next_scores: torch.FloatTensor, next_tokens: torch.LongTensor, + next_indices: torch.LongTensor, **kwargs) -> torch.LongTensor: + raise NotImplementedError('This is an abstract method.') + + +class BeamSearchScorer(BeamScorer): + r""" + :class:`transformers.BeamScorer` implementing standard beam search decoding. + + Adapted in part from `Facebook's XLM beam search code + `__. + + Args: + batch_size (:obj:`int`): + Batch Size of :obj:`input_ids` for which beam search decoding is run in parallel. + max_length (:obj:`int`): + The maximum length of the sequence to be generated. + num_beams (:obj:`int`): + Number of beams for beam search. + device (:obj:`torch.device`): + Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of + :obj:`BeamSearchScorer` will be allocated. + length_penalty (:obj:`float`, `optional`, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the + model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer + sequences. + do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. + num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1): + The number of beam hypotheses that shall be returned upon calling + :meth:`~transformer.BeamSearchScorer.finalize`. + """ + + def __init__( + self, + batch_size: int, + max_length: int, + num_beams: int, + device: torch.device, + length_penalty: Optional[float] = 1.0, + do_early_stopping: Optional[bool] = False, + num_beam_hyps_to_keep: Optional[int] = 1, + ): + self.max_length = max_length + self.num_beams = num_beams + self.device = device + self.length_penalty = length_penalty + self.do_early_stopping = do_early_stopping + self.num_beam_hyps_to_keep = num_beam_hyps_to_keep + + self._is_init = False + self._beam_hyps = [ + BeamHypotheses( + num_beams=self.num_beams, + max_length=self.max_length, + length_penalty=self.length_penalty, + early_stopping=self.do_early_stopping, + ) for _ in range(batch_size) + ] + self._done = torch.tensor([False for _ in range(batch_size)], + dtype=torch.bool, + device=self.device) + + # if not isinstance(num_beams, int) or num_beams <= 1: + # raise ValueError( + # ) + + @property + def is_done(self) -> bool: + return self._done.all() + + def process(self, + input_ids: torch.LongTensor, + next_scores: torch.FloatTensor, + next_tokens: torch.LongTensor, + next_indices: torch.LongTensor, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + mems=None) -> Tuple[torch.Tensor]: + cur_len = input_ids.shape[-1] + batch_size = len(self._beam_hyps) + assert batch_size == (input_ids.shape[0] // self.num_beams) + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + device = next_scores.device + next_beam_scores = torch.zeros((batch_size, self.num_beams), + dtype=next_scores.dtype, + device=device) + next_beam_tokens = torch.zeros((batch_size, self.num_beams), + dtype=next_tokens.dtype, + device=device) + next_beam_indices = torch.zeros((batch_size, self.num_beams), + dtype=next_indices.dtype, + device=device) + + for batch_idx, beam_hyp in enumerate(self._beam_hyps): + if self._done[batch_idx]: + assert ( + len(beam_hyp) >= self.num_beams + ), 'Batch can only be done if at least {} beams have been generated'.format( + self.num_beams) + assert ( + eos_token_id is not None and pad_token_id is not None + ), 'generated beams >= num_beams -> eos_token_id and pad_token have to be defined' + # pad the batch + next_beam_scores[batch_idx, :] = 0 + next_beam_tokens[batch_idx, :] = pad_token_id + next_beam_indices[batch_idx, :] = 0 + continue + + # next tokens for this sentence + beam_idx = 0 + for beam_token_rank, (next_token, next_score, + next_index) in enumerate( + zip(next_tokens[batch_idx], + next_scores[batch_idx], + next_indices[batch_idx])): + batch_beam_idx = batch_idx * self.num_beams + next_index + # add to generated hypotheses if end of sentence + if (eos_token_id is not None) and (next_token.item() + in eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams + if is_beam_token_worse_than_top_num_beams: + continue + beam_hyp.add( + input_ids[batch_beam_idx].clone(), + next_score.item(), + mems=[mem[[next_index.item()]] + for mem in mems] if mems else None) + else: + # add next predicted token since it is not eos_token + next_beam_scores[batch_idx, beam_idx] = next_score + next_beam_tokens[batch_idx, beam_idx] = next_token + next_beam_indices[batch_idx, beam_idx] = batch_beam_idx + beam_idx += 1 + + # once the beam for next step is full, don't add more tokens to it. + if beam_idx == self.num_beams: + break + + if beam_idx < self.num_beams: + raise ValueError( + f'At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected.' # noqa + ) # noqa + + # Check if we are done so that we can save a pad step if all(done) + self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done( + next_scores[batch_idx].max().item(), cur_len) + + return UserDict({ + 'next_beam_scores': next_beam_scores.view(-1), + 'next_beam_tokens': next_beam_tokens.view(-1), + 'next_beam_indices': next_beam_indices.view(-1), + }) + + def finalize(self, + input_ids: torch.LongTensor, + final_beam_scores: torch.FloatTensor, + final_beam_tokens: torch.LongTensor, + final_beam_indices: torch.LongTensor, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + mems=None) -> Tuple[torch.LongTensor, List[torch.Tensor]]: + batch_size = len(self._beam_hyps) + + # finalize all open beam hypotheses and add to generated hypotheses + for batch_idx, beam_hyp in enumerate(self._beam_hyps): + if self._done[batch_idx]: + continue + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(self.num_beams): + batch_beam_idx = batch_idx * self.num_beams + beam_id + final_score = final_beam_scores[batch_beam_idx].item() + final_tokens = input_ids[batch_beam_idx] + beam_hyp.add( + final_tokens, + final_score, + mems=[mem[[batch_beam_idx]] + for mem in mems] if mems else None) + + # select the best hypotheses + sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep) + best = [] + + # retrieve best hypotheses + for i, beam_hyp in enumerate(self._beam_hyps): + sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0]) + for j in range(self.num_beam_hyps_to_keep): + best_hyp, mems = sorted_hyps.pop()[1:] + sent_lengths[self.num_beam_hyps_to_keep * i + + j] = len(best_hyp) + best.append((best_hyp, mems)) + + # prepare for adding eos + sent_max_len = min(sent_lengths.max().item(), self.max_length) + decoded: torch.LongTensor = input_ids.new( + batch_size * self.num_beam_hyps_to_keep, sent_max_len) + # shorter batches are padded if needed + if sent_lengths.min().item() != sent_lengths.max().item(): + assert pad_token_id is not None, '`pad_token_id` has to be defined' + decoded.fill_(pad_token_id) + + # fill with hypotheses and eos_token_id if the latter fits in + mems = [] + for i, (hypo, mem) in enumerate(best): + decoded[i, :sent_lengths[i]] = hypo + if sent_lengths[i] < sent_max_len: + decoded[i, sent_lengths[i]] = eos_token_id + mems.append(mem) + mems = [ + torch.cat([mem[i] for mem in mems], dim=0) + for i in range(len(mems[0])) + ] if mems and mems[0] else None + return decoded, mems + + +class BeamHypotheses: + + def __init__(self, num_beams: int, max_length: int, length_penalty: float, + early_stopping: bool): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp: torch.LongTensor, sum_logprobs: float, mems=None): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / (max(hyp.shape[-1], 1)**self.length_penalty) + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp, mems)) + if len(self) > self.num_beams: + sorted_next_scores = sorted([ + (s, idx) for idx, (s, _, _) in enumerate(self.beams) + ]) + del self.beams[sorted_next_scores[0][1]] + self.worst_score = sorted_next_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool: + """ + If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst + one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len**self.length_penalty + ret = self.worst_score >= cur_score + return ret + + +class LogitsProcessor(ABC): + """Abstract base class for all logit processors that can be applied during generation.""" + + def __call__(self, input_ids: torch.LongTensor, + scores: torch.FloatTensor) -> torch.FloatTensor: + """Torch method for processing logits.""" + raise NotImplementedError( + f'{self.__class__} is an abstract class. Only classes inheriting this class can be called.' + ) + + +class LogitsProcessorList(list): + """ + This class can be used to create a list of :class:`~transformers.LogitsProcessor` or + :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from + list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or + :class:`~transformers.LogitsProcessor` to the inputs. + """ + + def __call__(self, input_ids: torch.LongTensor, + scores: torch.FloatTensor) -> torch.FloatTensor: + for processor in self: + scores = processor(input_ids, scores) + return scores + + +class MinLengthLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0. + + Args: + min_length (:obj:`int`): + The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`. + eos_token_id (:obj:`int`): + The id of the `end-of-sequence` token. + """ + + def __init__(self, min_length: int, eos_token_id: int): + if not isinstance(min_length, int) or min_length < 0: + raise ValueError( + f'`min_length` has to be a positive integer, but is {min_length}' + ) + + if not isinstance(eos_token_id, int) or eos_token_id < 0: + raise ValueError( + f'`eos_token_id` has to be a positive integer, but is {eos_token_id}' + ) + + self.min_length = min_length + self.eos_token_id = eos_token_id + + def __call__(self, input_ids: torch.LongTensor, + scores: torch.FloatTensor) -> torch.FloatTensor: + cur_len = input_ids.shape[-1] + if cur_len < self.min_length: + scores[:, self.eos_token_id] = -float('inf') + return scores + + +class NoRepeatNGramLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq + `__. + + Args: + ngram_size (:obj:`int`): + All ngrams of size :obj:`ngram_size` can only occur once. + """ + + def __init__(self, ngram_size: int): + if not isinstance(ngram_size, int) or ngram_size <= 0: + raise ValueError( + f'`ngram_size` has to be a strictly positive integer, but is {ngram_size}' + ) + self.ngram_size = ngram_size + + def __call__(self, input_ids: torch.LongTensor, + scores: torch.FloatTensor) -> torch.FloatTensor: + num_batch_hypotheses = scores.shape[0] + cur_len = input_ids.shape[-1] + banned_batch_tokens = self._calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, cur_len) + + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float('inf') + + return scores + + def _calc_banned_ngram_tokens(self, prev_input_ids: torch.Tensor, + num_hypos: int, + cur_len: int) -> List[Iterable[int]]: + """Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < self.ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] + for i in range(self.ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get( + prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - self.ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, + start_idx:cur_len].tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [ + _get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos) + ] + return banned_tokens diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py new file mode 100644 index 00000000..ea1dfb5a --- /dev/null +++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py @@ -0,0 +1,469 @@ +# Copyright (c) 2022 Zhipu.AI + +import os +import random +from os import path as osp +from typing import Dict + +import numpy as np +import torch +import torch.nn.functional as F + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Models +from modelscope.models.base import Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from . import mpu +from .arguments import get_args +from .generation_utils import BeamSearchScorer +from .train_utils import get_model +from .utils import load_checkpoint + +__all__ = ['MGLMForTextSummarization'] + + +def setup_args(args): + args.block_lm = True + args.task_mask = True + args.cloze_eval = True + args.num_layers = 24 + args.hidden_size = 1536 + args.num_attention_heads = 16 + args.max_position_embeddings = 1024 + args.tokenizer_type = 'ChineseSPTokenizer' + args.load_pretrained = '' + args.DDP_impl = 'none' + args.model_parallel_size = 1 + args.fp16 = True + args.cache_dir = 'cache' + args.out_seq_length = 200 + args.seq_length = 512 + args.temperature = 0.9 + args.top_k = 2 + args.top_p = 0.8 + args.frequency_penalty = 0.1 + args.presence_penalty = 0.1 + args.mem_length = args.seq_length + args.mem_length - 1 + return args + + +def setup_model(args): + """Setup model and optimizer.""" + + model = get_model(args, model_type='generation') + + if args.load_pretrained is not None: + args.no_load_optim = True + args.load = args.load_pretrained + _ = load_checkpoint(model, None, None, args) + + return model + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + mpu.model_parallel_cuda_manual_seed(seed) + + +def get_masks_and_position_ids(data, + eod_token, + reset_position_ids, + reset_attention_mask, + loss_mask=None, + attention_mask=None, + set_loss_mask=False, + mem_length=None): + # Extract batch size and sequence length. + batch_size, seq_length = data.size() + + # Attention mask (lower triangular). + if mem_length: + if attention_mask is None: + attention_mask = torch.ones( + (1, seq_length, seq_length + mem_length), device=data.device) + attention_mask = torch.tril( + torch.triu(attention_mask, 1 - seq_length + mem_length), + mem_length) + else: + if reset_attention_mask: + att_mask_batch = batch_size + else: + att_mask_batch = 1 + if attention_mask is None: + attention_mask = torch.ones( + (att_mask_batch, seq_length, seq_length), device=data.device) + attention_mask = torch.tril(attention_mask) + attention_mask = attention_mask.unsqueeze(1) + + # Loss mask. + if loss_mask is None: + loss_mask = torch.ones( + data.size(), dtype=torch.float, device=data.device) + + # Position ids. + position_ids = torch.arange( + seq_length, dtype=torch.long, device=data.device) + position_ids = position_ids.unsqueeze(0).expand_as(data) + if set_loss_mask: + loss_mask[data == eod_token] = 0.0 + # We need to clone as the ids will be modifed based on batch index. + if reset_position_ids: + position_ids = position_ids.clone() + + if reset_position_ids or reset_attention_mask: + # Loop through the batches: + for b in range(batch_size): + + # Find indecies where EOD token is. + eod_index = position_ids[b, data[b] == eod_token] + # Detach indecies from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask: + attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1):] -= (i + 1 - prev_index) + prev_index = i + 1 + + return attention_mask, loss_mask, position_ids + + +def initialize_distributed(args): + """Initialize torch.distributed.""" + + # Manually set the device ids. + device = args.rank % torch.cuda.device_count() + if args.local_rank is not None: + device = args.local_rank + torch.cuda.set_device(device) + # Call the init process + init_method = 'tcp://' + args.master_ip = os.getenv('MASTER_ADDR', 'localhost') + args.master_port = os.getenv('MASTER_PORT', '6000') + init_method += args.master_ip + ':' + args.master_port + torch.distributed.init_process_group( + backend=args.distributed_backend, + world_size=args.world_size, + rank=args.rank, + init_method=init_method) + + # Set the model-parallel / data-parallel communicators. + mpu.initialize_model_parallel(args.model_parallel_size) + + # Optional DeepSpeed Activation Checkpointing Features + # + if hasattr( + args, 'deepspeed' + ) and args.deepspeed and args.deepspeed_activation_checkpointing: + set_deepspeed_activation_checkpointing(args) + + +def get_batch(context_tokens, device, args): + tokens = context_tokens + tokens = tokens.view(args.batch_size, -1).contiguous() + tokens = tokens.to(device) + + # Get the masks and postition ids. + if args.block_lm: + attention_mask = torch.tensor([tokens.size(1)], + device=device, + dtype=torch.long) + position_ids = torch.arange( + tokens.size(1), device=device, dtype=torch.long) + if not args.no_block_position: + block_position_ids = torch.zeros( + tokens.size(1), device=device, dtype=torch.long) + position_ids = torch.stack((position_ids, block_position_ids), + dim=0) + position_ids = position_ids.unsqueeze(0) + else: + attention_mask, loss_mask, position_ids = get_masks_and_position_ids( + tokens, + args.eod_token, + reset_position_ids=False, + reset_attention_mask=False, + set_loss_mask=False, + mem_length=args.mem_length) + + return tokens, attention_mask, position_ids + + +def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + # This function has been mostly taken from huggingface conversational ai code at + # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313 + + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, + None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + # convert to 1D + logits = logits.view(logits.size()[1]).contiguous() + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum( + F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ + ..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + indices_to_remove = sorted_indices[sorted_indices_to_remove] + logits[indices_to_remove] = filter_value + # going back to 2D + logits = logits.view(1, -1).contiguous() + + return logits + + +def sample_sequence(model, + tokenizer, + context_tokens, + context_length, + args, + device, + mems=None, + end_tokens=None): + if not args.block_lm: + context_tokens, attention_mask, position_ids = get_batch( + context_tokens, device, args) + tokens = torch.empty((args.num_beams, 0), + device=context_tokens.device, + dtype=torch.long) + else: + tokens = context_tokens.new_full((1, 1), + tokenizer.get_command('sop').Id) + counter = 0 + if mems is None: + mems = [] + if end_tokens is None: + end_tokens = [args.eod_token] + + last_beam_num = 1 + output_tokens_list = [] + generated_tokens_list = [] + + while counter < args.out_seq_length: + if counter == 0 and not args.block_lm: + next_token_logits, *mems = model(context_tokens, position_ids, + attention_mask, *mems) + else: + if args.block_lm: + if args.no_block_position: + position_ids = context_tokens.new_full( + (last_beam_num, 1), context_length + counter) + else: + position_ids = context_tokens.new_ones(last_beam_num, 2, 1) + position_ids[:, 0] = context_length + position_ids[:, 1] = counter + 1 + attention_mask = context_tokens.new_zeros( + [1], device=context_tokens.device, dtype=torch.long) + else: + position_ids = context_tokens.new_ones((last_beam_num, 1)) * ( + context_length + counter - 1) + attention_mask = context_tokens.new_ones( + last_beam_num, + 1, + 1, + args.mem_length + 1, + device=context_tokens.device, + dtype=torch.float) + last_token = tokens[:, -1:] + next_token_logits, *mems = model(last_token, position_ids, + attention_mask, *mems) + next_token_logits = next_token_logits[:, -1] + + next_token_logits /= args.temperature + frequency_count = torch.zeros(next_token_logits.shape) + for tk in output_tokens_list: + frequency_count[0][tk] += 1 + + next_token_logits -= (args.frequency_penalty + * frequency_count).to(device) + next_token_logits -= ( + args.presence_penalty * # noqa + (frequency_count > 0)).to(device) + + next_token_logits = top_k_logits( + next_token_logits, top_k=args.top_k, top_p=args.top_p) + log_probs = F.softmax(next_token_logits, dim=-1) + prev = torch.multinomial(log_probs, num_samples=1)[0] + is_end = prev.item() in end_tokens + if is_end: + break + decode_tokens = tokenizer.DecodeIds([prev.item()]) # noqa + generated_tokens_list.append(prev.item()) + prev = prev.view(1, 1) + tokens = prev if tokens is None else torch.cat((tokens, prev), dim=1) + counter += 1 + output_tokens_list = tokens.view(-1).contiguous() + return torch.cat((context_tokens, tokens), dim=1), mems + + +def read_context(tokenizer, args, context): + terminate_runs, skip_run = 0, 0 # noqa + if mpu.get_model_parallel_rank() == 0: + while True: + # raw_text = input("\nContext prompt (stop to exit) >>> ") + raw_text = context + if not raw_text: + print('Prompt should not be empty!') + break + # if raw_text == "stop": + # terminate_runs = 1 + # break + generation_mask = '[gMASK]' if args.task_mask else '[MASK]' + if args.block_lm and 'MASK]' not in raw_text: + raw_text += ' ' + generation_mask + # output.write(raw_text) + context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization + if args.block_lm: + context_tokens = [tokenizer.get_command('ENC').Id + ] + context_tokens + if not raw_text.endswith('[gMASK]'): + context_tokens = context_tokens + [ + tokenizer.get_command('eos').Id + ] + context_length = len(context_tokens) + + if context_length >= args.seq_length: + print('\nContext length', context_length, + '\nPlease give smaller context than the window length!') + break + break + else: + context_length = 0 + + terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) + torch.distributed.broadcast( + terminate_runs_tensor, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group()) + terminate_runs = terminate_runs_tensor[0].item() + + if terminate_runs == 1: + return terminate_runs, None, None, None + + context_length_tensor = torch.cuda.LongTensor([context_length]) + + torch.distributed.broadcast( + context_length_tensor, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group()) + context_length = context_length_tensor[0].item() + if mpu.get_model_parallel_rank() == 0: + context_tokens_tensor = torch.cuda.LongTensor(context_tokens) + else: + context_tokens_tensor = torch.cuda.LongTensor([0] * context_length) + torch.distributed.broadcast( + context_tokens_tensor, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group()) + if mpu.get_model_parallel_rank() != 0: + raw_text = tokenizer.DecodeIds(context_tokens_tensor.tolist()) + return terminate_runs, raw_text, context_tokens_tensor, context_length + + +@MODELS.register_module(Tasks.text_summarization, module_name=Models.mglm) +class MGLMForTextSummarization(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the text summarization model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + + from .configure_data import prepare_tokenizer + # Disable CuDNN. + torch.backends.cudnn.enabled = False + # Arguments. + self.args = setup_args(get_args()) + self.args.load_pretrained = model_dir + # Pytorch distributed. + try: + initialize_distributed(self.args) + except (RuntimeError): + print('group process initialized twice') + # Random seeds for reproducability. + set_random_seed(self.args.seed) + # setting default batch size to 1 + self.args.batch_size = 1 + self.args.tokenizer_path = model_dir + self.tokenizer = prepare_tokenizer(self.args) + self.model = setup_model(self.args) + self.cfg = Config.from_file( + osp.join(model_dir, ModelFile.CONFIGURATION)) + + def forward(self, input: Dict[str, str]) -> Dict[str, str]: + pass + + def generate(self, input: Dict[str, str]) -> Dict[str, str]: + model = self.model + tokenizer = self.tokenizer + args = self.args + device = torch.cuda.current_device() + model.eval() + + context = input['text'] + self.cfg.model.prompt + with torch.no_grad(): + terminate_runs, raw_text, context_tokens_tensor, context_length = read_context( + tokenizer, args, context) + mems = [] + tokens, attention_mask, position_ids = get_batch( + context_tokens_tensor, device, args) + mask_tokens = ['MASK', 'sMASK', 'gMASK' + ] if args.task_mask else ['MASK'] + mask_tokens = [ + tokenizer.get_command(token).Id for token in mask_tokens + ] + end_tokens = [tokenizer.get_command('eop').Id, args.eod_token] + + mask_positions = [] + for token in mask_tokens: + mask_positions += (context_tokens_tensor == token).nonzero( + as_tuple=True)[0].tolist() + mask_positions.sort() + if args.no_block_position: + for mask_position in mask_positions: + position_ids[0, mask_position + 1:] += args.out_seq_length + _, *mems = model(tokens, position_ids, attention_mask, *mems) + for mask_position in mask_positions: + if args.no_block_position: + position = position_ids[0, mask_position].item() + else: + position = mask_position + tokens, mems, = sample_sequence( + model, + tokenizer, + tokens, + position, + args, + device, + mems=mems, + end_tokens=end_tokens) + output_tokens_list = tokens.view(-1).contiguous() + trim_decode_tokens = tokenizer.DecodeIds( + output_tokens_list.tolist()) + res = trim_decode_tokens.split('<|startofpiece|>')[-1] + print(res) + return {OutputKeys.TEXT: res} diff --git a/modelscope/models/nlp/mglm/model/__init__.py b/modelscope/models/nlp/mglm/model/__init__.py new file mode 100755 index 00000000..84c55ae3 --- /dev/null +++ b/modelscope/models/nlp/mglm/model/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .distributed import (DistributedDataParallel, + PyTorchDistributedDataParallel) +from .downstream import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, + GLMForSequenceClassification, GLMForSingleTokenCloze) +from .modeling_glm import (GLMModel, + glm_get_params_for_weight_decay_optimization) diff --git a/modelscope/models/nlp/mglm/model/distributed.py b/modelscope/models/nlp/mglm/model/distributed.py new file mode 100755 index 00000000..a3c84e9f --- /dev/null +++ b/modelscope/models/nlp/mglm/model/distributed.py @@ -0,0 +1,127 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed as dist +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from torch.autograd import Variable +from torch.nn.modules import Module +from torch.nn.parallel.distributed import DistributedDataParallel as DDP + +from modelscope.models.nlp.mglm import mpu + + +class PyTorchDistributedDataParallel(DDP): + + def named_parameters(self, prefix: str = '', recurse: bool = True): + return self.module.named_parameters(prefix=prefix, recurse=recurse) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + sd = self.module.state_dict(destination, prefix, keep_vars) + return sd + + def load_state_dict(self, state_dict, strict=True): + return self.module.load_state_dict(state_dict, strict=strict) + + +class DistributedDataParallel(Module): + + def __init__(self, module): + super(DistributedDataParallel, self).__init__() + self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False + + self.module = module + self.data_parallel_group = mpu.get_data_parallel_group() + src_rank = mpu.get_model_parallel_rank() + for p in self.module.parameters(): + if torch.is_tensor(p): + dist.broadcast(p, src_rank, group=self.data_parallel_group) + + def allreduce_params(reduce_after=True, + no_scale=False, + fp32_allreduce=False): + if (self.needs_reduction): + self.needs_reduction = False + buckets = {} + for name, param in self.module.named_parameters(): + if param.requires_grad and param.grad is not None: + tp = (param.data.type()) + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(param) + if self.warn_on_half: + if torch.cuda.HalfTensor in buckets: + print( + 'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.' # noqa + ) + self.warn_on_half = False + for tp in buckets: + bucket = buckets[tp] + grads = [param.grad.data for param in bucket] + coalesced = _flatten_dense_tensors(grads) + if fp32_allreduce: + coalesced = coalesced.float() + if not no_scale and not reduce_after: + coalesced /= dist.get_world_size( + group=self.data_parallel_group) + dist.all_reduce(coalesced, group=self.data_parallel_group) + torch.cuda.synchronize() + if not no_scale and reduce_after: + coalesced /= dist.get_world_size( + group=self.data_parallel_group) + for buf, synced in zip( + grads, _unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) + + self.hook_handles = [] + self.hooks = [] + for param in list(self.module.parameters()): + + def allreduce_hook(*unused): + Variable._execution_engine.queue_callback(allreduce_params) + + self.allreduce_params = allreduce_params + + def forward(self, *inputs, **kwargs): + self.needs_reduction = True + return self.module(*inputs, **kwargs) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + sd = self.module.state_dict(destination, prefix, keep_vars) + return sd + + def load_state_dict(self, state_dict, strict=True): + return self.module.load_state_dict(state_dict, strict=strict) + + def named_parameters(self, prefix: str = '', recurse: bool = True): + return self.module.named_parameters(prefix=prefix, recurse=recurse) + + ''' + def _sync_buffers(self): + buffers = list(self.module._all_buffers()) + if len(buffers) > 0: + # cross-node buffer sync + flat_buffers = _flatten_dense_tensors(buffers) + dist.broadcast(flat_buffers, 0) + for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): + buf.copy_(synced) + def train(self, mode=True): + # Clear NCCL communicator and CUDA event cache of the default group ID, + # These cache will be recreated at the later call. This is currently a + # work-around for a potential NCCL deadlock. + if dist._backend == dist.dist_backend.NCCL: + dist._clear_group_cache() + super(DistributedDataParallel, self).train(mode) + self.module.train(mode) + ''' diff --git a/modelscope/models/nlp/mglm/model/downstream.py b/modelscope/models/nlp/mglm/model/downstream.py new file mode 100644 index 00000000..61b1e807 --- /dev/null +++ b/modelscope/models/nlp/mglm/model/downstream.py @@ -0,0 +1,242 @@ +# Copyright (c) 2022 Zhipu.AI +"""Multiple choice model.""" + +import torch +import torch.nn + +from .modeling_glm import GLMModel + + +class GLMForMultiTokenCloze(torch.nn.Module): + + def __init__(self, + language_model: GLMModel, + take_softmax=True, + length_penalty=0.0): + super(GLMForMultiTokenCloze, self).__init__() + self.model = language_model + self.take_softmax = take_softmax + self.length_penalty = length_penalty + + def state_dict(self, destination=None, prefix='', keep_vars=False): + # [h.remove() for h in self.hook_handles] + sd = self.model.state_dict(destination, prefix, keep_vars) + return sd + + def load_state_dict(self, state_dict, strict=True): + return self.model.load_state_dict(state_dict, strict=strict) + + def named_parameters(self, prefix: str = '', recurse: bool = True): + return self.model.named_parameters(prefix=prefix, recurse=recurse) + + def forward(self, + input_ids, + position_ids, + attention_mask, + target_ids=None, + logit_mask=None, + prompt_pos=None): + if target_ids is None: + return self.model(input_ids, position_ids, attention_mask) + num_choices = None + if len(input_ids.shape) == 3: + batch_size, num_choices = input_ids.shape[:2] + input_ids = input_ids.reshape(-1, input_ids.size(-1)) + attention_mask = attention_mask.reshape(-1, + *attention_mask.size()[2:]) + position_ids = position_ids.reshape(-1, *position_ids.size()[2:]) + target_ids = target_ids.reshape(-1, target_ids.size(-1)) + logit_mask = logit_mask.reshape(-1, logit_mask.size(-1)) + if prompt_pos is not None: + prompt_pos = prompt_pos.reshape(-1, prompt_pos.size(-1)) + outputs, *mems = self.model( + input_ids, position_ids, attention_mask, prompt_pos=prompt_pos) + if self.take_softmax: + outputs = torch.nn.functional.log_softmax(outputs, dim=-1) + # select the target logits + batch_ids = torch.arange( + target_ids.size(0), dtype=torch.long, device=target_ids.device) + batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids) + seq_ids = torch.arange( + target_ids.size(-1), dtype=torch.long, device=target_ids.device) + seq_ids = seq_ids.unsqueeze(0).expand_as(target_ids) + logits = outputs[batch_ids, seq_ids, target_ids] + logits = (logits * logit_mask).sum(dim=1) + if self.length_penalty > 0.0: + logits = logits / logit_mask.sum(dim=1)**self.length_penalty + if num_choices is not None: + logits = logits.view(-1, num_choices) + return (logits, *mems) + + +class GLMForMultiTokenClozeFast(torch.nn.Module): + + def __init__(self, language_model, take_softmax=True, length_penalty=0.0): + super(GLMForMultiTokenClozeFast, self).__init__() + self.model = language_model + self.take_softmax = take_softmax + self.length_penalty = length_penalty + + def forward(self, input_ids, position_ids, attention_mask, dec_input_ids, + dec_position_ids, dec_attention_mask, dec_target_ids, + dec_logit_mask): + # encoder + outputs, *mems = self.model( + input_ids, + position_ids, + attention_mask, + return_memory=True, + detach_memory=False) + batch_size, num_choices, max_dec_len = dec_input_ids.size() + max_enc_len = input_ids.size(-1) + + enc_mems = [] + for hidden in mems: + hidden = hidden.unsqueeze(1).expand(-1, num_choices, -1, + -1).reshape( + batch_size * num_choices, + *hidden.size()[1:]) + enc_mems.append(hidden) + + def build_dec_mask_matrix(seq_length, sep, memory_length=0): + m = enc_mems[0].new_ones((1, seq_length, seq_length)) + m = torch.tril(m) + + # sep = dec_attention_mask + ids = torch.arange( + memory_length, device=sep.device, dtype=sep.dtype).view(1, -1) + mask = ids < sep.view(-1, 1) # batch * mem + mask = mask.unsqueeze(1).float().expand(-1, seq_length, -1) + + m = m.expand(batch_size * num_choices, -1, -1) + m = torch.cat((mask, m), dim=2) + m = m.unsqueeze(1) + return m + + dec_input_ids = dec_input_ids.reshape(-1, max_dec_len) + dec_position_ids = dec_position_ids.reshape( + -1, + *dec_position_ids.size()[2:]) + # dec_attention_mask = dec_attention_mask.reshape(-1, *dec_attention_mask.size()[2:]).unsqueeze(1) + dec_attention_mask = build_dec_mask_matrix( + max_dec_len, dec_attention_mask.reshape(-1), max_enc_len) + dec_target_ids = dec_target_ids.reshape(-1, dec_target_ids.size(-1)) + dec_logit_mask = dec_logit_mask.reshape(-1, dec_logit_mask.size(-1)) + + outputs, *mems = self.model(dec_input_ids, dec_position_ids, + dec_attention_mask, *enc_mems) + if self.take_softmax: + outputs = torch.nn.functional.log_softmax(outputs, dim=-1) + + batch_ids = torch.arange( + dec_target_ids.size(0), + dtype=torch.long, + device=dec_target_ids.device) + batch_ids = batch_ids.unsqueeze(1).expand_as(dec_target_ids) + seq_ids = torch.arange( + dec_target_ids.size(-1), + dtype=torch.long, + device=dec_target_ids.device) + seq_ids = seq_ids.unsqueeze(0).expand_as(dec_target_ids) + logits = outputs[batch_ids, seq_ids, dec_target_ids] + logits = (logits * dec_logit_mask).sum(dim=1) + if self.length_penalty > 0.0: + logits = logits / dec_logit_mask.sum(dim=1)**self.length_penalty + if num_choices is not None: + logits = logits.view(-1, num_choices) + return (logits, *mems) + + +class GLMForSingleTokenCloze(torch.nn.Module): + + def __init__(self, language_model, take_softmax=False): + super().__init__() + self.model = language_model + self.take_softmax = take_softmax + + def state_dict(self, destination=None, prefix='', keep_vars=False): + # [h.remove() for h in self.hook_handles] + sd = self.model.state_dict(destination, prefix, keep_vars) + return sd + + def load_state_dict(self, state_dict, strict=True): + return self.model.load_state_dict(state_dict, strict=strict) + + def named_parameters(self, prefix: str = '', recurse: bool = True): + return self.model.named_parameters(prefix=prefix, recurse=recurse) + + def forward(self, + input_ids, + position_ids, + attention_mask, + target_ids=None, + logit_mask=None, + prompt_pos=None): + if target_ids is None: + return self.model(input_ids, position_ids, attention_mask) + assert len(input_ids.shape) == 2 + outputs, *mems = self.model( + input_ids, position_ids, attention_mask, prompt_pos=prompt_pos) + batch_ids = torch.arange( + outputs.size(0), + dtype=attention_mask.dtype, + device=attention_mask.device) + target_logits = outputs[batch_ids, attention_mask] + if self.take_softmax: + target_prob = torch.nn.functional.log_softmax( + target_logits, dim=-1) + else: + target_prob = target_logits + batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids) + output = target_prob[batch_ids, target_ids] + + return (output, target_logits, *mems) + + +class GLMForSequenceClassification(torch.nn.Module): + + def __init__(self, + language_model, + hidden_size, + hidden_dropout, + pool_token, + num_class=1): + super().__init__() + self.pool_token = pool_token + self.model = language_model + self.num_class = num_class + # Multi-choice head. + self.pool_layer = torch.nn.Linear(hidden_size, hidden_size) + self.multichoice_dropout = torch.nn.Dropout(hidden_dropout) + self.multichoice_head = torch.nn.Linear(hidden_size, num_class) + + def forward(self, input_ids, position_ids, attention_mask): + num_choices = None + if len(input_ids.shape) == 3: + assert self.num_class == 1 + batch_size, num_choices = input_ids.shape[:2] + input_ids = input_ids.reshape(-1, input_ids.size(-1)) + attention_mask = attention_mask.reshape(-1, + *attention_mask.size()[2:]) + position_ids = position_ids.reshape(-1, *position_ids.size()[2:]) + outputs, *mems = self.model(input_ids, position_ids, attention_mask) + if self.pool_token == 'start': + output = outputs[torch.arange( + outputs.size(0), + dtype=attention_mask.dtype, + device=attention_mask.device), attention_mask] + elif self.pool_token == 'pad': + output = outputs[torch.arange( + outputs.size(0), + dtype=attention_mask.dtype, + device=attention_mask.device), attention_mask - 1] + elif self.pool_token == 'cls': + output = outputs[:, 0] + else: + raise NotImplementedError + output = torch.tanh(self.pool_layer(output)) + multichoice_output = self.multichoice_dropout(output) + logits = self.multichoice_head(multichoice_output) + if num_choices is not None: + logits = logits.view(-1, num_choices) + return (logits, *mems) diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py new file mode 100644 index 00000000..965f82a7 --- /dev/null +++ b/modelscope/models/nlp/mglm/model/modeling_bert.py @@ -0,0 +1,1576 @@ +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import copy +import logging +import math +import os +import shutil +import tarfile +import tempfile + +import json +import mpu +import torch +import torch.nn.functional as F +from data_utils.file_utils import cached_path +from torch import nn +from torch.nn import CrossEntropyLoss + +# from torch.utils.checkpoint import checkpoint + + +def normal_init_method(mean, std): + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=mean, std=std) + + return init_ + + +def scaled_init_method(mean, std, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = std / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=mean, std=std) + + return init_ + + +def bert_extended_attention_mask(attention_mask): + # We create a 3D attention mask from a 2D tensor mask. + # [b, 1, s] + attention_mask_b1s = attention_mask.unsqueeze(1) + # [b, s, 1] + attention_mask_bs1 = attention_mask.unsqueeze(2) + # [b, s, s] + attention_mask_bss = attention_mask_b1s * attention_mask_bs1 + # [b, 1, s, s] + extended_attention_mask = attention_mask_bss.unsqueeze(1) + + return extended_attention_mask + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': + '/root/data/bert-base-uncased.tar.gz', + 'bert-large-uncased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz', + 'bert-base-cased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz', + 'bert-large-cased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz', + 'bert-base-multilingual-uncased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz', + 'bert-base-multilingual-cased': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz', + 'bert-base-chinese': + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz', +} +CONFIG_NAME = 'bert_config.json' +WEIGHTS_NAME = 'pytorch_model.bin' +TF_WEIGHTS_NAME = 'model.ckpt' + + +def load_tf_weights_in_bert(model, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + print( + 'Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see ' + 'https://www.tensorflow.org/install/ for installation instructions.' + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + print('Converting TensorFlow checkpoint from {}'.format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print('Loading TF weight {} with shape {}'.format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ['adam_v', 'adam_m'] for n in name): + print('Skipping {}'.format('/'.join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) # noqa + else: + l = [m_name] # noqa + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print('Initialize PyTorch weight {}'.format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish} + + +class BertConfig(object): + """Configuration class to store the configuration of a `BertModel`. + """ + + def __init__(self, + vocab_size_or_config_json_file, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + deep_init=False, + fp32_layernorm=False, + fp32_embedding=False, + fp32_tokentypes=False, + layernorm_epsilon=1e-12): + """Constructs BertConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + if isinstance(vocab_size_or_config_json_file, str): + with open( + vocab_size_or_config_json_file, 'r', + encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.deep_init = deep_init + self.fp32_layernorm = fp32_layernorm + self.fp32_embedding = fp32_embedding + self.layernorm_epsilon = layernorm_epsilon + self.fp32_tokentypes = fp32_tokentypes + else: + raise ValueError( + 'First argument must be either a vocabulary size (int)' + 'or the path to a pretrained model config file (str)') + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, 'r', encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n' + + +try: + from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm +except ImportError: + print( + 'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.' + ) + + class BertLayerNorm(nn.Module): + + def __init__(self, hidden_size, eps=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, + config.hidden_size) + # self.word_embeddings = mpu.VocabParallelEmbedding( + # config.vocab_size, config.hidden_size, + # init_method=normal_init_method(mean=0.0, + # std=config.initializer_range)) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.fp32_layernorm = config.fp32_layernorm + self.fp32_embedding = config.fp32_embedding + self.fp32_tokentypes = config.fp32_tokentypes + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None): + seq_length = input_ids.size(1) + position_ids = torch.arange( + seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + if not self.fp32_tokentypes: + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + if self.fp32_embedding and not self.fp32_layernorm: + embeddings = embeddings.half() + previous_type = embeddings.type() + if self.fp32_layernorm: + embeddings = embeddings.float() + embeddings = self.LayerNorm(embeddings) + if self.fp32_layernorm: + if self.fp32_embedding: + embeddings = embeddings.half() + else: + embeddings = embeddings.type(previous_type) + else: + embeddings = words_embeddings.float() + position_embeddings.float( + ) + token_type_embeddings.float() + if self.fp32_tokentypes and not self.fp32_layernorm: + embeddings = embeddings.half() + previous_type = embeddings.type() + if self.fp32_layernorm: + embeddings = embeddings.float() + embeddings = self.LayerNorm(embeddings) + if self.fp32_layernorm: + if self.fp32_tokentypes: + embeddings = embeddings.half() + else: + embeddings = embeddings.type(previous_type) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + 'The hidden size (%d) is not a multiple of the number of attention ' + 'heads (%d)' % + (config.hidden_size, config.num_attention_heads)) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + previous_type = attention_probs.type() # noqa + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super(BertSelfOutput, self).__init__() + if hasattr(config, 'deep_init') and config.deep_init: + init_method = scaled_init_method( + mean=0.0, + std=config.initializer_range, + num_layers=config.num_hidden_layers) + else: + init_method = normal_init_method( # noqa + mean=0.0, std=config.initializer_range) + self.dense = nn.Linear( + config.hidden_size, config.hidden_size, bias=True) + # self.dense = mpu.RowParallelLinear( + # input_size=config.hidden_size, + # output_size=config.hidden_size, + # bias=True, + # input_is_parallel=True, + # stride=1, + # init_method=init_method) + self.fp32_layernorm = config.fp32_layernorm + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + ln_input = hidden_states + input_tensor + previous_type = ln_input.type() + if self.fp32_layernorm: + ln_input = ln_input.float() + hidden_states = self.LayerNorm(ln_input) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + # self.self = mpu.BertParallelSelfAttention( + # hidden_size=config.hidden_size, + # num_attention_heads=config.num_attention_heads, + # dropout_prob=config.attention_probs_dropout_prob, + # output_parallel=True, + # init_method=normal_init_method(mean=0.0, + # std=config.initializer_range)) + self.output = BertSelfOutput(config) + + def forward(self, input_tensor, attention_mask): + self_output = self.self(input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear( + config.hidden_size, config.intermediate_size, bias=True) + # self.dense = mpu.ColumnParallelLinear( + # input_size=config.hidden_size, + # output_size=config.intermediate_size, + # bias=True, + # gather_output=False, + # stride=1, + # init_method=normal_init_method(mean=0.0, + # std=config.initializer_range)) + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super(BertOutput, self).__init__() + if hasattr(config, 'deep_init') and config.deep_init: + init_method = scaled_init_method( + mean=0.0, + std=config.initializer_range, + num_layers=config.num_hidden_layers) + else: + init_method = normal_init_method( # noqa + mean=0.0, std=config.initializer_range) + self.dense = nn.Linear( + config.intermediate_size, config.hidden_size, bias=True) + # self.dense = mpu.RowParallelLinear( + # input_size=config.intermediate_size, + # output_size=config.hidden_size, + # bias=True, + # input_is_parallel=True, + # stride=1, + # init_method=init_method) + self.fp32_layernorm = config.fp32_layernorm + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + ln_input = hidden_states + input_tensor + previous_type = ln_input.type() + if self.fp32_layernorm: + ln_input = ln_input.float() + hidden_states = self.LayerNorm(ln_input) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask): + attention_output = self.attention(hidden_states, attention_mask) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super(BertEncoder, self).__init__() + # layer = BertLayer(config) + # self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + + # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): + # all_encoder_layers = [] + # for layer_module in self.layer: + # hidden_states = layer_module(hidden_states, attention_mask) + # if output_all_encoded_layers: + # all_encoder_layers.append(hidden_states) + # if not output_all_encoded_layers: + # all_encoder_layers.append(hidden_states) + # return all_encoder_layers + def forward(self, + hidden_states, + attention_mask, + output_all_encoded_layers=True, + checkpoint_activations=False): + all_encoder_layers = [] + + def custom(start, end): + + def custom_forward(*inputs): + layers = self.layer[start:end] + x_ = inputs[0] + for layer in layers: + x_ = layer(x_, inputs[1]) + return x_ + + return custom_forward + + if checkpoint_activations: + l = 0 # noqa + num_layers = len(self.layer) + chunk_length = 1 # math.ceil(math.sqrt(num_layers)) + while l < num_layers: + hidden_states = mpu.checkpoint( + custom(l, l + chunk_length), hidden_states, + attention_mask * 1) + l += chunk_length # noqa + # decoder layers + else: + for i, layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states, attention_mask) + + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + + if not output_all_encoded_layers or checkpoint_activations: + all_encoder_layers.append(hidden_states) + return all_encoder_layers + + +class BertPooler(nn.Module): + + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layernorm_epsilon) + self.fp32_layernorm = config.fp32_layernorm + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + previous_type = hidden_states.type() + if self.fp32_layernorm: + hidden_states = hidden_states.float() + hidden_states = self.LayerNorm(hidden_states) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + bert_model_embedding_weights.size(1), + bert_model_embedding_weights.size(0), + bias=False) + # self.decoder_weight = bert_model_embedding_weights + # self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + # self.bias.model_parallel = True + self.fp32_embedding = config.fp32_embedding + self.fp32_layernorm = config.fp32_layernorm + + def convert_to_type(tensor): + if self.fp32_embedding: + return tensor.half() + else: + return tensor + + self.type_converter = convert_to_type + self.converted = False + + def forward(self, hidden_states): + if not self.converted: + self.converted = True + if self.fp32_embedding: + self.transform.half() + if self.fp32_layernorm: + self.transform.LayerNorm.float() + hidden_states = self.transform(self.type_converter(hidden_states)) + hidden_states = self.decoder(hidden_states) + self.bias + # hidden_states = mpu.copy_to_model_parallel_region(hidden_states) + # hidden_states = F.linear(self.type_converter(hidden_states), + # self.type_converter(self.decoder_weight), + # self.type_converter(self.bias)) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config, bert_model_embedding_weights): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config, + bert_model_embedding_weights) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + + def __init__(self, config, bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config, + bert_model_embedding_weights) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + for p in self.seq_relationship.parameters(): + if p is None: + continue + pooled_output = pooled_output.type_as(p) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class PreTrainedBertModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super(PreTrainedBertModel, self).__init__() + if not isinstance(config, BertConfig): + raise ValueError( + 'Parameter config in `{}(config)` should be an instance of class `BertConfig`. ' + 'To create a model from a Google pretrained model use ' + '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format( + self.__class__.__name__, self.__class__.__name__)) + self.config = config + + def init_bert_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, + pretrained_model_name, + state_dict=None, + cache_dir=None, + fp32_layernorm=False, + fp32_embedding=False, + layernorm_epsilon=1e-12, + fp32_tokentypes=False, + *inputs, + **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ # noqa + if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] + else: + archive_file = pretrained_model_name + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path( + archive_file, cache_dir=cache_dir) + except FileNotFoundError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + 'associated to this path or url.'.format( + pretrained_model_name, + ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + archive_file)) + return None + if resolved_archive_file == archive_file: + logger.info('loading archive file {}'.format(archive_file)) + else: + logger.info('loading archive file {} from cache at {}'.format( + archive_file, resolved_archive_file)) + tempdir = None + if os.path.isdir(resolved_archive_file): + serialization_dir = resolved_archive_file + else: + # Extract archive to temp dir + tempdir = tempfile.mkdtemp() + logger.info('extracting archive file {} to temp dir {}'.format( + resolved_archive_file, tempdir)) + with tarfile.open(resolved_archive_file, 'r:gz') as archive: + archive.extractall(tempdir) + serialization_dir = tempdir + # Load config + config_file = os.path.join(serialization_dir, CONFIG_NAME) + config = BertConfig.from_json_file(config_file) + config.fp32_layernorm = fp32_layernorm + config.fp32_embedding = fp32_embedding + config.layernorm_epsilon = layernorm_epsilon + config.fp32_tokentypes = fp32_tokentypes + logger.info('Model config {}'.format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None: + weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) + state_dict = torch.load(weights_path) + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get( + prefix[:-1], {}) + module._load_from_state_dict(state_dict, prefix, local_metadata, + True, missing_keys, unexpected_keys, + error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(model, prefix='' if hasattr(model, 'bert') else 'bert.') + if len(missing_keys) > 0: + print('Weights of {} not initialized from pretrained model: {}'. + format(model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + print('Weights from pretrained model not used in {}: {}'.format( + model.__class__.__name__, unexpected_keys)) + if tempdir: + # Clean up temp dir + shutil.rmtree(tempdir) + return model + + +class BertModel(PreTrainedBertModel): + """BERT model ("Bidirectional Embedding Representations from a Transformer"). + + Params: + config: a BertConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. + + Outputs: Tuple of (encoded_layers, pooled_output) + `encoded_layers`: controled by `output_all_encoded_layers` argument: + - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end + of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], + - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding + to the last attention block of shape [batch_size, sequence_length, hidden_size], + `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a + classifier pretrained on top of the hidden state associated to the first character of the + input (`CLF`) to train on the Next-Sentence task (see BERT's paper). + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = modeling.BertModel(config=config) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + ``` + """ # noqa + + def __init__(self, config): + super(BertModel, self).__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + output_all_encoded_layers=True, + checkpoint_activations=False): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=next(self.encoder.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers = self.encoder( + embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers, + checkpoint_activations=checkpoint_activations) + sequence_output = encoded_layers[-1] + for p in self.pooler.parameters(): + if p is None: + continue + sequence_output = sequence_output.type_as(p) + break + pooled_output = self.pooler(sequence_output) + if not output_all_encoded_layers or checkpoint_activations: + encoded_layers = encoded_layers[-1] + return encoded_layers, pooled_output + + +class BertForPreTraining(PreTrainedBertModel): + """BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads: + - the masked language modeling head, and + - the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `masked_lm_labels` and `next_sentence_label` are not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `masked_lm_labels` or `next_sentence_label` is `None`: + Outputs a tuple comprising + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForPreTraining(config) + masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForPreTraining, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads( + config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + masked_lm_labels=None, + next_sentence_label=None, + checkpoint_activations=False): + sequence_output, pooled_output = self.bert( + input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + prediction_scores, seq_relationship_score = self.cls( + sequence_output, pooled_output) + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size).float(), + masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct( + seq_relationship_score.view(-1, 2).float(), + next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + return total_loss + else: + return prediction_scores, seq_relationship_score + + +class BertForMaskedLM(PreTrainedBertModel): + """BERT model with the masked language modeling head. + This module comprises the BERT model followed by the masked language modeling head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + + Outputs: + if `masked_lm_labels` is not `None`: + Outputs the masked language modeling loss. + if `masked_lm_labels` is `None`: + Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForMaskedLM(config) + masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForMaskedLM, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config, + self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + masked_lm_labels=None, + checkpoint_activations=False): + sequence_output, _ = self.bert( + input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + prediction_scores = self.cls(sequence_output) + + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + masked_lm_labels.view(-1)) + return masked_lm_loss + else: + return prediction_scores + + +class BertForNextSentencePrediction(PreTrainedBertModel): + """BERT model with next sentence prediction head. + This module comprises the BERT model followed by the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `next_sentence_label` is not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `next_sentence_label` is `None`: + Outputs the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForNextSentencePrediction(config) + seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForNextSentencePrediction, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + next_sentence_label=None, + checkpoint_activations=False): + _, pooled_output = self.bert( + input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + seq_relationship_score = self.cls(pooled_output) + + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct( + seq_relationship_score.view(-1, 2), + next_sentence_label.view(-1)) + return next_sentence_loss + else: + return seq_relationship_score + + +class BertForSequenceClassification(PreTrainedBertModel): + """BERT model for classification. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForSequenceClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_labels=2): + super(BertForSequenceClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + labels=None, + checkpoint_activations=False): + _, pooled_output = self.bert( + input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForMultipleChoice(PreTrainedBertModel): + """BERT model for multiple choice tasks. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_choices`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` + and type 1 corresponds to a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_choices]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) + input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) + token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_choices = 2 + + model = BertForMultipleChoice(config, num_choices) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForMultipleChoice, self).__init__(config) + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + labels=None, + checkpoint_activations=False): + batch_size, num_choices = input_ids.shape[:2] + flat_input_ids = input_ids.reshape(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.reshape(-1, + token_type_ids.size(-1)) + flat_attention_mask = attention_mask.reshape(-1, + attention_mask.size(-1)) + _, pooled_output = self.bert( + flat_input_ids, + flat_token_type_ids, + flat_attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.reshape(-1, num_choices) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + return loss + else: + return reshaped_logits + + +class BertForTokenClassification(PreTrainedBertModel): + """BERT model for token-level classification. + This module is composed of the BERT model with a linear layer on top of + the full hidden state of the last layer. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForTokenClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_labels=2): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + # self.classifier = mpu.RowParallelLinear( + # input_size=config.hidden_size, + # output_size=num_labels, + # bias=True, + # input_is_parallel=True, + # stride=1, + # init_method=normal_init_method(mean=0.0, + # std=config.initializer_range)) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + labels=None, + checkpoint_activations=False): + sequence_output, _ = self.bert( + input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + with mpu.get_cuda_rng_tracker().fork(): + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForQuestionAnswering(PreTrainedBertModel): + """BERT model for Question Answering (span extraction). + This module is composed of the BERT model with a linear layer on top of + the sequence output that computes start_logits and end_logits + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + + Outputs: + if `start_positions` and `end_positions` are not `None`: + Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. + if `start_positions` or `end_positions` is `None`: + Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end + position tokens of shape [batch_size, sequence_length]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForQuestionAnswering(config) + start_logits, end_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForQuestionAnswering, self).__init__(config) + self.bert = BertModel(config) + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + # self.qa_outputs = mpu.RowParallelLinear( + # input_size=config.hidden_size, + # output_size=2, + # bias=True, + # input_is_parallel=True, + # stride=1, + # init_method=normal_init_method(mean=0.0, + # std=config.initializer_range)) + self.apply(self.init_bert_weights) + + def forward(self, + input_ids, + token_type_ids=None, + attention_mask=None, + start_positions=None, + end_positions=None, + checkpoint_activations=False): + sequence_output, _ = self.bert( + input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + else: + return start_logits, end_logits diff --git a/modelscope/models/nlp/mglm/model/modeling_glm.py b/modelscope/models/nlp/mglm/model/modeling_glm.py new file mode 100644 index 00000000..80f61cef --- /dev/null +++ b/modelscope/models/nlp/mglm/model/modeling_glm.py @@ -0,0 +1,245 @@ +# Modified by Zhipu.AI +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GPT-2 model.""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.models.nlp.mglm import mpu +from modelscope.models.nlp.mglm.model.prompt import PromptSpell +from modelscope.models.nlp.mglm.utils import print_rank_0 + + +def init_method_normal(std=0.02): + """Init method based on normal distribution. + + This is only used for embeddings. The transformer has its + own initializer. + """ + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +class GLMModel(torch.nn.Module): + """GLM Language model. + + The output of the forward method are the logits (parallel or + serial depending on the `parallel_output` flag. + """ + + def __init__( + self, + num_layers, + vocab_size, + hidden_size, + num_attention_heads, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + max_sequence_length, + max_memory_length, + checkpoint_activations, + checkpoint_num_layers=1, + parallel_output=True, + relative_encoding=False, + block_position_encoding=False, + output_predict=True, + spell_length=None, + spell_func='lstm', + attention_scale=1.0, + ): + + super(GLMModel, self).__init__() + + self.parallel_output = parallel_output + self.output_predict = output_predict + self.hidden_size = hidden_size + + init_method = init_method_normal(std=0.02) + + # Word embeddings (parallel). + self.word_embeddings = mpu.VocabParallelEmbedding( + vocab_size, hidden_size, init_method=init_method) + + # Transformer + self.transformer = mpu.GPT2ParallelTransformer( + num_layers, + hidden_size, + num_attention_heads, + max_sequence_length, + max_memory_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers, + attention_scale=attention_scale, + relative_encoding=relative_encoding, + block_position_encoding=block_position_encoding) + if spell_length is not None: + self.prompt_spell = PromptSpell(spell_length, self.hidden_size, + spell_func) + + def freeze_transformer(self, tune_prefix_layers=None): + log_str = 'Freeze transformer' + self.word_embeddings.requires_grad_(False) + self.transformer.requires_grad_(False) + if tune_prefix_layers is not None: + log_str += f' tune {tune_prefix_layers} prefix layers' + for i in range(tune_prefix_layers): + self.transformer.layers[i].requires_grad_(True) + print_rank_0(log_str) + + def forward(self, + input_ids, + position_ids, + attention_mask, + *mems, + return_memory=False, + detach_memory=True, + prompt_pos=None): + # Embeddings. + batch_size = input_ids.size(0) + words_embeddings = self.word_embeddings(input_ids) + embeddings = words_embeddings + if prompt_pos is not None: + embeddings = embeddings.clone() + prompt_embeds = self.prompt_spell() + batch_index = torch.arange( + batch_size, device=input_ids.device).unsqueeze(1) + embeddings[batch_index, prompt_pos] = prompt_embeds + # Transformer. + transformer_output = self.transformer( + embeddings, + position_ids, + attention_mask, + mems, + return_memory=return_memory, + detach_memory=detach_memory) + logits, hidden_layers = transformer_output + outputs = hidden_layers + + if self.output_predict: + # Parallel logits. + logits_parallel = mpu.copy_to_model_parallel_region(logits) + logits_parallel = F.linear(logits_parallel, + self.word_embeddings.weight) + + if self.parallel_output: + return (logits_parallel, *outputs) + + return (mpu.gather_from_model_parallel_region(logits_parallel), + *outputs) + else: + return (logits, *outputs) + + +class EncoderDecoder(torch.nn.Module): + """Seq2Seq Transformer Model + The output of the forward method are the logits (parallel or serial depending on the `parallel_output` flag). + """ + + def __init__(self, + num_layers, + vocab_size, + hidden_size, + num_attention_heads, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + max_sequence_length, + max_memory_length, + checkpoint_activations, + checkpoint_num_layers=1, + parallel_output=True, + output_predict=True): + super(EncoderDecoder, self).__init__() + + self.parallel_output = parallel_output + self.output_predict = output_predict + + init_method = init_method_normal(std=0.02) + + # Word embeddings (parallel). + self.word_embeddings = mpu.VocabParallelEmbedding( + vocab_size, hidden_size, init_method=init_method) + + # Transformer + self.encoder = mpu.GPT2ParallelTransformer( + num_layers, hidden_size, num_attention_heads, max_sequence_length, + max_memory_length, embedding_dropout_prob, attention_dropout_prob, + output_dropout_prob, checkpoint_activations, checkpoint_num_layers) + self.decoder = mpu.GPT2ParallelTransformer( + num_layers, + hidden_size, + num_attention_heads, + max_sequence_length, + max_memory_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers, + use_decoder_layer=True) + + def forward(self, source_ids, target_ids, source_position_ids, + target_position_ids, source_mask, target_mask): + # Embeddings. + source_embeddings = self.word_embeddings(source_ids) + target_embeddings = self.word_embeddings(target_ids) + + # Transformer. + encoder_output, _ = self.encoder(source_embeddings, + source_position_ids, source_mask) + decoder_output, _ = self.decoder(target_embeddings, + target_position_ids, target_mask) + if self.output_predict: + # Parallel logits. + output_parallel = mpu.copy_to_model_parallel_region(decoder_output) + logits_parallel = F.linear(output_parallel, + self.word_embeddings.weight) + + if self.parallel_output: + return (logits_parallel, ) + + return (mpu.gather_from_model_parallel_region(logits_parallel), ) + else: + return (decoder_output, ) + + +def glm_get_params_for_weight_decay_optimization(module): + weight_decay_params = {'params': []} + no_weight_decay_params = {'params': [], 'weight_decay': 0.0} + for module_ in module.modules(): + if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)): + no_weight_decay_params['params'].extend([ + p for p in list(module_._parameters.values()) + if p is not None and p.requires_grad + ]) + else: + weight_decay_params['params'].extend([ + p for n, p in list(module_._parameters.items()) + if p is not None and p.requires_grad and n != 'bias' + ]) + no_weight_decay_params['params'].extend([ + p for n, p in list(module_._parameters.items()) + if p is not None and p.requires_grad and n == 'bias' + ]) + + return weight_decay_params, no_weight_decay_params diff --git a/modelscope/models/nlp/mglm/model/prompt.py b/modelscope/models/nlp/mglm/model/prompt.py new file mode 100644 index 00000000..a29ceda0 --- /dev/null +++ b/modelscope/models/nlp/mglm/model/prompt.py @@ -0,0 +1,59 @@ +# Copyright (c) 2022 Zhipu.AI + +import random + +import torch + + +class PromptSpell(torch.nn.Module): + + def __init__(self, spell_length, hidden_size, spell_func): + super(PromptSpell, self).__init__() + self.spell_length = spell_length + self.hidden_size = hidden_size + self.spell_embeddings = torch.nn.Embedding(self.spell_length, + self.hidden_size) + self.spell_func = spell_func + if self.spell_func == 'lstm': + self.lstm_head = torch.nn.LSTM( + input_size=self.hidden_size, + hidden_size=self.hidden_size, + num_layers=2, + # dropout=self.lstm_dropout, + bidirectional=True, + batch_first=True) # .to(torch.device("cuda")) + self.mlp_head = torch.nn.Sequential( + torch.nn.Linear(2 * self.hidden_size, self.hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size, self.hidden_size)) + elif self.spell_func == 'mlp': + self.mlp_head = torch.nn.Sequential( + torch.nn.Linear(self.hidden_size, self.hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size, self.hidden_size)) + elif self.spell_func != 'none': + raise NotImplementedError('Prompt function ' + self.spell_func) + + def init_embedding(self, word_embeddings=None, task_tokens=None): + num_words = 5000 + with torch.no_grad(): + for i in range(self.spell_length): + rand_token = random.randrange(num_words) + if task_tokens is None: + target_embedding = word_embeddings[rand_token] + else: + word_embedding = word_embeddings[rand_token] + task_token = random.choice(task_tokens) + task_embedding = word_embeddings[task_token] + ratio = random.random() + target_embedding = word_embedding * ratio + task_embedding * ( + 1 - ratio) + self.spell_embeddings.weight.data[i] = target_embedding + + def forward(self): + prompt_embeds = self.spell_embeddings.weight.unsqueeze(0) + if self.spell_func == 'lstm': + prompt_embeds = self.lstm_head(prompt_embeds)[0] + if self.spell_func == 'lstm' or self.spell_func == 'mlp': + prompt_embeds = self.mlp_head(prompt_embeds) + return prompt_embeds diff --git a/modelscope/models/nlp/mglm/mpu/__init__.py b/modelscope/models/nlp/mglm/mpu/__init__.py new file mode 100755 index 00000000..8cca4e2c --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model parallel utility interface.""" + +from .cross_entropy import vocab_parallel_cross_entropy +from .data import broadcast_data +from .grads import clip_grad_norm +from .initialize import (destroy_model_parallel, get_data_parallel_group, + get_data_parallel_rank, get_data_parallel_world_size, + get_model_parallel_group, get_model_parallel_rank, + get_model_parallel_src_rank, + get_model_parallel_world_size, + initialize_model_parallel, + model_parallel_is_initialized) +from .layers import (ColumnParallelLinear, ParallelEmbedding, + RowParallelLinear, VocabParallelEmbedding) +from .mappings import (copy_to_model_parallel_region, + gather_from_model_parallel_region, + reduce_from_model_parallel_region, + scatter_to_model_parallel_region) +from .random import (checkpoint, get_cuda_rng_tracker, + model_parallel_cuda_manual_seed, + partition_activations_in_checkpoint) +from .transformer import (BertParallelSelfAttention, + BertParallelTransformerLayer, + GPT2ParallelTransformer, LayerNorm) diff --git a/modelscope/models/nlp/mglm/mpu/cross_entropy.py b/modelscope/models/nlp/mglm/mpu/cross_entropy.py new file mode 100644 index 00000000..2ebcf7a8 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/cross_entropy.py @@ -0,0 +1,110 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from .initialize import (get_model_parallel_group, get_model_parallel_rank, + get_model_parallel_world_size) +from .utils import VocabUtility + + +class _VocabParallelCrossEntropy(torch.autograd.Function): + + @staticmethod + def forward(ctx, vocab_parallel_logits, target): + + # Copy so the input remains unchanged. + logits = vocab_parallel_logits.clone() + # Maximum value along vocab dimension across all GPUs. + logits_max = torch.max(logits, dim=-1)[0] + torch.distributed.all_reduce( + logits_max, + op=torch.distributed.ReduceOp.MAX, + group=get_model_parallel_group()) + # Subtract the maximum value. + logits.sub_(logits_max.unsqueeze(dim=-1)) + # Sum of exponential of logits along vocab dimension across all GPUs. + exp_logits = logits.exp() + sum_exp_logits = exp_logits.sum(dim=-1) + torch.distributed.all_reduce( + sum_exp_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_model_parallel_group()) + + # Get the partition's vocab indecies + get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size + partition_vocab_size = vocab_parallel_logits.size()[-1] + rank = get_model_parallel_rank() + world_size = get_model_parallel_world_size() + vocab_start_index, vocab_end_index = get_vocab_range( + partition_vocab_size, rank, world_size) + + # Create a mask of valid vocab ids (1 means it needs to be masked). + target_mask = (target < vocab_start_index) | ( + target >= vocab_end_index) + masked_target = target.clone() - vocab_start_index + masked_target[target_mask] = 0 + + # Get predicted-logits = logits[target]. + # For Simplicity, we convert logits to a 2-D tensor with size + # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. + logits_2d = logits.view(-1, partition_vocab_size) + masked_target_1d = masked_target.view(-1) + arange_1d = torch.arange( + start=0, end=logits_2d.size()[0], device=logits_2d.device) + predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] + predicted_logits = predicted_logits_1d.view_as(target) + predicted_logits[target_mask] = 0.0 + # All reduce is needed to get the chunks from other GPUs. + torch.distributed.all_reduce( + predicted_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_model_parallel_group()) + + # Loss = log(sum(exp(logits))) - predicted-logit. + loss = torch.log(sum_exp_logits) - predicted_logits + + # Store softmax, target-mask and masked-target for backward pass. + exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) + ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) + + return loss + + @staticmethod + def backward(ctx, grad_output): + + # Retreive tensors from the forward path. + softmax, target_mask, masked_target_1d = ctx.saved_tensors + + # All the inputs have softmax as thier gradient. + grad_input = softmax + # For simplicity, work with the 2D gradient. + partition_vocab_size = softmax.size()[-1] + grad_2d = grad_input.view(-1, partition_vocab_size) + + # Add the gradient from matching classes. + arange_1d = torch.arange( + start=0, end=grad_2d.size()[0], device=grad_2d.device) + grad_2d[arange_1d, + masked_target_1d] -= (1.0 - target_mask.view(-1).float()) + + # Finally elementwise multiplication with the output gradients. + grad_input.mul_(grad_output.unsqueeze(dim=-1)) + + return grad_input, None + + +def vocab_parallel_cross_entropy(vocab_parallel_logits, target): + """Helper function for the cross entropy.""" + return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) diff --git a/modelscope/models/nlp/mglm/mpu/data.py b/modelscope/models/nlp/mglm/mpu/data.py new file mode 100644 index 00000000..6f595f0f --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/data.py @@ -0,0 +1,117 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from .initialize import (get_model_parallel_group, get_model_parallel_rank, + get_model_parallel_src_rank) + +_MAX_DATA_DIM = 5 + + +def _check_data_types(keys, data, target_dtype): + """Check that all the keys have the same target data type.""" + for key in keys: + assert data[key].dtype == target_dtype, '{} has data type {} which '\ + 'is different than {}'.format(key, data[key].dtype, target_dtype) + + +def _build_key_size_numel_dictionaries(keys, data): + """Build the size on rank 0 and broadcast.""" + max_dim = _MAX_DATA_DIM + sizes = [0 for _ in range(max_dim) for _ in keys] + + # Pack the sizes on rank zero. + if get_model_parallel_rank() == 0: + offset = 0 + for key in keys: + assert data[key].dim( + ) < max_dim, 'you should increase MAX_DATA_DIM' + size = data[key].size() + for i, s in enumerate(size): + sizes[i + offset] = s + offset += max_dim + + # Move to GPU and broadcast. + sizes_cuda = torch.cuda.LongTensor(sizes) + torch.distributed.broadcast( + sizes_cuda, + get_model_parallel_src_rank(), + group=get_model_parallel_group()) + + # Move back to cpu and unpack. + sizes_cpu = sizes_cuda.cpu() + key_size = {} + key_numel = {} + total_numel = 0 + offset = 0 + for key in keys: + i = 0 + size = [] + numel = 1 + while sizes_cpu[offset + i] > 0: + this_size = sizes_cpu[offset + i] + size.append(this_size) + numel *= this_size + i += 1 + key_size[key] = size + key_numel[key] = numel + total_numel += numel + offset += max_dim + + return key_size, key_numel, total_numel + + +def broadcast_data(keys, data, datatype): + """Broadcast data from rank zero of each model parallel group to the + members of the same model parallel group. + + Arguments: + keys: list of keys in the data disctionary to be broadcasted + data: data dictionary of string keys and cpu tensor values. + datatype: torch data type of all tensors in data associated + with keys. + """ + # Build (key, size) and (key, number of elements) dictionaries along + # with the total number of elements on all ranks. + key_size, key_numel, total_numel = _build_key_size_numel_dictionaries( + keys, data) + + # Pack on rank zero. + if get_model_parallel_rank() == 0: + # Check that all keys have the same data type. + _check_data_types(keys, data, datatype) + # Flatten the data associated with the keys + flatten_data = torch.cat( + [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() + else: + flatten_data = torch.empty( + total_numel, device=torch.cuda.current_device(), dtype=datatype) + + # Boradcast + torch.distributed.broadcast( + flatten_data, + get_model_parallel_src_rank(), + group=get_model_parallel_group()) + + # Unpack + output = {} + offset = 0 + for key in keys: + size = key_size[key] + numel = key_numel[key] + output[key] = flatten_data.narrow(0, offset, numel).view(size) + offset += numel + + return output diff --git a/modelscope/models/nlp/mglm/mpu/grads.py b/modelscope/models/nlp/mglm/mpu/grads.py new file mode 100644 index 00000000..a7dc6c5c --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/grads.py @@ -0,0 +1,72 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + +import torch +from torch._six import inf + +from .initialize import get_model_parallel_group, get_model_parallel_rank + + +def clip_grad_norm(parameters, max_norm, norm_type=2): + """Clips gradient norm of an iterable of parameters. + + This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and + added functionality to handle model parallel parameters. Note that + the gradients are modified in place. + + Arguments: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + + Returns: + Total norm of the parameters (viewed as a single vector). + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + max_norm = float(max_norm) + norm_type = float(norm_type) + if norm_type == inf: + total_norm = max(p.grad.data.abs().max() for p in parameters) + total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + # Take max across all GPUs. + torch.distributed.all_reduce( + total_norm_cuda, + op=torch.distributed.ReduceOp.MAX, + group=get_model_parallel_group()) + total_norm = total_norm_cuda[0].item() + else: + total_norm = 0 + for p in parameters: + if p.model_parallel or (get_model_parallel_rank() == 0): + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item()**norm_type + # Sum across all model parallel GPUs. + total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + torch.distributed.all_reduce( + total_norm_cuda, + op=torch.distributed.ReduceOp.SUM, + group=get_model_parallel_group()) + total_norm = total_norm_cuda[0].item()**(1. / norm_type) + clip_coef = max_norm / (total_norm + 1e-6) + if clip_coef < 1: + for p in parameters: + p.grad.data.mul_(clip_coef) + return total_norm diff --git a/modelscope/models/nlp/mglm/mpu/initialize.py b/modelscope/models/nlp/mglm/mpu/initialize.py new file mode 100644 index 00000000..33f8dbda --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/initialize.py @@ -0,0 +1,130 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model and data parallel groups.""" + +import torch + +from .utils import ensure_divisibility + +# Model parallel group that the current rank belongs to. +_MODEL_PARALLEL_GROUP = None +# Data parallel group that the current rank belongs to. +_DATA_PARALLEL_GROUP = None + + +def initialize_model_parallel(model_parallel_size_): + """ + Initialize model data parallel groups. + + Arguments: + model_parallel_size: number of GPUs used to parallelize model. + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model. The present function will + create 4 model parallel groups and 2 data parallel grous as: + 4 model parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 data parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + if torch.distributed.get_rank() == 0: + print('> initializing model parallel with size {}'.format( + model_parallel_size_)) + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size = torch.distributed.get_world_size() + model_parallel_size = min(model_parallel_size_, world_size) + ensure_divisibility(world_size, model_parallel_size) + rank = torch.distributed.get_rank() + + # Build the data parallel groups. + global _DATA_PARALLEL_GROUP + assert _DATA_PARALLEL_GROUP is None, \ + 'data parallel group is already initialized' + for i in range(model_parallel_size): + ranks = range(i, world_size, model_parallel_size) + group = torch.distributed.new_group(ranks) + if i == (rank % model_parallel_size): + _DATA_PARALLEL_GROUP = group + + # Build the model parallel groups. + global _MODEL_PARALLEL_GROUP + assert _MODEL_PARALLEL_GROUP is None, \ + 'model parallel group is already initialized' + for i in range(world_size // model_parallel_size): + ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size) + group = torch.distributed.new_group(ranks) + if i == (rank // model_parallel_size): + _MODEL_PARALLEL_GROUP = group + + +def model_parallel_is_initialized(): + """Check if model and data parallel groups are initialized.""" + if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None: + return False + return True + + +def get_model_parallel_group(): + """Get the model parallel group the caller rank belongs to.""" + assert _MODEL_PARALLEL_GROUP is not None, \ + 'model parallel group is not initialized' + return _MODEL_PARALLEL_GROUP + + +def get_data_parallel_group(): + """Get the data parallel group the caller rank belongs to.""" + assert _DATA_PARALLEL_GROUP is not None, \ + 'data parallel group is not initialized' + return _DATA_PARALLEL_GROUP + + +def get_model_parallel_world_size(): + """Return world size for the model parallel group.""" + return torch.distributed.get_world_size(group=get_model_parallel_group()) + + +def get_model_parallel_rank(): + """Return my rank for the model parallel group.""" + return torch.distributed.get_rank(group=get_model_parallel_group()) + + +def get_model_parallel_src_rank(): + """Calculate the global rank corresponding to a local rank zeor + in the model parallel group.""" + global_rank = torch.distributed.get_rank() + local_world_size = get_model_parallel_world_size() + return (global_rank // local_world_size) * local_world_size + + +def get_data_parallel_world_size(): + """Return world size for the data parallel group.""" + return torch.distributed.get_world_size(group=get_data_parallel_group()) + + +def get_data_parallel_rank(): + """Return my rank for the data parallel group.""" + return torch.distributed.get_rank(group=get_data_parallel_group()) + + +def destroy_model_parallel(): + """Set the groups to none.""" + global _MODEL_PARALLEL_GROUP + _MODEL_PARALLEL_GROUP = None + global _DATA_PARALLEL_GROUP + _DATA_PARALLEL_GROUP = None diff --git a/modelscope/models/nlp/mglm/mpu/layers.py b/modelscope/models/nlp/mglm/mpu/layers.py new file mode 100644 index 00000000..4eb94b50 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/layers.py @@ -0,0 +1,357 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + +import math + +import torch +import torch.nn.functional as F +import torch.nn.init as init +from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm +from torch.nn.parameter import Parameter + +from .initialize import get_model_parallel_rank, get_model_parallel_world_size +from .mappings import (copy_to_model_parallel_region, + gather_from_model_parallel_region, + reduce_from_model_parallel_region, + scatter_to_model_parallel_region) +from .random import get_cuda_rng_tracker +from .utils import VocabUtility, divide, split_tensor_along_last_dim + + +def _initialize_affine_weight(weight, + output_size, + input_size, + per_partition_size, + partition_dim, + init_method, + stride=1, + return_master_weight=False): + """Initialize affine weight for model parallel. + + Build the master weight on all processes and scatter + the relevant chunk.""" + # If we only use 1 process for model parallelism, bypass scatter. + world_size = get_model_parallel_world_size() + if world_size == 1: + init_method(weight) + if return_master_weight: + return weight + return None + + # Initialize master weight + master_weight = torch.empty( + output_size, input_size, dtype=weight.dtype, requires_grad=False) + init_method(master_weight) + + # Split and copy + per_partition_per_stride_size = divide(per_partition_size, stride) + weight_list = torch.split( + master_weight, per_partition_per_stride_size, dim=partition_dim) + rank = get_model_parallel_rank() + my_weight_list = weight_list[rank::world_size] + + with torch.no_grad(): + torch.cat(my_weight_list, dim=partition_dim, out=weight) + if return_master_weight: + return master_weight + return None + + +class VocabParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the vocabulary dimension. + + This is mainly adapted from torch.nn.Embedding and all the default + values are kept. + Arguments: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + init_method: method to initialize weights. + """ + + def __init__(self, + num_embeddings, + embedding_dim, + init_method=init.xavier_normal_): + super(VocabParallelEmbedding, self).__init__() + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + # Set the detauls for compatibility. + self.padding_idx = None + self.max_norm = None + self.norm_type = 2. + self.scale_grad_by_freq = False + self.sparse = False + self._weight = None + # Divide the weight matrix along the vocaburaly dimension. + self.vocab_start_index, self.vocab_end_index = \ + VocabUtility.vocab_range_from_global_vocab_size( + self.num_embeddings, get_model_parallel_rank(), + get_model_parallel_world_size()) + self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index # noqa + + # Allocate weights. + self.weight = Parameter( + torch.Tensor(self.num_embeddings_per_partition, + self.embedding_dim)) + self.weight.model_parallel = True + # And initialize. + _initialize_affine_weight(self.weight, self.num_embeddings, + self.embedding_dim, + self.num_embeddings_per_partition, 0, + init_method) + + def forward(self, input_): + # Build the mask. + input_mask = (input_ < self.vocab_start_index) | \ + (input_ >= self.vocab_end_index) + # Mask the input. + masked_input = input_.clone() - self.vocab_start_index + masked_input[input_mask] = 0 + # Get the embeddings. + output_parallel = F.embedding(masked_input, self.weight, + self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, + self.sparse) + # Mask the output embedding. + output_parallel[input_mask, :] = 0.0 + # Reduce across all the model parallel GPUs. + output = reduce_from_model_parallel_region(output_parallel) + return output + + +class ParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the embedding dimension. + + This is mainly adapted from torch.nn.Embedding and all the default + values are kept. + Arguments: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + init_method: method to initialize weights. + """ + + def __init__(self, + num_embeddings, + embedding_dim, + init_method=init.xavier_normal_, + keep_master_weight_for_test=False): + super(ParallelEmbedding, self).__init__() + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + # Set some detauls for compatibility. + self.padding_idx = None + self.max_norm = None + self.norm_type = 2. + self.scale_grad_by_freq = False + self.sparse = False + self._weight = None + # Divide the weight matrix along the embedding dimension. + world_size = get_model_parallel_world_size() + self.embedding_dim_per_partition = divide(self.embedding_dim, + world_size) + + # Allocate weights. + self.weight = Parameter( + torch.Tensor(self.num_embeddings, + self.embedding_dim_per_partition)) + self.weight.model_parallel = True + # And initialize. + _initialize_affine_weight( + self.weight, + self.num_embeddings, + self.embedding_dim, + self.embedding_dim_per_partition, + 1, + init_method, + stride=1, + return_master_weight=False) + + def forward(self, input_): + input_parallel = copy_to_model_parallel_region(input_) + output_parallel = F.embedding(input_parallel, self.weight, + self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, + self.sparse) + output = gather_from_model_parallel_region(output_parallel) + return output + + +class ColumnParallelLinear(torch.nn.Module): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias + gather_output: If true, call all-gether on output and make Y avaiable + to all GPUs, otherwise, every GPU will have its output + which is Y_i = XA_i + init_method: method to initialize weights. Note that bias is always set + to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be + set to False. It returns the master weights + used for initialization. + """ + + def __init__(self, + input_size, + output_size, + bias=True, + gather_output=True, + init_method=init.xavier_normal_, + stride=1, + keep_master_weight_for_test=False): + super(ColumnParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.gather_output = gather_output + # Divide the weight matrix along the last dimension. + world_size = get_model_parallel_world_size() + self.output_size_per_partition = divide(output_size, world_size) + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + self.weight = Parameter( + torch.Tensor(self.output_size_per_partition, self.input_size)) + self.weight.model_parallel = True + if bias: + self.bias = Parameter(torch.Tensor(self.output_size_per_partition)) + self.bias.model_parallel = True + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + else: + self.register_parameter('bias', None) + + # Initialize weight. + self.master_weight = _initialize_affine_weight( + self.weight, + self.output_size, + self.input_size, + self.output_size_per_partition, + 0, + init_method, + stride=stride, + return_master_weight=keep_master_weight_for_test) + + def forward(self, input_): + # Set up backprop all-reduce. + input_parallel = copy_to_model_parallel_region(input_) + # Matrix multiply. + output_parallel = F.linear(input_parallel, self.weight, self.bias) + if self.gather_output: + # All-gather across the partitions. + output = gather_from_model_parallel_region(output_parallel) + else: + output = output_parallel + return output + + +class RowParallelLinear(torch.nn.Module): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already + split across the GPUs and we do not split + again. + init_method: method to initialize weights. Note that bias is always set + to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be + set to False. It returns the master weights + used for initialization. + """ + + def __init__(self, + input_size, + output_size, + bias=True, + input_is_parallel=False, + init_method=init.xavier_normal_, + stride=1, + keep_master_weight_for_test=False): + super(RowParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.input_is_parallel = input_is_parallel + # Divide the weight matrix along the last dimension. + world_size = get_model_parallel_world_size() + self.input_size_per_partition = divide(input_size, world_size) + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + self.weight = Parameter( + torch.Tensor(self.output_size, self.input_size_per_partition)) + self.weight.model_parallel = True + if bias: + self.bias = Parameter(torch.Tensor(self.output_size)) + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + else: + self.register_parameter('bias', None) + + # Initialize weight. + self.master_weight = _initialize_affine_weight( + self.weight, + self.output_size, + self.input_size, + self.input_size_per_partition, + 1, + init_method, + stride=stride, + return_master_weight=keep_master_weight_for_test) + + def forward(self, input_): + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + input_parallel = scatter_to_model_parallel_region(input_) + # Matrix multiply. + output_parallel = F.linear(input_parallel, self.weight) + # All-reduce across all the partitions. + output_ = reduce_from_model_parallel_region(output_parallel) + if self.bias is not None: + output = output_ + self.bias + else: + output = output_ + return output diff --git a/modelscope/models/nlp/mglm/mpu/mappings.py b/modelscope/models/nlp/mglm/mpu/mappings.py new file mode 100644 index 00000000..b3056dd7 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/mappings.py @@ -0,0 +1,144 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from .initialize import get_model_parallel_group +from .utils import split_tensor_along_last_dim + + +def _reduce(input_): + """All-reduce the the input tensor across model parallel group.""" + group = get_model_parallel_group() + + # Bypass the function if we are using only 1 GPU. + if torch.distributed.get_world_size(group=group) == 1: + return input_ + + # All-reduce. + torch.distributed.all_reduce(input_, group=group) + + return input_ + + +def _split(input_): + """Split the tensor along its last dimension and keep the + corresponding slice.""" + group = get_model_parallel_group() + + # Bypass the function if we are using only 1 GPU. + if torch.distributed.get_world_size(group=group) == 1: + return input_ + + # Split along last dimension. + world_size = torch.distributed.get_world_size(group=group) + input_list = split_tensor_along_last_dim(input_, world_size) + + # Note: torch.split does not create contiguous tensors by default. + rank = torch.distributed.get_rank(group=group) + output = input_list[rank].contiguous() + + return output + + +def _gather(input_): + """Gather tensors and concatinate along the last dimension.""" + group = get_model_parallel_group() + + # Bypass the function if we are using only 1 GPU. + if torch.distributed.get_world_size(group=group) == 1: + return input_ + + # Size and dimension. + last_dim = input_.dim() - 1 + rank = torch.distributed.get_rank(group=group) + world_size = torch.distributed.get_world_size(group=group) + + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank] = input_ + torch.distributed.all_gather(tensor_list, input_, group=group) + + # Note: torch.cat already creates a contiguous tensor. + output = torch.cat(tensor_list, dim=last_dim).contiguous() + + return output + + +class _CopyToModelParallelRegion(torch.autograd.Function): + """Pass the input to the model parallel region.""" + + @staticmethod + def forward(ctx, input_): + return input_ + + @staticmethod + def backward(ctx, grad_output): + return _reduce(grad_output) + + +class _ReduceFromModelParallelRegion(torch.autograd.Function): + """All-redcue the input from the model parallel region.""" + + @staticmethod + def forward(ctx, input_): + return _reduce(input_) + + @staticmethod + def backward(ctx, grad_output): + return grad_output + + +class _ScatterToModelParallelRegion(torch.autograd.Function): + """Split the input and keep only the corresponding chuck to the rank.""" + + @staticmethod + def forward(ctx, input_): + return _split(input_) + + @staticmethod + def backward(ctx, grad_output): + return _gather(grad_output) + + +class _GatherFromModelParallelRegion(torch.autograd.Function): + """Gather the input from model parallel region and concatinate.""" + + @staticmethod + def forward(ctx, input_): + return _gather(input_) + + @staticmethod + def backward(ctx, grad_output): + return _split(grad_output) + + +# ----------------- +# Helper functions. +# ----------------- + + +def copy_to_model_parallel_region(input_): + return _CopyToModelParallelRegion.apply(input_) + + +def reduce_from_model_parallel_region(input_): + return _ReduceFromModelParallelRegion.apply(input_) + + +def scatter_to_model_parallel_region(input_): + return _ScatterToModelParallelRegion.apply(input_) + + +def gather_from_model_parallel_region(input_): + return _GatherFromModelParallelRegion.apply(input_) diff --git a/modelscope/models/nlp/mglm/mpu/random.py b/modelscope/models/nlp/mglm/mpu/random.py new file mode 100755 index 00000000..2cdf236d --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/random.py @@ -0,0 +1,408 @@ +# Modified by Samyam Rajbhandari +# Used to partition the activations stored for backward propagation +# Therefore reduces the memory consumption + +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch +import contextlib + +import torch +import torch.distributed as dist +from torch import _C +from torch.cuda import _lazy_call +from torch.cuda import device as device_ctx_manager + +from .initialize import (get_data_parallel_rank, get_model_parallel_group, + get_model_parallel_rank, + get_model_parallel_world_size) + +# from torch.utils.checkpoint import detach_variable + +PARTITION_ACTIVATIONS = False +PA_CORRECTNESS_TEST = False + + +def see_memory_usage(message, force=False): + if not force: + return + dist.barrier() + if dist.get_rank() == 0: + print(message) + print('Memory Allocated ', + torch.cuda.memory_allocated() / (1024 * 1024 * 1024), + 'GigaBytes') + print('Max Memory Allocated ', + torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), + 'GigaBytes') + print('Cache Allocated ', + torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes') + print('Max cache Allocated ', + torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), + 'GigaBytes') + print(' ') + # input("Press Any Key To Continue ..") + + +mp_rank = None # get_model_parallel_rank() +mp_size = None # get_model_parallel_world_size() +mp_group = None # get_model_parallel_group() + +# Default name for the model parallel rng tracker. +_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' +transport_stream = None +cuda_device = None + + +def detach_variable(inputs, device=None): + if isinstance(inputs, tuple): + out = [] + for inp in inputs: + if not isinstance(inp, torch.Tensor): + out.append(inp) + continue + + requires_grad = inp.requires_grad + + if device is not None: + x = inp.to(device=device) + else: + x = inp + + x = x.detach() + x.requires_grad = requires_grad + out.append(x) + return tuple(out) + else: + raise RuntimeError( + 'Only tuple of tensors is supported. Got Unsupported input type: ', + type(inputs).__name__) + + +def _set_cuda_rng_state(new_state, device=-1): + """Sets the random number generator state of the current GPU. + + Argumentss: + new_state (torch.ByteTensor): The desired state + This function is adapted from PyTorch repo (torch.cuda.set_rng_state) + with a single change: the input state is not cloned. Cloning caused + major performance issues for +4 GPU cases. + """ + if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): + # older PyTorch + def cb(): + with device_ctx_manager(device): + _C._cuda_setRNGState(new_state) + else: + # newer PyTorch + if device == -1: + device = torch.device('cuda') + elif isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device('cuda', device) + + def cb(): + idx = device.index + if idx is None: + idx = torch.cuda.current_device() + default_generator = torch.cuda.default_generators[idx] + default_generator.set_state(new_state) + + _lazy_call(cb) + + +class CudaRNGStatesTracker: + """Tracker for the cuda RNG states. + + Using the `add` method, a cuda rng state is initialized based on + the input `seed` and is assigned to `name`. Later, by forking the + rng state, we can perform operations and return to our starting + cuda state. + """ + + def __init__(self): + # Map from a string name to the cuda rng state. + self.states_ = {} + # Seeds are just for book keeping and ensure no seed is set twice. + self.seeds_ = set() + + def reset(self): + """Set to the initial state (no tracker).""" + self.states_ = {} + self.seeds_ = set() + + def get_states(self): + """Get rng states. Copy the dictionary so we have direct + pointers to the states, not just a pointer to the dictionary.""" + states = {} + for name in self.states_: + states[name] = self.states_[name] + return states + + def set_states(self, states): + """Set the rng states. For efficiency purposes, we do not check + the size of seed for compatibility.""" + self.states_ = states + + def add(self, name, seed): + """Track the rng state.""" + # Check seed is not already used. + if seed in self.seeds_: + raise Exception('seed {} already exists'.format(seed)) + self.seeds_.add(seed) + # Check that state is not already defined. + if name in self.states_: + raise Exception('cuda rng state {} already exists'.format(name)) + # Get the current rng state. + orig_rng_state = torch.cuda.get_rng_state() + # Set the new state and store it. + torch.cuda.manual_seed(seed) + self.states_[name] = torch.cuda.get_rng_state() + # Reset rng state to what it was. + _set_cuda_rng_state(orig_rng_state) + + @contextlib.contextmanager + def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): + """Fork the cuda rng state, perform operations, and exit with + the original state.""" + # Check if we have added the state + if name not in self.states_: + raise Exception('cuda rng state {} is not added'.format(name)) + # Store current rng state. + orig_cuda_rng_state = torch.cuda.get_rng_state() + # Set rng state to the desired one + _set_cuda_rng_state(self.states_[name]) + # Do the stuff we wanted to do. + try: + yield + finally: + # Update the current rng state for later use. + self.states_[name] = torch.cuda.get_rng_state() + # And set the state to the original state we started with. + _set_cuda_rng_state(orig_cuda_rng_state) + + +# RNG tracker object. +_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + + +def get_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _CUDA_RNG_STATE_TRACKER + + +def model_parallel_cuda_manual_seed(seed): + """Initialize model parallel cuda seed. + + This function should be called after the model parallel is + initialized. Also, no torch.cuda.manual_seed should be called + after this function. Basically, this is replacement for that + function. + Two set of RNG states are tracked: + default state: This is for data parallelism and is the same among a + set of model parallel GPUs but different across + different model paralle groups. This is used for + example for dropout in the non-model-parallel regions. + model-parallel state: This state is different among a set of model + parallel GPUs, but the same across data parallel + groups. This is used for example for dropout in + model parallel regions. + """ + # 2718 is just for fun and any POSITIVE value will work. + offset = seed + 2718 + model_parallel_seed = offset + get_model_parallel_rank() + # Data parallel gets the original sedd. + data_parallel_seed = seed + + if torch.distributed.get_rank() == 0: + print( + '> initializing model parallel cuda seeds on global rank {}, ' + 'model parallel rank {}, and data parallel rank {} with ' + 'model parallel seed: {} and data parallel seed: {}'.format( + torch.distributed.get_rank(), get_model_parallel_rank(), + get_data_parallel_rank(), model_parallel_seed, + data_parallel_seed), + flush=True) + _CUDA_RNG_STATE_TRACKER.reset() + # Set the default state. + torch.cuda.manual_seed(data_parallel_seed) + # and model parallel state. + _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, + model_parallel_seed) + + +def get_partition_start(item): + global mp_rank, mp_size, mp_group + partition_size = get_partition_size(item) + start = partition_size * mp_rank + return int(start) + + +def get_partition_size(item): + global mp_rank, mp_size, mp_group + size = item.numel() + partition_size = size / mp_size + return int(partition_size) + + +def get_full_inputs(tensors): + inputs = [] + for i in range(int(len(tensors) / 2) - 1): + item = tensors[2 * i] + size = tensors[2 * i + 1] + partition_size = item.numel() + tensor_size = partition_size * mp_size + flat_tensor = torch.zeros([tensor_size], + dtype=item.dtype, + device=item.device) + partitions = [] + for i in range(mp_size): + part_i = flat_tensor.narrow(0, partition_size * i, partition_size) + if i == mp_rank: + part_i.copy_(item) + partitions.append(part_i) + dist.all_gather(partitions, partitions[mp_rank], group=mp_group) + input_tensor = flat_tensor.view(list(size.numpy())) + item.data = input_tensor.data + + inputs.append(item) + inputs.append(tensors[-2]) + + return tuple(inputs) + + +class CheckpointFunction(torch.autograd.Function): + """This function is adapted from torch.utils.checkpoint with + two main changes: + 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` + 2) the states in the model parallel tracker are also properly + tracked/set/reset. + """ + + @staticmethod + def forward(ctx, run_function, *args): + ctx.run_function = run_function + global mp_rank, mp_size, mp_group + if mp_rank is None: + mp_rank = get_model_parallel_rank() + mp_size = get_model_parallel_world_size() + mp_group = get_model_parallel_group() + + global cuda_device, transport_stream, PARTITION_ACTIVATIONS + if cuda_device is None: + if dist.get_rank() == 0: + print( + f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}' + ) + + cuda_device = torch.cuda.current_device() + # The transport stream is used to overlap the allgather communication for the activations + # with the computation in the backward pass + transport_stream = torch.cuda.Stream(device=cuda_device) + + if PARTITION_ACTIVATIONS: + inputs = [ + item.detach().contiguous().view(-1).narrow( + 0, get_partition_start(item), + get_partition_size(item)).clone() for item in args[:-1] + ] + inputs.append(args[-1]) + + # just in case something funky is happening such as reuse of inputs + inputs_cuda = [item.to(cuda_device) for item in args] + + # Copy the rng states. + ctx.fwd_cpu_rng_state = torch.get_rng_state() + ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() + ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + # ctx.save_for_backward(*args) + with torch.no_grad(): + outputs = run_function(*inputs_cuda) + + del inputs_cuda + + if PARTITION_ACTIVATIONS: + new_args = [] + for arg, inp in zip(args, inputs): + size = torch.tensor(arg.size()) + arg.data = inp.data + new_args.append(arg) + new_args.append(size) + ctx.save_for_backward(*new_args) + else: + ctx.save_for_backward(*args) + + return outputs + + @staticmethod + def backward(ctx, *args): + if not torch.autograd._is_checkpoint_valid(): + raise RuntimeError('Checkpointing is not compatible with .grad(), ' + 'please use .backward() if possible') + + global cuda_device, transport_stream, PARTITION_ACTIVATIONS + + if PARTITION_ACTIVATIONS: + with torch.cuda.stream(transport_stream): + inputs = get_full_inputs(ctx.saved_tensors) + detached_inputs = detach_variable(inputs) + else: + inputs = ctx.saved_tensors + detached_inputs = detach_variable(inputs) + + # Store the current states. + bwd_cpu_rng_state = torch.get_rng_state() + bwd_cuda_rng_state = torch.cuda.get_rng_state() + bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + # Set the states to what it used to be before the forward pass. + torch.set_rng_state(ctx.fwd_cpu_rng_state) + _set_cuda_rng_state(ctx.fwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker) + + if PARTITION_ACTIVATIONS: + current_stream = torch.cuda.current_stream() + current_stream.wait_stream(transport_stream) + + with torch.enable_grad(): + outputs = ctx.run_function(*detached_inputs) + + # Set the states back to what it was at the start of this function. + torch.set_rng_state(bwd_cpu_rng_state) + _set_cuda_rng_state(bwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker) + + if isinstance(outputs, torch.Tensor): + outputs = (outputs, ) + torch.autograd.backward(outputs, args) + return (None, ) + tuple(inp.grad for inp in detached_inputs) + + +def checkpoint(function, *args): + """Checkpoint a model or part of the model. + This has been directly copied from torch.utils.checkpoint.""" + return CheckpointFunction.apply(function, *args) + + +def partition_activations_in_checkpoint(partition_activation): + global PARTITION_ACTIVATIONS + PARTITION_ACTIVATIONS = partition_activation + if dist.get_rank() == 0: + print( + f'**************Partition Activations {PARTITION_ACTIVATIONS}************' + ) diff --git a/modelscope/models/nlp/mglm/mpu/tests/__init__.py b/modelscope/models/nlp/mglm/mpu/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/nlp/mglm/mpu/tests/commons.py b/modelscope/models/nlp/mglm/mpu/tests/commons.py new file mode 100644 index 00000000..ecfd5e72 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/tests/commons.py @@ -0,0 +1,86 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import random + +import mpu +import numpy +import torch + + +class IdentityLayer(torch.nn.Module): + + def __init__(self, size, scale=1.0): + super(IdentityLayer, self).__init__() + self.weight = torch.nn.Parameter(scale * torch.randn(size)) + + def forward(self): + return self.weight + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + numpy.random.seed(seed) + torch.manual_seed(seed) + mpu.model_parallel_cuda_manual_seed(seed) + + +def initialize_distributed(backend='nccl'): + """Initialize torch.distributed.""" + # Get local rank in case it is provided. + parser = argparse.ArgumentParser() + parser.add_argument( + '--local_rank', + type=int, + default=None, + help='local rank passed from distributed launcher') + args = parser.parse_args() + local_rank = args.local_rank + + # Get rank and world size. + rank = int(os.getenv('RANK', '0')) + world_size = int(os.getenv('WORLD_SIZE', '1')) + + print('> initializing torch.distributed with local rank: {}, ' + 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) + + # Set the device id. + device = rank % torch.cuda.device_count() + if local_rank is not None: + device = local_rank + torch.cuda.set_device(device) + + # Call the init process. + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend=backend, + world_size=world_size, + rank=rank, + init_method=init_method) + + +def print_separator(message): + torch.distributed.barrier() + filler_len = (78 - len(message)) // 2 + filler = '-' * filler_len + string = '\n' + filler + ' {} '.format(message) + filler + if torch.distributed.get_rank() == 0: + print(string, flush=True) + torch.distributed.barrier() diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py new file mode 100644 index 00000000..47fd1d7e --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py @@ -0,0 +1,106 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import sys + +import mpu +import torch +import torch.nn.functional as F +from commons import (IdentityLayer, initialize_distributed, print_separator, + set_random_seed) +from mpu.cross_entropy import vocab_parallel_cross_entropy + +sys.path.append('../..') + + +def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, + seed): + set_random_seed(seed) + identity = IdentityLayer((batch_size, seq_length, vocab_size), + scale=logits_scale).cuda() + logits = identity() + target = torch.cuda.LongTensor(size=(batch_size, + seq_length)).random_(0, vocab_size) + loss = F.cross_entropy( + logits.view(-1, + logits.size()[-1]), target.view(-1), + reduction='none').view_as(target).mean() + loss.backward() + return loss, identity.weight.grad + + +def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed): + set_random_seed(seed) + identity = IdentityLayer((batch_size, seq_length, vocab_size), + scale=logits_scale).cuda() + logits = identity() + logits_parallel = mpu.scatter_to_model_parallel_region(logits) + target = torch.cuda.LongTensor(size=(batch_size, + seq_length)).random_(0, vocab_size) + loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() + loss.backward() + return loss, identity.weight.grad + + +def test_cross_entropy(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing cross entropy with model parallel size {} ...'.format( + model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + batch_size = 13 + seq_length = 17 + vocab_size_per_partition = 11 + logits_scale = 1000.0 + vocab_size = vocab_size_per_partition * model_parallel_size + seed = 1234 + + loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, + vocab_size, logits_scale, + seed) + loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size, + logits_scale, seed) + + error = loss_torch.sub_(loss_mpu).abs().max() + print(' max error in loss on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = grad_torch.sub_(grad_mpu).abs().max() + print(' max error in grad on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test cross entropy') + test_cross_entropy(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_data.py b/modelscope/models/nlp/mglm/mpu/tests/test_data.py new file mode 100644 index 00000000..66575300 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/tests/test_data.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import operator +import sys + +import mpu +import torch +from commons import initialize_distributed, print_separator +from mpu import data as data_utils + +sys.path.append('../..') + + +def test_boradcast_data(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print( + '> testing boradcast_data with model parallel size {} ...'.format( + model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + torch.manual_seed(1234 + mpu.get_data_parallel_rank()) + model_parallel_size = mpu.get_model_parallel_world_size() + + key_size_t = { + 'key1': [7, 11], + 'key2': [8, 2, 1], + 'key3': [13], + 'key4': [5, 1, 2], + 'key5': [5, 12] + } + keys = list(key_size_t.keys()) + + data = {} + data_t = {} + for key in key_size_t: + data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) + data_t[key] = data[key].clone() + data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) + data_t['keyX'] = data['keyX'].clone() + if mpu.get_model_parallel_rank() != 0: + data = None + + data_utils._check_data_types(keys, data_t, torch.int64) + key_size, key_numel, \ + total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) + for key in keys: + assert key_size[key] == key_size_t[key] + total_numel_t = 0 + for key in keys: + target_size = functools.reduce(operator.mul, key_size_t[key], 1) + assert key_numel[key] == target_size + total_numel_t += target_size + assert total_numel == total_numel_t + + data_b = data_utils.broadcast_data(keys, data, torch.int64) + for key in keys: + tensor = data_t[key].cuda() + assert data_b[key].sub(tensor).abs().max() == 0 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test test boradcast data') + test_boradcast_data(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py new file mode 100644 index 00000000..df62d213 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py @@ -0,0 +1,95 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import mpu +import torch +from commons import initialize_distributed, print_separator + +sys.path.append('../..') + + +def test_initialize_model_parallel(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing initialize_model_parallel with size {} ...'.format( + model_parallel_size)) + model_parallel_size_ = min(model_parallel_size, + torch.distributed.get_world_size()) + assert not mpu.model_parallel_is_initialized() + mpu.initialize_model_parallel(model_parallel_size_) + assert mpu.model_parallel_is_initialized() + + # Checks. + def check(group, world_size, rank): + assert world_size == torch.distributed.get_world_size(group=group) + assert rank == torch.distributed.get_rank(group=group) + + # Model parallel. + world_size = model_parallel_size_ + rank = torch.distributed.get_rank() % model_parallel_size_ + assert world_size == mpu.get_model_parallel_world_size() + assert rank == mpu.get_model_parallel_rank() + check(mpu.get_model_parallel_group(), world_size, rank) + + # Data parallel. + world_size = torch.distributed.get_world_size() // model_parallel_size_ + rank = torch.distributed.get_rank() // model_parallel_size + assert world_size == mpu.get_data_parallel_world_size() + assert rank == mpu.get_data_parallel_rank() + check(mpu.get_data_parallel_group(), world_size, rank) + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_get_model_parallel_src_rank(model_parallel_size_): + + if torch.distributed.get_rank() == 0: + print('> testing get_model_parallel_src_rank with size {} ...'.format( + model_parallel_size_)) + model_parallel_size = min(model_parallel_size_, + torch.distributed.get_world_size()) + assert not mpu.model_parallel_is_initialized() + mpu.initialize_model_parallel(model_parallel_size) + assert mpu.model_parallel_is_initialized() + + # Checks + src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() + assert mpu.get_model_parallel_src_rank() == src_rank + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test initialize model parallel') + test_initialize_model_parallel(model_parallel_size) + print_separator('test model parallel source rank') + test_get_model_parallel_src_rank(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_layers.py b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py new file mode 100644 index 00000000..2dbc987a --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py @@ -0,0 +1,533 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import sys + +import mpu +import torch +import torch.nn.init as init +from commons import initialize_distributed, print_separator, set_random_seed +from mpu import layers +from torch.nn.parameter import Parameter + +sys.path.append('../..') + + +def test_parallel_embedding(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing parallel embedding with model parallel size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + batch_size = 17 + seq_length = 23 + vocab_size = 48 + hidden_size = 16 + seed = 1236 + + set_random_seed(123) + input_data = torch.LongTensor(size=(batch_size, seq_length)).random_( + 0, vocab_size).cuda() + loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda() + + set_random_seed(seed) + embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda() + + output = embedding_original(input_data) + loss_original = torch.mul(output, loss_weight).sum() + loss_original.backward() + + set_random_seed(seed) + embedding_parallel = layers.ParallelEmbedding( + vocab_size, hidden_size, init_method=init.normal_).cuda() + output = embedding_parallel(input_data) + loss_parallel = torch.mul(output, loss_weight).sum() + loss_parallel.backward() + + set_random_seed(seed) + embedding_vocab_parallel = layers.VocabParallelEmbedding( + vocab_size, hidden_size, init_method=init.normal_).cuda() + output = embedding_vocab_parallel(input_data) + loss_vocab_parallel = torch.mul(output, loss_weight).sum() + loss_vocab_parallel.backward() + + torch.distributed.barrier() + error = loss_parallel.sub(loss_original).abs() + print(' error in loss (parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + torch.distributed.barrier() + error = loss_vocab_parallel.sub(loss_original).abs() + print(' error in loss (vocab parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + weight_grad_orig = torch.split(embedding_original.weight.grad, + hidden_size // model_parallel_size, + 1)[mpu.get_model_parallel_rank()] + error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max() + print(' error in grad (parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + weight_grad_orig = torch.split(embedding_original.weight.grad, + vocab_size // model_parallel_size, + 0)[mpu.get_model_parallel_rank()] + error = embedding_vocab_parallel.weight.grad.sub( + weight_grad_orig).abs().max() + print(' error in grad (vocab parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_initialize_affine_weight(model_parallel_size): + + mpu.initialize_model_parallel(model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing initialize_affine_weight with model parallel ' + 'size: {}'.format(model_parallel_size)) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + input_size_coeff = 13 + input_size = input_size_coeff * model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * model_parallel_size + + # --------------- + # Column parallel + # --------------- + weight = torch.empty(output_size_coeff, input_size) + set_random_seed(seed) + layers._initialize_affine_weight(weight, output_size, input_size, + output_size_coeff, 0, + torch.nn.init.normal_) + # Target. + set_random_seed(seed) + master_weight = torch.empty(output_size, input_size) + torch.nn.init.normal_(master_weight) + rank = mpu.get_model_parallel_rank() + my_weight = torch.split( + master_weight, output_size_coeff, dim=0)[rank].contiguous().clone() + + # Compare. + error = weight.sub(my_weight).abs().max() + torch.distributed.barrier() + print(' column parallel max error (should be zero) on global rank ' + '{}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # ------------ + # Row parallel + # ------------ + weight = torch.empty(output_size, input_size_coeff) + set_random_seed(seed) + mpu.layers._initialize_affine_weight(weight, output_size, input_size, + input_size_coeff, 1, + torch.nn.init.normal_) + # Target. + set_random_seed(seed) + master_weight = torch.empty(output_size, input_size) + torch.nn.init.normal_(master_weight) + rank = mpu.get_model_parallel_rank() + my_weight = torch.split( + master_weight, input_size_coeff, dim=1)[rank].contiguous().clone() + + # Compare. + error = weight.sub(my_weight).abs().max() + torch.distributed.barrier() + print(' row parallel max error (should be zero) on global rank ' + '{}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +class IdentityLayer2D(torch.nn.Module): + + def __init__(self, m, n): + super(IdentityLayer2D, self).__init__() + self.weight = Parameter(torch.Tensor(m, n)) + torch.nn.init.xavier_normal_(self.weight) + + def forward(self): + return self.weight + + +def test_column_parallel_linear(model_parallel_size): + + mpu.initialize_model_parallel(model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing ColumnParallelLinear with model parallel ' + 'size: {}'.format(model_parallel_size)) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + input_size_coeff = 13 + input_size = input_size_coeff * model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * model_parallel_size + batch_size = 7 + + # Network + identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + linear_layer = mpu.ColumnParallelLinear( + input_size, output_size, keep_master_weight_for_test=True).cuda() + loss_weight = torch.randn([batch_size, output_size]).cuda() + # Forward + input_ = identity_layer() + output = linear_layer(input_) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + # Values. + dLdY = loss_weight + X = identity_layer.weight + A = linear_layer.master_weight.cuda() + dLdA = torch.matmul(dLdY.t(), X) + dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdX = torch.matmul(dLdY, A) + + rank = mpu.get_model_parallel_rank() + my_dLdA = torch.split( + dLdA, output_size_coeff, dim=0)[rank].contiguous().clone() + error = my_dLdA.sub(linear_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdA on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + my_dLdb = torch.split( + dLdb, output_size_coeff, dim=0)[rank].contiguous().clone() + error = my_dLdb.sub(linear_layer.bias.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdb on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdX.sub(identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdX on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +def test_row_parallel_linear(model_parallel_size): + + mpu.initialize_model_parallel(model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing RowParallelLinear with model parallel ' + 'size: {}'.format(model_parallel_size)) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + input_size_coeff = 13 + input_size = input_size_coeff * model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * model_parallel_size + batch_size = 7 + + # Network + identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + linear_layer = mpu.RowParallelLinear( + input_size, output_size, keep_master_weight_for_test=True).cuda() + loss_weight = torch.randn([batch_size, output_size]).cuda() + # Forward + input_ = identity_layer() + output = linear_layer(input_) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + # Values. + dLdY = loss_weight + X = identity_layer.weight + A = linear_layer.master_weight.cuda() + dLdA = torch.matmul(dLdY.t(), X) + dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdX = torch.matmul(dLdY, A) + + rank = mpu.get_model_parallel_rank() + my_dLdA = torch.split( + dLdA, input_size_coeff, dim=1)[rank].contiguous().clone() + error = my_dLdA.sub(linear_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdA on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdb.sub(linear_layer.bias.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdb on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdX.sub(identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdX on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +class IdentityLayer3D(torch.nn.Module): + + def __init__(self, m, n, k): + super(IdentityLayer3D, self).__init__() + self.weight = Parameter(torch.Tensor(m, n, k)) + torch.nn.init.xavier_normal_(self.weight) + + def forward(self): + return self.weight + + +def parallel_self_attention(model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, + sequence_length): + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + + num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size( + ) # noqa + hidden_size = hidden_size_per_att_head * num_att_heads + + # Network + identity_layer = IdentityLayer3D(batch_size, sequence_length, + hidden_size).cuda() + attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads, + dropout_prob).cuda() + loss_weight = torch.randn([batch_size, sequence_length, + hidden_size]).cuda() + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + # Forward + input_ = identity_layer() + output = attention_layer(input_, attention_mask) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + rank = mpu.get_model_parallel_rank() + mpu.destroy_model_parallel() + return rank, hidden_size, model_parallel_size, loss, \ + attention_layer, identity_layer + + +def test_parallel_self_attention(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing ParallelSelfAttention with model parallel ' + 'size: {}'.format(model_parallel_size)) + + num_att_heads_per_partition = 3 + hidden_size_per_att_head = 7 + dropout_prob = 0.0 # has to be zero + batch_size = 5 + sequence_length = 13 + + rank_1, hideen_size_1, model_parallel_size_1, loss_1, \ + attention_layer_1, identity_layer_1 = parallel_self_attention( + 1, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) + + rank, hidden_size, model_parallel_size, loss, \ + attention_layer, identity_layer = parallel_self_attention( + model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) + assert hideen_size_1 == hidden_size + + error = loss_1.sub(loss).abs().max() + torch.distributed.barrier() + print(' loss error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + my_lin_grad_list = torch.split( + attention_layer_1.query_key_value.weight.grad, + hidden_size // model_parallel_size, 0)[rank::model_parallel_size] + my_lin_grad = torch.cat(my_lin_grad_list, dim=0) + error = my_lin_grad.sub( + attention_layer.query_key_value.weight.grad).abs().max() + torch.distributed.barrier() + print(' weight gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + error = identity_layer_1.weight.grad.sub( + identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' input gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +def parallel_transformer(model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, + sequence_length): + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + + num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size( + ) + hidden_size = hidden_size_per_att_head * num_att_heads + intermediate_size = 4 * hidden_size + + # Network + identity_layer = IdentityLayer3D(batch_size, sequence_length, + hidden_size).cuda() + transformer_layer = mpu.BertParallelTransformerLayer( + hidden_size, intermediate_size, num_att_heads, 0.0, 0.0, + torch.nn.functional.relu, 1.0e-5).cuda() + + loss_weight = torch.randn([batch_size, sequence_length, + hidden_size]).cuda() + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + # Forward + input_ = identity_layer() + output = transformer_layer(input_, attention_mask) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + rank = mpu.get_model_parallel_rank() + mpu.destroy_model_parallel() + return rank, hidden_size, model_parallel_size, loss, \ + transformer_layer, identity_layer + + +def test_parallel_transformer_layer(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing ParallelTransformerLayer with model parallel ' + 'size: {}'.format(model_parallel_size)) + + num_att_heads_per_partition = 3 + hidden_size_per_att_head = 7 + batch_size = 5 + sequence_length = 13 + + rank_1, hidden_size_1, model_parallel_size_1, loss_1, \ + transformer_layer_1, identity_layer_1 = parallel_transformer( + 1, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length) + + rank, hidden_size, model_parallel_size, loss, \ + transformer_layer, identity_layer = parallel_transformer( + model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length) + + error = loss_1.sub(loss).abs().max() + torch.distributed.barrier() + print(' loss error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-5, 'error: {}'.format(error) + + error = identity_layer_1.weight.grad.sub( + identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' input gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-5, 'error: {}'.format(error) + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +if __name__ == '__main__': + + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + print_separator('test initialize affine weight') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_initialize_affine_weight(model_parallel_size) + model_parallel_size *= 2 + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test parallel embedding') + test_parallel_embedding(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test column-parallel linear') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_column_parallel_linear(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test row-parallel linear') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_row_parallel_linear(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test parallel self-attention') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_parallel_self_attention(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test parallel transformer') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_parallel_transformer_layer(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_random.py b/modelscope/models/nlp/mglm/mpu/tests/test_random.py new file mode 100644 index 00000000..55cc2351 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/tests/test_random.py @@ -0,0 +1,206 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import mpu +import torch +from commons import initialize_distributed, print_separator + +sys.path.append('../..') + + +def test_set_cuda_rng_state(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing set_rng_state with size {} ...'.format( + model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + size = 123 + seed = 1234 + torch.cuda.manual_seed(seed) + tensor = torch.cuda.FloatTensor(size) + + # Get the state + rng_state = torch.cuda.get_rng_state() + rng_state_copy = rng_state.clone() + + # Do some stuff. + for _ in range(5): + torch.randn(size, out=tensor) + result_1 = tensor.clone() + + assert rng_state.sub(rng_state_copy).max() == 0 + assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 + + # State should be different. + new_rng_state = torch.cuda.get_rng_state() + max_diff = new_rng_state.sub(rng_state).max() + print( + ' max diff in rng state (should be non-zero) on global rank {}: {}'. + format(torch.distributed.get_rank(), max_diff)) + assert max_diff > 0 + + # Reset the rng state and do the same stuff. + mpu.random._set_cuda_rng_state(rng_state) + for _ in range(5): + torch.randn(size, out=tensor) + mpu.random._set_cuda_rng_state(rng_state) + for _ in range(5): + torch.randn(size, out=tensor) + result_2 = tensor.clone() + + # Results should be the same + error = result_2.sub(result_1).abs().max() + print(' max error in generated tensors (should be zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Input state should have remained intact. + error = rng_state.sub(rng_state_copy).max() + print(' max error in rng state (should be zero) on global rank {}: {}'. + format(torch.distributed.get_rank(), error)) + assert error == 0 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_cuda_rng_tracker(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing cuda rng tracker with size {} ...'.format( + model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed_1 = 1234 + seed_2 = 4321 + size = [12, 21] + tensor = torch.cuda.FloatTensor(size) + + # Set to seed_1 and generate two tensors. + torch.cuda.manual_seed(seed_1) + torch.randn(size, out=tensor) + target_11 = tensor.clone() + torch.randn(size, out=tensor) + target_12 = tensor.clone() + + # Set to seed_2 and generate two tensors. + torch.cuda.manual_seed(seed_2) + torch.randn(size, out=tensor) + target_21 = tensor.clone() + torch.randn(size, out=tensor) + target_22 = tensor.clone() + + # Now if we interleave seed_1 and seed_2, + # we should still get the same tensors + torch.cuda.manual_seed(seed_1) + mpu.get_cuda_rng_tracker().add('test', seed_2) + + torch.randn(size, out=tensor) + result_11 = tensor.clone() + + with mpu.get_cuda_rng_tracker().fork('test'): + torch.randn(size, out=tensor) + result_21 = tensor.clone() + + torch.randn(size, out=tensor) + result_12 = tensor.clone() + + with mpu.get_cuda_rng_tracker().fork('test'): + torch.randn(size, out=tensor) + result_22 = tensor.clone() + + diff = result_11.sub(result_21).abs().max() + diff = min(diff, result_12.sub(result_22).abs().max()) + print(' max diff in generated tensors (should be non-zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) + assert diff > 1.0e-6 + error = max( + result_11.sub(target_11).abs().max(), + result_12.sub(target_12).abs().max()) + error = max(error, result_21.sub(target_21).abs().max()) + error = max(error, result_22.sub(target_22).abs().max()) + print(' max error in generated tensors (should be zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset the tracker + mpu.get_cuda_rng_tracker().reset() + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_model_parallel_cuda_manual_seed(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing model parallel cuda manual seed with size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + mpu.model_parallel_cuda_manual_seed(12345) + assert torch.cuda.initial_seed() == 12345 + with mpu.get_cuda_rng_tracker().fork(): + assert torch.cuda.initial_seed() == (12345 + 2718 + + mpu.get_model_parallel_rank()) + + # Reset the tracker + mpu.get_cuda_rng_tracker().reset() + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test set rng state') + test_set_cuda_rng_state(model_parallel_size) + model_parallel_size *= 2 + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test cuda rng tracker') + test_cuda_rng_tracker(model_parallel_size) + model_parallel_size *= 2 + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test model parallel cuda manual seed') + test_model_parallel_cuda_manual_seed(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/models/nlp/mglm/mpu/transformer.py b/modelscope/models/nlp/mglm/mpu/transformer.py new file mode 100755 index 00000000..c12b2e10 --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/transformer.py @@ -0,0 +1,1200 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformer.""" + +import math + +import deepspeed +import torch +import torch.nn.init as init +from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm + +from .initialize import get_model_parallel_world_size +from .layers import ColumnParallelLinear, RowParallelLinear +from .mappings import gather_from_model_parallel_region +from .random import checkpoint, get_cuda_rng_tracker +from .utils import divide, split_tensor_along_last_dim + + +class PositionalEmbedding(torch.nn.Module): + + def __init__(self, hidden_size): + super(PositionalEmbedding, self).__init__() + + self.hidden_size = hidden_size + + inv_freq = 1 / ( + 10000**(torch.arange(0.0, hidden_size, 2.0) / hidden_size)) # noqa + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[None, :, :].expand(bsz, -1, -1) + else: + return pos_emb[None, :, :] + + +class ParallelCrossAttention(torch.nn.Module): + """Parallel cross-attention layer for Transformer""" + + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=None): + super(ParallelCrossAttention, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide( + num_attention_heads, world_size) + # Strided linear layer. + self.query = ColumnParallelLinear( + hidden_size, + hidden_size, + gather_output=False, + init_method=init_method) + self.key_value = ColumnParallelLinear( + hidden_size, + 2 * hidden_size, + stride=2, + gather_output=False, + init_method=init_method) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = RowParallelLinear( + hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, # noqa + self.hidden_size_per_attention_head) # noqa + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, encoder_states, cross_mask): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Attention heads. [b, s, hp] + mixed_query_layer = self.query(hidden_states) + mixed_x_layer = self.key_value(encoder_states) + (mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 2) + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + # Raw attention scores. [b, np, s, s] + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.hidden_size_per_attention_head) + if cross_mask is not None: + # Apply the left to right attention mask. + attention_scores = torch.mul(attention_scores, cross_mask) - \ + 10000.0 * (1.0 - cross_mask) # noqa + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) # noqa + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + output = self.dense(context_layer) + output = self.output_dropout(output) + + return output + + +class ParallelSelfAttention(torch.nn.Module): + """Parallel self-attention layer for GPT2. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size to be divisible by n. + attention_dropout_prob: dropout probability for the attention scores. + init_method: weight initialization. + output_layer_init_method: output layer initialization. If None, use + `init_method`. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=None, + relative_encoding=False, + performer=False, + attention_scale=1.0): + super(ParallelSelfAttention, self).__init__() + self.performer = performer + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide( + num_attention_heads, world_size) + self.relative_encoding = relative_encoding + self.attention_scale = attention_scale + # Strided linear layer. + self.query_key_value = ColumnParallelLinear( + hidden_size, + 3 * hidden_size, + stride=3, + gather_output=False, + init_method=init_method) + if relative_encoding: + self.relative = ColumnParallelLinear( + hidden_size, + hidden_size, + gather_output=False, + init_method=init_method) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = RowParallelLinear( + hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, # noqa + self.hidden_size_per_attention_head) # noqa + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + @staticmethod + def _rel_shift(x, zero_triu=False): + # ql x kl x bsz x h + # bsz x h x ql x kl + zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1), + device=x.device, + dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2)) + + x = x_padded[:, :, 1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(0), x.size(1))) + x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None] + + return x + + def forward(self, + hidden_states, + ltor_mask, + position_embeddings=None, + r_w_bias=None, + r_r_bias=None, + mem=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Attention heads. [b, s, hp] + query_length = hidden_states.size(1) + + if mem is None: + mixed_x_layer = self.query_key_value(hidden_states) + (mixed_query_layer, mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim( + mixed_x_layer, 3) + else: + cat = torch.cat((mem, hidden_states), 1) + mixed_x_layer = self.query_key_value(cat) + (mixed_query_layer, mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim( + mixed_x_layer, 3) + mixed_query_layer = mixed_query_layer[:, -query_length:] + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + if self.relative_encoding: + relative_layer = self.relative(position_embeddings) + relative_layer = self._transpose_for_scores( + relative_layer) # 1 (bsz) x n_head x klen x d_head + # Raw attention scores. [b, np, qs, ks] + rw_head_q = query_layer + r_w_bias.unsqueeze(1) + ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2)) + rr_head_q = query_layer + r_r_bias.unsqueeze(1) + bd_score = torch.matmul(rr_head_q, + relative_layer.transpose(-1, -2)) + bd_score = self._rel_shift(bd_score) # qlen x klen x bsz x n_head + # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen + + attention_scores = ac_score + bd_score + attention_scores = attention_scores / math.sqrt( + self.hidden_size_per_attention_head) + else: + if self.attention_scale > 1.0: + # Raw attention scores. [b, np, s, s] + attention_scores = torch.matmul( + query_layer / math.sqrt(self.attention_scale), + key_layer.transpose(-1, -2) + / math.sqrt(self.hidden_size_per_attention_head + * self.attention_scale)) + else: + attention_scores = torch.matmul( + query_layer, + key_layer.transpose(-1, -2) + / math.sqrt(self.hidden_size_per_attention_head)) + + # Apply the left to right attention mask. + attention_scores = torch.mul(attention_scores, ltor_mask) + if self.attention_scale > 1.0: + max_attention_scores = attention_scores.max( + dim=-1, keepdim=True)[0] + attention_scores -= max_attention_scores + attention_scores *= self.attention_scale + # if torch.distributed.get_rank() == 0: + # print(min_attention_scores, attention_scores.max().item()) + attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask) + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) # noqa + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + output = self.dense(context_layer) + output = self.output_dropout(output) + + return output + + +@torch.jit.script +def gelu_impl(x): + """OpenAI's gelu implementation.""" + return 0.5 * x * ( + 1.0 + torch.tanh(0.7978845608028654 * x * # noqa + (1.0 + 0.044715 * x * x))) # noqa + + +def gelu(x): + return gelu_impl(x) + + +class ParallelMLP(torch.nn.Module): + """MLP for GPT2. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform gelu transformation, and project the + state back into h hidden dimension. At the end, dropout is also + applied. + + Arguments: + hidden_size: The hidden size of the self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layer initialization. If None, + use `init_method`. + """ + + def __init__(self, + hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=None): + super(ParallelMLP, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Project to 4h. + self.dense_h_to_4h = ColumnParallelLinear( + hidden_size, + 4 * hidden_size, + gather_output=False, + init_method=init_method) + # Project back to h. + self.dense_4h_to_h = RowParallelLinear( + 4 * hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method) + self.dropout = torch.nn.Dropout(output_dropout_prob) + + def forward(self, hidden_states): + # [b, s, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = gelu(intermediate_parallel) + + # [b, s, h] + output = self.dense_4h_to_h(intermediate_parallel) + output = self.dropout(output) + return output + + +class ParallelDecoderLayer(torch.nn.Module): + """A single layer transformer for GPT2. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layers (attention output and + mlp output) initialization. If None, + use `init_method`. + """ + + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + output_layer_init_method=None): + super(ParallelDecoderLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + + # Layernorm on the input data. + self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.self_attention = ParallelSelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + # Layernorm after the self attention. + self.post_self_layernorm = LayerNorm( + hidden_size, eps=layernorm_epsilon) + + self.cross_attention = ParallelCrossAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + # Layernorm after the cross attention. + self.post_attention_layernorm = LayerNorm( + hidden_size, eps=layernorm_epsilon) + + # MLP + self.mlp = ParallelMLP( + hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + def forward(self, + hidden_states, + encoder_states, + ltor_mask, + cross_mask=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Layer norm at the begining of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + self_attention_output = self.self_attention(layernorm_output, + ltor_mask) + # Residual connection. + self_layernorm_input = hidden_states + self_attention_output + # Layer norm post the self attention. + self_layernorm_output = self.post_self_layernorm(self_layernorm_input) + # Cross attention + attention_output = self.cross_attention(self_layernorm_output, + encoder_states, cross_mask) + # Residual connection + layernorm_input = self_layernorm_input + attention_output + # Layer norm post the cross attention + layernorm_output = self.post_attention_layernorm(layernorm_input) + # MLP. + mlp_output = self.mlp(layernorm_output) + # Second residual connection. + output = layernorm_input + mlp_output + return output + + +class ParallelTransformerLayer(torch.nn.Module): + """A single layer transformer for GPT2. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layers (attention output and + mlp output) initialization. If None, + use `init_method`. + """ + + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + output_layer_init_method=None, + relative_encoding=False, + performer=False, + attention_scale=1.0): + super(ParallelTransformerLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + + # Layernorm on the input data. + self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.attention = ParallelSelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method, + relative_encoding=relative_encoding, + performer=performer, + attention_scale=attention_scale) + + # Layernorm on the input data. + self.post_attention_layernorm = LayerNorm( + hidden_size, eps=layernorm_epsilon) + + # MLP + self.mlp = ParallelMLP( + hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + def forward(self, + hidden_states, + ltor_mask, + position_embeddings=None, + r_w_bias=None, + r_r_bias=None, + mem=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Layer norm at the begining of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + mem = self.input_layernorm(mem) if mem is not None else None + # Self attention. + attention_output = self.attention(layernorm_output, ltor_mask, + position_embeddings, r_w_bias, + r_r_bias, mem) + # Residual connection. + layernorm_input = hidden_states + attention_output + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + # MLP. + mlp_output = self.mlp(layernorm_output) + # Second residual connection. + output = layernorm_input + mlp_output + + return output + + +def unscaled_init_method(sigma): + """Init method based on N(0, sigma).""" + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +class GPT2ParallelTransformer(torch.nn.Module): + """GPT-2 transformer. + + This module takes input from embedding layer and it's output can + be used directly by a logit layer. It consists of L (num-layers) + blocks of: + layer norm + self attention + residual connection + layer norm + mlp + residual connection + followed by a final layer norm. + + Arguments: + num_layers: Number of transformer layers. + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + checkpoint_activations: if True, checkpoint activations. + checkpoint_num_layers: number of layers to checkpoint. This + is basically the chunk size in checkpoitning. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method_std: standard deviation of the init method which has + the form N(0, std). + use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) + scaling for the output weights ( + output of self attention and mlp). + """ + + def __init__( + self, + num_layers, + hidden_size, + num_attention_heads, + max_sequence_length, + max_memory_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + relative_encoding=False, + block_position_encoding=False, + performer=False, + use_decoder_layer=False, + attention_scale=1.0, + ): + super(GPT2ParallelTransformer, self).__init__() + self.hidden_size = hidden_size + # Store activation checkpoiting flag. + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + self.max_memory_length = max_memory_length + self.performer = performer + self.use_decoder_layer = use_decoder_layer + assert not (performer and relative_encoding) + + output_layer_init_method = None + if use_scaled_init_for_output_weights: + output_layer_init_method = scaled_init_method( + init_method_std, num_layers) + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + self.relative_encoding = relative_encoding + self.block_position_encoding = block_position_encoding + if relative_encoding: + # Relative position embedding + self.position_embeddings = PositionalEmbedding(hidden_size) + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_attention_head = divide( + hidden_size, num_attention_heads) + self.num_attention_heads_per_partition = divide( + num_attention_heads, world_size) + self.r_w_bias = torch.nn.Parameter( + torch.Tensor(self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head)) + self.r_w_bias.model_parallel = True + self.r_r_bias = torch.nn.Parameter( + torch.Tensor(self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head)) + self.r_r_bias.model_parallel = True + # Always initialize bias to zero. + with torch.no_grad(): + self.r_w_bias.zero_() + self.r_r_bias.zero_() + else: + # Position embedding (serial). + if block_position_encoding: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length + 1, hidden_size) + self.block_position_embeddings = torch.nn.Embedding( + max_sequence_length + 1, hidden_size) + torch.nn.init.normal_( + self.block_position_embeddings.weight, + mean=0.0, + std=init_method_std) + else: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length, hidden_size) + # Initialize the position embeddings. + torch.nn.init.normal_( + self.position_embeddings.weight, mean=0.0, std=init_method_std) + + def get_layer(): + if use_decoder_layer: + return ParallelDecoderLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + unscaled_init_method(init_method_std), + output_layer_init_method=output_layer_init_method) + else: + return ParallelTransformerLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + unscaled_init_method(init_method_std), + output_layer_init_method=output_layer_init_method, + relative_encoding=relative_encoding, + performer=performer, + attention_scale=attention_scale) + + # Transformer layers. + self.layers = torch.nn.ModuleList( + [get_layer() for _ in range(num_layers)]) + + # Final layer norm before output. + self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + def forward(self, + hidden_states, + position_ids, + attention_mask, + memory_states=None, + encoder_states=None, + return_memory=False, + detach_memory=True): + batch_size, query_length = hidden_states.size()[:2] + memory_length = memory_states[0].size(1) if memory_states else 0 + key_length = query_length + memory_length + # attention mask is the beginning postion of B region, \in [0, query_len) + is_scalar = torch.numel(attention_mask) == 1 + is_sep = is_scalar or torch.numel(attention_mask) == batch_size + if self.performer: + assert is_scalar, 'attention_mask should be a scalar to indicate the seperation position.' + assert memory_length == 0, 'Do not support transformer-xl.' + if is_sep: + sep = attention_mask.item() if is_scalar else attention_mask + + # conventional transformer + def build_mask_matrix(seq_length, sep, memory_length=0): + m = hidden_states.new_ones((1, seq_length, seq_length)) + m = torch.tril(m) + if is_scalar: + m[0, :, :sep] = 1 + else: + m = m.expand(batch_size, -1, -1) + ids = torch.arange( + seq_length, device=sep.device, + dtype=sep.dtype).view(1, -1) + mask = ids < sep.view(-1, 1) + m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1) + if memory_length > 0: + m = m.expand(batch_size, -1, -1) + m = torch.cat( + (hidden_states.new_ones((batch_size, seq_length, + memory_length)), m), # noqa + dim=2) # noqa + m = m.unsqueeze(1) + return m + + if not self.performer: + attention_mask = build_mask_matrix( + query_length, sep, memory_length=memory_length) + else: + attention_mask = attention_mask[:, :, :, + -query_length - memory_length:] + + if self.relative_encoding: + position_sequence = torch.arange( + key_length - 1, + -1, + -1.0, + device=hidden_states.device, + dtype=hidden_states.dtype) + position_embeddings = self.position_embeddings(position_sequence) + # Apply dropout + position_embeddings = self.embedding_dropout(position_embeddings) + else: + if self.block_position_encoding: + position_ids, block_position_ids = position_ids[:, + 0], position_ids[:, + 1] + position_embeddings = self.position_embeddings(position_ids) + hidden_states = hidden_states + position_embeddings + if self.block_position_encoding: + block_position_embeddings = self.block_position_embeddings( + block_position_ids) + hidden_states = hidden_states + block_position_embeddings + hidden_states = self.embedding_dropout(hidden_states) + + def check_detach(_hidden_states): + if detach_memory: + return _hidden_states.detach() + return _hidden_states + + if self.max_memory_length > 0 or return_memory: + mem_layers = [check_detach(hidden_states)] + else: + mem_layers = [] + + def custom(start, end): + + def custom_forward(*inputs): + layers_ = self.layers[start:end] + x_, inputs = inputs[0], inputs[1:] + if self.relative_encoding: + inputs, mems_ = inputs[:4], inputs[4:] + else: + inputs, mems_ = inputs[:1], inputs[1:] + for i, layer in enumerate(layers_): + mem_i_ = mems_[i] if mems_ else None + x_ = layer(x_, *inputs, mem=mem_i_) + if self.max_memory_length > 0 or return_memory: + mem_layers.append(check_detach(x_)) + return x_ + + return custom_forward + + if self.checkpoint_activations: + l = 0 # noqa + num_layers = len(self.layers) + chunk_length = self.checkpoint_num_layers + while l < num_layers: + args = [hidden_states, attention_mask + ] if not self.use_decoder_layer else [ + hidden_states, + encoder_states, + attention_mask # noqa + ] # noqa + if self.relative_encoding: + args += [position_embeddings, self.r_w_bias, self.r_r_bias] + if memory_states: + args += memory_states[l:l + chunk_length] + hidden_states = checkpoint(custom(l, l + chunk_length), *args) + l += chunk_length # noqa + else: + for i, layer in enumerate(self.layers): + args = [hidden_states, attention_mask + ] if not self.use_decoder_layer else [ + hidden_states, + encoder_states, + attention_mask # noqa + ] # noqa + if self.relative_encoding: + args += [position_embeddings, self.r_w_bias, self.r_r_bias] + mem_i = memory_states[i] if memory_states else None + hidden_states = layer(*args, mem=mem_i) + if self.max_memory_length > 0 or return_memory: + mem_layers.append(check_detach(hidden_states)) + + # Final layer norm. + output = self.final_layernorm(hidden_states) + if self.max_memory_length > 0 or return_memory: + mem_layers = self.update_mems( + mem_layers, memory_states, return_memory=return_memory) + + return (output, mem_layers) + + def update_mems(self, hiddens, mems, return_memory=False): + memory_length = mems[0].size(1) if mems else 0 + query_length = hiddens[0].size(1) + new_memory_length = memory_length + query_length + if not return_memory: + new_memory_length = min(self.max_memory_length, new_memory_length) + new_mems = [] + # with torch.no_grad(): + for i in range(len(hiddens)): + if new_memory_length <= query_length: + new_mems.append(hiddens[i][:, -new_memory_length:]) + else: + new_mems.append( + torch.cat((mems[i][:, -new_memory_length + query_length:], + hiddens[i]), + dim=1)) + return new_mems + + +class BertParallelSelfAttention(torch.nn.Module): + """Parallel self-attention layer for BERT. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size be divisible by n. + dropout_prob: dropout probability for the attention scores. + output_parallel: If true, no all-gather is done on the output and + the output values will be per partition. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + + def __init__(self, + hidden_size, + num_attention_heads, + dropout_prob, + output_parallel=False, + init_method=init.xavier_normal_): + super(BertParallelSelfAttention, self).__init__() + # Input configuration. + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.dropout_prob = dropout_prob + self.output_parallel = output_parallel + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide( + num_attention_heads, world_size) + # Strided linear layer. + self.query_key_value = ColumnParallelLinear( + hidden_size, + 3 * hidden_size, + stride=3, + gather_output=False, + init_method=init_method) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.dropout = torch.nn.Dropout(dropout_prob) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, # noqa + self.hidden_size_per_attention_head) # noqa + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask): + + # Attention heads. [b, s, hp] + mixed_x_layer = self.query_key_value(hidden_states) + (mixed_query_layer, mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + + # Raw attention scores. [b, np, s, s] + norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head)) + attention_scores = torch.matmul( + query_layer / norm_factor, + key_layer.transpose(-1, -2) / norm_factor) + # Apply the attention mask. + attention_scores += attention_mask + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.hidden_size_per_partition, ) # noqa + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + if self.output_parallel: + output = context_layer + else: + output = gather_from_model_parallel_region(context_layer) + + return output + + +class BertParallelTransformerOutput(torch.nn.Module): + """The output layer used after self attention and intermediate + parts of transformer layer.""" + + def __init__(self, + input_size, + output_size, + dropout_prob, + layernorm_epsilon=1.0e-12, + input_is_parallel=False, + init_method=init.xavier_normal_): + super(BertParallelTransformerOutput, self).__init__() + # Components. + self.dense = RowParallelLinear( + input_size, + output_size, + input_is_parallel=input_is_parallel, + init_method=init_method) + self.dropout = torch.nn.Dropout(dropout_prob) + self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + layernorm_input = hidden_states + input_tensor + hidden_states = self.layernorm(layernorm_input) + return hidden_states + + +class BertParallelTransformerLayer(torch.nn.Module): + """A single layer transformer for Bert. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + intermediate_size: size of the intermediate state after + self attention. In both BERT and GPT + this is set to be 4 times the hidden + size. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + intermediate_activation_fn: activation function for output + of intermediate. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + """ + + def __init__(self, + hidden_size, + intermediate_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + intermediate_activation_fn, + layernorm_epsilon, + init_method=init.xavier_normal_): + super(BertParallelTransformerLayer, self).__init__() + + # Self attention. + self.attention = BertParallelSelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_parallel=True, + init_method=init_method) + # Self attention output. + self.self_output = BertParallelTransformerOutput( + hidden_size, + hidden_size, + output_dropout_prob, + layernorm_epsilon=layernorm_epsilon, + input_is_parallel=True, + init_method=init_method) + # Intermediate. + self.intermediate = ColumnParallelLinear( + hidden_size, + intermediate_size, + gather_output=False, + init_method=init_method) + self.intermediate_activation_fn = intermediate_activation_fn + # Output. + self.output = BertParallelTransformerOutput( + intermediate_size, + hidden_size, + output_dropout_prob, + layernorm_epsilon=layernorm_epsilon, + input_is_parallel=True, + init_method=init_method) + + def forward(self, hidden_states, attention_mask): + # [b, s, hp] + attention_output_parallel = self.attention(hidden_states, + attention_mask) + # [b, s, h] + attention_self_output = self.self_output(attention_output_parallel, + hidden_states) + # [b, s, ip] + intermediate_output_parallel = self.intermediate(attention_self_output) + intermediate_output_parallel = self.intermediate_activation_fn( + intermediate_output_parallel) + # [b, s, h] + layer_output = self.output(intermediate_output_parallel, + attention_self_output) + + return layer_output diff --git a/modelscope/models/nlp/mglm/mpu/utils.py b/modelscope/models/nlp/mglm/mpu/utils.py new file mode 100644 index 00000000..76c37a2b --- /dev/null +++ b/modelscope/models/nlp/mglm/mpu/utils.py @@ -0,0 +1,70 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, '{} is not divisible by {}'.format( + numerator, denominator) + + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator + + +def split_tensor_along_last_dim(tensor, + num_partitions, + contiguous_split_chunks=False): + """Split a tensor along its last dimension. + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = divide(tensor.size()[last_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class VocabUtility: + """Split the vocabulary into `world_size` chunks amd return the + first and last index of the vocabulary belonging to the `rank` + partition: Note that indecies in [fist, last)""" + + @staticmethod + def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, + rank, world_size): + index_f = rank * per_partition_vocab_size + index_l = index_f + per_partition_vocab_size + return index_f, index_l + + @staticmethod + def vocab_range_from_global_vocab_size(global_vocab_size, rank, + world_size): + per_partition_vocab_size = divide(global_vocab_size, world_size) + return VocabUtility.vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size, rank, world_size) diff --git a/modelscope/models/nlp/mglm/process_grid.py b/modelscope/models/nlp/mglm/process_grid.py new file mode 100644 index 00000000..d425c970 --- /dev/null +++ b/modelscope/models/nlp/mglm/process_grid.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022 Zhipu.AI + +import glob +import os +import statistics +import sys + +import json + +path_pattern = sys.argv[1] +target_type = sys.argv[2] +best_value, best_result, best_name = None, None, None +mean_result = {} +print(path_pattern) +for dir_path in glob.glob(path_pattern, recursive=True): + entry = os.path.basename(dir_path) + valid_result = None + test_found = os.path.exists(os.path.join(dir_path, 'test_results.json')) + valid_path = os.path.join(dir_path, 'results.json') + if os.path.exists(valid_path): + print(entry) + with open(valid_path) as file: + valid_result = json.load(file) + else: + print(f'{entry} no validation results') + continue + if not test_found: + print(f'{entry} not tested yet') + if target_type == 'max': + metric = sys.argv[3] + metric_value = valid_result[metric] + if best_value is None or metric_value > best_value: + best_value = metric_value + best_result = valid_result + best_name = entry + elif target_type == 'mean' or target_type == 'median': + if mean_result: + for metric, value in valid_result.items(): + if metric not in ['type', 'epoch']: + mean_result[metric].append(value) + else: + mean_result = { + metric: [value] + for metric, value in valid_result.items() + if metric not in ['type', 'epoch'] + } + +if target_type == 'max': + print(f'Best result found at {best_name}: {best_result}') +elif target_type == 'mean': + mean_result = { + metric: sum(value) / len(value) + for metric, value in mean_result.items() + } + print(f'Mean result {mean_result}') +elif target_type == 'median': + mean_result = { + metric: statistics.median(value) + for metric, value in mean_result.items() + } + print(f'Mean result {mean_result}') diff --git a/modelscope/models/nlp/mglm/requirements.txt b/modelscope/models/nlp/mglm/requirements.txt new file mode 100644 index 00000000..e44ae5d1 --- /dev/null +++ b/modelscope/models/nlp/mglm/requirements.txt @@ -0,0 +1,22 @@ +boto3 +botocore +deepspeed +fasttext +filelock +ftfy +langdetect +lsh +matplotlib +mpi4py +nltk +pandas +regex +requests +rouge_score +scikit_learn +scipy +sentencepiece +termcolor +tldextract +tqdm +transformers diff --git a/modelscope/models/nlp/mglm/run_test.py b/modelscope/models/nlp/mglm/run_test.py new file mode 100644 index 00000000..2f568265 --- /dev/null +++ b/modelscope/models/nlp/mglm/run_test.py @@ -0,0 +1,10 @@ +# Copyright (c) 2022 Zhipu.AI + +import sys + +if sys.argv[1] == 'block': + from test.test_block import main + main() +elif sys.argv[1] == 'rel_shift': + from test.test_rel_shift import main + main() diff --git a/modelscope/models/nlp/mglm/tasks/data_utils.py b/modelscope/models/nlp/mglm/tasks/data_utils.py new file mode 100644 index 00000000..179d304e --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/data_utils.py @@ -0,0 +1,389 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tasks data utility.""" +import copy +import pickle +import re +from typing import Dict, List, Optional + +import json +import numpy as np +import torch +import torch.utils.data +from torch.utils.data.dataloader import default_collate + +from modelscope.models.nlp.mglm import mpu + + +def clean_text(text): + """Remove new lines and multiple spaces and adjust end of sentence dot.""" + + text = text.replace('\n', ' ') + text = re.sub(r'\s+', ' ', text) + for _ in range(3): + text = text.replace(' . ', '. ') + + return text + + +class InputExample(object): + """A raw input example consisting of one or two segments of text and a label""" + + def __init__(self, + guid, + text_a, + text_b=None, + label=None, + logits=None, + meta: Optional[Dict] = None, + idx=-1, + num_choices=1): + """ + Create a new InputExample. + + :param guid: a unique textual identifier + :param text_a: the sequence of text + :param text_b: an optional, second sequence of text + :param label: an optional label + :param logits: an optional list of per-class logits + :param meta: an optional dictionary to store arbitrary meta information + :param idx: an optional numeric index + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + self.logits = logits + self.idx = idx + self.num_choices = num_choices + self.meta = meta if meta else {} + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serialize this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serialize this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n' + + @staticmethod + def load_examples(path: str) -> List['InputExample']: + """Load a set of input examples from a file""" + with open(path, 'rb') as fh: + return pickle.load(fh) + + @staticmethod + def save_examples(examples: List['InputExample'], path: str) -> None: + """Save a set of input examples to a file""" + with open(path, 'wb') as fh: + pickle.dump(examples, fh) + + +def num_special_tokens_to_add(text_a_ids, + text_b_ids, + answer_ids, + add_cls, + add_sep, + add_piece, + add_eos=True): + num_tokens = 0 + if add_cls: + num_tokens += 1 + if text_b_ids and add_sep: + num_tokens += 1 + if add_eos: + num_tokens += 1 + if not answer_ids and add_piece: + num_tokens += 1 + return num_tokens + + +def build_input_from_ids(text_a_ids, + text_b_ids, + answer_ids, + max_seq_length, + tokenizer, + args=None, + add_cls=True, + add_sep=False, + add_piece=False, + add_eos=True, + mask_id=None): + if mask_id is None: + mask_id = tokenizer.get_command('MASK').Id + eos_id = tokenizer.get_command('eos').Id + cls_id = tokenizer.get_command('ENC').Id + sep_id = tokenizer.get_command('sep').Id + ids = [] + types = [] + paddings = [] + # CLS + if add_cls: + ids.append(cls_id) + types.append(0) + paddings.append(1) + # A + len_text_a = len(text_a_ids) + ids.extend(text_a_ids) + types.extend([0] * len_text_a) + paddings.extend([1] * len_text_a) + # B + if text_b_ids is not None: + # SEP + if add_sep: + ids.append(sep_id) + types.append(0) + paddings.append(1) + len_text_b = len(text_b_ids) + ids.extend(text_b_ids) + types.extend([1] * len_text_b) + paddings.extend([1] * len_text_b) + eos_length = 1 if add_eos else 0 + # Cap the size. + if len(ids) >= max_seq_length - eos_length: + max_seq_length_m1 = max_seq_length - 1 + ids = ids[0:max_seq_length_m1] + types = types[0:max_seq_length_m1] + paddings = paddings[0:max_seq_length_m1] + end_type = 0 if text_b_ids is None else 1 + if add_eos: + ids.append(eos_id) + types.append(end_type) + paddings.append(1) + sep = len(ids) + target_ids = [0] * len(ids) + loss_masks = [0] * len(ids) + position_ids = list(range(len(ids))) + block_position_ids = [0] * len(ids) + # Piece + if add_piece or answer_ids is not None: + sop_id = tokenizer.get_command('sop').Id + mask_position = ids.index( + mask_id + ) if not args.sentinel_token else args.max_position_embeddings + ids.append(sop_id) + types.append(end_type) + paddings.append(1) + position_ids.append(mask_position) + block_position_ids.append(1) + if answer_ids is not None: + len_answer = len(answer_ids) + ids.extend(answer_ids[:-1]) + types.extend([end_type] * (len_answer - 1)) + paddings.extend([1] * (len_answer - 1)) + position_ids.extend([mask_position] * (len_answer - 1)) + if not args.no_block_position: + block_position_ids.extend(range(2, len(answer_ids) + 1)) + else: + block_position_ids.extend([1] * (len(answer_ids) - 1)) + target_ids.extend(answer_ids) + loss_masks.extend([1] * len(answer_ids)) + else: + target_ids.append(0) + loss_masks.append(1) + # Padding. + padding_length = max_seq_length - len(ids) + if padding_length > 0: + ids.extend([eos_id] * padding_length) + types.extend([eos_id] * padding_length) + paddings.extend([0] * padding_length) + position_ids.extend([0] * padding_length) + block_position_ids.extend([0] * padding_length) + target_ids.extend([0] * padding_length) + loss_masks.extend([0] * padding_length) + if not args.masked_lm: + position_ids = [position_ids, block_position_ids] + return ids, types, paddings, position_ids, sep, target_ids, loss_masks + + +def build_decoder_input(enc_ids, answer_ids, max_seq_length, + max_dec_seq_length, tokenizer): + mask_id = tokenizer.get_command('MASK').Id + eos_id = tokenizer.get_command('eos').Id + sop_id = tokenizer.get_command('sop').Id + enc_len = len(enc_ids) # noqa + masks = [] + # TODO: it probably takes too much memory + # for i in range(max_dec_seq_length): + # m = [1]*enc_len + [0]*(max_seq_length - enc_len) + [1]*(i+1) + [0]*(max_dec_seq_length-1-i) + # masks.append(m) + mask_position = enc_ids.index(mask_id) + len_answer = len(answer_ids) + ids = [sop_id] + answer_ids[:-1] + types = [0] * len_answer # not used + paddings = [1] * len_answer + position_ids = [mask_position] * len_answer + block_position_ids = list(range(1, len_answer + 1)) + target_ids = answer_ids + loss_masks = [1] * len_answer + # Padding. + padding_length = max_dec_seq_length - len(ids) + if padding_length > 0: + ids.extend([eos_id] * padding_length) + types.extend([0] * padding_length) + paddings.extend([0] * padding_length) + position_ids.extend([0] * padding_length) + block_position_ids.extend([0] * padding_length) + target_ids.extend([0] * padding_length) + loss_masks.extend([0] * padding_length) + position_ids = [position_ids, block_position_ids] + return ids, types, paddings, position_ids, masks, target_ids, loss_masks + + +def build_sample(ids, + types=None, + paddings=None, + positions=None, + masks=None, + label=None, + unique_id=None, + target=None, + logit_mask=None, + segment_ids=None, + prompt_ids=None): + """Convert to numpy and return a sample consumed by the batch producer.""" + + ids_np = np.array(ids, dtype=np.int64) + sample = {'text': ids_np, 'label': int(label)} + if types is not None: + types_np = np.array(types, dtype=np.int64) + sample['types'] = types_np + if paddings is not None: + paddings_np = np.array(paddings, dtype=np.int64) + sample['padding_mask'] = paddings_np + if positions is not None: + positions_np = np.array(positions, dtype=np.int64) + sample['position'] = positions_np + if masks is not None: + masks_np = np.array(masks, dtype=np.int64) + sample['mask'] = masks_np + if target is not None: + target_np = np.array(target, dtype=np.int64) + sample['target'] = target_np + if logit_mask is not None: + logit_mask_np = np.array(logit_mask, dtype=np.int64) + sample['logit_mask'] = logit_mask_np + if segment_ids is not None: + segment_ids = np.array(segment_ids, dtype=np.int64) + sample['segment_id'] = segment_ids + if prompt_ids is not None: + prompt_ids = np.array(prompt_ids, dtype=np.int64) + sample['prompt_pos'] = prompt_ids + if unique_id is not None: + sample['uid'] = unique_id + return sample + + +def build_decoder_sample(sample, dec_ids, dec_position, dec_masks, dec_target, + dec_logit_mask): + sample['dec_text'] = np.array(dec_ids) + sample['dec_position'] = np.array(dec_position) + sample['dec_mask'] = np.array(dec_masks) + sample['dec_target'] = np.array(dec_target) + sample['dec_logit_mask'] = np.array(dec_logit_mask) + return sample + + +def my_collate(batch): + new_batch = [{key: value + for key, value in sample.items() if key != 'uid'} + for sample in batch] + text_list = [sample['text'] for sample in batch] + + def pad_choice_dim(data, choice_num): + if len(data) < choice_num: + data = np.concatenate([data] + + [data[0:1]] * (choice_num - len(data))) + return data + + if len(text_list[0].shape) == 2: + choice_nums = list(map(len, text_list)) + max_choice_num = max(choice_nums) + for i, sample in enumerate(new_batch): + for key, value in sample.items(): + if key != 'label': + sample[key] = pad_choice_dim(value, max_choice_num) + else: + sample[key] = value + sample['loss_mask'] = np.array( + [1] * choice_nums[i] + [0] * (max_choice_num - choice_nums[i]), + dtype=np.int64) + + if 'dec_text' in new_batch[0]: + choice_nums = [len(sample['dec_text']) for sample in new_batch] + if choice_nums.count(choice_nums[0]) != len(choice_nums): + max_choice_num = max(choice_nums) + for i, sample in enumerate(new_batch): + for key, value in sample.items(): + if key.startswith('dec_'): + sample[key] = pad_choice_dim(value, max_choice_num) + sample['loss_mask'] = np.array( + [1] * choice_nums[i] + [0] * # noqa + (max_choice_num - choice_nums[i]), + dtype=np.int64) + + new_batch = default_collate(new_batch) + if 'uid' in batch[0]: + uid_list = [sample['uid'] for sample in batch] + new_batch['uid'] = uid_list + return new_batch + + +class FakeDataloader: + + def __init__(self, num_iters): + self.num_iters = num_iters + + def __iter__(self): + if self.num_iters is not None: + for _ in range(self.num_iters): + yield None + else: + while True: + yield None + + +def build_data_loader(dataset, + batch_size, + num_workers, + drop_last, + shuffle=True, + only_rank0=False): + """Data loader. Note that batch-size is the local (per GPU) batch-size.""" + + # Sampler. + if only_rank0: + rank, world_size = 0, 1 + else: + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=shuffle) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=True, + collate_fn=my_collate) + + return data_loader diff --git a/modelscope/models/nlp/mglm/tasks/eval_utils.py b/modelscope/models/nlp/mglm/tasks/eval_utils.py new file mode 100644 index 00000000..da23a884 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/eval_utils.py @@ -0,0 +1,249 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation utilities.""" + +import datetime +import os +import random +import time +from collections import OrderedDict +from typing import List + +import mpu +import torch +from finetune_glm import process_batch +from sklearn.metrics import f1_score +from tasks.data_utils import InputExample, build_data_loader +from utils import debug_finetune_data, get_spare_port, print_rank_0 + + +def accuracy_metric(predictions, labels, examples): + count = 0 + num_predictions = max(len(predictions), 1) + assert len(predictions) == len(labels) + for prediction, label in zip(predictions, labels): + count += prediction == label + return count * 100.0 / num_predictions + + +def f1_metric(predictions, labels, examples): + return f1_score(labels, predictions) + + +def f1_macro_metric(predictions, labels, examples): + return f1_score(labels, predictions, average='macro') + + +global_tokenizer = None + + +def accuracy_func_provider(single_dataset_provider, + metric_dict, + args, + is_test=False, + eval_func=None, + output_func=None, + only_rank0=True, + tokenizer=None): + """Provide function that calculates accuracies.""" + # Build dataloaders. + global global_tokenizer + global_tokenizer = tokenizer + if only_rank0 and torch.distributed.is_initialized( + ) and torch.distributed.get_rank() != 0: + return None + if is_test and not args.eval_valid: + datapaths = args.test_data if args.test_data is not None else ['test'] + else: + datapaths = args.valid_data if args.valid_data is not None else ['dev'] + if eval_func is None: + eval_func = multichoice_evaluate + dataloaders = [] + eval_batch_size = args.eval_batch_size if args.eval_batch_size else args.batch_size + for datapath in datapaths: + dataset = single_dataset_provider(datapath) + dataloader = build_data_loader( + dataset, + eval_batch_size, + num_workers=args.num_workers, + drop_last=False, + shuffle=False, + only_rank0=only_rank0) + dataloaders.append((dataset.dataset_name, dataloader)) + + def metrics_func(model, + epoch, + output_predictions=False, + summary_writer=None): + print_rank_0('calculating metrics ...') + score_dict = OrderedDict([(key, 0.0) for key in metric_dict + ]) if isinstance(metric_dict, dict) else { + metric_dict: 0.0 + } # noqa + total = 0 + for name, dataloader in dataloaders: + example_dict = None + if hasattr(dataloader.dataset, 'examples'): + example_dict = dataloader.dataset.examples + start_time = time.time() + predictions, labels, examples = eval_func(model, dataloader, + example_dict, args) + elapsed_time = time.time() - start_time + if output_predictions and torch.distributed.get_rank() == 0: + filename = os.path.join(args.log_dir, name + '.jsonl') + output_func(predictions, examples, filename) + total_count = len(predictions) + single_dict = { + key: metric(predictions, labels, examples) + for key, metric in metric_dict.items() + } + output_str = ' > |epoch: {}| metrics for {}: total {}'.format( + epoch, name, total_count) + for key, value in single_dict.items(): + output_str += ' {} = {:.4f} %'.format(key, value) + if summary_writer is not None and epoch >= 0 and not is_test and len( + dataloaders) > 1: + summary_writer.add_scalar(f'Train/valid_{name}_{key}', + value, epoch) + output_str += ' elapsed time (sec): {:.3f}'.format(elapsed_time) + if len(dataloaders) > 1: + print_rank_0(output_str) + for key in score_dict: + score_dict[key] += single_dict[key] * total_count + total += total_count + score_dict = { + key: score / float(total) + for key, score in score_dict.items() + } + output_str = ' >> |epoch: {}| overall: total = {}'.format(epoch, total) + for key, score in score_dict.items(): + output_str += ' {} = {:.4f}'.format(key, score) + if summary_writer is not None and epoch >= 0 and not is_test: + summary_writer.add_scalar(f'Train/valid_{key}', score, epoch) + print_rank_0(output_str) + return score_dict + + return metrics_func + + +segment_length = 10 + + +def multichoice_evaluate(model, dataloader, example_dict, args): + """Calculate correct over total answers and return prediction if the + `output_predictions` is true.""" + model.eval() + port = get_spare_port(args) + print_rank_0(f'Using port {port}') + store = torch.distributed.TCPStore(args.master_ip, port, + torch.distributed.get_world_size(), + torch.distributed.get_rank() == 0, + datetime.timedelta(seconds=30)) + # file_path = os.path.join("/cache", args.experiment_name + "_store") + # print_rank_0(f"Using file store at {file_path}") + # store = torch.distributed.FileStore(file_path, torch.distributed.get_world_size()) + with torch.no_grad(): + # For all the batches in the dataset. + for _, batch in enumerate(dataloader): + # Run the model forward. + data = process_batch(batch, args) + if args.pretrained_bert: + tokens, types, labels_, attention_mask = data['text'], data[ + 'types'], data['label'], data['padding_mask'] + inputs = [tokens, types, attention_mask] + elif args.cloze_eval: + tokens, labels_, position_ids = data['text'], data[ + 'label'], data['position'] + attention_mask, target_ids, logit_mask = data['mask'], data[ + 'target'], data['logit_mask'] + if not args.fast_decode: + inputs = [ + tokens, position_ids, attention_mask, target_ids, + logit_mask + ] + if args.continuous_prompt: + prompt_pos = data['prompt_pos'] + inputs.append(prompt_pos) + else: + dec_input_ids, dec_position_ids, dec_attention_mask = data[ + 'dec_text'], data['dec_position'], data['dec_mask'] + dec_target_ids, dec_logit_mask = data['dec_target'], data[ + 'dec_logit_mask'] + inputs = [ + tokens, position_ids, attention_mask, dec_input_ids, + dec_position_ids, dec_attention_mask, dec_target_ids, + dec_logit_mask + ] + else: + tokens, labels_, position_ids, attention_mask = data[ + 'text'], data['label'], data['position'], data['mask'] + inputs = [tokens, position_ids, attention_mask] + if len(inputs[0].shape + ) == 3 and inputs[0].size(1) > segment_length: + logit_list = [] + for i in range((inputs[0].size(1) - 1) // segment_length + 1): + input_batch = [ + arg[:, i * segment_length:(i + 1) * segment_length] + for arg in inputs + ] + if args.pretrained_bert: + logits = model(*input_batch) + else: + logits, *mems = model(*input_batch) + logit_list.append(logits) + logits = torch.cat(logit_list, dim=1) + elif args.cloze_eval and args.fast_decode: + logit_list = [] + num_choices = inputs[3].size(1) + for i in range((num_choices - 1) // segment_length + 1): + input_batch = inputs[:3] + [ + arg[:, i * segment_length:(i + 1) * segment_length] + for arg in inputs[3:] + ] + logits, *mems = model(*input_batch) + logit_list.append(logits) + logits = torch.cat(logit_list, dim=1) + else: + if args.pretrained_bert: + logits = model(*inputs) + else: + logits, *mems = model(*inputs) + if 'segment_id' in data: + from torch_scatter import scatter_sum + if 'loss_mask' in data: + logits = logits * data['loss_mask'] + logits = scatter_sum(logits, data['segment_id'], dim=1) + elif 'loss_mask' in data: + loss_mask = data['loss_mask'] + logits = logits * loss_mask - 10000.0 * (1.0 - loss_mask) + uid_list = batch['uid'] + if isinstance(uid_list, torch.Tensor): + uid_list = uid_list.cpu().numpy().tolist() + predicted = torch.argmax(logits, dim=-1).tolist() + labels = labels_.tolist() + if args.task.lower() == 'wsc': + predicted = [1 if pred == 0 else 0 for pred in predicted] + if mpu.get_model_parallel_rank() == 0: + for uid, prediction, label in zip(uid_list, predicted, labels): + store.set(uid, str((prediction, label))) + model.train() + torch.distributed.barrier() + predictions, labels, examples = [], [], [] + for uid, example in example_dict.items(): + prediction, label = eval(store.get(uid)) + predictions.append(prediction) + labels.append(label) + examples.append(example) + torch.distributed.barrier() + return predictions, labels, examples diff --git a/modelscope/models/nlp/mglm/tasks/language_model/dataset.py b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py new file mode 100644 index 00000000..cfdfa714 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022 Zhipu.AI + +import math +from bisect import bisect_right +from itertools import accumulate + +import json +import numpy as np +import torch +from tasks.data_utils import build_input_from_ids, num_special_tokens_to_add +from tasks.language_model.detokenizer import get_detokenizer +from utils import print_rank_0 + + +class LMDataset(torch.utils.data.Dataset): + + def __init__(self, args, documents, tokenizer, num_original_tokens, + num_tokenized_tokens): + self.args = args + self.documents = documents + self.max_seq_len = args.seq_length - 1 + self.tokenizer = tokenizer + self.overalapping_eval = args.overlapping_eval + if self.overalapping_eval is None: + self.overalapping_eval = self.max_seq_len + self.overalapping_eval = max(1, self.overalapping_eval) + self.num_original_tokens = num_original_tokens + self.num_tokenized_tokens = num_tokenized_tokens + # remove first sequence tokens + targets = [ + max(len(tokens) - self.max_seq_len, 0) for tokens in self.documents + ] + self.num_sequences = [ + max(math.ceil(target / self.overalapping_eval) + 1, 1) + for target in targets + ] + self.weights = list(accumulate(self.num_sequences)) + self.left_weights = [0] + self.weights[:-1] + self.unidirectional = args.unidirectional + self.block_lm = args.block_lm + mask_token = 'gMASK' if args.task_mask else 'MASK' + self.mask_id = self.tokenizer.get_command(mask_token).Id + + def __len__(self): + return sum(self.num_sequences) + + def __getitem__(self, idx): + document_idx = bisect_right(self.weights, idx) + idx = idx - self.left_weights[document_idx] + start_idx = idx * self.overalapping_eval + end_idx = start_idx + self.max_seq_len + tokens = self.documents[document_idx][start_idx:end_idx] + if self.block_lm: + if idx == 0 or self.unidirectional: + prompt, text = tokens[:1], tokens[1:] + else: + prompt_length = self.max_seq_len - self.overalapping_eval + prompt, text = tokens[:prompt_length], tokens[prompt_length:] + prompt = prompt + [self.mask_id] + num_special_tokens = num_special_tokens_to_add( + prompt, + None, + text, + add_cls=True, + add_sep=False, + add_piece=True, + add_eos=False) + data = build_input_from_ids( + prompt, + None, + text, + self.max_seq_len + num_special_tokens + 1, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=True, + add_eos=False, + mask_id=self.mask_id) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + if idx != 0 and self.unidirectional: + loss_masks = np.array(loss_masks, dtype=np.int64) + loss_masks[:-self.overalapping_eval] = 0 + return { + 'text': np.array(ids, dtype=np.int64), + 'target': np.array(target_ids, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'loss_mask': np.array(loss_masks, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64) + } + else: + loss_masks = [1] * len(tokens) + if len(tokens) < self.max_seq_len: + tokens = tokens + [0] * (self.max_seq_len - len(tokens)) + loss_masks = loss_masks + [0] * ( + self.max_seq_len - len(loss_masks)) + if idx != 0: + loss_masks = np.array(loss_masks, dtype=np.int64) + loss_masks[:-self.overalapping_eval] = 0 + return { + 'text': np.array(tokens, dtype=np.int64), + 'loss_mask': np.array(loss_masks, dtype=np.int64) + } + + +class LambadaDataset(torch.utils.data.Dataset): + + def __init__(self, args, tokenizer, strict=True): + data_path = args.valid_data[0] + print_rank_0( + '> building lambada dataset from {} ...'.format(data_path)) + self.args = args + self.max_seq_length = args.seq_length + self.tokenizer = tokenizer + self.pad_idx = tokenizer.get_command('pad').Id + self.strict = strict + self.block_lm = args.block_lm + self.unidirectional = args.unidirectional + mask_token = 'gMASK' if args.task_mask else 'MASK' + self.mask_id = self.tokenizer.get_command(mask_token).Id + + self.tokens = [] + self.labels = [] + with open(data_path, 'r') as f: + for line in f.readlines(): + text = json.loads(line)['text'] + tokens, labels = self.get_tokens(text) + self.tokens.append(tokens) + self.labels.append(labels) + + def get_tokens(self, text): + if not self.strict: + tokens = self.tokenizer.EncodeAsIds(text).tokenization + return tokens[:-1], [tokens[-1]] + last_token = text.split()[-1] + start_idx = text.rfind(last_token) + beginning_tokens = self.tokenizer.EncodeAsIds( + text[:start_idx].strip()).tokenization + last_token = self.tokenizer.EncodeAsIds(' ' + last_token).tokenization + return beginning_tokens, last_token + + def __len__(self): + return len(self.tokens) + + def __getitem__(self, idx): + tokens, answer = self.tokens[idx], self.labels[idx] + if self.block_lm: + if self.unidirectional: + tokens, answer_tokens = tokens[:1], tokens[1:] + answer + else: + answer_tokens = answer + tokens = tokens + [self.mask_id] + num_special_tokens = num_special_tokens_to_add( + tokens, + None, + answer_tokens, + add_cls=True, + add_sep=False, + add_piece=True) + left_shift = len(tokens) + len( + answer_tokens) + num_special_tokens - self.max_seq_length + if left_shift > 0: + tokens = tokens[left_shift:] + data = build_input_from_ids( + tokens, + None, + answer_tokens, + self.max_seq_length, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=True, + mask_id=self.mask_id) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + if self.unidirectional: + loss_masks = np.array(loss_masks, dtype=np.int64) + last_index = len(loss_masks) + while loss_masks[last_index - 1] == 0: + last_index -= 1 + loss_masks[:last_index - len(answer)] = 0 + return { + 'text': np.array(ids, dtype=np.int64), + 'target': np.array(target_ids, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'loss_mask': np.array(loss_masks, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64) + } + else: + left_shift = len(tokens) - self.max_seq_length + if left_shift > 0: + tokens = tokens[left_shift:] + ids = tokens + answer + if len(ids) < self.max_seq_length: + ids = ids + [0] * (self.max_seq_length - len(ids)) + loss_masks = [0] * len(tokens) + [1] * len(answer) + if len(loss_masks) < self.max_seq_length: + loss_masks = loss_masks + [0] * ( + self.max_seq_length - len(loss_masks)) + return { + 'text': np.array(ids, dtype=np.int64), + 'loss_mask': np.array(loss_masks, dtype=np.int64) + } + + +def build_lambada_dataset(tokenizer, args): + """Build lambada dataset.""" + assert len(args.valid_data) == 1 + val_dataset = LambadaDataset(args, tokenizer, strict=True) + print_rank_0(' > found {} samples, {} label tokens.'.format( + len(val_dataset), sum(map(len, val_dataset.labels)))) + return val_dataset + + +def build_lm_dataset(tokenizer, args): + documents = [] + num_tokens, num_original_tokens = 0, 0 + with open(args.valid_data[0], encoding='utf-8') as file: + for line in file: + tokens = tokenizer.EncodeAsIds(line.strip()).tokenization + num_tokens += len(tokens) + num_original_tokens += len(line.strip().split(' ')) + documents.append(tokens) + val_dataset = LMDataset(args, documents, tokenizer, num_original_tokens, + num_tokens) + print_rank_0( + ' > number of document: {}, number of original tokens {}, number of detokenized tokens: {}' + .format(len(documents), num_original_tokens, num_tokens)) + return val_dataset + + +def build_wikitext103_dataset(tokenizer, args): + """""" + + assert len(args.valid_data) == 1 + with open(args.valid_data[0], 'rb') as reader: + entire_data = reader.read().decode('utf-8') + num_original_tokens = len(entire_data.strip().split(' ')) + entire_data = get_detokenizer('wikitext')(entire_data) + print_rank_0(entire_data[:1024]) + tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization + num_tokenized_tokens = len(tokenized_data) + + val_dataset = LMDataset(args, [tokenized_data], tokenizer, + num_original_tokens, num_tokenized_tokens) + print_rank_0(' > number of original tokens: {}, number of detokenized ' + 'tokens: {}'.format(num_original_tokens, + num_tokenized_tokens)) + return val_dataset diff --git a/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py new file mode 100755 index 00000000..dc1524de --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py @@ -0,0 +1,63 @@ +# Copyright (c) 2022 Zhipu.AI + +import re + + +def ptb_detokenizer(string): + string = string.replace(" '", "'") + string = string.replace(' \n', '\n') + string = string.replace('\n ', '\n') + string = string.replace(" n't", "n't") + string = string.replace(' N ', '1 ') + string = string.replace('$ 1', '$1') + string = string.replace('# 1', '#1') + return string + + +def wikitext_detokenizer(string): + # contractions + string = string.replace("s '", "s'") + string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) + # number separators + string = string.replace(' @-@ ', '-') + string = string.replace(' @,@ ', ',') + string = string.replace(' @.@ ', '.') + # punctuation + string = string.replace(' : ', ': ') + string = string.replace(' ; ', '; ') + string = string.replace(' . ', '. ') + string = string.replace(' ! ', '! ') + string = string.replace(' ? ', '? ') + string = string.replace(' , ', ', ') + # double brackets + string = re.sub(r'\(\s*([^\)]*?)\s*\)', r'(\1)', string) + string = re.sub(r'\[\s*([^\]]*?)\s*\]', r'[\1]', string) + string = re.sub(r'{\s*([^}]*?)\s*}', r'{\1}', string) + string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) + string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) + # miscellaneous + string = string.replace('= = = =', '====') + string = string.replace('= = =', '===') + string = string.replace('= =', '==') + string = string.replace(' ' + chr(176) + ' ', chr(176)) + string = string.replace(' \n', '\n') + string = string.replace('\n ', '\n') + string = string.replace(' N ', ' 1 ') + string = string.replace(" 's", "'s") + + return string + + +def lambada_detokenizer(string): + return string + + +def get_detokenizer(dataset): + return DETOKENIZERS[dataset] + + +DETOKENIZERS = { + 'ptb': ptb_detokenizer, + 'wikitext': wikitext_detokenizer, + 'lambada': lambada_detokenizer, +} diff --git a/modelscope/models/nlp/mglm/tasks/language_model/finetune.py b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py new file mode 100644 index 00000000..b6089e6f --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py @@ -0,0 +1,254 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GPT2 zero-shot evaluation.""" + +import functools +import math + +import mpu +import torch +from finetune_glm import finetune +from pretrain_glm import get_batch +from tasks.data_utils import build_data_loader +from tasks.language_model.dataset import (build_lambada_dataset, + build_lm_dataset, + build_wikitext103_dataset) +from utils import print_rank_0 + +global_tokenizer = None + + +def lm_forward_step(data, model, args, timers, mems, eval_metric=None): + """Forward step.""" + + # Get the batch. + if timers is not None: + timers('batch generator').start() + if 'mask' in data: + data['attention_mask'] = data.pop('mask') + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data, args) + if timers is not None: + timers('batch generator').stop() + + def print_masked_text(batch_id): + block_position_ids = position_ids[:, 1] + position_ids_ = position_ids[:, 0] + output_tokens = [] + sep = attention_mask[batch_id].item() + for i, token in enumerate(tokens[batch_id, :sep].tolist()): + if global_tokenizer is not None: + token = global_tokenizer.IdToToken(token) + if token.startswith('[MASK'): + token = f'[{position_ids_[batch_id, i].item()}, {token}]' + if token.startswith('##') and len( + output_tokens) > 0 and not output_tokens[-1].endswith( + ']'): + output_tokens[-1] += token[2:] + else: + output_tokens.append(token) + else: + output_tokens.append(str(token)) + print(' '.join(output_tokens)) + last_index = None + for i in range(sep, tokens.size(1)): + if global_tokenizer.IdToToken( + tokens[batch_id, i].item()).startswith('<|startofpiece'): + if last_index is not None: + print( + global_tokenizer.DecodeIds( + tokens[batch_id, last_index:i].tolist()), '|', + global_tokenizer.DecodeIds( + labels[batch_id, last_index:i].tolist())), + print(position_ids_[batch_id, last_index:i].tolist(), + block_position_ids[batch_id, last_index:i].tolist()) + last_index = i + if last_index is not None: + print( + global_tokenizer.DecodeIds(tokens[batch_id, + last_index:].tolist()), '|', + global_tokenizer.DecodeIds(labels[batch_id, + last_index:].tolist())) + print(position_ids_[batch_id, last_index:].tolist(), + block_position_ids[batch_id, last_index:].tolist()) + + # Forward model. + if args.continuous_prompt: + prompt_pos = data['prompt_pos'].long().cuda() + logits, *mems = model( + tokens, position_ids, attention_mask, *mems, prompt_pos=prompt_pos) + else: + logits, *mems = model(tokens, position_ids, attention_mask, *mems) + + if eval_metric is None or eval_metric == 'loss': + losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(), + labels) + loss_mask = loss_mask.view(-1) + # The loss is not normalized for fair comparison + loss = torch.sum(losses.view(-1) * loss_mask) + if eval_metric is None: + loss = loss / loss_mask.sum() + return loss, mems, 'bert' + elif eval_metric == 'accuracy' or eval_metric == 'classify': + logits = mpu.gather_from_model_parallel_region(logits) + outputs = torch.argmax(logits, -1) + correct = (outputs == labels).float() + correct[(1 - loss_mask).bool()] = 1 + correct = correct.prod(-1) + if eval_metric == 'accuracy': + correct = correct.sum() + return correct, mems, 'bert' + else: + raise NotImplementedError( + 'Metric {} not implemented'.format(eval_metric)) + + +def classify_evaluate(model, dataloader, example_dict, args): + """Evaluation.""" + # Turn on evaluation mode which disables dropout. + model.eval() + predictions, labels, examples = [], [], [] + with torch.no_grad(): + # For all the batches in the dataset. + for iteration, batch in enumerate(dataloader): + # Forward evaluation. + output, _, _ = lm_forward_step( + batch, model, args, None, [], eval_metric='classify') + uid_list = batch['uid'] + example_batch = [example_dict[uid] for uid in uid_list] + predictions.extend(output.long().tolist()) + label = batch['label'].tolist() + labels.extend(label) + examples.extend(example_batch) + return predictions, labels, examples + + +def evaluate(model, dataloader, eval_metric, args): + """Evaluation.""" + # Turn on evaluation mode which disables dropout. + model.eval() + total_output, total_count = 0.0, 0 + total_tokens = 0 + with torch.no_grad(): + # For all the batches in the dataset. + for iteration, batch in enumerate(dataloader): + if (iteration + 1) % args.log_interval == 0: + print_rank_0('> working on iteration: {}'.format(iteration)) + # Forward evaluation. + output, _, _ = lm_forward_step( + batch, model, args, None, [], eval_metric=eval_metric) + count = batch['text'].size(0) + count = torch.cuda.LongTensor([count]) + # Reduce across processes. + torch.distributed.all_reduce( + output, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce( + count, group=mpu.get_data_parallel_group()) + + total_output += output.item() + total_count += count.item() + total_tokens += batch['loss_mask'].sum().item() + totals = torch.cuda.FloatTensor([total_output, total_tokens]) + torch.distributed.all_reduce(totals, group=mpu.get_data_parallel_group()) + total_output, total_tokens = totals.tolist() + print(total_tokens) + return {eval_metric: total_output}, total_count + + +def evaluate_and_print_results(data_loader, model, eval_metric, args): + """Evaluate and print results on screen.""" + + # Evaluate and get results. + output, _ = evaluate(model, data_loader, eval_metric, args) + + string = '' + if eval_metric == 'loss': + output = output['loss'] + num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens + num_original_tokens = data_loader.dataset.num_original_tokens + val_loss = output / (num_tokenized_tokens - 1) + ppl = math.exp(min(20, val_loss)) + token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1) + adjusted_ppl = math.exp(min(20, val_loss * token_ratio)) + string += 'avg loss: {:.4E} | '.format(val_loss) + string += 'ppl: {:.4E} | '.format(ppl) + string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) + string += 'token ratio: {} |'.format(token_ratio) + score_dict = { + 'avg loss': val_loss, + 'ppl': ppl, + 'adjusted ppl': adjusted_ppl + } + + elif eval_metric == 'accuracy': + output = output['accuracy'] + num_examples = len(data_loader.dataset) + acc = output / num_examples * 100 + string += 'number correct: {} | '.format(output) + string += 'total examples: {} | '.format(num_examples) + string += 'avg accuracy: {:.2f}'.format(acc) + score_dict = {'accuracy': acc} + else: + raise NotImplementedError('evaluation method for {} metric is not ' + 'implemented yet.'.format(eval_metric)) + + length = len(string) + 1 + print_rank_0('-' * length) + print_rank_0(string) + print_rank_0('-' * length) + return score_dict + + +def metrics_func_provider(args, tokenizer, is_test): + """Privde metrics callback function.""" + + if args.task.lower() == 'lambda': + eval_metric = 'accuracy' + dataset = build_lambada_dataset(tokenizer, args) + elif args.task == 'wikitext': + eval_metric = 'loss' + dataset = build_wikitext103_dataset(tokenizer, args) + elif args.task == 'language_model': + eval_metric = 'loss' + dataset = build_lm_dataset(tokenizer, args) + else: + raise NotImplementedError('{} task is not implemented.'.format( + args.task)) + # Data stuff + dataloader = build_data_loader( + dataset, + args.eval_batch_size, + args.num_workers, + drop_last=False, + shuffle=False) + + def metrics_func(model, + epoch, + output_predictions=False, + summary_writer=None): + return evaluate_and_print_results( + dataloader, model, eval_metric=eval_metric, args=args) + + global global_tokenizer + global_tokenizer = tokenizer + return metrics_func + + +def main(args): + """Main program.""" + finetune( + args, + None, {}, + end_of_epoch_callback_provider=metrics_func_provider, + forward_step=lm_forward_step) diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py new file mode 100644 index 00000000..6a4e275f --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py @@ -0,0 +1,667 @@ +# Copyright (c) 2022 Zhipu.AI + +import os +import random + +import json +import numpy as np +import torch +import torch.utils.data +from data_utils.corpora import punctuation_standardization +from tasks.data_utils import InputExample +from tqdm import tqdm +from utils import print_rank_0 + + +def gigaword_detokenize(string, is_target=False): + _tok_dict = { + '(': '-lrb-', + ')': '-rrb-', + '[': '-lsb-', + ']': '-rsb-', + '{': '-lcb-', + '}': '-rcb-', + '&': '&', + '<': '<', + '>': '>' + } + string = string.replace('UNK', '[UNK]') + string = string.replace('', '[UNK]') + for key, value in _tok_dict.items(): + string = string.replace(value, key) + # string = string.replace("''", "\"") + # string = string.replace("``", "\"") + # string = string.replace("`", "'") + # string = string.replace(" n't", "n't") + # string = string.replace(" 's", "'s") + # string = string.replace(" 'd", "'d") + # string = string.replace(" 'll", "'ll") + return string + + +def cnndm_detokenize(string, is_target=False): + _tok_dict = { + '(': '-LRB-', + ')': '-RRB-', + '[': '-LSB-', + ']': '-RSB-', + '{': '-LCB-', + '}': '-RCB-' + } + if not is_target: + string = string.replace('', '') + else: + string = string.replace('', '[SEP]') + for key, value in _tok_dict.items(): + string = string.replace(value, key) + string = string.replace("''", "\"") + string = string.replace('``', "\"") + string = string.replace('`', "'") + string = string.replace(" n't", "n't") + string = string.replace(" 's", "'s") + string = string.replace(" 'd", "'d") + string = string.replace(" 'll", "'ll") + return string + + +def blanklm_detokenize(string, is_target=False): + string = string.replace('_UNK', '[UNK]') + string = string.replace('', '[MASK]') + return string + + +class SummmaryProcessor: + + def __init__(self, task, data_dir, tokenizer): + self.task = task + self.data_dir = data_dir + self.tokenizer = tokenizer + + def create_examples(self, split): + if split == 'train': + filename = 'train' + elif split == 'dev': + filename = 'val' + elif split == 'test': + filename = 'test' + else: + raise NotImplementedError(split) + print_rank_0( + f'Creating {self.task}-{split} dataset from {self.data_dir}') + if self.task == 'gigaword': + detokenizer = gigaword_detokenize + elif self.task == 'cnn_dm': + detokenizer = cnndm_detokenize + else: + detokenizer = None + source_texts, target_texts = [], [] + with open( + os.path.join(self.data_dir, f'{filename}.source'), + encoding='utf-8') as file: + for line in file: + line = line.strip() + line = punctuation_standardization(line) + line = detokenizer(line) if detokenizer else line + source_texts.append(line) + with open( + os.path.join(self.data_dir, f'{filename}.target'), + encoding='utf-8') as file: + for line in file: + line = line.strip() + line = punctuation_standardization(line) + line = detokenizer( + line, is_target=True) if detokenizer else line + target_texts.append(line) + assert len(source_texts) == len(target_texts) + example_list = [] + for idx, (source_text, + target_text) in enumerate(zip(source_texts, target_texts)): + if (idx + 1) % 20000 == 0: + print_rank_0(f'Complete {idx + 1} examples') + guid = '%s-%s' % (split, idx) + meta = { + 'ref': + self.tokenizer.DecodeIds( + self.tokenizer.EncodeAsIds(target_text).tokenization) + } + example = InputExample( + guid=guid, text_a=source_text, text_b=target_text, meta=meta) + if idx < 10: + print_rank_0( + (source_text.encode('utf-8'), target_text.encode('utf-8'), + meta['ref'].encode('utf-8'))) + example_list.append(example) + return example_list + + +class SQuADProcessor: + + def __init__(self, data_dir, tokenizer): + self.data_dir = data_dir + self.tokenizer = tokenizer + + def create_examples(self, split): + if split == 'train': + filename = 'train.json' + elif split == 'dev': + filename = 'dev.json' + elif split == 'test': + filename = 'test.json' + else: + raise NotImplementedError(split) + print_rank_0(f'Creating SQuAD-{split} dataset from {self.data_dir}') + example_list = [] + idx = 0 + with open( + os.path.join(self.data_dir, filename), + encoding='utf-8') as file: + dataset = json.load(file) + for paragraphs in dataset: + for paragraph in paragraphs['paragraphs']: + context = paragraph['context'] + for qa in paragraph['qas']: + question = qa['question'] + answers = {answer['text'] for answer in qa['answers']} + answer_starts = { + answer['text']: answer['answer_start'] + for answer in qa['answers'] + } + for answer in answers: + guid = '%s-%s' % (split, idx) + meta = { + 'answer_start': + answer_starts[answer], + 'answer': + answer, + 'question': + question, + 'ref': + self.tokenizer.DecodeIds( + self.tokenizer.EncodeAsIds( + question).tokenization) + } + example = InputExample( + guid=guid, text_a=context, meta=meta) + if idx < 10: + print_rank_0((context.encode('utf-8'), + answer.encode('utf-8'), + meta['ref'].encode('utf-8'))) + example_list.append(example) + idx += 1 + print_rank_0(f'Creating {len(example_list)} examples for {split}') + return example_list + + +class XSumProcessor: + + def __init__(self, data_dir, tokenizer): + self.data_dir = data_dir + self.tokenizer = tokenizer + + def create_examples(self, split): + if split == 'train': + key = 'train' + elif split == 'dev': + key = 'validation' + elif split == 'test': + key = 'test' + else: + raise NotImplementedError(split) + print_rank_0(f'Creating XSUM-{split} dataset from {self.data_dir}') + with open( + os.path.join( + self.data_dir, + 'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json')) as file: + id_list = json.load(file) + id_list = id_list[key] + source_texts, target_texts = [], [] + for i, idx in enumerate(id_list): + with open(os.path.join(self.data_dir, f'{idx}.summary')) as file: + key, sentences = None, [] + source_text, target_text = None, None + for line in file: + line = line.strip() + if line.startswith('[SN]'): + if key is not None: + if key == 'RESTBODY': + source_text = ' '.join(sentences) + elif key == 'FIRST-SENTENCE': + target_text = ' '.join(sentences) + key = line[4:-4] + sentences = [] + elif line: + sentences.append(line) + if key is not None: + if key == 'RESTBODY': + source_text = ' '.join(sentences) + elif key == 'FIRST-SENTENCE': + target_text = ' '.join(sentences) + source_texts.append(source_text) + target_texts.append(target_text) + if (i + 1) % 1000 == 0: + print_rank_0(f'Complete {i + 1} examples') + assert len(source_texts) == len(target_texts) + example_list = [] + for idx, (source_text, + target_text) in enumerate(zip(source_texts, target_texts)): + if (idx + 1) % 20000 == 0: + print_rank_0(f'Complete {idx + 1} examples') + guid = '%s-%s' % (split, idx) + meta = { + 'ref': + self.tokenizer.DecodeIds( + self.tokenizer.EncodeAsIds(target_text).tokenization) + } + example = InputExample( + guid=guid, text_a=source_text, text_b=target_text, meta=meta) + if idx < 10: + print_rank_0( + (source_text.encode('utf-8'), target_text.encode('utf-8'), + meta['ref'].encode('utf-8'))) + example_list.append(example) + return example_list + + +class Seq2SeqDataset(torch.utils.data.Dataset): + + def __init__(self, args, split, tokenizer): + self.args = args + self.task, self.data_dir = args.task.lower(), args.data_dir + self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length + self.split = split + self.tokenizer = tokenizer + self.dataset_name = split + if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original']: + self.processor = SummmaryProcessor(self.task, self.data_dir, + tokenizer) + elif self.task in ['xsum']: + self.processor = XSumProcessor(self.data_dir, tokenizer) + elif self.task in ['squad_generation']: + self.processor = SQuADProcessor(self.data_dir, tokenizer) + else: + raise NotImplementedError + example_list = self.processor.create_examples(split) + self.example_list = example_list + self.examples = {example.guid: example for example in example_list} + + print_rank_0(f'Return {len(self.examples)} {split} examples') + + def __len__(self): + return len(self.example_list) + + def __getitem__(self, idx): + example = self.example_list[idx] + cls_id = self.tokenizer.get_command('ENC').Id + mask_token = 'sMASK' if self.args.task_mask else 'MASK' + mask_id = self.tokenizer.get_command(mask_token).Id + pad_id = self.tokenizer.get_command('pad').Id + sop_id = self.tokenizer.get_command('sop').Id + eop_id = self.tokenizer.get_command('eop').Id + if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original', 'xsum']: + source_text, target_text = example.text_a, example.text_b + source_tokens = self.tokenizer.EncodeAsIds( + ' ' + source_text).tokenization + prompt = [cls_id, mask_id + ] + self.tokenizer.EncodeAsIds(' Content:').tokenization + if len(source_tokens) > self.max_src_length - len(prompt): + source_tokens = source_tokens[:self.max_src_length + - len(prompt)] + source_tokens = prompt + source_tokens + elif self.task == 'squad_generation': + source_text = example.text_a + target_text, answer = example.meta['question'], example.meta[ + 'answer'] + source_tokens = self.tokenizer.EncodeAsIds( + source_text.rstrip() + ' Question:').tokenization + answer_tokens = self.tokenizer.EncodeAsIds(' Answer: ' + + answer).tokenization + if len(source_tokens + ) > self.max_src_length - len(answer_tokens) - 2: + max_src_length = self.max_src_length - len(answer_tokens) - 2 + answer_pattern = self.tokenizer.EncodeAsIds( + ' ' + answer).tokenization + + def sub_finder(mylist, pattern): + matches = [] + for i in range(len(mylist)): + if mylist[i] == pattern[0] and mylist[ + i:i + len(pattern)] == pattern: + matches.append(i) + return matches + + answer_indices = sub_finder(source_tokens, answer_pattern) + if len(answer_indices) == 0: + print(f'Answer {answer} not exists in the source text') + source_tokens = source_tokens[:max_src_length] + else: + start_index = max(answer_indices[0] - max_src_length // 2, + 0) + source_tokens = source_tokens[start_index:start_index + + max_src_length] + source_tokens = [cls_id] + source_tokens + [mask_id + ] + answer_tokens + else: + raise NotImplementedError + if len(source_tokens) < self.max_src_length: + source_tokens = source_tokens + [pad_id] * ( + self.max_src_length - len(source_tokens)) + sep = len(source_tokens) + position_ids = list(range(len(source_tokens))) + block_position_ids = [0] * len(source_tokens) + mask_pos = source_tokens.index(mask_id) + if self.split == 'train': + target_tokens = self.tokenizer.EncodeAsIds( + ' ' + target_text).tokenization + target_tokens = target_tokens + [eop_id] + if len(target_tokens) > self.max_tgt_length: + target_tokens = target_tokens[:self.max_tgt_length] + loss_mask = [1] * len(target_tokens) + if len(target_tokens) < self.max_tgt_length: + loss_mask += [0] * (self.max_tgt_length - len(target_tokens)) + target_tokens += [pad_id] * ( + self.max_tgt_length - len(target_tokens)) + tokens = source_tokens + [sop_id] + target_tokens[:-1] + loss_mask = [0] * len(source_tokens) + loss_mask + target_ids = [0] * len(source_tokens) + target_tokens + position_ids += [mask_pos] * len(target_tokens) + if self.args.no_block_position: + block_position_ids += [1] * len(target_tokens) + else: + block_position_ids += list(range(1, len(target_tokens) + 1)) + position_ids = [position_ids, block_position_ids] + sample = { + 'text': np.array(tokens, dtype=np.int64), + 'target': np.array(target_ids, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'loss_mask': np.array(loss_mask, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'uid': example.guid + } + else: + tokens = source_tokens + [sop_id] + position_ids = position_ids + [mask_pos] + block_position_ids = block_position_ids + [1] + position_ids = [position_ids, block_position_ids] + sample = { + 'text': np.array(tokens, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'uid': example.guid + } + return sample + + +class ExtractionDataset(torch.utils.data.Dataset): + + def __init__(self, args, split, tokenizer): + self.args = args + task, data_dir = args.task.lower(), args.data_dir + self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length + self.split = split + self.tokenizer = tokenizer + if split == 'train': + filename = 'train' + elif split == 'dev': + filename = 'valid' + elif split == 'test': + filename = 'test' + else: + raise NotImplementedError(split) + print_rank_0(f'Creating {task}-{split} dataset from {data_dir}') + self.dataset_name = split + source_texts, target_texts = [], [] + with open( + os.path.join(data_dir, f'{filename}.source'), + encoding='utf-8') as file: + for line in file: + line = line.strip() + source_texts.append(line) + with open( + os.path.join(data_dir, f'{filename}.target'), + encoding='utf-8') as file: + for line in file: + line = line.strip() + target_texts.append(line) + self.examples, self.example_list = {}, [] + for idx, (source_text, + target_text) in enumerate(zip(source_texts, target_texts)): + if (idx + 1) % 20000 == 0: + print_rank_0(f'Complete {idx + 1} examples') + guid = '%s-%s' % (split, idx) + meta = {'ref': target_text} + example = InputExample( + guid=guid, text_a=source_text, text_b=target_text, meta=meta) + self.examples[guid] = example + self.example_list.append(example) + print_rank_0(f'Return {len(self.examples)} {split} examples') + + def __len__(self): + return len(self.example_list) + + def __getitem__(self, idx): + example = self.example_list[idx] + source_text, target_text = example.text_a, example.text_b + mask_token = 'MASK' + mask_id = self.tokenizer.get_command(mask_token).Id + sop_id = self.tokenizer.get_command('sop').Id + eop_id = self.tokenizer.get_command('eop').Id + pad_id = self.tokenizer.get_command('pad').Id + + def pad_to(text, max_len, pad_id): + if len(text) > max_len: + text = text[:max_len] + else: + text = text + [pad_id] * (max_len - len(text)) + return text + + source_tokens = self.tokenizer.EncodeAsIds(source_text).tokenization + masked_tgt = target_text.split('|') + source_tokens = pad_to(source_tokens, self.max_src_length, pad_id) + sep = len(source_tokens) + position_ids = list(range(len(source_tokens))) + block_position_ids = [0] * len(source_tokens) + if self.split == 'train': + mask_positions = [ + i for i, x in enumerate(source_tokens) if x == mask_id + ] + assert len(mask_positions) <= len(masked_tgt) + tokens = source_tokens + target_ids = [0] * len(source_tokens) + loss_mask = [0] * len(source_tokens) + for i, mask_pos in enumerate(mask_positions): + tgt_text = masked_tgt[i] + tgt_tokens = self.tokenizer.EncodeAsIds( + ' ' + tgt_text).tokenization + tokens += [sop_id] + tgt_tokens + target_ids += tgt_tokens + [eop_id] + loss_mask += [1] * (len(tgt_tokens) + 1) + position_ids += [mask_pos] * (len(tgt_tokens) + 1) + block_position_ids += [ + i + 1 for i in range(len(tgt_tokens) + 1) + ] + tokens = pad_to(tokens, self.max_src_length + self.max_tgt_length, + pad_id) + target_ids = pad_to(target_ids, + self.max_src_length + self.max_tgt_length, + pad_id) + loss_mask = pad_to(loss_mask, + self.max_src_length + self.max_tgt_length, 0) + position_ids = pad_to(position_ids, + self.max_src_length + self.max_tgt_length, 0) + block_position_ids = pad_to( + block_position_ids, self.max_src_length + self.max_tgt_length, + 0) + position_ids = [position_ids, block_position_ids] + sample = { + 'text': np.array(tokens, dtype=np.int64), + 'target': np.array(target_ids, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'loss_mask': np.array(loss_mask, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'uid': example.guid + } + else: + tokens = source_tokens + [sop_id] + mask_pos = source_tokens.index(mask_id) + position_ids = position_ids + [mask_pos] + block_position_ids = block_position_ids + [1] + position_ids = [position_ids, block_position_ids] + sample = { + 'text': np.array(tokens, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'uid': example.guid + } + return sample + + +class BlankLMDataset(torch.utils.data.Dataset): + + def __init__(self, args, split, tokenizer): + self.args = args + task, data_dir = args.task.lower(), args.data_dir + self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length + self.split = split + assert args.tokenizer_type == 'BertWordPieceTokenizer' + self.tokenizer = tokenizer + if split == 'train': + filename = 'train' + elif split == 'dev': + filename = 'valid' + elif split == 'test': + filename = 'test' + else: + raise NotImplementedError(split) + print_rank_0(f'Creating {task}-{split} dataset from {data_dir}') + self.dataset_name = split + detokenizer = blanklm_detokenize + source_texts, target_texts = [], [] + with open( + os.path.join(data_dir, f'{filename}.txt'), + encoding='utf-8') as file: + for line in file: + line = line.strip() + line = detokenizer(line) if detokenizer else line + target_texts.append(line) + if split == 'test': + with open( + os.path.join( + data_dir, + f'blank/test.maskratio{args.blank_maskratio:.1f}.blank' + ), + encoding='utf-8') as file: + for line in file: + line = line.strip() + line = detokenizer(line) if detokenizer else line + source_texts.append(line) + else: + source_texts = target_texts + self.examples, self.example_list = {}, [] + for idx, (source_text, + target_text) in enumerate(zip(source_texts, target_texts)): + # if idx > 10000: + # break + if (idx + 1) % 20000 == 0: + print_rank_0(f'Complete {idx + 1} examples') + guid = '%s-%s' % (split, idx) + meta = {'ref': target_text} + example = InputExample( + guid=guid, text_a=source_text, text_b=target_text, meta=meta) + self.examples[guid] = example + self.example_list.append(example) + print_rank_0(f'Return {len(self.examples)} {split} examples') + self.random = random.Random(args.seed) + + def __len__(self): + return len(self.example_list) + + def __getitem__(self, idx): + example = self.example_list[idx] + source_text, target_text = example.text_a, example.text_b # noqa + mask_token = 'gMASK' if self.args.task_mask else 'MASK' + mask_id = self.tokenizer.get_command(mask_token).Id + sop_id = self.tokenizer.get_command('sop').Id + eop_id = self.tokenizer.get_command('eop').Id + pad_id = self.tokenizer.get_command('pad').Id + if self.split in ['train', 'dev']: + masked_src, masked_tgt = self.mask_text(source_text) + source_text = masked_src + + def pad_to(text, max_len, pad_id): + if len(text) > max_len: + text = text[:max_len] + else: + text = text + [pad_id] * (max_len - len(text)) + return text + + source_tokens = self.tokenizer.EncodeAsIds(' ' + + source_text).tokenization + source_tokens = pad_to(source_tokens, self.max_src_length, pad_id) + sep = len(source_tokens) + position_ids = list(range(len(source_tokens))) + block_position_ids = [0] * len(source_tokens) + if self.split in ['train', 'dev']: + mask_positions = [ + i for i, x in enumerate(source_tokens) if x == mask_id + ] + assert len(mask_positions) <= len(masked_tgt) + tokens = source_tokens + target_ids = [0] * len(source_tokens) + loss_mask = [0] * len(source_tokens) + for i, mask_pos in enumerate(mask_positions): + tgt_text = masked_tgt[i] + tgt_tokens = self.tokenizer.EncodeAsIds( + ' ' + tgt_text).tokenization + tokens += [sop_id] + tgt_tokens + target_ids += tgt_tokens + [eop_id] + loss_mask += [1] * (len(tgt_tokens) + 1) + position_ids += [mask_pos] * (len(tgt_tokens) + 1) + block_position_ids += [ + i + 1 for i in range(len(tgt_tokens) + 1) + ] + max_length = self.max_src_length + int( + self.max_src_length * self.args.blank_maskratio) + tokens = pad_to(tokens, max_length, pad_id) + target_ids = pad_to(target_ids, max_length, pad_id) + loss_mask = pad_to(loss_mask, max_length, 0) + position_ids = pad_to(position_ids, max_length, 0) + block_position_ids = pad_to(block_position_ids, max_length, 0) + position_ids = [position_ids, block_position_ids] + sample = { + 'text': np.array(tokens, dtype=np.int64), + 'target': np.array(target_ids, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'loss_mask': np.array(loss_mask, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'uid': example.guid + } + else: + tokens = source_tokens + [sop_id] + mask_pos = source_tokens.index(mask_id) + position_ids = position_ids + [mask_pos] + block_position_ids = block_position_ids + [1] + position_ids = [position_ids, block_position_ids] + sample = { + 'text': np.array(tokens, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'uid': example.guid + } + return sample + + def mask_text(self, text): + tokens = text.split() + mask_ratio = self.args.blank_maskratio + n = len(tokens) + indices = sorted(self.random.sample(range(n), int(n * mask_ratio))) + masked_src, masked_tgt = '', [] + for i, idx in enumerate(indices): + if i == 0 or idx != indices[i - 1] + 1: + masked_tgt.append('') + masked_tgt[-1] += ' ' + tokens[idx] + tokens[idx] = '[MASK]' + for i, token in enumerate(tokens): + if i != 0 and token == '[MASK]' and tokens[i - 1] == '[MASK]': + continue + masked_src += ' ' + token + return masked_src, masked_tgt diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py new file mode 100644 index 00000000..5fd28b89 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py @@ -0,0 +1,538 @@ +# Copyright (c) 2022 Zhipu.AI + +import datetime +import random +import string + +import mpu +import torch +import torch.nn.functional as F +from generation_utils import (BeamSearchScorer, LogitsProcessorList, + MinLengthLogitsProcessor, + NoRepeatNGramLogitsProcessor) +from rouge_score import rouge_scorer +from utils import print_rank_0 + + +def _is_digit(w): + for ch in w: + if not (ch.isdigit() or ch == ','): + return False + return True + + +gigaword_tok_dict = { + '(': '-lrb-', + ')': '-rrb-', + '[': '-lsb-', + ']': '-rsb-', + '{': '-lcb-', + '}': '-rcb-', + '[UNK]': 'UNK', + '&': '&', + '<': '<', + '>': '>' +} + +cnndm_tok_dict = { + '(': '-LRB-', + ')': '-RRB-', + '[': '-LSB-', + ']': '-RSB-', + '{': '-LCB-', + '}': '-RCB-' +} + + +def fix_tokenization(text, dataset): + if dataset == 'cnn_dm_org': + return text + if dataset == 'gigaword': + text = text.replace('[UNK]', 'UNK') + return text + input_tokens = text.split() + output_tokens = [] + has_left_quote = False + has_left_single_quote = False + + i = 0 + prev_dash = False + while i < len(input_tokens): + tok = input_tokens[i] + flag_prev_dash = False + if tok == "\"": + if has_left_quote: + output_tokens.append("''") + else: + output_tokens.append('``') + has_left_quote = not has_left_quote + i += 1 + elif tok == "'" and len( + output_tokens) > 0 and output_tokens[-1].endswith( + 'n') and i < len(input_tokens) - 1 and input_tokens[ + i + 1] == 't': # noqa + output_tokens[-1] = output_tokens[-1][:-1] + output_tokens.append("n't") + i += 2 + elif tok == "'" and i < len(input_tokens) - 1 and input_tokens[ + i + 1] in ('s', 'd', 'll'): + output_tokens.append("'" + input_tokens[i + 1]) + i += 2 + elif tok == "'": + if has_left_single_quote: + output_tokens.append("'") + else: + output_tokens.append('`') + has_left_single_quote = not has_left_single_quote + i += 1 + elif tok == '.' and i < len(input_tokens) - 2 and input_tokens[ + i + 1] == '.' and input_tokens[i + 2] == '.': + output_tokens.append('...') + i += 3 + elif tok == ',' and len(output_tokens) > 0 and _is_digit( + output_tokens[-1]) and i < len(input_tokens) - 1 and _is_digit( + input_tokens[i + 1]): + # $ 3 , 000 -> $ 3,000 + output_tokens[-1] += ',' + input_tokens[i + 1] + i += 2 + elif tok == '.' and len(output_tokens) > 0 and output_tokens[-1].isdigit() and i < len(input_tokens) - 1 and \ + input_tokens[i + 1].isdigit(): + # 3 . 03 -> $ 3.03 + output_tokens[-1] += '.' + input_tokens[i + 1] + i += 2 + elif tok == '.' and len(output_tokens) > 0 and len( + output_tokens[-1]) == 1 and output_tokens[-1].isalpha( # noqa + ) and i < len(input_tokens) - 2 and len( # noqa + input_tokens[i + 1]) == 1 and input_tokens[ + i + 1].isalpha( # noqa + ) and input_tokens[i + 2] == '.': # noqa + # U . N . -> U.N. + k = i + 3 + while k + 2 < len(input_tokens): + if len(input_tokens[k + 1]) == 1 and input_tokens[ + k + 1].isalpha() and input_tokens[k + 2] == '.': + k += 2 + else: + break + output_tokens[-1] += ''.join(input_tokens[i:k]) + i = k + elif tok == '-': + if i < len(input_tokens) - 1 and input_tokens[i + 1] == '-': + output_tokens.append('--') + i += 2 + elif i == len(input_tokens) - 1 or i == 0: + output_tokens.append('-') + i += 1 + elif output_tokens[-1] not in string.punctuation and input_tokens[ + i + 1][0] not in string.punctuation: + output_tokens[-1] += '-' + i += 1 + flag_prev_dash = True + else: + output_tokens.append('-') + i += 1 + elif prev_dash and len( + output_tokens) > 0 and tok[0] not in string.punctuation: + output_tokens[-1] += tok + i += 1 + else: + output_tokens.append(tok) + i += 1 + prev_dash = flag_prev_dash + return ' '.join(output_tokens) + + +def count_tokens(tokens): + counter = {} + for t in tokens: + if t in counter.keys(): + counter[t] += 1 + else: + counter[t] = 1 + return counter + + +def get_f1(text_a, text_b): + tokens_a = text_a.lower().split() + tokens_b = text_b.lower().split() + if len(tokens_a) == 0 or len(tokens_b) == 0: + return 1 if len(tokens_a) == len(tokens_b) else 0 + set_a = count_tokens(tokens_a) + set_b = count_tokens(tokens_b) + match = 0 + for token in set_a.keys(): + if token in set_b.keys(): + match += min(set_a[token], set_b[token]) + p = match / len(tokens_a) + r = match / len(tokens_b) + return 2.0 * p * r / (p + r + 1e-5) + + +def remove_duplicate(l_list, duplicate_rate): + tk_list = [l.lower().split() for l in l_list] # noqa + r_list = [] + history_set = set() + for i, w_list in enumerate(tk_list): + w_set = set(w_list) + if len(w_set & history_set) / len(w_set) <= duplicate_rate: + r_list.append(l_list[i]) + history_set |= w_set + return r_list + + +def rouge_metric(predictions, + labels, + examples, + metric='rouge-1', + duplicate_rate=0.7, + dataset='cnn_dm'): + metric_dict = { + 'rouge-1': 'rouge1', + 'rouge-2': 'rouge2', + 'rouge-l': 'rougeLsum' + } + refs = [example.meta['ref'] for example in examples] + ref_list = [] + for ref in refs: + ref = ref.strip().split('[SEP]') + ref = [fix_tokenization(sentence, dataset=dataset) for sentence in ref] + ref = '\n'.join(ref) + ref_list.append(ref) + pred_list = [] + for prediction in predictions: + buf = [] + for sentence in prediction.strip().split('[SEP]'): + sentence = fix_tokenization(sentence, dataset=dataset) + if any(get_f1(sentence, s) > 1.0 for s in buf): + continue + s_len = len(sentence.split()) + if s_len <= 4: + continue + buf.append(sentence) + if duplicate_rate and duplicate_rate < 1: + buf = remove_duplicate(buf, duplicate_rate) + line = '\n'.join(buf) + pred_list.append(line) + if torch.distributed.get_rank() == 0: + import json + with open('./results.json', 'w') as output: + for ref, pred in zip(ref_list, pred_list): + output.write(json.dumps({'ref': ref, 'pred': pred}) + '\n') + scorer = rouge_scorer.RougeScorer([metric_dict[metric]], use_stemmer=True) + scores = [ + scorer.score(pred, ref) for pred, ref in zip(pred_list, ref_list) + ] + scores = [score[metric_dict[metric]].fmeasure for score in scores] + scores = sum(scores) / len(scores) + return scores + + +def process_batch(batch, args): + """Process batch and produce inputs for the model.""" + tokens = batch['text'].long().cuda() + attention_mask = batch['attention_mask'].long().cuda() + position_ids = batch['position_id'].long().cuda() + return tokens, attention_mask, position_ids + + +class DecoderEvaluater: + + def __init__(self, args, tokenizer): + self.tokenizer = tokenizer + self.start_token = tokenizer.get_command('sop').Id + self.end_token = tokenizer.get_command('eop').Id + self.mask_token = tokenizer.get_command( + 'sMASK').Id if args.task_mask else tokenizer.get_command('MASK').Id + self.pad_token = tokenizer.get_command('pad').Id + self.processors = LogitsProcessorList() + if args.min_tgt_length > 0: + processor = MinLengthLogitsProcessor(args.min_tgt_length, + self.end_token) + self.processors.append(processor) + if args.no_repeat_ngram_size > 0: + processor = NoRepeatNGramLogitsProcessor(args.no_repeat_ngram_size) + self.processors.append(processor) + + def evaluate(self, model, dataloader, example_dict, args): + """Calculate correct over total answers and return prediction if the + `output_predictions` is true.""" + model.eval() + store = torch.distributed.TCPStore(args.master_ip, + 18931 + random.randint(0, 10000), + mpu.get_data_parallel_world_size(), + torch.distributed.get_rank() == 0, + datetime.timedelta(seconds=30)) + print_rank_0('Distributed store created') + with torch.no_grad(): + # For all the batches in the dataset. + for idx, data in enumerate(dataloader): + tokens, attention_mask, position_ids = process_batch( + data, args) + batch_size = tokens.size(0) + beam_scorer = BeamSearchScorer( + batch_size=batch_size, + max_length=args.out_seq_length, + num_beams=args.num_beams, + device=tokens.device, + length_penalty=args.length_penalty, + do_early_stopping=False, + ) + beam_scores = torch.zeros((batch_size, args.num_beams), + dtype=torch.float, + device=tokens.device) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view((batch_size * args.num_beams, )) + # Run the model forward. + counter = 0 + while counter < args.tgt_seq_length: + if counter == 0: + next_token_logits, *mems = model( + tokens, + position_ids, + attention_mask, + return_memory=True) + seq_length = next_token_logits.size(1) + next_token_logits = next_token_logits[:, -1] + next_token_logits = next_token_logits.unsqueeze( + 1).repeat(1, args.num_beams, + 1).view(batch_size * args.num_beams, -1) + mems = [ + mem.unsqueeze(1).repeat( + 1, args.num_beams, 1, + 1).view(batch_size * args.num_beams, + seq_length, -1) for mem in mems + ] + position_ids = tokens.new_ones(batch_size, + args.num_beams, 2, 1) + for i, text in enumerate(tokens.tolist()): + mask_pos = text.index(self.mask_token) + position_ids[i, :, 0] = mask_pos + position_ids = position_ids.reshape( + batch_size * args.num_beams, 2, 1) + tokens = tokens.new_zeros(batch_size * args.num_beams, + 0) + attention_mask = tokens.new_zeros( + [batch_size * args.num_beams]) + else: + if not args.no_block_position: + position_ids[:, 1] = counter + 1 + last_token = tokens[:, -1:] + next_token_logits, *mems = model( + last_token, + position_ids, + attention_mask, + *mems, + return_memory=True) + next_token_logits = next_token_logits[:, -1] + next_token_scores = F.log_softmax( + next_token_logits, dim=-1) + next_token_scores = self.processors( + tokens, next_token_scores) + next_token_scores = next_token_scores + beam_scores[:, None].expand_as( + next_token_scores) + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view( + batch_size, args.num_beams * vocab_size) + + probs = F.softmax(next_token_scores, dim=-1) + if args.select_topk: + _, next_tokens = torch.topk( + probs, k=2 * args.num_beams, dim=-1, largest=True) + else: + next_tokens = torch.multinomial( + probs, num_samples=2 * args.num_beams) + next_token_scores = torch.gather(next_token_scores, -1, + next_tokens) + next_token_scores, _indices = torch.sort( + next_token_scores, descending=True, dim=1) + next_tokens = torch.gather(next_tokens, -1, _indices) + + next_indices = next_tokens // vocab_size + next_tokens = next_tokens % vocab_size + # stateless + beam_outputs = beam_scorer.process( + tokens, + next_token_scores, + next_tokens, + next_indices, + eos_token_id=self.end_token, + pad_token_id=self.pad_token) + beam_scores = beam_outputs['next_beam_scores'] + beam_next_tokens = beam_outputs['next_beam_tokens'] + beam_idx = beam_outputs['next_beam_indices'] + beam_next_tokens = beam_next_tokens.unsqueeze(-1) + tokens = torch.cat([tokens[beam_idx, :], beam_next_tokens], + dim=-1) + mems = [mem[beam_idx] for mem in mems] if mems else [] + if beam_scorer.is_done: + break + counter += 1 + tokens, _ = beam_scorer.finalize( + tokens, + beam_scores, + next_tokens, + next_indices, + eos_token_id=self.end_token, + pad_token_id=self.pad_token) + predictions = [] + for text in tokens.tolist(): + text = [ + token for token in text + if token not in [self.end_token, self.pad_token] + ] + text = self.tokenizer.DecodeIds(text) + predictions.append(text) + uid_list = data['uid'] + if isinstance(uid_list, torch.Tensor): + uid_list = uid_list.cpu().numpy().tolist() + for uid, prediction in zip(uid_list, predictions): + store.set(uid, prediction) + if (idx + 1) % args.log_interval == 0: + print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}') + model.train() + torch.distributed.barrier() + print_rank_0('Evaluation completed') + predictions, examples = [], [] + for uid, example in example_dict.items(): + predictions.append(store.get(uid).decode('utf-8')) + examples.append(example) + torch.distributed.barrier() + return predictions, [], examples + + +def blanklm_fix_tokenization(text): + text = text.replace('` `', '``') + text = text.replace("\' \'", "\'\'") + text = text.replace("n \' t", "n\'t") + text = text.replace("\' s", "\'s") + text = text.replace("\' m", "\'m") + text = text.replace("\' re", "\'re") + text = text.replace('. . .', '...') + text = text.replace(' . .', ' ..') + text = text.replace('- -', '--') + text = text.replace('u . s .', 'u.s.') + text = text.replace('u . k .', 'u.k.') + text = text.replace('e . g .', 'e.g.') + return text + + +class BlankLMEvaluater(DecoderEvaluater): + + def evaluate(self, model, dataloader, example_dict, args): + model.eval() + store = torch.distributed.TCPStore(args.master_ip, + 18931 + random.randint(0, 10000), + mpu.get_data_parallel_world_size(), + torch.distributed.get_rank() == 0, + datetime.timedelta(seconds=30)) + print_rank_0('Distributed store created') + + with torch.no_grad(): + for idx, data in enumerate(dataloader): + tokens, attention_mask, position_ids = process_batch( + data, args) + src_tokens = tokens + batch_size = tokens.size(0) + mask_positions = [] + current_mask = [] + for text in tokens.tolist(): + mask_positions.append([ + i for i, x in enumerate(text) if x == self.mask_token + ]) + current_mask.append(0) + # print(self.tokenizer.DecodeIds(text)) + # print(mask_positions[-1]) + counter = 0 + done = [False] * batch_size + while counter < args.tgt_seq_length: + if counter == 0: + # print(tokens) + # print(position_ids) + next_token_logits, *mems = model( + tokens, + position_ids, + attention_mask, + return_memory=True) + next_token_logits = next_token_logits[:, -1] + position_ids = tokens.new_ones(batch_size, 2, 1) + for i, text in enumerate(tokens.tolist()): + mask_pos = mask_positions[i][current_mask[i]] + position_ids[i, 0] = mask_pos + tokens = tokens.new_zeros(batch_size, 0) + attention_mask = tokens.new_zeros(batch_size) + else: + position_ids[:, 1] = position_ids[:, 1] + 1 + last_token = tokens[:, -1:] + next_token_logits, *mems = model( + last_token, + position_ids, + attention_mask, + *mems, + return_memory=True) + next_token_logits = next_token_logits[:, -1] + next_token_scores = F.log_softmax( + next_token_logits, dim=-1) + next_token_scores = self.processors( + tokens, next_token_scores) + next_tokens = next_token_scores.max(dim=-1)[1] + # print(self.tokenizer.DecodeIds(next_tokens.tolist())) + for i, next_token in enumerate(next_tokens.tolist()): + if next_token == self.end_token: + if current_mask[i] + 1 < len(mask_positions[i]): + current_mask[i] += 1 + next_tokens[i] = self.start_token + position_ids[i, 0] = mask_positions[i][ + current_mask[i]] + position_ids[i, 1] = 0 + else: + done[i] = True + if done[i]: + next_tokens[i] = self.pad_token + if all(done): + break + tokens = torch.cat( + [tokens, next_tokens.unsqueeze(-1)], dim=-1) + counter += 1 + predictions = [] + for i, text in enumerate(tokens.tolist()): + text = [ + token for token in text + if token not in [self.end_token, self.pad_token] + ] + blanks = [[]] + for token in text: + if token == self.start_token: + blanks.append([]) + else: + blanks[-1].append(token) + output_tokens = [] + current_blank = 0 + for token in src_tokens[i].tolist(): + if token == self.mask_token: + if current_blank < len(blanks): + output_tokens += blanks[current_blank] + current_blank += 1 + else: + if token not in [self.pad_token]: + output_tokens.append(token) + text = self.tokenizer.DecodeIds(output_tokens[:-1]) + text = blanklm_fix_tokenization(text) + predictions.append(text) + # print(text) + uid_list = data['uid'] + if isinstance(uid_list, torch.Tensor): + uid_list = uid_list.cpu().numpy().tolist() + for uid, prediction in zip(uid_list, predictions): + store.set(uid, prediction) + if (idx + 1) % args.log_interval == 0: + print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}') + + model.train() + torch.distributed.barrier() + print_rank_0('Evaluation completed') + predictions, examples = [], [] + for uid, example in example_dict.items(): + predictions.append(store.get(uid).decode('utf-8')) + examples.append(example) + torch.distributed.barrier() + return predictions, [], examples diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py new file mode 100644 index 00000000..4c0c28e7 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Race.""" +import functools +from collections import OrderedDict + +import mpu +import torch +from finetune_glm import finetune +from pretrain_glm import get_batch +from tasks.eval_utils import accuracy_func_provider +from tasks.seq2seq.dataset import (BlankLMDataset, ExtractionDataset, + Seq2SeqDataset) +from tasks.seq2seq.evaluate import (BlankLMEvaluater, DecoderEvaluater, + rouge_metric) + +global_tokenizer = None + + +def seq2seq_forward_step(data, model, args, timers, mems): + """Forward step.""" + + # Get the batch. + if timers is not None: + timers('batch generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data, args) + if timers is not None: + timers('batch generator').stop() + # Forward model. + logits, *mems = model(tokens, position_ids, attention_mask, *mems) + # logits, loss_mask = logits[:, args.src_seq_length:], loss_mask[:, args.src_seq_length:] + # target_ids = target_ids[:, args.src_seq_length:] + losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(), + labels) + if args.label_smoothing > 0.0: + epsilon = args.label_smoothing + smooth_loss = -torch.nn.functional.log_softmax( + logits, dim=-1).mean(dim=-1) + losses = (1 - epsilon) * losses + epsilon * smooth_loss + loss_mask = loss_mask.reshape(-1) + # The loss is not normalized for fair comparison + loss = torch.sum(losses.reshape(-1) * loss_mask) / loss_mask.sum() + return loss, mems, 'bert' + + +def train_valid_datasets_provider(args, tokenizer): + """Provide train and validation datasets.""" + if args.task.lower() == 'blank': + train_dataset = BlankLMDataset( + args, split='train', tokenizer=tokenizer) + valid_dataset = None + elif args.task.lower() == 'extraction': + train_dataset = ExtractionDataset( + args, split='train', tokenizer=tokenizer) + valid_dataset = None + else: + train_dataset = Seq2SeqDataset( + args, split='train', tokenizer=tokenizer) + valid_dataset = None + global global_tokenizer + global_tokenizer = tokenizer + return train_dataset, valid_dataset + + +def metrics_func_provider(args, tokenizer, is_test): + """Provide metrics callback function.""" + + def single_dataset_provider(split): + if args.task.lower() == 'blank': + return BlankLMDataset(args, split=split, tokenizer=tokenizer) + elif args.task.lower() == 'extraction': + return ExtractionDataset(args, split=split, tokenizer=tokenizer) + else: + return Seq2SeqDataset(args, split=split, tokenizer=tokenizer) + + if args.task.lower() in ['blank', 'extraction']: + evaluater = BlankLMEvaluater(args, tokenizer) + eval_func = evaluater.evaluate + metric_dict = {} + else: + evaluater = DecoderEvaluater(args, tokenizer) + eval_func = evaluater.evaluate + if args.tokenizer_type == 'BertWordPieceTokenizer': + dataset = 'cnn_dm' + elif args.task.lower() == 'gigaword': + dataset = 'gigaword' + else: + dataset = 'cnn_dm_org' + metric_dict = OrderedDict({ + 'rouge-1': + functools.partial(rouge_metric, metric='rouge-1', dataset=dataset), + 'rouge-2': + functools.partial(rouge_metric, metric='rouge-2', dataset=dataset), + 'rouge-l': + functools.partial(rouge_metric, metric='rouge-l', dataset=dataset) + }) + + def output_func(predictions, examples, output_file): + with open(output_file + '.hyps', 'w', encoding='utf-8') as output: + for prediction in predictions: + output.write(prediction) + output.write('\n') + with open(output_file + '.refs', 'w', encoding='utf-8') as output: + for example in examples: + output.write(example.meta['ref']) + output.write('\n') + if args.task.lower() == 'squad_generation': + with open( + output_file + '.source', 'w', encoding='utf-8') as output: + for example in examples: + output.write( + example.text_a.replace('\n', ' ') + ' Answer: ' + + example.meta['answer']) + output.write('\n') + + return accuracy_func_provider( + single_dataset_provider, + metric_dict, + args, + is_test=is_test, + eval_func=eval_func, + output_func=output_func, + only_rank0=False) + + +def main(args): + if args.src_seq_length > args.max_position_embeddings: + args.max_position_embeddings = args.src_seq_length + if args.task.lower() in [ + 'cnn_dm', 'cnn_dm_original', 'gigaword', 'blank', + 'squad_generation', 'xsum', 'extraction' + ]: + finetune( + args, + train_valid_datasets_provider, {}, + end_of_epoch_callback_provider=metrics_func_provider, + forward_step=seq2seq_forward_step) + else: + raise NotImplementedError(args.task) diff --git a/modelscope/models/nlp/mglm/tasks/superglue/README.md b/modelscope/models/nlp/mglm/tasks/superglue/README.md new file mode 100644 index 00000000..94aab0e9 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/superglue/README.md @@ -0,0 +1,137 @@ +# Use GLM for your NLU tasks +To use GLM for your own NLU tasks, you should implement a subclass of `DataProcessor` in [tasks/superglue/dataset.py](dataset.py) and a subclass of `PVP` in [tasks/superglue/pvp.py](pvp.py). You should also specify the We will take the RTE and ReCoRD tasks in SuperGLUE as an example. + +## 1. Design your patterns +RTE is an NLI task in which the model is required to predict text entailment between a premise and a hypothesis. The label can be `entailment` or `not_entailment` One sample from the training set is +``` +premise: No Weapons of Mass Destruction Found in Iraq Yet. +hypothesis: Weapons of Mass Destruction Found in Iraq. +label: not_entailment +``` +We design the pattern as +``` +"`hypothesis`"?, [MASK], "`premise`" +``` +GLM predicts "Yes" for `entailment` and "No" for `not_entailment`. "Yes" and "No" are called verbalizers for `entailment` and `not_entailment`. + +ReCoRD is a multi-choice QA task. Each example consists of a news article and a Cloze-style question about the article in which one entity is masked out. The system must predict the masked out entity from a list of possible entities in the provided passage. We directly adopt the cloze-style question as our pattern and use GLM to predict the masked entity. + +## 2. Implement subclass of `DataProcessor` +A subclass of `DataProcessor` should implement `get_train_examples`, `get_dev_examples` and `get_test_examples`, which return the examples of the train, dev, and test sets. The returned value is a list of `InputExample`. It should also implement `get_labels` to return the list of possible labels. Hete we take the `RTEProcessor` as an example: +```python +class RteProcessor(DataProcessor): + """Processor for the RTE data set.""" + + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "train.jsonl"), "train") + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples(os.path.join(data_dir, "val.jsonl"), "dev") + + def get_test_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "test.jsonl"), "test") + + def get_unlabeled_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "unlabeled.jsonl"), "unlabeled") + + def get_labels(self): + return ["entailment", "not_entailment"] + + def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis", + premise_name: str = "premise") -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line_idx, line in enumerate(f): + example_json = json.loads(line) + idx = example_json['idx'] + if isinstance(idx, str): + try: + idx = int(idx) + except ValueError: + idx = line_idx + label = example_json.get('label') + guid = "%s-%s" % (set_type, idx) + text_a = example_json[premise_name] + text_b = example_json[hypothesis_name] + + example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) + examples.append(example) + + return examples +``` +After that, you should add the implemented class to ``PROCESSORS`` at the end of [tasks/superglue/dataset.py](dataset.py): +```python +PROCESSORS = { + ... + "rte": RteProcessor +} +``` + +## 3. Implement subclass of `PVP` +To implement a subclass of `PVP`, you should first decide your verbalizers is single-token or multi-token. The verbalizers in RTE, "Yes" and "No" are single-token. Instead, the verbalizers in ReCoRD are multi-token, as one entity can be tokenized into multiple tokens with WordPiece or BPE tokenizer. + +For single-token task, you should set `is_multi_token=False` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `verbalize` to return the verbalizer given a label. Take `RTEPVP` as an example: +```python +class RtePVP(PVP): + is_multi_token = False + VERBALIZER = { + "not_entailment": [" No"], + "entailment": [" Yes"] + } + + @property + def spell_length(self): + return self.pattern_id + + def get_parts(self, example: InputExample) -> FilledPattern: + # switch text_a and text_b to get the correct order + text_a = example.text_a + text_b = example.text_b.rstrip(string.punctuation) + return ['"', self.shortenable(text_b), '" ?'], [[self.mask], ', "', self.shortenable(text_a), '"'] + + def verbalize(self, label) -> List[str]: + return RtePVP.VERBALIZER[label] +``` +We use `PvP.shortenable` to mark the segments that can be truncated when exceeding the maximum sequence length. + +For multi-token task, you should set `is_multi_token=True` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `get_answers` to return the candidates. Take `ReCoRDPVP` as an example: +```python +class RecordPVP(PVP): + is_multi_token = True + + def get_answers(self, example: InputExample): + choices = example.meta['candidates'] + choices = [" " + choice for choice in choices] + return choices + + def get_parts(self, example: InputExample) -> FilledPattern: + premise = self.shortenable(example.text_a) + + assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token' + question_a, question_b = example.text_b.split('@placeholder') + return [premise, " " + question_a.rstrip(), [self.mask], question_b], [] +``` +After that, you should implement the class to `PVPS` at the end of [tasks/superglue/pvp.py](pvp.py): +```python +PVPS = { + ... + 'rte': RtePVP, + 'record': RecordPVP +} +``` +## 4. Run the experiment +To run the experiment for your new task, you should create a config file like [config_tasks/task_rte.sh](/config_tasks/task_rte.sh). You should also specify the evaluation metrics for the task in `DEFAULT_METRICS` of [tasks/superglue/finetune.py](finetune.py): +```python +DEFAULT_METRICS = { + ... + "record": [("EM", qa_exact_match), ("F1", qa_f1)], + "rte": [("accuracy", accuracy_metric)] +} +``` +Then you can run the experiment with [finetune_superglue.sh](/scripts/finetune_superglue.sh): +```shell +bash scripts/finetune_superglue.sh \ + config_tasks/model_blocklm_large.sh \ + config_tasks/task_rte.sh +``` diff --git a/modelscope/models/nlp/mglm/tasks/superglue/__init__.py b/modelscope/models/nlp/mglm/tasks/superglue/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/nlp/mglm/tasks/superglue/dataset.py b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py new file mode 100644 index 00000000..36367671 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py @@ -0,0 +1,1475 @@ +# Copyright (c) 2022 Zhipu.AI +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains the logic for loading training and test data for all tasks. +""" + +import copy +import csv +import glob +import os +import random +import re +from abc import ABC, abstractmethod +from collections import Counter, defaultdict +from typing import Callable, Dict, List + +import json +import numpy as np +import pandas as pd +from data_utils import (build_input_from_ids, build_sample, + num_special_tokens_to_add) +from data_utils.corpora import punctuation_standardization +from torch.utils.data import Dataset +from tqdm import tqdm +from utils import print_rank_0 + +from modelscope.models.nlp.mglm.tasks.data_utils import InputExample +from modelscope.models.nlp.mglm.tasks.superglue.pvp import PVPS + +TRAIN_SET = 'train' +DEV_SET = 'dev' +TEST_SET = 'test' +TRUE_DEV_SET = 'true_dev' +UNLABELED_SET = 'unlabeled' + +SPLIT_TYPES = [TRAIN_SET, DEV_SET, TEST_SET, TRUE_DEV_SET, UNLABELED_SET] + + +def get_output_func(task_name, args): + return PROCESSORS[task_name](args).output_prediction + + +def read_tsv(path, **kwargs): + return pd.read_csv( + path, + sep='\t', + quoting=csv.QUOTE_NONE, + dtype=str, + na_filter=False, + **kwargs) + + +class SuperGlueDataset(Dataset): + + def __init__(self, + args, + task_name, + data_dir, + seq_length, + split, + tokenizer, + for_train=False, + pattern_ensemble=False, + pattern_text=False): + self.processor = PROCESSORS[task_name](args) + args.variable_num_choices = self.processor.variable_num_choices + print_rank_0( + f'Creating {task_name} dataset from file at {data_dir} (split={split})' + ) + self.dataset_name = f'{task_name}-{split}' + self.cloze_eval = args.cloze_eval + self.seq_length = seq_length + self.tokenizer = tokenizer + self.pattern_ensemble = pattern_ensemble + self.pattern_text = pattern_text + if pattern_text: + assert self.cloze_eval, 'Labeled examples only exist in cloze evaluation' + self.args = args + if split == DEV_SET: + example_list = self.processor.get_dev_examples( + data_dir, for_train=for_train) + elif split == TEST_SET: + example_list = self.processor.get_test_examples(data_dir) + elif split == TRUE_DEV_SET: + example_list = self.processor.get_true_dev_examples(data_dir) + elif split == TRAIN_SET: + if task_name == 'wsc': + example_list = self.processor.get_train_examples( + data_dir, cloze_eval=args.cloze_eval) + else: + example_list = self.processor.get_train_examples(data_dir) + elif split == UNLABELED_SET: + example_list = self.processor.get_unlabeled_examples(data_dir) + for example in example_list: + example.label = self.processor.get_labels()[0] + else: + raise ValueError( + f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead") + if split == TEST_SET: + self.labeled = False + else: + self.labeled = True + + label_distribution = Counter(example.label for example in example_list) + print_rank_0( + f'Returning {len(example_list)} {split} examples with label dist.: {list(label_distribution.items())}' + ) + self.samples = [] + example_list.sort(key=lambda x: x.num_choices) + self.example_list = example_list + if self.cloze_eval: + if self.pattern_ensemble: + pattern_ids = PVPS[task_name].available_patterns() + self.pvps = [] + for pattern_id in pattern_ids: + self.pvps.append(PVPS[task_name]( + args, + tokenizer, + self.processor.get_labels(), + seq_length, + pattern_id=pattern_id, + num_prompt_tokens=args.num_prompt_tokens, + is_multi_token=args.multi_token, + max_segment_length=args.segment_length, + fast_decode=args.fast_decode, + split=split)) + else: + self.pvp = PVPS[task_name]( + args, + tokenizer, + self.processor.get_labels(), + seq_length, + pattern_id=args.pattern_id, + num_prompt_tokens=args.num_prompt_tokens, + is_multi_token=args.multi_token, + max_segment_length=args.segment_length, + fast_decode=args.fast_decode, + split=split) + self.examples = {example.guid: example for example in example_list} + + def __len__(self): + if self.cloze_eval and self.pattern_ensemble: + return len(self.example_list) * len(self.pvps) + else: + return len(self.example_list) + + def __getitem__(self, idx): + sample_idx = idx % len(self.example_list) + example = self.example_list[sample_idx] + if self.cloze_eval: + kwargs = {} + if self.pattern_text: + kwargs = {'labeled': True, 'priming': True} + if self.pattern_ensemble: + pvp_idx = idx // len(self.example_list) + sample = self.pvps[pvp_idx].encode(example, **kwargs) + else: + sample = self.pvp.encode(example, **kwargs) + if self.pattern_text: + eos_id = self.tokenizer.get_command('eos').Id + cls_id = self.tokenizer.get_command('ENC').Id + input_ids = [cls_id] + sample + [eos_id] + sample = { + 'text': input_ids, + 'loss_mask': np.array([1] * len(input_ids)) + } + else: + sample = self.processor.encode(example, self.tokenizer, + self.seq_length, self.args) + return sample + + +class DataProcessor(ABC): + """ + Abstract class that provides methods for loading training, testing, development and unlabeled examples for a given + task + """ + + def __init__(self, args): + self.args = args + self.num_truncated = 0 + + def output_prediction(self, predictions, examples, output_file): + with open(output_file, 'w') as output: + for prediction, example in zip(predictions, examples): + prediction = self.get_labels()[prediction] + data = {'idx': example.idx, 'label': prediction} + output.write(json.dumps(data) + '\n') + + @property + def variable_num_choices(self): + return False + + @abstractmethod + def get_train_examples(self, data_dir) -> List[InputExample]: + """Get a collection of `InputExample`s for the train set.""" + pass + + @abstractmethod + def get_dev_examples(self, + data_dir, + for_train=False) -> List[InputExample]: + """Get a collection of `InputExample`s for the dev set.""" + pass + + def get_test_examples(self, data_dir) -> List[InputExample]: + """Get a collection of `InputExample`s for the test set.""" + return [] + + def get_unlabeled_examples(self, data_dir) -> List[InputExample]: + """Get a collection of `InputExample`s for the unlabeled set.""" + return [] + + @abstractmethod + def get_labels(self) -> List[str]: + """Get the list of labels for this data set.""" + pass + + def get_classifier_input(self, example: InputExample, tokenizer): + return example.text_a, example.text_b + + def encode(self, example: InputExample, tokenizer, seq_length, args): + text_a, text_b = self.get_classifier_input(example, tokenizer) + tokens_a = tokenizer.EncodeAsIds(text_a).tokenization + tokens_b = tokenizer.EncodeAsIds(text_b).tokenization + num_special_tokens = num_special_tokens_to_add( + tokens_a, + tokens_b, + None, + add_cls=True, + add_sep=True, + add_piece=False) + if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length: + self.num_truncated += 1 + data = build_input_from_ids( + tokens_a, + tokens_b, + None, + seq_length, + tokenizer, + args=args, + add_cls=True, + add_sep=True, + add_piece=False) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + label = 0 + if example.label is not None: + label = example.label + label = self.get_labels().index(label) + if args.pretrained_bert: + sample = build_sample( + ids, + label=label, + types=types, + paddings=paddings, + unique_id=example.guid) + else: + sample = build_sample( + ids, + positions=position_ids, + masks=sep, + label=label, + unique_id=example.guid) + return sample + + +class SuperGLUEProcessor(DataProcessor): + + def __init__(self, args): + super(SuperGLUEProcessor, self).__init__(args) + self.few_superglue = args.few_superglue + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train.jsonl'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + if self.few_superglue: + return self._create_examples( + os.path.join(data_dir, 'dev32.jsonl'), 'dev') + else: + return self._create_examples( + os.path.join(data_dir, 'val.jsonl'), 'dev') + + def get_test_examples(self, data_dir): + if self.few_superglue: + return self._create_examples( + os.path.join(data_dir, 'val.jsonl'), 'test') + else: + return self._create_examples( + os.path.join(data_dir, 'test.jsonl'), 'test') + + def get_unlabeled_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'unlabeled.jsonl'), 'unlabeled') + + def _create_examples(self, *args, **kwargs): + pass + + +class RteProcessor(SuperGLUEProcessor): + """Processor for the RTE data set.""" + + def get_labels(self): + return ['entailment', 'not_entailment'] + + def _create_examples(self, + path: str, + set_type: str, + hypothesis_name: str = 'hypothesis', + premise_name: str = 'premise') -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line_idx, line in enumerate(f): + example_json = json.loads(line) + idx = example_json['idx'] + if isinstance(idx, str): + try: + idx = int(idx) + except ValueError: + idx = line_idx + label = example_json.get('label') + guid = '%s-%s' % (set_type, idx) + text_a = punctuation_standardization( + example_json[premise_name]) + text_b = punctuation_standardization( + example_json[hypothesis_name]) + + example = InputExample( + guid=guid, + text_a=text_a, + text_b=text_b, + label=label, + idx=idx) + examples.append(example) + + return examples + + +class AxGProcessor(RteProcessor): + """Processor for the AX-G diagnostic data set.""" + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'AX-g.jsonl'), 'train') + + def get_test_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'AX-g.jsonl'), 'test') + + +class AxBProcessor(RteProcessor): + """Processor for the AX-B diagnostic data set.""" + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'AX-b.jsonl'), 'train') + + def get_test_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'AX-b.jsonl'), 'test') + + def _create_examples(self, + path, + set_type, + hypothesis_name='sentence2', + premise_name='sentence1'): + return super()._create_examples(path, set_type, hypothesis_name, + premise_name) + + +class CbProcessor(RteProcessor): + """Processor for the CB data set.""" + + def get_labels(self): + return ['entailment', 'contradiction', 'neutral'] + + +class WicProcessor(SuperGLUEProcessor): + """Processor for the WiC data set.""" + + def get_labels(self): + return ['false', 'true'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + with open(path, encoding='utf8') as f: + for line in f: + example_json = json.loads(line) + idx = example_json['idx'] + if isinstance(idx, str): + idx = int(idx) + label = 'true' if example_json.get('label') else 'false' + guid = '%s-%s' % (set_type, idx) + text_a = punctuation_standardization(example_json['sentence1']) + text_b = punctuation_standardization(example_json['sentence2']) + meta = {'word': example_json['word']} + example = InputExample( + guid=guid, + text_a=text_a, + text_b=text_b, + label=label, + idx=idx, + meta=meta) + examples.append(example) + return examples + + def get_classifier_input(self, example: InputExample, tokenizer): + text_a = example.meta['word'] + ': ' + example.text_a + return text_a, example.text_b + + +class WscProcessor(SuperGLUEProcessor): + """Processor for the WSC data set.""" + + @property + def variable_num_choices(self): + return self.args.wsc_negative + + def get_train_examples(self, data_dir, cloze_eval=True): + return self._create_examples( + os.path.join(data_dir, 'train.jsonl'), + 'train', + cloze_eval=cloze_eval) + + def get_labels(self): + return ['False', 'True'] + + def get_classifier_input(self, example: InputExample, tokenizer): + target = example.meta['span1_text'] + pronoun_idx = example.meta['span2_index'] + + # mark the pronoun with asterisks + words_a = example.text_a.split() + words_a[pronoun_idx] = '*' + words_a[pronoun_idx] + '*' + text_a = ' '.join(words_a) + text_b = target + return text_a, text_b + + def _create_examples(self, + path: str, + set_type: str, + cloze_eval=True) -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line in f: + example_json = json.loads(line) + idx = example_json['idx'] + label = str( + example_json['label']) if 'label' in example_json else None + guid = '%s-%s' % (set_type, idx) + text_a = punctuation_standardization(example_json['text']) + meta = { + 'span1_text': example_json['target']['span1_text'], + 'span2_text': example_json['target']['span2_text'], + 'span1_index': example_json['target']['span1_index'], + 'span2_index': example_json['target']['span2_index'] + } + if 'candidates' in example_json: + candidates = [ + cand['text'] for cand in example_json['candidates'] + ] + # candidates = list(set(candidates)) + filtered = [] + for i, cand in enumerate(candidates): + if cand not in candidates[:i]: + filtered.append(cand) + candidates = filtered + + # the indices in the dataset are wrong for some examples, so we manually fix them + span1_index, span1_text = meta['span1_index'], meta[ + 'span1_text'] + span2_index, span2_text = meta['span2_index'], meta[ + 'span2_text'] + words_a = text_a.split() + words_a_lower = text_a.lower().split() + words_span1_text = span1_text.lower().split() + span1_len = len(words_span1_text) + + if words_a_lower[span1_index:span1_index + + span1_len] != words_span1_text: + for offset in [-1, +1]: + if words_a_lower[span1_index + offset:span1_index + + span1_len + + offset] == words_span1_text: + span1_index += offset + + # if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: + # print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected " + # f"'{words_span1_text}' at index {span1_index} for '{words_a}'") + + if words_a[span2_index] != span2_text: + for offset in [-1, +1]: + if words_a[span2_index + offset] == span2_text: + span2_index += offset + + if words_a[span2_index] != span2_text and words_a[ + span2_index].startswith(span2_text): + words_a = words_a[:span2_index] \ + + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] + words_a[span2_index + 1:] # noqa + + assert words_a[span2_index] == span2_text, \ + f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'" + + text_a = ' '.join(words_a) + meta['span1_index'], meta[ + 'span2_index'] = span1_index, span2_index + + if self.args.task == 'wsc1': + example = InputExample( + guid=guid, + text_a=text_a, + text_b=span1_text, + label=label, + meta=meta, + idx=idx) + examples.append(example) + if set_type == 'train' and label == 'True': + for cand in candidates: + example = InputExample( + guid=guid, + text_a=text_a, + text_b=cand, + label='False', + meta=meta, + idx=idx) + examples.append(example) + continue + + if cloze_eval and set_type == 'train' and label != 'True': + continue + if set_type == 'train' and 'candidates' in example_json and len( + candidates) > 9: + for i in range(0, len(candidates), 9): + _meta = copy.deepcopy(meta) + _meta['candidates'] = candidates[i:i + 9] + if len(_meta['candidates']) < 9: + _meta['candidates'] += candidates[:9 - len( + _meta['candidates'])] + example = InputExample( + guid=guid, + text_a=text_a, + label=label, + meta=_meta, + idx=idx) + examples.append(example) + else: + if 'candidates' in example_json: + meta['candidates'] = candidates + example = InputExample( + guid=guid, + text_a=text_a, + label=label, + meta=meta, + idx=idx) + examples.append(example) + + return examples + + +class BoolQProcessor(SuperGLUEProcessor): + """Processor for the BoolQ data set.""" + + def get_labels(self): + return ['false', 'true'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line in f: + example_json = json.loads(line) + idx = example_json['idx'] + label = str(example_json['label']).lower( + ) if 'label' in example_json else None + guid = '%s-%s' % (set_type, idx) + text_a = punctuation_standardization(example_json['passage']) + text_b = punctuation_standardization(example_json['question']) + example = InputExample( + guid=guid, + text_a=text_a, + text_b=text_b, + label=label, + idx=idx) + examples.append(example) + + return examples + + +class CopaProcessor(SuperGLUEProcessor): + """Processor for the COPA data set.""" + + def get_labels(self): + return [0, 1] + + def encode(self, example: InputExample, tokenizer, seq_length, args): + if args.pretrained_bert: + ids_list, types_list, paddings_list = [], [], [] + else: + ids_list, positions_list, sep_list = [], [], [] + question = example.meta['question'] + joiner = 'because' if question == 'cause' else 'so' + text_a = punctuation_standardization(example.text_a) + ' ' + joiner + tokens_a = tokenizer.EncodeAsIds(text_a).tokenization + for choice in [example.meta['choice1'], example.meta['choice2']]: + choice = punctuation_standardization(choice) + tokens_b = tokenizer.EncodeAsIds(choice).tokenization + num_special_tokens = num_special_tokens_to_add( + tokens_a, + tokens_b, + None, + add_cls=True, + add_sep=True, + add_piece=False) + if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length: + self.num_truncated += 1 + data = build_input_from_ids( + tokens_a, + tokens_b, + None, + seq_length, + tokenizer, + args, + add_cls=True, + add_sep=True, + add_piece=False) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + if args.pretrained_bert: + ids_list.append(ids) + types_list.append(types) + paddings_list.append(paddings) + else: + ids_list.append(ids) + positions_list.append(position_ids) + sep_list.append(sep) + label = 0 + if example.label is not None: + label = example.label + label = self.get_labels().index(label) + if args.pretrained_bert: + sample = build_sample( + ids_list, + label=label, + types=types_list, + paddings=paddings_list, + unique_id=example.guid) + else: + sample = build_sample( + ids_list, + positions=positions_list, + masks=sep_list, + label=label, + unique_id=example.guid) + return sample + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line in f: + example_json = json.loads(line) + label = example_json[ + 'label'] if 'label' in example_json else None + idx = example_json['idx'] + guid = '%s-%s' % (set_type, idx) + text_a = example_json['premise'] + meta = { + 'choice1': example_json['choice1'], + 'choice2': example_json['choice2'], + 'question': example_json['question'] + } + example = InputExample( + guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) + examples.append(example) + + if set_type == 'train' or set_type == 'unlabeled': + mirror_examples = [] + for ex in examples: + label = 1 if ex.label == 0 else 0 + meta = { + 'choice1': ex.meta['choice2'], + 'choice2': ex.meta['choice1'], + 'question': ex.meta['question'] + } + mirror_example = InputExample( + guid=ex.guid + 'm', + text_a=ex.text_a, + label=label, + meta=meta) + mirror_examples.append(mirror_example) + examples += mirror_examples + print_rank_0( + f'Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...' + ) + return examples + + +class MultiRcProcessor(SuperGLUEProcessor): + """Processor for the MultiRC data set.""" + + def get_labels(self): + return [0, 1] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line in f: + example_json = json.loads(line) + + passage_idx = example_json['idx'] + text = punctuation_standardization( + example_json['passage']['text']) + questions = example_json['passage']['questions'] + for question_json in questions: + question = punctuation_standardization( + question_json['question']) + question_idx = question_json['idx'] + answers = question_json['answers'] + for answer_json in answers: + label = answer_json[ + 'label'] if 'label' in answer_json else None + answer_idx = answer_json['idx'] + guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}' + meta = { + 'passage_idx': + passage_idx, + 'question_idx': + question_idx, + 'answer_idx': + answer_idx, + 'answer': + punctuation_standardization(answer_json['text']) + } + idx = [passage_idx, question_idx, answer_idx] + example = InputExample( + guid=guid, + text_a=text, + text_b=question, + label=label, + meta=meta, + idx=idx) + examples.append(example) + + question_indices = list( + set(example.meta['question_idx'] for example in examples)) + label_distribution = Counter(example.label for example in examples) + print_rank_0( + f'Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label ' + f'distribution {list(label_distribution.items())}') + return examples + + def output_prediction(self, predictions, examples, output_file): + with open(output_file, 'w') as output: + passage_dict = defaultdict(list) + for prediction, example in zip(predictions, examples): + passage_dict[example.meta['passage_idx']].append( + (prediction, example)) + for passage_idx, data in passage_dict.items(): + question_dict = defaultdict(list) + passage_data = { + 'idx': passage_idx, + 'passage': { + 'questions': [] + } + } + for prediction, example in data: + question_dict[example.meta['question_idx']].append( + (prediction, example)) + for question_idx, data in question_dict.items(): + question_data = {'idx': question_idx, 'answers': []} + for prediction, example in data: + prediction = self.get_labels()[prediction] + question_data['answers'].append({ + 'idx': + example.meta['answer_idx'], + 'label': + prediction + }) + passage_data['passage']['questions'].append(question_data) + output.write(json.dumps(passage_data) + '\n') + + def get_classifier_input(self, example: InputExample, tokenizer): + text_a = example.text_a + text_b = ' '.join([example.text_b, 'answer:', example.meta['answer']]) + return text_a, text_b + + +class RaceProcessor(DataProcessor): + + @property + def variable_num_choices(self): + return True + + def get_labels(self): + return ['A', 'B', 'C', 'D'] + + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, 'train'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples( + os.path.join(data_dir, 'dev'), 'dev', for_train=for_train) + + def get_test_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, 'test'), 'test') + + @staticmethod + def _create_examples(path, + set_type, + for_train=False) -> List[InputExample]: + examples = [] + + def clean_text(text): + """Remove new lines and multiple spaces and adjust end of sentence dot.""" + + text = text.replace('\n', ' ') + text = re.sub(r'\s+', ' ', text) + for _ in range(3): + text = text.replace(' . ', '. ') + + return text + + filenames = glob.glob(os.path.join( + path, 'middle', '*.txt')) + glob.glob( + os.path.join(path, 'high', '*.txt')) + for filename in filenames: + with open(filename, 'r') as f: + for line in f: + data = json.loads(line) + idx = data['id'] + context = data['article'] + questions = data['questions'] + choices = data['options'] + answers = data['answers'] + # Check the length. + assert len(questions) == len(answers) + assert len(questions) == len(choices) + + context = clean_text(context) + for question_idx, question in enumerate(questions): + answer = answers[question_idx] + choice = choices[question_idx] + guid = f'{set_type}-p{idx}-q{question_idx}' + ex_idx = [set_type, idx, question_idx] + meta = {'choices': choice} + example = InputExample( + guid=guid, + text_a=context, + text_b=question, + label=answer, + meta=meta, + idx=ex_idx) + examples.append(example) + return examples + + +class RecordProcessor(SuperGLUEProcessor): + """Processor for the ReCoRD data set.""" + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples( + os.path.join(data_dir, 'val.jsonl'), 'dev', for_train=for_train) + + @property + def variable_num_choices(self): + return True + + def get_labels(self): + return ['0', '1'] + + def output_prediction(self, predictions, examples, output_file): + with open(output_file, 'w') as output: + for prediction, example in zip(predictions, examples): + prediction = example.meta['candidates'][prediction] + data = {'idx': example.idx, 'label': prediction} + output.write(json.dumps(data) + '\n') + + def encode(self, example: InputExample, tokenizer, seq_length, args): + if args.pretrained_bert: + ids_list, types_list, paddings_list = [], [], [] + else: + ids_list, positions_list, sep_list = [], [], [] + tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization + tokens_b = tokenizer.EncodeAsIds( + example.text_b).tokenization if example.text_b else None + for answer in example.meta['candidates']: + answer_ids = tokenizer.EncodeAsIds(answer).tokenization + total_length = len(tokens_a) + len(tokens_b) + len(answer_ids) + total_length += num_special_tokens_to_add( + tokens_a, + tokens_b + answer_ids, + None, + add_cls=True, + add_sep=True, + add_piece=False) + if total_length > seq_length: + self.num_truncated += 1 + data = build_input_from_ids( + tokens_a, + tokens_b + answer_ids, + None, + seq_length, + tokenizer, + args, + add_cls=True, + add_sep=True, + add_piece=False) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + if args.pretrained_bert: + ids_list.append(ids) + types_list.append(types) + paddings_list.append(paddings) + else: + ids_list.append(ids) + positions_list.append(position_ids) + sep_list.append(sep) + label = example.label + label = self.get_labels().index(label) + if args.pretrained_bert: + sample = build_sample( + ids_list, + label=label, + types=types_list, + paddings=paddings_list, + unique_id=example.guid) + else: + sample = build_sample( + ids_list, + positions=positions_list, + masks=sep_list, + label=label, + unique_id=example.guid) + return sample + + @staticmethod + def _create_examples(path, + set_type, + seed=42, + max_train_candidates_per_question: int = 10, + for_train=False) -> List[InputExample]: + examples = [] + + entity_shuffler = random.Random(seed) + + with open(path, encoding='utf8') as f: + for idx, line in enumerate(f): + example_json = json.loads(line) + + idx = example_json['idx'] + text = punctuation_standardization( + example_json['passage']['text']) + entities = set() + + for entity_json in example_json['passage']['entities']: + start = entity_json['start'] + end = entity_json['end'] + entity = punctuation_standardization(text[start:end + 1]) + entities.add(entity) + + entities = list(entities) + entities.sort() + + text = text.replace( + '@highlight\n', '- ' + ) # we follow the GPT-3 paper wrt @highlight annotations + questions = example_json['qas'] + + for question_json in questions: + question = punctuation_standardization( + question_json['query']) + question_idx = question_json['idx'] + answers = set() + + for answer_json in question_json.get('answers', []): + answer = punctuation_standardization( + answer_json['text']) + answers.add(answer) + + answers = list(answers) + + if set_type == 'train' or for_train: + # create a single example per *correct* answer + for answer_idx, answer in enumerate(answers): + candidates = [ + ent for ent in entities if ent not in answers + ] + if len(candidates + ) > max_train_candidates_per_question - 1: + entity_shuffler.shuffle(candidates) + candidates = candidates[: + max_train_candidates_per_question + - 1] + + guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}' + meta = { + 'passage_idx': idx, + 'question_idx': question_idx, + 'candidates': [answer] + candidates, + 'answers': [answer] + } + ex_idx = [idx, question_idx, answer_idx] + example = InputExample( + guid=guid, + text_a=text, + text_b=question, + label='0', + meta=meta, + idx=ex_idx, + num_choices=len(candidates) + 1) + examples.append(example) + + else: + # create just one example with *all* correct answers and *all* answer candidates + guid = f'{set_type}-p{idx}-q{question_idx}' + meta = { + 'passage_idx': idx, + 'question_idx': question_idx, + 'candidates': entities, + 'answers': answers + } + example = InputExample( + guid=guid, + text_a=text, + text_b=question, + label='1', + meta=meta, + idx=question_idx, + num_choices=len(entities)) + examples.append(example) + + question_indices = list( + set(example.meta['question_idx'] for example in examples)) + label_distribution = Counter(example.label for example in examples) + print_rank_0( + f'Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label ' + f'distribution {list(label_distribution.items())}') + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train.tsv'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples( + os.path.join(data_dir, 'dev_matched.tsv'), 'dev_matched') + + def get_test_examples(self, data_dir) -> List[InputExample]: + return self._create_examples( + os.path.join(data_dir, 'test_matched.tsv'), 'test_matched') + + def get_unlabeled_examples(self, data_dir) -> List[InputExample]: + return self.get_train_examples(data_dir) + + def get_labels(self): + return ['contradiction', 'entailment', 'neutral'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + df = read_tsv(path) + + for idx, row in df.iterrows(): + guid = f'{set_type}-{idx}' + text_a = punctuation_standardization(row['sentence1']) + text_b = punctuation_standardization(row['sentence2']) + label = row.get('gold_label', None) + example = InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples( + os.path.join(data_dir, 'dev_mismatched.tsv'), 'dev_mismatched') + + def get_test_examples(self, data_dir) -> List[InputExample]: + return self._create_examples( + os.path.join(data_dir, 'test_mismatched.tsv'), 'test_mismatched') + + +class AgnewsProcessor(DataProcessor): + """Processor for the AG news data set.""" + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train.csv'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev') + + def get_test_examples(self, data_dir) -> List[InputExample]: + raise NotImplementedError() + + def get_unlabeled_examples(self, data_dir) -> List[InputExample]: + return self.get_train_examples(data_dir) + + def get_labels(self): + return ['1', '2', '3', '4'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + + with open(path) as f: + reader = csv.reader(f, delimiter=',') + for idx, row in enumerate(reader): + label, headline, body = row + guid = '%s-%s' % (set_type, idx) + text_a = punctuation_standardization( + headline.replace('\\', ' ')) + text_b = punctuation_standardization(body.replace('\\', ' ')) + + example = InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class YahooAnswersProcessor(DataProcessor): + """Processor for the Yahoo Answers data set.""" + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train.csv'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev') + + def get_test_examples(self, data_dir) -> List[InputExample]: + raise NotImplementedError() + + def get_unlabeled_examples(self, data_dir) -> List[InputExample]: + return self.get_train_examples(data_dir) + + def get_labels(self): + return ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + reader = csv.reader(f, delimiter=',') + for idx, row in enumerate(reader): + label, question_title, question_body, answer = row + guid = '%s-%s' % (set_type, idx) + text_a = ' '.join([ + question_title.replace('\\n', ' ').replace('\\', ' '), + question_body.replace('\\n', ' ').replace('\\', ' ') + ]) + text_a = punctuation_standardization(text_a) + text_b = answer.replace('\\n', ' ').replace('\\', ' ') + text_b = punctuation_standardization(text_b) + + example = InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class YelpPolarityProcessor(DataProcessor): + """Processor for the YELP binary classification set.""" + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train.csv'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev') + + def get_test_examples(self, data_dir) -> List[InputExample]: + raise NotImplementedError() + + def get_unlabeled_examples(self, data_dir) -> List[InputExample]: + return self.get_train_examples(data_dir) + + def get_labels(self): + return ['1', '2'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + + with open(path) as f: + reader = csv.reader(f, delimiter=',') + for idx, row in enumerate(reader): + label, body = row + guid = '%s-%s' % (set_type, idx) + text_a = body.replace('\\n', ' ').replace('\\', ' ') + text_a = punctuation_standardization(text_a) + + example = InputExample(guid=guid, text_a=text_a, label=label) + examples.append(example) + + return examples + + +class YelpFullProcessor(YelpPolarityProcessor): + """Processor for the YELP full classification set.""" + + def get_test_examples(self, data_dir) -> List[InputExample]: + raise NotImplementedError() + + def get_labels(self): + return ['1', '2', '3', '4', '5'] + + +class XStanceProcessor(DataProcessor): + """Processor for the X-Stance data set.""" + + def __init__(self, args, language: str = None): + super().__init__(args) + if language is not None: + assert language in ['de', 'fr'] + self.language = language + + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, 'train.jsonl')) + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples(os.path.join(data_dir, 'test.jsonl')) + + def get_test_examples(self, data_dir) -> List[InputExample]: + raise NotImplementedError() + + def get_unlabeled_examples(self, data_dir) -> List[InputExample]: + return self.get_train_examples(data_dir) + + def get_labels(self): + return ['FAVOR', 'AGAINST'] + + def _create_examples(self, path: str) -> List[InputExample]: + examples = [] + + with open(path, encoding='utf8') as f: + for line in f: + example_json = json.loads(line) + label = example_json['label'] + id_ = example_json['id'] + text_a = punctuation_standardization(example_json['question']) + text_b = punctuation_standardization(example_json['comment']) + language = example_json['language'] + + if self.language is not None and language != self.language: + continue + + example = InputExample( + guid=id_, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class Sst2Processor(DataProcessor): + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train.tsv'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples(os.path.join(data_dir, 'dev.tsv'), 'dev') + + def get_test_examples(self, data_dir) -> List[InputExample]: + return self._create_examples( + os.path.join(data_dir, 'test.tsv'), 'test') + + def get_labels(self): + return ['0', '1'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + df = read_tsv(path) + + for idx, row in df.iterrows(): + guid = f'{set_type}-{idx}' + text_a = punctuation_standardization(row['sentence']) + label = row.get('label', None) + example = InputExample(guid=guid, text_a=text_a, label=label) + examples.append(example) + + return examples + + +class ColaProcessor(Sst2Processor): + + def get_labels(self): + return ['0', '1'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + if set_type != 'test': + df = read_tsv(path, header=None) + else: + df = read_tsv(path) + + for idx, row in df.iterrows(): + guid = f'{set_type}-{idx}' + if set_type != 'test': + text_a = punctuation_standardization(row[3]) + label = row[1] + else: + text_a = punctuation_standardization(row['sentence']) + label = None + example = InputExample(guid=guid, text_a=text_a, label=label) + examples.append(example) + + return examples + + +class MrpcProcessor(Sst2Processor): + + def get_labels(self): + return ['0', '1'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + df = read_tsv(path) + + for idx, row in df.iterrows(): + guid = f'{set_type}-{idx}' + text_a = punctuation_standardization(row['#1 String']) + text_b = punctuation_standardization(row['#2 String']) + label = row.get('Quality', None) + example = InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class QqpProcessor(Sst2Processor): + + def get_labels(self): + return ['0', '1'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + df = read_tsv(path) + + for idx, row in df.iterrows(): + guid = f'{set_type}-{idx}' + text_a = punctuation_standardization(row['question1']) + text_b = punctuation_standardization(row['question2']) + label = row.get('is_duplicate', None) + example = InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class QnliProcessor(Sst2Processor): + + def get_labels(self): + return ['entailment', 'not_entailment'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + df = read_tsv(path) + + for idx, row in df.iterrows(): + guid = f'{set_type}-{idx}' + text_a = punctuation_standardization(row['question']) + text_b = punctuation_standardization(row['sentence']) + label = row.get('label', None) + example = InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label) + examples.append(example) + + return examples + + +class SquadProcessor(DataProcessor): + + def get_train_examples(self, data_dir): + return self._create_examples( + os.path.join(data_dir, 'train-v2.0.json'), 'train') + + def get_dev_examples(self, data_dir, for_train=False): + return self._create_examples( + os.path.join(data_dir, 'dev-v2.0.json'), 'dev') + + def get_labels(self): + return ['0'] + + @staticmethod + def _create_examples(path: str, set_type: str) -> List[InputExample]: + examples = [] + with open(path) as f: + data = json.load(f)['data'] + + for idx, passage in enumerate(data): + for pid, paragraph in enumerate(passage['paragraphs']): + context = paragraph['context'] + for qid, qas in enumerate(paragraph['qas']): + if len(qas['answers']) == 0: + continue + guid = f'{set_type}-{idx}-{pid}-{qid}' + example = InputExample( + guid=guid, + text_a=context, + text_b=qas['question'], + label='0', + meta={'answer': qas['answers'][0]}) + examples.append(example) + + return examples + + +CLASSIFICATION_DATASETS = {'wic', 'rte', 'cb', 'boolq', 'multirc', 'wsc'} +MULTI_CHOICE_DATASETS = {'copa', 'record'} + +PROCESSORS = { + 'mnli': MnliProcessor, + 'mnli-mm': MnliMismatchedProcessor, + 'agnews': AgnewsProcessor, + 'yahoo': YahooAnswersProcessor, + 'yelp-polarity': YelpPolarityProcessor, + 'yelp-full': YelpFullProcessor, + 'xstance-de': lambda: XStanceProcessor('de'), + 'xstance-fr': lambda: XStanceProcessor('fr'), + 'xstance': XStanceProcessor, + 'wic': WicProcessor, + 'rte': RteProcessor, + 'cb': CbProcessor, + 'wsc': WscProcessor, + 'wsc1': WscProcessor, + 'boolq': BoolQProcessor, + 'copa': CopaProcessor, + 'multirc': MultiRcProcessor, + 'record': RecordProcessor, + 'ax-g': AxGProcessor, + 'ax-b': AxBProcessor, + 'sst2': Sst2Processor, + 'cola': ColaProcessor, + 'mrpc': MrpcProcessor, + 'qqp': QqpProcessor, + 'qnli': QnliProcessor, + 'squad': SquadProcessor, + 'race': RaceProcessor, + 'squad': SquadProcessor +} # type: Dict[str,Callable[[1],DataProcessor]] diff --git a/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py new file mode 100644 index 00000000..145fb45b --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022 Zhipu.AI +""" +Official evaluation script for ReCoRD v1.0. +(Some functions are adopted from the SQuAD evaluation script.) +""" + +from __future__ import print_function +import functools +import re +import string +from collections import Counter, defaultdict +from typing import List + +from tasks.data_utils import InputExample + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + if not ground_truths: + return 0.0 + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def qa_evaluate(predictions, labels, examples: List[InputExample], metric): + assert len(examples) == len(predictions) + score = 0.0 + for example, prediction in zip(examples, predictions): + ground_truths = example.meta['answers'] + prediction = example.meta['candidates'][prediction] + if ground_truths: + score += metric_max_over_ground_truths(metric, prediction, + ground_truths) + score = 100.0 * score / len(predictions) + return score + + +def multirc_em(predictions, labels, examples: List[InputExample]): + """Compute the exact match (EM) for a sequence of predictions and actual labels""" + question_ids = [example.meta['question_idx'] for example in examples] + unique_questions = set(question_ids) + + q_actuals = list(zip(question_ids, labels)) + q_predictions = list(zip(question_ids, predictions)) + + actuals_per_question = defaultdict(list) + predictions_per_question = defaultdict(list) + + for qid, val in q_actuals: + actuals_per_question[qid].append(val) + for qid, val in q_predictions: + predictions_per_question[qid].append(val) + + em = 0 + for qid in unique_questions: + if actuals_per_question[qid] == predictions_per_question[qid]: + em += 1 + em /= len(unique_questions) + return em + + +qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score) +qa_f1 = functools.partial(qa_evaluate, metric=f1_score) diff --git a/modelscope/models/nlp/mglm/tasks/superglue/finetune.py b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py new file mode 100644 index 00000000..371705ff --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py @@ -0,0 +1,138 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Race.""" + +from collections import OrderedDict + +from finetune_glm import finetune +from tasks.eval_utils import (accuracy_func_provider, accuracy_metric, + f1_macro_metric, f1_metric) +from tasks.superglue.dataset import (CLASSIFICATION_DATASETS, + MULTI_CHOICE_DATASETS, PROCESSORS, + SuperGlueDataset, get_output_func) +from tasks.superglue.evaluate import multirc_em, qa_exact_match, qa_f1 +from tasks.superglue.pvp import PVPS + +DEFAULT_METRICS = { + 'record': [('EM', qa_exact_match), ('F1', qa_f1)], + 'copa': [('accuracy', accuracy_metric)], + 'rte': [('accuracy', accuracy_metric)], + 'boolq': [('accuracy', accuracy_metric)], + 'wic': [('accuracy', accuracy_metric)], + 'wsc': [('accuracy', accuracy_metric)], + 'cb': [('accuracy', accuracy_metric), ('f1-macro', f1_macro_metric)], + 'multirc': [('f1a', f1_metric), ('em', multirc_em), + ('acc', accuracy_metric)], + 'mnli': [('accuracy', accuracy_metric)], + 'sst2': [('accuracy', accuracy_metric)], + 'qnli': [('accuracy', accuracy_metric)], + 'qqp': [('accuracy', accuracy_metric)], + 'mrpc': [('accuracy', accuracy_metric)], + 'cola': [('accuracy', accuracy_metric)], + 'squad': [('accuracy', accuracy_metric)], +} + + +def train_valid_datasets_provider(args, tokenizer, pattern_text=False): + """Provide train and validation datasets.""" + task_name = args.task.lower() + data_dir = args.data_dir + train_dataset = SuperGlueDataset( + args, + task_name, + data_dir, + args.seq_length, + 'train', + tokenizer, + pattern_text=pattern_text) + valid_dataset = SuperGlueDataset( + args, + task_name, + data_dir, + args.seq_length, + 'dev', + tokenizer, + for_train=True, + pattern_text=pattern_text) + + return train_dataset, valid_dataset + + +def metrics_func_provider(args, tokenizer, is_test): + """Privde metrics callback function.""" + + def single_dataset_provider(split): + return SuperGlueDataset(args, args.task.lower(), args.data_dir, + args.seq_length, split, tokenizer) + + output_func = get_output_func(args.task.lower(), args) + eval_func = None + if args.task.lower() in ['wsc', 'squad' + ] and args.cloze_eval and not args.wsc_negative: + from tasks.language_model.finetune import classify_evaluate + eval_func = classify_evaluate + metric_dict = OrderedDict(DEFAULT_METRICS[args.task.lower()]) + return accuracy_func_provider( + single_dataset_provider, + metric_dict, + args, + is_test=is_test, + eval_func=eval_func, + output_func=output_func, + only_rank0=False, + tokenizer=tokenizer) + + +def main(args): + model_kwargs = {} + processor = PROCESSORS[args.task.lower()](args) + pvp = PVPS[args.task.lower()]( + args, + None, + processor.get_labels(), + args.seq_length, + pattern_id=args.pattern_id, + is_multi_token=args.multi_token, + num_prompt_tokens=args.num_prompt_tokens) + if args.continuous_prompt: + model_kwargs['spell_length'] = pvp.spell_length + if args.task.lower() in ['wsc', 'squad' + ] and args.cloze_eval and not args.wsc_negative: + from tasks.language_model.finetune import lm_forward_step + finetune( + args, + train_valid_datasets_provider, + model_kwargs, + end_of_epoch_callback_provider=metrics_func_provider, + forward_step=lm_forward_step) + else: + if args.cloze_eval: + multi_token = pvp.is_multi_token + else: + multi_token = args.task.lower() in MULTI_CHOICE_DATASETS + args.multi_token = multi_token + if not multi_token: + model_kwargs[ + 'model_type'] = 'multiple_choice' if args.cloze_eval else 'classification' + model_kwargs['multi_token'] = False + model_kwargs['num_labels'] = len(processor.get_labels()) + else: + model_kwargs['model_type'] = 'multiple_choice' + model_kwargs['multi_token'] = True + model_kwargs['num_labels'] = 1 + finetune( + args, + train_valid_datasets_provider, + model_kwargs, + end_of_epoch_callback_provider=metrics_func_provider) diff --git a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py new file mode 100644 index 00000000..ff394172 --- /dev/null +++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py @@ -0,0 +1,1541 @@ +# Copyright (c) 2022 Zhipu.AI +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains the pattern-verbalizer pairs (PVPs) for all tasks. +""" +import copy +import math +import random +import string +from abc import ABC, abstractmethod +from collections import defaultdict +from typing import Dict, List, Tuple, Union + +import numpy as np +from tasks.data_utils import (InputExample, build_decoder_input, + build_decoder_sample, build_input_from_ids, + build_sample, num_special_tokens_to_add) +from utils import print_rank_0 + +FilledPattern = Tuple[List[Union[str, Tuple[str, bool]]], + List[Union[str, Tuple[str, bool]]]] + + +class PVP(ABC): + """ + This class contains functions to apply patterns and verbalizers as required by PET. Each task requires its own + custom implementation of a PVP. + """ + + def __init__(self, + args, + tokenizer, + label_list, + max_seq_length, + pattern_id: int = 0, + verbalizer_file: str = None, + seed: int = 42, + is_multi_token=False, + max_segment_length=0, + fast_decode: bool = False, + split='train', + num_prompt_tokens=0): + """ + Create a new PVP. + + :param args: the args + :param tokenizer: the tokenizer + :param label_list: the list of labels + :param max_seq_length: the maximum length of the sequence + :param pattern_id: the pattern id to use + :param seed: a seed to be used for generating random numbers if necessary + :param is_multi_token: if the verbalizers contain multiple tokens + :param fast_decode: whether to use the fast decode mode for multi-token tasks + :param continuous_prompt: whether to use continuous prompt optimization + """ + self.args = args + self.tokenizer = tokenizer + self.label_list = label_list + self.max_seq_length = max_seq_length + self.pattern_id = pattern_id + self.num_prompt_tokens = num_prompt_tokens + self.rng = random.Random(seed) + self.num_truncated = 0 + self.fast_decode = fast_decode + self.split = split + self.max_dec_seq_length = 16 + self._is_multi_token = is_multi_token + self.max_segment_length = max_segment_length + self.task_mask = args.task_mask + self.continuous_prompt = args.continuous_prompt + self.prefix_prompt = args.prefix_prompt + if self.continuous_prompt: + print_rank_0( + f'Prompt tokens in pvp {self.num_prompt_tokens} spell length {self.spell_length}' + ) + + if verbalizer_file: + self.verbalize = PVP._load_verbalizer_from_file( + verbalizer_file, self.pattern_id) + + @property + def is_multi_token(self): + return self._is_multi_token + + @property + def spell_length(self): + return 0 + + @property + def mask(self) -> str: + """Return the underlying LM's mask token""" + return self.tokenizer.get_command('MASK').Id + + @property + def mask_id(self) -> int: + """Return the underlying LM's mask id""" + return self.tokenizer.get_command('MASK').Id + + @property + def max_num_verbalizers(self) -> int: + """Return the maximum number of verbalizers across all labels""" + return max(len(self.verbalize(label)) for label in self.label_list) + + @staticmethod + def shortenable(s): + """Return an instance of this string that is marked as shortenable""" + return s, True + + @staticmethod + def remove_final_punc(s: Union[str, Tuple[str, bool]]): + """Remove the final punctuation mark""" + if isinstance(s, tuple): + return PVP.remove_final_punc(s[0]), s[1] + return s.rstrip(string.punctuation) + + @staticmethod + def lowercase_first(s: Union[str, Tuple[str, bool]]): + """Lowercase the first character""" + if isinstance(s, tuple): + return PVP.lowercase_first(s[0]), s[1] + return s[0].lower() + s[1:] + + @staticmethod + def uppercase_first(s: Union[str, Tuple[str, bool]]): + """Lowercase the first character""" + if isinstance(s, tuple): + return PVP.uppercase_first(s[0]), s[1] + return s[0].upper() + s[1:] + + @staticmethod + def available_patterns(): + return [0] + + def replace_prompt_tokens(self, parts_a, parts_b): + if not self.continuous_prompt: + parts_a = [part for part in parts_a if part is not None] + parts_b = [part for part in parts_b if part is not None] + return parts_a, parts_b + num_prompt_tokens = self.num_prompt_tokens + num_pos = 0 + for parts in (parts_a, parts_b): + for part in parts: + if part is None: + num_pos += 1 + avg_prompt_tokens = math.ceil(num_prompt_tokens / num_pos) + new_parts_a, new_parts_b = [], [] + for part in parts_a: + if part is None: + if num_prompt_tokens > 0: + if num_prompt_tokens >= avg_prompt_tokens: + new_parts_a.append(avg_prompt_tokens) + num_prompt_tokens -= avg_prompt_tokens + else: + new_parts_a.append(num_prompt_tokens) + num_prompt_tokens = 0 + else: + new_parts_a.append(part) + for part in parts_b: + if part is None: + if num_prompt_tokens > 0: + if num_prompt_tokens >= avg_prompt_tokens: + new_parts_b.append(avg_prompt_tokens) + num_prompt_tokens -= avg_prompt_tokens + else: + new_parts_b.append(num_prompt_tokens) + num_prompt_tokens = 0 + else: + new_parts_b.append(part) + return new_parts_a, new_parts_b + + def encode(self, + example: InputExample, + priming: bool = False, + labeled: bool = False): + """ + Encode an input example using this pattern-verbalizer pair. + + :param example: the input example to encode + :param priming: whether to use this example for priming + :param labeled: if ``priming=True``, whether the label should be appended to this example + :return: A tuple, consisting of a list of input ids and a list of token type ids + """ + + if not priming: + assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true" + + tokenizer = self.tokenizer + raw_parts_a, raw_parts_b = self.get_parts(example) + + raw_parts_a = [ + x if isinstance(x, tuple) else (x, False) for x in raw_parts_a + ] + prompt_id = tokenizer.num_tokens + + def encode_input(raw_parts): + parts = [] + for x, s in raw_parts: + if isinstance(x, str): + x = tokenizer.EncodeAsIds(x) + elif isinstance(x, int): + x = [prompt_id] * x + else: + pass + parts.append((x, s)) + return parts + + parts_a = encode_input(raw_parts_a) + if self.prefix_prompt > 0: + parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a + + parts_b = None + if raw_parts_b: + raw_parts_b = [ + x if isinstance(x, tuple) else (x, False) for x in raw_parts_b + ] + parts_b = encode_input(raw_parts_b) + + if self.is_multi_token: + answers = self.get_answers(example) + if example.label is not None: + label = self.label_list.index(example.label) + else: + label = 0 + + if not self.fast_decode: + ids_list, positions_list, sep_list, mask_list, target_list, prompt_list = [], [], [], [], [], [] + segment_id_list = [] + if priming: + answer = answers[label] + answer_ids = get_verbalization_ids( + answer, tokenizer, force_single_token=False) + self.num_truncated += self.truncate( + parts_a, + parts_b, + answer_ids, + max_length=self.max_seq_length) + tokens_a = [ + token_id for part, _ in parts_a for token_id in part + ] + tokens_b = [ + token_id for part, _ in parts_b for token_id in part + ] if parts_b else None + input_ids = tokens_a + if tokens_b: + input_ids += tokens_b + if labeled: + mask_idx = input_ids.index(self.mask_id) + input_ids = input_ids[: + mask_idx] + answer_ids + input_ids[ + mask_idx + 1:] + return input_ids + else: + for idx, answer in enumerate(answers): + this_parts_a, this_parts_b = copy.deepcopy( + parts_a), copy.deepcopy(parts_b) + answer_ids = get_verbalization_ids( + answer, tokenizer, force_single_token=False) + answer_ids = answer_ids + [ + tokenizer.get_command('eop').Id + ] + self.num_truncated += self.truncate( + this_parts_a, + this_parts_b, + answer_ids, + max_length=self.max_seq_length) + tokens_a = [ + token_id for part, _ in this_parts_a + for token_id in part + ] + tokens_b = [ + token_id for part, _ in this_parts_b + for token_id in part + ] if parts_b else None + if self.max_segment_length > 0: + num_segments = (len(answer_ids) + - 1) // self.max_segment_length + 1 + segments = [ + answer_ids[index + * self.max_segment_length:(index + + 1) + * self.max_segment_length] + for index in range(num_segments) + ] + segment_id_list += [idx] * len(segments) + else: + segments = [answer_ids] + for segment in segments: + data = build_input_from_ids( + tokens_a, + tokens_b, + segment, + self.max_seq_length, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=True, + mask_id=self.mask_id) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + prompt_pos = [ + idx for idx, token in enumerate(ids) + if token == prompt_id + ] + ids = [ + idx if idx != prompt_id else 0 for idx in ids + ] + prompt_list.append(prompt_pos) + ids_list.append(ids) + positions_list.append(position_ids) + sep_list.append(sep) + target_list.append(target_ids) + mask_list.append(loss_masks) + if self.mask in tokens_a: + mask_pos = tokens_a.index(self.mask) + tokens_a = tokens_a[: + mask_pos] + segment + tokens_a[ + mask_pos:] + else: + mask_pos = tokens_b.index(self.mask) + tokens_b = tokens_b[: + mask_pos] + segment + tokens_b[ + mask_pos:] + segment_id_list = segment_id_list if segment_id_list else None + sample = build_sample( + ids_list, + positions=positions_list, + masks=sep_list, + label=label, + logit_mask=mask_list, + target=target_list, + unique_id=example.guid, + segment_ids=segment_id_list, + prompt_ids=prompt_list) + return sample + else: + this_parts_a, this_parts_b = copy.deepcopy( + parts_a), copy.deepcopy(parts_b) + self.num_truncated += self.truncate( + this_parts_a, + this_parts_b, + None, + max_length=self.max_seq_length) + tokens_a = [ + token_id for part, _ in this_parts_a for token_id in part + ] + tokens_b = [ + token_id for part, _ in this_parts_b for token_id in part + ] if parts_b else None + data = build_input_from_ids( + tokens_a, + tokens_b, + None, + self.max_seq_length, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=False) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + sample = build_sample( + ids, + positions=position_ids, + masks=sep, + label=label, + unique_id=example.guid) + + ids_list, positions_list, mask_list, target_list, logit_mask_list = [], [], [], [], [] + for answer in answers: + answer_ids = get_verbalization_ids( + answer, tokenizer, force_single_token=False) + answer_ids = answer_ids + [tokenizer.get_command('eop').Id] + answer_ids = answer_ids[:self.max_dec_seq_length] + data = build_decoder_input(ids, answer_ids, + self.max_seq_length, + self.max_dec_seq_length, + tokenizer) + dec_ids, _, _, dec_position_ids, _, dec_target_ids, dec_loss_masks = data + ids_list.append(dec_ids) + positions_list.append(dec_position_ids) + mask_list.append(sep) + target_list.append(dec_target_ids) + logit_mask_list.append(dec_loss_masks) + + sample = build_decoder_sample(sample, ids_list, positions_list, + mask_list, target_list, + logit_mask_list) + return sample + + else: + self.num_truncated += self.truncate( + parts_a, parts_b, [], max_length=self.max_seq_length) + + tokens_a = [token_id for part, _ in parts_a for token_id in part] + tokens_b = [token_id for part, _ in parts_b + for token_id in part] if parts_b else None + if priming: + input_ids = tokens_a + if tokens_b: + input_ids += tokens_b + if labeled: + mask_idx = input_ids.index(self.mask_id) + verbalizer = self.verbalize(example.label) + assert len( + verbalizer + ) == 1, 'priming only supports one verbalization per label' + verbalizer = verbalizer[0] + verbalizer_id = get_verbalization_ids( + verbalizer, self.tokenizer, force_single_token=True) + input_ids[mask_idx] = verbalizer_id + return input_ids + data = build_input_from_ids( + tokens_a, + tokens_b, + None, + self.max_seq_length, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=True) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + prompt_pos = [ + idx for idx, token in enumerate(ids) if token == prompt_id + ] + ids = [token if token != prompt_id else 0 for token in ids] + target_ids = self.get_verbalizer_ids() + if example.label is not None: + label = self.label_list.index(example.label) + else: + label = 0 + sample = build_sample( + ids=ids, + positions=position_ids, + target=target_ids, + masks=sep, + logit_mask=loss_masks, + label=label, + unique_id=example.guid, + prompt_ids=prompt_pos) + return sample + + @staticmethod + def _seq_length(parts: List[Tuple[List[int], bool]], + only_shortenable: bool = False): + return sum([ + len(x) for x, shortenable in parts + if not only_shortenable or shortenable + ]) if parts else 0 + + @staticmethod + def _remove_last(parts: List[Tuple[List[int], bool]]): + last_idx = max(idx for idx, (seq, shortenable) in enumerate(parts) + if shortenable and seq) + parts[last_idx] = (parts[last_idx][0][:-1], parts[last_idx][1]) + + def truncate(self, parts_a: List[Tuple[List[int], bool]], + parts_b: List[Tuple[List[int], bool]], answer: List[int], + max_length: int): + """Truncate two sequences of text to a predefined total maximum length""" + total_len = self._seq_length(parts_a) + self._seq_length(parts_b) + if answer: + total_len += len(answer) + total_len += num_special_tokens_to_add( + parts_a, + parts_b, + answer, + add_cls=True, + add_sep=False, + add_piece=True) + num_tokens_to_remove = total_len - max_length + + if num_tokens_to_remove <= 0: + return False + + for _ in range(num_tokens_to_remove): + if self._seq_length( + parts_a, only_shortenable=True) > self._seq_length( + parts_b, only_shortenable=True): + self._remove_last(parts_a) + else: + self._remove_last(parts_b) + return True + + @abstractmethod + def get_parts(self, example: InputExample) -> FilledPattern: + """ + Given an input example, apply a pattern to obtain two text sequences (text_a and text_b) containing exactly one + mask token (or one consecutive sequence of mask tokens for PET with multiple masks). If a task requires only a + single sequence of text, the second sequence should be an empty list. + + :param example: the input example to process + :return: Two sequences of text. All text segments can optionally be marked as being shortenable. + """ + pass + + def get_answers(self, example: InputExample): + return [self.verbalize(label)[0] for label in self.label_list] + + def get_verbalizer_ids(self): + target_ids = [] + for label in self.label_list: + verbalizer = self.verbalize(label)[0] + verbalizer_id = get_verbalization_ids( + verbalizer, self.tokenizer, force_single_token=True) + target_ids.append(verbalizer_id) + return target_ids + + @abstractmethod + def verbalize(self, label) -> List[str]: + """ + Return all verbalizations for a given label. + + :param label: the label + :return: the list of verbalizations + """ + pass + + def get_mask_positions(self, input_ids: List[int]) -> List[int]: + label_idx = input_ids.index(self.mask_id) + labels = [-1] * len(input_ids) + labels[label_idx] = 1 + return labels + + @staticmethod + def _load_verbalizer_from_file(path: str, pattern_id: int): + + verbalizers = defaultdict( + dict) # type: Dict[int, Dict[str, List[str]]] + current_pattern_id = None + + with open(path, 'r') as fh: + for line in fh.read().splitlines(): + if line.isdigit(): + current_pattern_id = int(line) + elif line: + label, *realizations = line.split() + verbalizers[current_pattern_id][label] = realizations + + print_rank_0( + 'Automatically loaded the following verbalizer: \n {}'.format( + verbalizers[pattern_id])) + + def verbalize(label) -> List[str]: + return verbalizers[pattern_id][label] + + return verbalize + + +class CopaPVP(PVP): + + @staticmethod + def available_patterns(): + return [0, 1] + + @property + def is_multi_token(self): + return True + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + @property + def mask(self) -> str: + """Return the underlying LM's mask token""" + mask_token = 'MASK' + return self.tokenizer.get_command(mask_token).Id + + @property + def mask_id(self) -> int: + """Return the underlying LM's mask id""" + mask_token = 'MASK' + return self.tokenizer.get_command(mask_token).Id + + def get_answers(self, example: InputExample): + choice1 = ' ' + self.remove_final_punc( + self.lowercase_first(example.meta['choice1'])) + choice2 = ' ' + self.remove_final_punc( + self.lowercase_first(example.meta['choice2'])) + return [choice1, choice2] + + def get_parts(self, example: InputExample) -> FilledPattern: + assert self.pattern_id in [0, 1, 2, 3] + premise = self.remove_final_punc( + self.shortenable(' ' + example.text_a)) + choice1 = self.remove_final_punc( + self.lowercase_first(example.meta['choice1'])) + choice2 = self.remove_final_punc( + self.lowercase_first(example.meta['choice2'])) + + question = example.meta['question'] + assert question in ['cause', 'effect'] + if question == 'cause': + joiner = ' because' + else: + joiner = ', so' + if self.pattern_id == 0: + parts_a, parts_b = [ + None, '"', choice1, '" or "', choice2, '"?', None, premise, + joiner, None, [self.mask], '.' + ], [] + elif self.pattern_id == 1: + parts_a, parts_b = [ + None, choice1, ' or', ' ' + choice2, '?', None, premise, + joiner, None, [self.mask], '.' + ], [] + elif self.pattern_id == 2: + parts_a, parts_b = [ + None, '"', choice1, '" or "', choice2, '"', None, premise, + joiner, [self.mask], '.', None + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + return [] + + def encode(self, + example: InputExample, + priming: bool = False, + labeled: bool = False): + """ + Encode an input example using this pattern-verbalizer pair. + + :param example: the input example to encode + :param priming: whether to use this example for priming + :param labeled: if ``priming=True``, whether the label should be appended to this example + :return: A tuple, consisting of a list of input ids and a list of token type ids + """ + if self.continuous_prompt or self.pattern_id < 2: + return super().encode(example, priming=priming, labeled=labeled) + if not priming: + assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true" + + tokenizer = self.tokenizer + premise = self.remove_final_punc(self.shortenable(example.text_a)) + choice1 = ' ' + self.remove_final_punc( + self.lowercase_first(example.meta['choice1'])) + choice2 = ' ' + self.remove_final_punc( + self.lowercase_first(example.meta['choice2'])) + question = example.meta['question'] + assert question in ['cause', 'effect'] + answer = ' because' if question == 'cause' else ' so' + answer_ids = [ + get_verbalization_ids(answer, tokenizer, force_single_token=True) + ] + if self.is_multi_token: + answer_ids.append(tokenizer.get_command('eop').Id) + + ids_list, positions_list, sep_list, mask_list, target_list = [], [], [], [], [] + + for choice in [choice1, choice2]: + parts = [ + '"', choice1[1:], '" or "', choice2[1:], '"?', premise, + [self.mask], choice + ] + parts = [x if isinstance(x, tuple) else (x, False) for x in parts] + parts = [(tokenizer.EncodeAsIds(x).tokenization if isinstance( + x, str) else x, s) for x, s in parts if x] + self.num_truncated += self.truncate( + parts, None, answer_ids, max_length=self.max_seq_length) + tokens_a = [token_id for part, _ in parts for token_id in part] + data = build_input_from_ids( + tokens_a, + None, + answer_ids, + self.max_seq_length, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=True) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + ids_list.append(ids) + positions_list.append(position_ids) + sep_list.append(sep) + target_list.append(target_ids) + mask_list.append(loss_masks) + if example.label is not None: + label = self.label_list.index(example.label) + else: + label = 0 + sample = build_sample( + ids_list, + positions=positions_list, + masks=sep_list, + label=label, + logit_mask=mask_list, + target=target_list, + unique_id=example.guid) + return sample + + +class WscPVP(PVP): + + @staticmethod + def available_patterns(): + return [0, 1, 2] + + @property + def is_multi_token(self): + return True + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_answers(self, example: InputExample): + target = ' ' + example.meta['span1_text'] + answers = [target] + if 'candidates' in example.meta: + candidates = example.meta['candidates'] + # if len(candidates) > 10: + # random.shuffle(candidates) + # candidates = candidates[:10] + answers += [' ' + cand for cand in candidates] + return answers + + def get_parts(self, example: InputExample) -> FilledPattern: + pronoun = example.meta['span2_text'] + pronoun_idx = example.meta['span2_index'] + + words_a = example.text_a.split() + words_a[pronoun_idx] = '*' + words_a[pronoun_idx] + '*' + text_a = ' '.join(words_a) + text_a = self.shortenable(text_a) + + if self.pattern_id == 0: + parts_a, parts_b = [ + None, text_a, + None, " The pronoun '*" + pronoun + "*' refers to", None, + [self.mask], '.' + ], [] + elif self.pattern_id == 1: + parts_a, parts_b = [ + None, text_a, None, " In the previous sentence, the pronoun '*" + + pronoun + "*' refers to", None, [self.mask], '.' + ], [] + elif self.pattern_id == 2: + parts_a, parts_b = [ + None, text_a, None, + " Question: In the passage above, what does the pronoun '*" + + pronoun + "*' refer to?", None, ' Answer:', [self.mask], '.' + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def encode(self, + example: InputExample, + priming: bool = False, + labeled: bool = False): + """ + Encode an input example using this pattern-verbalizer pair. + + :param example: the input example to encode + :param priming: whether to use this example for priming + :param labeled: if ``priming=True``, whether the label should be appended to this example + :return: A tuple, consisting of a list of input ids and a list of token type ids + """ + if self.args.loss_func in ['generative', 'mix']: + sample = super().encode(example, priming=priming, labeled=labeled) + if self.split == 'train': + sample['label'] = 0 + return sample + + if not priming: + assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true" + + tokenizer = self.tokenizer + prompt_id = tokenizer.num_tokens + raw_parts_a, raw_parts_b = self.get_parts(example) + + raw_parts_a = [ + x if isinstance(x, tuple) else (x, False) for x in raw_parts_a + ] + + def encode_input(raw_parts): + parts = [] + for x, s in raw_parts: + if isinstance(x, str): + x = tokenizer.EncodeAsIds(x) + elif isinstance(x, int): + x = [prompt_id] * x + else: + pass + parts.append((x, s)) + return parts + + parts_a = encode_input(raw_parts_a) + if self.prefix_prompt > 0: + parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a + parts_b = None + if raw_parts_b: + raw_parts_b = [ + x if isinstance(x, tuple) else (x, False) for x in raw_parts_b + ] + parts_b = encode_input(raw_parts_b) + answer = self.get_answers(example)[0] + answer_ids = get_verbalization_ids( + answer, tokenizer, force_single_token=False) + answer_ids = answer_ids + [tokenizer.get_command('eop').Id] + self.num_truncated += self.truncate( + parts_a, parts_b, answer_ids, max_length=self.max_seq_length) + tokens_a = [token_id for part, _ in parts_a for token_id in part] + tokens_b = [token_id for part, _ in parts_b + for token_id in part] if parts_b else None + data = build_input_from_ids( + tokens_a, + tokens_b, + answer_ids, + self.max_seq_length, + self.tokenizer, + args=self.args, + add_cls=True, + add_sep=False, + add_piece=True) + ids, types, paddings, position_ids, sep, target_ids, loss_masks = data + prompt_pos = [ + idx for idx, token in enumerate(ids) if token == prompt_id + ] + ids = [token if token != prompt_id else 0 for token in ids] + if example.label is not None: + label = self.label_list.index(example.label) + else: + label = 0 + return { + 'text': np.array(ids, dtype=np.int64), + 'target': np.array(target_ids, dtype=np.int64), + 'attention_mask': np.array(sep, dtype=np.int64), + 'loss_mask': np.array(loss_masks, dtype=np.int64), + 'position_id': np.array(position_ids, dtype=np.int64), + 'prompt_pos': np.array(prompt_pos, dtype=np.int64), + 'label': label, + 'uid': example.guid + } + + def verbalize(self, label) -> List[str]: + return [] + + +class RecordPVP(PVP): + + @property + def is_multi_token(self): + return True + + def get_answers(self, example: InputExample): + choices = example.meta['candidates'] + choices = [' ' + choice for choice in choices] + return choices + + def get_parts(self, example: InputExample) -> FilledPattern: + premise = self.shortenable(example.text_a) + + assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token' + question_a, question_b = example.text_b.split('@placeholder') + return [premise, ' ' + question_a.rstrip(), [self.mask], + question_b], [] + + def verbalize(self, label) -> List[str]: + return [] + + +class RacePVP(PVP): + + @property + def is_multi_token(self): + return True + + @staticmethod + def available_patterns(): + return [0, 1] + + def get_answers(self, example: InputExample): + choices = example.meta['choices'] + choices = [' ' + choice for choice in choices] + return choices + + def get_parts(self, example: InputExample) -> FilledPattern: + context = self.shortenable(example.text_a) + question = ' ' + example.text_b + + if '_' in question: + left, right = question.split('_', maxsplit=1) + if self.pattern_id == 0: + return [context], [ + self.shortenable(left.rstrip()), [self.mask], + self.shortenable(right) + ] + else: + left = left.rstrip() + if left: + left = self.lowercase_first(left) + return [context], [ + ' Based on the previous passage,', + self.shortenable(left), [self.mask], + self.shortenable(right) + ] + else: + if self.pattern_id == 0: + return [context], [ + ' Question:', + self.shortenable(question), ' Answer:', [self.mask] + ] + else: + return [context], [ + ' Based on the previous passage,', + self.shortenable(question), [self.mask] + ] + + def verbalize(self, label) -> List[str]: + return [] + + +class RtePVP(PVP): + VERBALIZER = {'not_entailment': [' No'], 'entailment': [' Yes']} + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4] + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_parts(self, example: InputExample) -> FilledPattern: + # switch text_a and text_b to get the correct order + text_a = example.text_a + text_b = example.text_b.rstrip(string.punctuation) + if self.pattern_id == 0: + parts_a, parts_b = [None, '"', + self.shortenable(text_b), '" ?'], [ + None, [self.mask], ',', None, ' "', + self.shortenable(text_a), '"' + ] # noqa + elif self.pattern_id == 1: + parts_a, parts_b = [None, self.shortenable(text_b), '?'], [ + None, [self.mask], ',', None, + self.shortenable(' ' + text_a) + ] + elif self.pattern_id == 2: + parts_a, parts_b = [None, '"', + self.shortenable(text_b), '" ?'], [ + None, [self.mask], '. "', None, + self.shortenable(text_a), '"' + ] # noqa + elif self.pattern_id == 3: + parts_a, parts_b = [None, self.shortenable(text_b), '?'], [ + None, [self.mask], '.', None, + self.shortenable(' ' + text_a) + ] + elif self.pattern_id == 4: + parts_a, parts_b = [ + None, + self.shortenable(text_a), None, ' question:', + self.shortenable(' ' + text_b), ' True or False?', None, + ' answer:', [self.mask] + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 4: + return [' true'] if label == 'entailment' else [' false'] + return RtePVP.VERBALIZER[label] + + +class CbPVP(RtePVP): + VERBALIZER = { + 'contradiction': [' No'], + 'entailment': [' Yes'], + 'neutral': [' Maybe'] + } + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4] + + def get_parts(self, example: InputExample) -> FilledPattern: + if self.pattern_id == 4: + text_a = self.shortenable(example.text_a) + text_b = self.shortenable(' ' + example.text_b) + parts_a, parts_b = [ + None, text_a, None, ' question:', text_b, + ' true, false or neither?', None, ' answer:', [self.mask] + ], [] + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + return super().get_parts(example) + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 4: + return [' true'] if label == 'entailment' else [ + ' false' + ] if label == 'contradiction' else [' neither'] + return CbPVP.VERBALIZER[label] + + +class BoolQPVP(PVP): + VERBALIZER_A = {'false': [' No'], 'true': [' Yes']} + + VERBALIZER_B = {'false': [' false'], 'true': [' true']} + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4, 5] + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_parts(self, example: InputExample) -> FilledPattern: + passage = example.text_a + question = example.text_b + + if self.pattern_id < 2: + parts_a, parts_b = [ + None, + self.shortenable(passage), None, ' Question:', + self.shortenable(' ' + question), '? Answer:', None, + [self.mask], '.' + ], [] + elif self.pattern_id < 4: + parts_a, parts_b = [ + None, + self.shortenable(passage), ' Based on the previous passage,', + None, + self.shortenable(' ' + question), '?', None, [self.mask], '.' + ], [] + elif self.pattern_id < 6: + parts_a, parts_b = [ + 'Based on the following passage', None, + self.shortenable(' ' + question), '?', None, [self.mask], '.', + None, + self.shortenable(' ' + passage) + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 0 or self.pattern_id == 2 or self.pattern_id == 4: + return BoolQPVP.VERBALIZER_A[label] + else: + return BoolQPVP.VERBALIZER_B[label] + + +class MultiRcPVP(PVP): + VERBALIZER = {0: [' No'], 1: [' Yes']} + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4] + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_parts(self, example: InputExample) -> FilledPattern: + passage = self.remove_final_punc( + self.shortenable(example.text_a.rstrip())) + question = self.remove_final_punc(example.text_b.rstrip()) + answer = example.meta['answer'] + if self.pattern_id == 0: + parts_a, parts_b = [ + passage, '.', None, ' Question:', ' ' + question + '?', None, + ' Is it', ' ' + answer, '?', None, [self.mask], '.' + ], [] + elif self.pattern_id == 1: + parts_a, parts_b = [ + passage, '.', None, ' Question:', ' ' + question, '?', + None, ' Is the correct answer "', answer, '"?', None, + [self.mask], '.' + ], [] + elif self.pattern_id == 2: + parts_a, parts_b = [ + passage, '. Based on the previous passage,', None, + ' ' + question, '?', None, ' Is "', answer, + '" a correct answer?', None, [self.mask], '.' + ], [] + elif self.pattern_id == 3: + parts_a, parts_b = [ + None, passage, None, ' ' + question, '- [', [self.mask], ']', + None, answer + ], [] + elif self.pattern_id == 4: + parts_a, parts_b = [ + passage, '.', None, ' Question:', ' ' + question, '?', None, + ' ' + answer, '?', None, [self.mask], '.' + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 3: + return [' False'] if label == 0 else [' True'] + return MultiRcPVP.VERBALIZER[label] + + +class WicPVP(PVP): + VERBALIZER_A = {'false': [' No'], 'true': [' Yes']} + VERBALIZER_B = {'false': ['2'], 'true': ['b']} + + @staticmethod + def available_patterns(): + return [0, 1, 2] + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_parts(self, example: InputExample) -> FilledPattern: + text_a = example.text_a + text_b = example.text_b + word = example.meta['word'] + + if self.pattern_id == 0: + parts_a, parts_b = [ + None, + self.shortenable('"' + text_a + '" / "' + text_b + '"'), None, + ' Similar sense of "' + word + '"?', None, [self.mask], '.' + ], [] + elif self.pattern_id == 1: + parts_a, parts_b = [ + self.shortenable(text_a), None, + self.shortenable(' ' + text_b), None, + ' Does ' + word + ' have the same meaning in both sentences?', + None, [self.mask] + ], [] + elif self.pattern_id == 2: + parts_a, parts_b = [ + None, word, ' .', None, ' Sense (1) (a) "', + self.shortenable(text_a), '"', None, ' (', [self.mask], ') "', + text_b, '"' + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 2: + return WicPVP.VERBALIZER_B[label] + return WicPVP.VERBALIZER_A[label] + + +class AgnewsPVP(PVP): + VERBALIZER = { + '1': [' World'], + '2': [' Sports'], + '3': [' Business'], + '4': [' Tech'] + } + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4, 5] + + def get_parts(self, example: InputExample) -> FilledPattern: + + text_a = self.shortenable(example.text_a) + text_b = self.shortenable(example.text_b) + + if self.pattern_id == 0: + return [[self.mask], ':', text_a, text_b], [] + elif self.pattern_id == 1: + return [[self.mask], ' News:', text_a, text_b], [] + elif self.pattern_id == 2: + return [text_a, '(', [self.mask], ')', text_b], [] + elif self.pattern_id == 3: + return [text_a, text_b, '(', [self.mask], ')'], [] + elif self.pattern_id == 4: + return ['[ Category:', [self.mask], ']', text_a, text_b], [] + elif self.pattern_id == 5: + return [[self.mask], '-', text_a, text_b], [] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return AgnewsPVP.VERBALIZER[label] + + +class YahooPVP(PVP): + VERBALIZER = { + '1': [' Society'], + '2': [' Science'], + '3': [' Health'], + '4': [' Education'], + '5': [' Computer'], + '6': [' Sports'], + '7': [' Business'], + '8': [' Entertainment'], + '9': [' Relationship'], + '10': [' Politics'], + } + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4, 5] + + def get_parts(self, example: InputExample) -> FilledPattern: + + text_a = self.shortenable(example.text_a) + text_b = self.shortenable(example.text_b) + + if self.pattern_id == 0: + return [[self.mask], ':', text_a, text_b], [] + elif self.pattern_id == 1: + return [[self.mask], ' Question:', text_a, text_b], [] + elif self.pattern_id == 2: + return [text_a, '(', [self.mask], ')', text_b], [] + elif self.pattern_id == 3: + return [text_a, text_b, '(', [self.mask], ')'], [] + elif self.pattern_id == 4: + return ['[ Category:', [self.mask], ']', text_a, text_b], [] + elif self.pattern_id == 5: + return [[self.mask], '-', text_a, text_b], [] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return YahooPVP.VERBALIZER[label] + + +class MnliPVP(PVP): + VERBALIZER_A = { + 'contradiction': [' Wrong'], + 'entailment': [' Right'], + 'neutral': [' Maybe'] + } + VERBALIZER_B = { + 'contradiction': [' No'], + 'entailment': [' Yes'], + 'neutral': [' Maybe'] + } + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3] + + def get_parts(self, example: InputExample) -> FilledPattern: + text_a = self.shortenable(self.remove_final_punc(example.text_a)) + text_b = self.shortenable(example.text_b) + + if self.pattern_id == 0 or self.pattern_id == 2: + return ['"', text_a, '" ?'], [[self.mask], ', "', text_b, '"'] + elif self.pattern_id == 1 or self.pattern_id == 3: + return [text_a, '?'], [[self.mask], ',', text_b] + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 0 or self.pattern_id == 1: + return MnliPVP.VERBALIZER_A[label] + return MnliPVP.VERBALIZER_B[label] + + +class YelpPolarityPVP(PVP): + VERBALIZER = {'1': [' bad'], '2': [' good']} + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3] + + def get_parts(self, example: InputExample) -> FilledPattern: + text = self.shortenable(example.text_a) + + if self.pattern_id == 0: + return ['It was', [self.mask], '.', text], [] + elif self.pattern_id == 1: + return [text, '. All in all, it was', [self.mask], '.'], [] + elif self.pattern_id == 2: + return ['Just', [self.mask], '!'], [text] + elif self.pattern_id == 3: + return [text], [' In summary, the restaurant is', [self.mask], '.'] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return YelpPolarityPVP.VERBALIZER[label] + + +class YelpFullPVP(YelpPolarityPVP): + VERBALIZER = { + '1': [' terrible'], + '2': [' bad'], + '3': [' okay'], + '4': [' good'], + '5': [' great'] + } + + def verbalize(self, label) -> List[str]: + return YelpFullPVP.VERBALIZER[label] + + +class XStancePVP(PVP): + VERBALIZERS = { + 'en': { + 'FAVOR': ['Yes'], + 'AGAINST': ['No'] + }, + 'de': { + 'FAVOR': ['Ja'], + 'AGAINST': ['Nein'] + }, + 'fr': { + 'FAVOR': ['Oui'], + 'AGAINST': ['Non'] + } + } + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4, 5] + + def get_parts(self, example: InputExample) -> FilledPattern: + + text_a = self.shortenable(example.text_a) + text_b = self.shortenable(example.text_b) + + if self.pattern_id == 0 or self.pattern_id == 2 or self.pattern_id == 4: + return ['"', text_a, '"'], [[self.mask], '. "', text_b, '"'] + elif self.pattern_id == 1 or self.pattern_id == 3 or self.pattern_id == 5: + return [text_a], [[self.mask], '.', text_b] + + def verbalize(self, label) -> List[str]: + lang = 'de' if self.pattern_id < 2 else 'en' if self.pattern_id < 4 else 'fr' + return XStancePVP.VERBALIZERS[lang][label] + + +class Sst2PVP(PVP): + VERBALIZER_A = {'0': [' terrible'], '1': [' great']} + + VERBALIZER_B = {'0': [' bad'], '1': [' good']} + + @staticmethod + def available_patterns(): + return [0, 1] + + def get_parts(self, example: InputExample) -> FilledPattern: + text = self.shortenable(example.text_a) + if self.pattern_id == 0 or self.pattern_id == 1: + return [text, ' It was', [self.mask], '.'], [] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 0: + return Sst2PVP.VERBALIZER_A[label] + else: + return Sst2PVP.VERBALIZER_B[label] + + +class ColaPVP(PVP): + VERBALIZER = {'0': [' incorrect'], '1': [' correct']} + + def get_parts(self, example: InputExample) -> FilledPattern: + text = self.shortenable(example.text_a) + if self.pattern_id == 0: + return ['"', text, '"', ' This is', [self.mask], '.'], [] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return ColaPVP.VERBALIZER[label] + + +class MrpcPVP(PVP): + VERBALIZER = {'0': [' No'], '1': [' Yes']} + + @staticmethod + def available_patterns(): + return [0, 1] + + def get_parts(self, example: InputExample) -> FilledPattern: + text_a = self.shortenable(example.text_a) + if self.pattern_id == 0: + text_b = self.shortenable(self.lowercase_first(example.text_b)) + return [text_a], [[self.mask], ', ', text_b] + elif self.pattern_id == 1: + text_b = self.shortenable( + self.remove_final_punc(self.lowercase_first(example.text_b))) + return [text_a], [' Does it mean that', text_b, '?', [self.mask]] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return MrpcPVP.VERBALIZER[label] + + +class QqpPVP(PVP): + VERBALIZER = {'0': [' No'], '1': [' Yes']} + + @staticmethod + def available_patterns(): + return [0, 1] + + def get_parts(self, example: InputExample) -> FilledPattern: + text_a = self.shortenable(example.text_a) + text_b = self.shortenable(self.lowercase_first(example.text_b)) + if self.pattern_id == 0: + return [text_a], [' Do you mean ', text_b, [self.mask], '.'] + elif self.pattern_id == 1: + return [text_a], [[self.mask], ', ', text_b] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return QqpPVP.VERBALIZER[label] + + +class QnliPVP(PVP): + VERBALIZER = {'not_entailment': [' No'], 'entailment': [' Yes']} + + @staticmethod + def available_patterns(): + return [0, 1, 2] + + def get_parts(self, example: InputExample) -> FilledPattern: + question = self.remove_final_punc(example.text_a) + passage = example.text_b + if self.pattern_id == 0: + return [ + self.shortenable(passage), ' Question:', + self.shortenable(' ' + question), '? Do you know the answer?', + [self.mask], '.' + ], [] + elif self.pattern_id == 1: + return [ + self.shortenable(passage), + ' Based on the previous passage, do you know the answer', + self.shortenable(' ' + question), '?', [self.mask], '.' + ], [] + elif self.pattern_id == 2: + return [ + 'Based on the following passage, do you know the answer', + self.shortenable(' ' + question), '?', [self.mask], '.', + self.shortenable(' ' + passage) + ], [] + else: + raise ValueError('No pattern implemented for id {}'.format( + self.pattern_id)) + + def verbalize(self, label) -> List[str]: + return QnliPVP.VERBALIZER[label] + + +class SquadPVP(PVP): + + @property + def is_multi_token(self): + return True + + def get_answers(self, example: InputExample): + target = ' ' + example.meta['answer']['text'] + answers = [target] + return answers + + def get_parts(self, example: InputExample) -> FilledPattern: + context = self.shortenable(example.text_a) + question = example.text_b + return [context, ' ' + question, [self.mask], '.'], [] + + def verbalize(self, label) -> List[str]: + return [] + + +def get_verbalization_ids(word: str, tokenizer, + force_single_token: bool) -> Union[int, List[int]]: + """ + Get the token ids corresponding to a verbalization + + :param word: the verbalization + :param tokenizer: the tokenizer to use + :param force_single_token: whether it should be enforced that the verbalization corresponds to a single token. + If set to true, this method returns a single int instead of a list and throws an error if the word + corresponds to multiple tokens. + :return: either the list of token ids or the single token id corresponding to this word + """ + ids = tokenizer.EncodeAsIds(word).tokenization + if not force_single_token: + return ids + assert len(ids) == 1, \ + f'Verbalization "{word}" does not correspond to a single token, got {tokenizer.DecodeIds(ids)}' + verbalization_id = ids[0] + assert verbalization_id not in tokenizer.command_id_map, \ + f'Verbalization {word} is mapped to a special token {tokenizer.IdToToken(verbalization_id)}' + return verbalization_id + + +PVPS = { + 'agnews': AgnewsPVP, + 'mnli': MnliPVP, + 'yelp-polarity': YelpPolarityPVP, + 'yelp-full': YelpFullPVP, + 'yahoo': YahooPVP, + 'xstance': XStancePVP, + 'xstance-de': XStancePVP, + 'xstance-fr': XStancePVP, + 'rte': RtePVP, + 'wic': WicPVP, + 'cb': CbPVP, + 'wsc': WscPVP, + 'boolq': BoolQPVP, + 'copa': CopaPVP, + 'multirc': MultiRcPVP, + 'record': RecordPVP, + 'ax-b': RtePVP, + 'ax-g': RtePVP, + 'sst2': Sst2PVP, + 'cola': ColaPVP, + 'mrpc': MrpcPVP, + 'qqp': QqpPVP, + 'qnli': QnliPVP, + 'squad': SquadPVP, + 'race': RacePVP, +} diff --git a/modelscope/models/nlp/mglm/test/__init__.py b/modelscope/models/nlp/mglm/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/nlp/mglm/test/test_block.py b/modelscope/models/nlp/mglm/test/test_block.py new file mode 100644 index 00000000..ed4225da --- /dev/null +++ b/modelscope/models/nlp/mglm/test/test_block.py @@ -0,0 +1,36 @@ +# Copyright (c) 2022 Zhipu.AI + +import random +from argparse import Namespace + +import numpy as np +from blocklm_utils import ConstructBlockStrategy + + +# rng = random.Random() +# span_lengths = [2, 3, 4, 2, 3, 4] +# length = 100 +# +# counts = np.array([0] * length) +# for _ in range(10000): +# rng.shuffle(span_lengths) +# spans = ConstructBlockStrategy.sample_spans(span_lengths, length, rng) +# for start, end in spans: +# counts[start: end] += 1 +# print(counts) +def main(): + args = Namespace() + args.seq_length = 10 + args.eod_token = 0 + + strategy = ConstructBlockStrategy( + args, None, bert_ratio=0.4, max_seq_length=128) + counts = np.array([0] * 10) + for _ in range(10000): + spans = strategy.sample_span_in_document( + np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1], + random.Random()) + for start, end in spans: + counts[start:end] += 1 + + print(counts) diff --git a/modelscope/models/nlp/mglm/test/test_rel_shift.py b/modelscope/models/nlp/mglm/test/test_rel_shift.py new file mode 100644 index 00000000..00cbb9fe --- /dev/null +++ b/modelscope/models/nlp/mglm/test/test_rel_shift.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022 Zhipu.AI + +import matplotlib.pyplot as plt +import numpy as np +from learning_rates import AnnealingLR +from torch.nn.modules import Linear +from torch.optim import Adam + + +def main(): + model = Linear(10, 10) + optimizer = Adam(model.parameters()) + lr_scheduler = AnnealingLR( + optimizer, + start_lr=0.00015, + warmup_iter=3000, + num_iters=300000, + decay_style='cosine', + decay_ratio=0.1) + steps = np.arange(0, 400000, 10, dtype=np.long) + rates = [] + for step in steps: + lr_scheduler.num_iters = step + rates.append(lr_scheduler.get_lr()) + print(rates) + plt.plot(steps, rates) + plt.savefig('lr.pdf', format='pdf') diff --git a/modelscope/models/nlp/mglm/train_utils.py b/modelscope/models/nlp/mglm/train_utils.py new file mode 100644 index 00000000..c9c0de8e --- /dev/null +++ b/modelscope/models/nlp/mglm/train_utils.py @@ -0,0 +1,472 @@ +# Copyright (c) 2022 Zhipu.AI + +import deepspeed +import torch +from apex.optimizers import FusedAdam as Adam +from torch import distributed as dist + +from . import mpu +from .fp16 import DynamicLossScaler, FP16_Module, FP16_Optimizer +from .model import DistributedDataParallel as LocalDDP +from .model import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, + GLMForSequenceClassification, GLMForSingleTokenCloze, + GLMModel) +from .model import PyTorchDistributedDataParallel as TorchDDP +from .model import glm_get_params_for_weight_decay_optimization +from .utils import get_checkpoint_iteration, get_checkpoint_name, print_rank_0 + + +def load_pretrained(model, checkpoint_path, args, task_tokens=None): + load_dir, tag, release, success = get_checkpoint_iteration(checkpoint_path) + checkpoint_name = get_checkpoint_name(load_dir, tag, release) + if mpu.get_data_parallel_rank() == 0: + print('global rank {} is loading pretrained model {}'.format( + torch.distributed.get_rank(), checkpoint_name)) + # Load the checkpoint. + sd = torch.load(checkpoint_name, map_location='cpu') + if args.deepspeed: + model = model.module + if isinstance(model, TorchDDP): + model = model.module + if isinstance(model, FP16_Module): + model = model.module + if hasattr(model, 'model'): + model = model.model + + # Model. + def extend_embedding_weights(state_weights, model_weights): + original_length = state_weights.shape[0] + assert original_length <= args.max_position_embeddings + 1 + new_weights = model_weights.clone() + new_weights[:original_length] = state_weights + return new_weights + + if args.block_lm: + if 'transformer.block_position_embeddings.weight' in sd['module']: + position_weights = sd['module'][ + 'transformer.position_embeddings.weight'] + if args.max_position_embeddings + 1 > position_weights.shape[0]: + sd['module'][ + 'transformer.position_embeddings.weight'] = extend_embedding_weights( + position_weights, + model.state_dict() + ['transformer.position_embeddings.weight'].data) + print_rank_0( + f'Extend position embedding to {args.max_position_embeddings + 1}' + ) + if 'transformer.block_position_embeddings.weight' in sd['module']: + block_position_weights = sd['module'][ + 'transformer.block_position_embeddings.weight'] + if args.max_position_embeddings + 1 > block_position_weights.shape[ + 0]: + sd['module'][ + 'transformer.block_position_embeddings.weight'] = extend_embedding_weights( + block_position_weights, + model.state_dict() + ['transformer.block_position_embeddings.weight'].data) + print_rank_0( + f'Extend block position embedding to {args.max_position_embeddings + 1}' + ) + for key in list(model.state_dict().keys()): + print(key) + model.state_dict()[key.replace( + 'mixins.block_position_embedding.block_position_embeddings.weight', + 'transformer.block_position_embeddings.weight').replace( + 'transformer.word_embeddings.weight', + 'word_embeddings.weight')] = model.state_dict().pop(key) + + missing_keys, unexpected_keys = model.load_state_dict( + sd['module'], strict=False) + if missing_keys or unexpected_keys: + print_rank_0( + f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}') + if args.continuous_prompt and args.prompt_init: + model.prompt_spell.init_embedding(model.word_embeddings.weight.data, + task_tokens) + + +def get_model(args, + model_type=None, + multi_token=True, + num_labels=None, + spell_length=None): + """Build the model.""" + print_rank_0('building GPT2 model ...') + if args.pretrained_bert: + if model_type == 'multiple_choice': + model = BertForMultipleChoice.from_pretrained( + args.tokenizer_model_type, + cache_dir=args.cache_dir, + fp32_layernorm=args.fp32_layernorm, + fp32_embedding=args.fp32_embedding, + layernorm_epsilon=args.layernorm_epsilon) + elif model_type == 'classification': + model = BertForSequenceClassification.from_pretrained( + args.tokenizer_model_type, + cache_dir=args.cache_dir, + fp32_layernorm=args.fp32_layernorm, + fp32_embedding=args.fp32_embedding, + layernorm_epsilon=args.layernorm_epsilon, + num_labels=num_labels) + else: + raise NotImplementedError + else: + output_predict, paralle_output = True, True + if (model_type == 'multiple_choice' + or model_type == 'classification') and not args.cloze_eval: + output_predict = False + if model_type is not None: + paralle_output = False + if spell_length is not None: + print_rank_0(f'Continuous spell length {spell_length}') + model = GLMModel( + num_layers=args.num_layers, + vocab_size=args.vocab_size, + hidden_size=args.hidden_size, + num_attention_heads=args.num_attention_heads, + embedding_dropout_prob=args.hidden_dropout, + attention_dropout_prob=args.attention_dropout, + output_dropout_prob=args.hidden_dropout, + max_sequence_length=args.max_position_embeddings, + max_memory_length=args.mem_length, + checkpoint_activations=args.checkpoint_activations, + checkpoint_num_layers=args.checkpoint_num_layers, + parallel_output=paralle_output, + relative_encoding=args.transformer_xl, + block_position_encoding=args.block_lm and not args.masked_lm, + output_predict=output_predict, + spell_length=spell_length, + spell_func=args.prompt_func, + attention_scale=args.attention_scale) + if args.freeze_transformer: + model.freeze_transformer( + tune_prefix_layers=args.tune_prefix_layers) + if model_type is not None: + if model_type == 'multiple_choice': + if args.cloze_eval: + if multi_token: + if args.fast_decode: + model = GLMForMultiTokenClozeFast( + model, length_penalty=args.length_penalty) + else: + model = GLMForMultiTokenCloze( + model, length_penalty=args.length_penalty) + else: + model = GLMForSingleTokenCloze( + model, take_softmax=args.adapet) + else: + model = GLMForSequenceClassification( + model, + args.hidden_size, + args.output_dropout, + args.pool_token, + num_class=num_labels) + elif model_type == 'classification': + model = GLMForSequenceClassification( + model, + args.hidden_size, + args.output_dropout, + args.pool_token, + num_class=num_labels) + elif model_type == 'generation': + pass + else: + raise NotImplementedError(model_type) + + if mpu.get_data_parallel_rank() == 0: + print( + ' > number of parameters on model parallel rank {}: {}'.format( + mpu.get_model_parallel_rank(), + sum([p.nelement() for p in model.parameters()])), + flush=True) + + # To prevent OOM for model sizes that cannot fit in GPU memory in full precision + if args.fp16: + model.half() + + # GPU allocation. + model.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if args.fp16: + model = FP16_Module(model) + + # Wrap model for distributed training. + if not args.deepspeed and (args.train_iters or args.epochs): + if args.DDP_impl == 'torch': + i = torch.cuda.current_device() + model = TorchDDP( + model, + device_ids=[i], + output_device=i, + process_group=mpu.get_data_parallel_group()) + elif args.DDP_impl == 'local': + model = LocalDDP(model) + else: + print_rank_0('Skip DDP model') + return model + + +def get_optimizer_param_groups(model): + # Build parameter groups (weight decay and non-decay). + while isinstance(model, (LocalDDP, TorchDDP, FP16_Module)): + model = model.module + param_groups = glm_get_params_for_weight_decay_optimization(model) + + # Add model parallel attribute if it is not set. + for param_group in param_groups: + # print('## param_group', len(param_group['params'])) + for param in param_group['params']: + if not hasattr(param, 'model_parallel'): + param.model_parallel = False + + return param_groups + + +def get_optimizer(param_groups, args): + """Set up the optimizer.""" + if args.cpu_optimizer: + # Apex FusedAdam uses decoupled weight decay so use the same here + if args.cpu_torch_adam: + cpu_adam_optimizer = torch.optim.AdamW + else: + from deepspeed.ops.adam import DeepSpeedCPUAdam + cpu_adam_optimizer = DeepSpeedCPUAdam + optimizer = cpu_adam_optimizer( + param_groups, lr=args.lr, weight_decay=args.weight_decay) + else: + # Use FusedAdam. + if args.optimizer == 'adam': + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps) + elif args.optimizer == 'adafactor': + from transformers import Adafactor + optimizer = Adafactor( + param_groups, + lr=args.lr, + relative_step=False, + warmup_init=False) + else: + raise NotImplementedError + + print(f'Optimizer = {optimizer.__class__.__name__}') + if hasattr(args, 'deepspeed') and args.deepspeed: + raise NotImplementedError + # fp16 wrapper is not required for DeepSpeed. + # return optimizer + + # Wrap into fp16 optimizer. + if args.fp16: + optimizer = FP16_Optimizer( + optimizer, + static_loss_scale=args.loss_scale, + dynamic_loss_scale=args.dynamic_loss_scale, + dynamic_loss_args={ + 'scale_window': args.loss_scale_window, + 'min_scale': args.min_scale, + 'delayed_shift': args.hysteresis + }) + + return optimizer + + +def get_learning_rate_scheduler(optimizer, args): + """Build the learning rate scheduler.""" + + # Add linear learning rate scheduler. + if args.lr_decay_iters is not None: + num_iters = args.lr_decay_iters + else: + num_iters = args.train_iters + if args.finetune: + num_iters = num_iters // args.gradient_accumulation_steps + num_iters = max(1, num_iters) + init_step = -1 + warmup_iter = args.warmup * num_iters + lr_scheduler = AnnealingLR( + optimizer, + start_lr=args.lr, + warmup_iter=warmup_iter, + num_iters=num_iters - warmup_iter, + decay_style=args.lr_decay_style, + last_iter=init_step, + decay_ratio=args.lr_decay_ratio) + + return lr_scheduler + + +def setup_model_and_optimizer(args, + model_type=None, + multi_token=True, + num_labels=None, + spell_length=None): + """Setup model and optimizer.""" + + model = get_model( + args, + model_type=model_type, + multi_token=multi_token, + num_labels=num_labels, + spell_length=spell_length) + param_groups = get_optimizer_param_groups(model) + + if args.train_data is not None or args.data_dir is not None and ( + args.epochs > 0 or args.train_iters > 0): + if args.deepspeed: + print_rank_0('DeepSpeed is enabled.') + + model, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=param_groups, + args=args, + mpu=mpu, + dist_init_required=False) + else: + optimizer = get_optimizer(param_groups, args) + lr_scheduler = get_learning_rate_scheduler(optimizer, args) + else: + optimizer, lr_scheduler = None, None + + return model, optimizer, lr_scheduler + + +def backward_step(optimizer, model, lm_loss, args, timers): + """Backward step.""" + + # Total loss. + loss = lm_loss + + # Backward pass. + if args.deepspeed: + model.backward(loss) + else: + # optimizer.zero_grad() + if args.fp16: + optimizer.backward(loss, update_master_grads=False) + else: + loss.backward() + + if args.deepspeed or args.DDP_impl == 'torch': + # DeepSpeed backward propagation already addressed all reduce communication. + # Reset the timer to avoid breaking timer logs below. + timers('allreduce').reset() + else: + timers('allreduce').start() + model.allreduce_params( + reduce_after=False, fp32_allreduce=args.fp32_allreduce) + timers('allreduce').stop() + + # Update master gradients. + if not args.deepspeed: + if args.fp16: + optimizer.update_master_grads() + + # Clipping gradients helps prevent the exploding gradient. + if args.clip_grad > 0: + if not args.fp16: + mpu.clip_grad_norm(model.parameters(), args.clip_grad) + else: + optimizer.clip_master_grads(args.clip_grad) + + return lm_loss + + +def see_memory_usage(message, force=False): + if not force: + return + dist.barrier() + if dist.get_rank() == 0: + print(message) + print('Memory Allocated ', + torch.cuda.memory_allocated() / (1024 * 1024 * 1024), + 'GigaBytes') + print('Max Memory Allocated ', + torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), + 'GigaBytes') + print('Cache Allocated ', + torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes') + print('Max cache Allocated ', + torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), + 'GigaBytes') + print(' ') + # input("Press Any Key To Continue ..") + + +def train_step(data_iterator, + model, + optimizer, + lr_scheduler, + args, + timers, + forward_step_func, + mems=None, + single_step=False): + """Single training step.""" + lm_loss_total, count = 0.0, 0 + mems = [] if mems is None else mems + if not args.deepspeed: + optimizer.zero_grad() + while True: + skipped_iter, complete = 0, False + # Forward model for one step. + timers('forward').start() + lm_loss, mems, _ = forward_step_func(data_iterator, model, args, + timers, mems) + timers('forward').stop() + # print_rank_0("Forward step") + if not args.deepspeed: + lm_loss /= args.gradient_accumulation_steps + + reduced_loss = lm_loss.detach().clone().view(1) + torch.distributed.all_reduce( + reduced_loss.data, group=mpu.get_data_parallel_group()) + reduced_loss.data = reduced_loss.data / ( + args.world_size / args.model_parallel_size) + + if not DynamicLossScaler._has_inf_or_nan(reduced_loss): + lm_loss_total += reduced_loss + count += 1 + + # Calculate gradients, reduce across processes, and clip. + timers('backward').start() + backward_step(optimizer, model, lm_loss, args, timers) + timers('backward').stop() + # print_rank_0("Backward step") + # Update parameters. + timers('optimizer').start() + if args.deepspeed: + if model.is_gradient_accumulation_boundary(): + model.step() + complete = True + if not (args.fp16 and optimizer.overflow): + lr_scheduler.step() + else: + skipped_iter = 1 + else: + model.step() + else: + if count == args.gradient_accumulation_steps: + optimizer.step() + complete = True + # Update learning rate. + if not (args.fp16 and optimizer.overflow): + lr_scheduler.step() + else: + skipped_iter = 1 + # print_rank_0("Optimizer step") + timers('optimizer').stop() + if complete: + break + else: + print_rank_0('Found NaN loss, skip backward') + del lm_loss, reduced_loss + mems = [] + if single_step: + break + if args.deepspeed: + lm_loss_total = lm_loss_total / count + return lm_loss_total, skipped_iter, mems diff --git a/modelscope/models/nlp/mglm/utils.py b/modelscope/models/nlp/mglm/utils.py new file mode 100644 index 00000000..2bfcf8c0 --- /dev/null +++ b/modelscope/models/nlp/mglm/utils.py @@ -0,0 +1,529 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for logging and serialization""" + +import os +import random +import subprocess +import time + +import json +import numpy as np +import torch + +from . import mpu +from .fp16 import FP16_Optimizer + +SUMMARY_WRITER_DIR_NAME = 'runs' + + +def get_log_dir(name, base): + return os.path.join(base, SUMMARY_WRITER_DIR_NAME, name) + + +def print_rank_0(message): + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + print(message, flush=True) + else: + print(message, flush=True) + + +def get_hostname(): + hostname_cmd = ['hostname -I'] + result = subprocess.check_output(hostname_cmd, shell=True) + master_addr = result.decode('utf-8').split()[0] + return master_addr + + +def get_spare_port(args): + if torch.distributed.get_rank() == 0: + port = subprocess.check_output(['shuf -n 1 -i 10000-65535'], + shell=True) + port = int(port.strip()) + if port == args.master_port: + port = subprocess.check_output(['shuf -n 1 -i 10000-65535'], + shell=True) + port = int(port.strip()) + port = torch.cuda.LongTensor([port]) + else: + port = torch.cuda.LongTensor([0]) + torch.distributed.broadcast(port, 0) + port = port.item() + return port + + +def print_and_save_args(args, verbose=True, log_dir=None): + """Print arguments.""" + if verbose: + print('arguments:', flush=True) + for arg in vars(args): + dots = '.' * (29 - len(arg)) + print( + ' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True) + if log_dir is not None: + json_file = os.path.join(log_dir, 'config.json') + with open(json_file, 'w') as output: + json.dump(vars(args), output, sort_keys=True) + if args.deepspeed and args.deepspeed_config is not None: + with open(args.deepspeed_config) as file: + deepspeed_config = json.load(file) + deepspeed_json_file = os.path.join(log_dir, + 'config_gpt_large.json') + with open(deepspeed_json_file, 'w') as output: + json.dump(deepspeed_config, output) + + +def print_params_min_max_norm(optimizer, iteration): + """Print min, max, and norm of all parameters.""" + index = 0 + rank = torch.distributed.get_rank() + string = 'iteration, rank, index, model-parallel,min, max, norm\n' + optimizer_ = optimizer + if isinstance(optimizer, FP16_Optimizer): + optimizer_ = optimizer.optimizer + for param_group in optimizer_.param_groups: + for param in param_group['params']: + index += 1 + min_ = param.data.min() + max_ = param.data.max() + norm = param.data.norm() + string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format( + iteration, rank, index, int(param.model_parallel)) + string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm) + print(string, flush=True) + + +class Timers: + """Group of timers.""" + + class Timer: + """Timer.""" + + def __init__(self, name): + self.name_ = name + self.elapsed_ = 0.0 + self.started_ = False + self.start_time = time.time() + + def start(self): + """Start the timer.""" + assert not self.started_, 'timer has already been started' + torch.cuda.synchronize() + self.start_time = time.time() + self.started_ = True + + def stop(self): + """Stop the timer.""" + assert self.started_, 'timer is not started' + torch.cuda.synchronize() + self.elapsed_ += (time.time() - self.start_time) + self.started_ = False + + def reset(self): + """Reset timer.""" + self.elapsed_ = 0.0 + self.started_ = False + + def elapsed(self, reset=True): + """Calculate the elapsed time.""" + started_ = self.started_ + # If the timing in progress, end it first. + if self.started_: + self.stop() + # Get the elapsed time. + elapsed_ = self.elapsed_ + # Reset the elapsed time + if reset: + self.reset() + # If timing was in progress, set it back. + if started_: + self.start() + return elapsed_ + + def __init__(self): + self.timers = {} + + def __call__(self, name): + if name not in self.timers: + self.timers[name] = self.Timer(name) + return self.timers[name] + + def log(self, names, normalizer=1.0, reset=True): + """Log a group of timers.""" + assert normalizer > 0.0 + string = 'time (ms)' + for name in names: + elapsed_time = self.timers[name].elapsed( + reset=reset) * 1000.0 / normalizer + string += ' | {}: {:.2f}'.format(name, elapsed_time) + print_rank_0(string) + + +def report_memory(name): + """Simple GPU memory report.""" + + mega_bytes = 1024.0 * 1024.0 + string = name + ' memory (MB)' + string += ' | allocated: {}'.format(torch.cuda.memory_allocated() + / mega_bytes) + string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated() + / mega_bytes) + string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes) + string += ' | max cached: {}'.format(torch.cuda.memory_reserved() + / mega_bytes) + print_rank_0(string) + + +def get_checkpoint_name(checkpoints_path, + iteration, + release=False, + zero=False): + if release: + d = 'release' + else: + d = '{}'.format(iteration) + if zero: + dp_rank = mpu.get_data_parallel_rank() + d += '_zero_dp_rank_{}'.format(dp_rank) + return os.path.join( + checkpoints_path, d, + 'mp_rank_{:02d}_model_states.pt'.format(mpu.get_model_parallel_rank())) + + +def ensure_directory_exists(filename): + dirname = os.path.dirname(filename) + if not os.path.exists(dirname): + os.makedirs(dirname, exist_ok=True) + + +def get_checkpoint_tracker_filename(checkpoints_path): + return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt') + + +def save_zero_checkpoint(args, iteration, optimizer): + zero_sd = { + 'iteration': iteration, + 'optimizer_state_dict': optimizer.state_dict() + } + zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True) + ensure_directory_exists(zero_checkpoint_name) + torch.save(zero_sd, zero_checkpoint_name) + print(' successfully saved {}'.format(zero_checkpoint_name)) + + +def save_checkpoint(iteration, + model, + optimizer, + lr_scheduler, + args, + tag=None, + barrier=True, + only_changed_parameters=False, + no_deepspeed=False, + no_save_optim=False): + """Save a model checkpoint.""" + if tag is None: + tag = str(iteration) + if args.deepspeed and not no_deepspeed: + save_ds_checkpoint(iteration, model, lr_scheduler, args, tag=tag) + else: + # Only rank zer0 of the data parallel writes to the disk. + + if mpu.get_data_parallel_rank() == 0: + checkpoint_name = get_checkpoint_name(args.save, tag) + print( + 'global rank {} is saving checkpoint at iteration {:7d} to {}'. + format(torch.distributed.get_rank(), iteration, + checkpoint_name)) + sd = {'iteration': iteration} + if args.deepspeed: + model = model.module + state_dict = model.state_dict() + if only_changed_parameters: + requires_grad_dict = {} + for name, parameter in model.named_parameters(): + requires_grad_dict[name] = parameter.requires_grad + state_dict = { + key: value + for key, value in state_dict.items() + if requires_grad_dict[key] + } + sd['module'] = state_dict + + # Optimizer stuff. + if not args.no_save_optim and not no_save_optim: + if optimizer is not None: + sd['optimizer'] = optimizer.state_dict() + if lr_scheduler is not None: + sd['lr_scheduler'] = lr_scheduler.state_dict() + + # rng states. + if not args.no_save_rng: + sd['random_rng_state'] = random.getstate() + sd['np_rng_state'] = np.random.get_state() + sd['torch_rng_state'] = torch.get_rng_state() + sd['cuda_rng_state'] = torch.cuda.get_rng_state() + sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker( + ).get_states() + + ensure_directory_exists(checkpoint_name) + torch.save(sd, checkpoint_name) + print(' successfully saved {}'.format(checkpoint_name)) + + # Wait so everyone is done (necessary) + if barrier: + torch.distributed.barrier() + # And update the latest iteration + if torch.distributed.get_rank() == 0: + tracker_filename = get_checkpoint_tracker_filename(args.save) + with open(tracker_filename, 'w') as f: + f.write(tag) + + +def save_ds_checkpoint(iteration, model, lr_scheduler, args, tag): + """Save a model checkpoint.""" + + sd = {} + sd['iteration'] = iteration + if lr_scheduler is not None: + sd['client_lr_scheduler'] = lr_scheduler.state_dict() + # rng states. + if not args.no_save_rng: + sd['random_rng_state'] = random.getstate() + sd['np_rng_state'] = np.random.get_state() + sd['torch_rng_state'] = torch.get_rng_state() + sd['cuda_rng_state'] = torch.cuda.get_rng_state() + sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states() + model.save_checkpoint(args.save, tag, client_state=sd) + + +def get_checkpoint_iteration(load_path): + # Read the tracker file and set the iteration. + tracker_filename = get_checkpoint_tracker_filename(load_path) + if not os.path.isfile(tracker_filename): + print_rank_0('WARNING: could not find the metadata file {} '.format( + tracker_filename)) + if os.path.isdir(load_path): + path = os.path.normpath(load_path) + load_dir, tag = os.path.split(path) + print_rank_0( + 'Try to directly load the checkpoint from the directory') + return load_dir, tag, False, True + print_rank_0(' will not load any checkpoints and will start from ' + 'random') + return load_path, 0, False, False + with open(tracker_filename, 'r') as f: + metastring = f.read().strip() + release = metastring == 'release' + # try: + # iteration = int(metastring) + # except ValueError: + # release = metastring == 'release' + # if not release: + # print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format( + # tracker_filename)) + # exit() + + # assert iteration > 0 or release, 'error parsing metadata file {}'.format( + # tracker_filename) + + return load_path, metastring, release, True + + +def load_checkpoint(model, + optimizer, + lr_scheduler, + args, + no_deepspeed=False, + no_load_optim=False): + """Load a model checkpoint.""" + + load_dir, tag, release, success = get_checkpoint_iteration(args.load) + + if not success: + return 0 + + if args.deepspeed and not no_deepspeed: + + checkpoint_name, sd = model.load_checkpoint( + load_dir, + tag, + load_optimizer_states=not args.no_load_optim and not no_load_optim, + load_lr_scheduler_states=not args.no_load_lr_scheduler) + if not args.no_load_lr_scheduler and 'client_lr_scheduler' in sd: + lr_scheduler.load_state_dict(sd['client_lr_scheduler']) + print_rank_0('Load lr scheduler state') + if checkpoint_name is None: + if mpu.get_data_parallel_rank() == 0: + print('Unable to load checkpoint.') + return tag + + else: + + # Checkpoint. + checkpoint_name = get_checkpoint_name(load_dir, tag, release) + + if mpu.get_data_parallel_rank() == 0: + print('global rank {} is loading checkpoint {}'.format( + torch.distributed.get_rank(), checkpoint_name)) + + # Load the checkpoint. + sd = torch.load(checkpoint_name, map_location='cpu') + + # Model. + if args.deepspeed: + model = model.module + missing_keys, unexpected_keys = model.load_state_dict( + sd['module'], strict=False) + if missing_keys or unexpected_keys: + print_rank_0( + f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}' + ) + + # Optimizer. + if not release and not args.finetune and not args.no_load_optim and not no_load_optim: + try: + if optimizer is not None: + optimizer.load_state_dict(sd['optimizer']) + if lr_scheduler is not None: + lr_scheduler.load_state_dict(sd['lr_scheduler']) + except KeyError: + print_rank_0( + 'Unable to load optimizer from checkpoint {}, exiting. ' + 'Specify --no-load-optim or --finetune to prevent ' + 'attempting to load the optimizer ' + 'state.'.format(checkpoint_name)) + + # Iterations. + if args.finetune or release: + iteration = 0 + else: + try: + iteration = sd['iteration'] + except KeyError: + try: # Backward compatible with older checkpoints + iteration = sd['total_iters'] + except KeyError: + print_rank_0( + 'A metadata file exists but Unable to load iteration ' + ' from checkpoint {}, starting from 0 iteration'.format( + checkpoint_name)) + iteration = 0 + + # rng states. + if not release and not args.finetune and not args.no_load_rng: + try: + random.setstate(sd['random_rng_state']) + np.random.set_state(sd['np_rng_state']) + torch.set_rng_state(sd['torch_rng_state']) + torch.cuda.set_rng_state(sd['cuda_rng_state']) + mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states']) + except KeyError: + print_rank_0( + 'Unable to load random state from checkpoint {}, exiting. ' + 'Specify --no-load-rng or --finetune to prevent ' + 'attempting to load the random ' + 'state.'.format(checkpoint_name)) + + if mpu.get_data_parallel_rank() == 0: + print(' successfully loaded {}'.format(checkpoint_name)) + + return iteration + + +def load_weights(src, dst, dst2src=False): + """ + Loads weights from src to dst via in place copy. + src is a huggingface gpt2model, while dst is one of our models. + dst2src=True loads parameters from our models into huggingface's. + ^dst2src is still untested + """ + conv_layer = 'Conv1D' in str(type(src)) + for n, p in src.named_parameters(): + if dst2src: + data = dst._parameters[n].data + load = p.data + else: + data = p.data + load = dst._parameters[n].data + if conv_layer and 'weight' in n: + data = data.t().contiguous() + load.copy_(data) + + +# dst._parameters[n].data.copy_(data) + + +def load_mlp(our, oai, dst2src=False): + load_weights(oai.c_fc, our.dense_h_to_4h, dst2src) + load_weights(oai.c_proj, our.dense_4h_to_h, dst2src) + + +def load_attention(our, oai, dst2src=False): + load_weights(oai.c_attn, our.query_key_value, dst2src) + load_weights(oai.c_proj, our.dense, dst2src) + + +def load_transformer_layer(our, oai, dst2src=False): + load_weights(oai.ln_1, our.input_layernorm, dst2src) + load_weights(oai.ln_2, our.post_attention_layernorm, dst2src) + load_mlp(our.mlp, oai.mlp, dst2src) + load_attention(our.attention, oai.attn, dst2src) + + +def move_weights(our, oai, dst2src=False): + """ + Loads weights from `oai` to `our` via in place copy. + `oai` is a huggingface gpt2model, while `our` is one of our models. + dst2src=True loads parameters from our models into huggingface's. + ^dst2src=True is still untested + """ + # while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)): + # our=our.module + transformer_model = oai.transformer + load_weights(transformer_model.ln_f, our.transformer.final_layernorm, + dst2src) + load_weights(transformer_model.wte, our.word_embeddings, dst2src) + load_weights(transformer_model.wpe, our.position_embeddings, dst2src) + + for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h): + load_transformer_layer(our_layer, oai_layer, dst2src) + + +def debug_finetune_data(local_vars, batch_id, tokenizer): + tokens, target_ids = local_vars['tokens'], local_vars['target_ids'] + attention_mask, logit_mask, position_ids = local_vars[ + 'attention_mask'], local_vars['logit_mask'], local_vars['position_ids'] + output_tokens = [] + sep = attention_mask[batch_id].item() + for i, token in enumerate(tokens[batch_id][:sep].tolist()): + token = tokenizer.IdToToken(token) + if token == '[MASK]': + token = f'[{position_ids[batch_id][0, i].item()}]' + output_tokens.append(token) + print(' '.join(output_tokens)) + target_positions = [] + for i in range(sep, tokens.size(-1)): + if logit_mask[batch_id][i]: + target_positions.append(i) + print(target_positions) + print(tokenizer.DecodeIds(tokens[batch_id][target_positions].tolist())) + if len(target_ids.shape) > 2: + print( + tokenizer.DecodeIds( + target_ids[batch_id][target_positions].tolist())) + else: + print(tokenizer.DecodeIds(target_ids[batch_id].tolist())) + print(position_ids[batch_id][:, target_positions]) diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index cbdeede4..b983125a 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -516,6 +516,12 @@ TASK_OUTPUTS = { # } Tasks.text_generation: [OutputKeys.TEXT], + # summarization result for single sample + # { + # "text": "this is the text generated by a model." + # } + Tasks.text_summarization: [OutputKeys.TEXT], + # text generation result for single sample # { # "text": "北京" diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index 7b726308..1206ae08 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -31,6 +31,7 @@ if TYPE_CHECKING: from .translation_pipeline import TranslationPipeline from .word_segmentation_pipeline import WordSegmentationPipeline from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline + from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \ WordSegmentationThaiPipeline @@ -71,6 +72,7 @@ else: 'word_segmentation_pipeline': ['WordSegmentationPipeline'], 'zero_shot_classification_pipeline': ['ZeroShotClassificationPipeline'], + 'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'], 'multilingual_word_segmentation_pipeline': [ 'MultilingualWordSegmentationPipeline', 'WordSegmentationThaiPipeline' diff --git a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py new file mode 100644 index 00000000..c6d03077 --- /dev/null +++ b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 Zhipu.AI + +from typing import Any, Dict, Optional, Union + +from modelscope.metainfo import Pipelines +from modelscope.models.base import Model +from modelscope.models.nlp import MGLMForTextSummarization +from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import (MGLMSummarizationPreprocessor, + Preprocessor) +from modelscope.utils.constant import Tasks + +__all__ = ['MGLMTextSummarizationPipeline'] + + +@PIPELINES.register_module( + group_key=Tasks.text_summarization, + module_name=Pipelines.mglm_text_summarization) +class MGLMTextSummarizationPipeline(Pipeline): + + def __init__(self, + model: Union[MGLMForTextSummarization, str], + preprocessor: [Preprocessor] = None, + *args, + **kwargs): + model = MGLMForTextSummarization(model) if isinstance(model, + str) else model + self.model = model + self.model.eval() + if preprocessor is None: + preprocessor = MGLMSummarizationPreprocessor() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + # define the forward pass + def forward(self, inputs: Union[Dict, str], + **forward_params) -> Dict[str, Any]: + inputs = {'text': inputs} if isinstance(inputs, str) else inputs + return self.model.generate(inputs) + + # format the outputs from pipeline + def postprocess(self, input, **kwargs) -> Dict[str, Any]: + return input diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index e568098f..0db1c7e0 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -18,16 +18,16 @@ if TYPE_CHECKING: from .nlp import ( DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor, FillMaskPoNetPreprocessor, NLPPreprocessor, - NLPTokenizerPreprocessorBase, TextRankingPreprocessor, - RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor, - SequenceClassificationPreprocessor, TokenClassificationPreprocessor, - TextErrorCorrectionPreprocessor, TextGenerationPreprocessor, - Text2TextGenerationPreprocessor, Tokenize, + NLPTokenizerPreprocessorBase, PassageRankingPreprocessor, + TextRankingPreprocessor, RelationExtractionPreprocessor, + SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor, + TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor, + TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize, WordSegmentationBlankSetToLabelPreprocessor, - ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor, - SentencePiecePreprocessor, DialogIntentPredictionPreprocessor, - DialogModelingPreprocessor, DialogStateTrackingPreprocessor, - ConversationalTextToSqlPreprocessor, + MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor, + TextGenerationJiebaPreprocessor, SentencePiecePreprocessor, + DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, + DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor, TableQuestionAnsweringPreprocessor, NERPreprocessorViet, NERPreprocessorThai, WordSegmentationPreprocessorThai) from .video import ReadVideoData, MovieSceneSegmentationPreprocessor @@ -57,6 +57,7 @@ else: 'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor', 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', + 'MGLMSummarizationPreprocessor', 'ZeroShotClassificationPreprocessor', 'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor', 'NERPreprocessorViet', 'NERPreprocessorThai', diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index d9c55fe1..7c48fb3c 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -29,6 +29,7 @@ if TYPE_CHECKING: MultiWOZBPETextField, IntentBPETextField) from .space_T_en import ConversationalTextToSqlPreprocessor from .space_T_cn import TableQuestionAnsweringPreprocessor + from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor else: _import_structure = { 'nlp_base': [ @@ -62,6 +63,7 @@ else: 'text_error_correction': [ 'TextErrorCorrectionPreprocessor', ], + 'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'], 'token_classification_thai_preprocessor': [ 'NERPreprocessorThai', 'WordSegmentationPreprocessorThai', diff --git a/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py new file mode 100644 index 00000000..0a68a9fa --- /dev/null +++ b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 Zhipu.AI + +import os.path as osp +import re +from typing import Any, Dict, Iterable, Optional, Tuple, Union + +from modelscope.metainfo import Models, Preprocessors +from modelscope.outputs import OutputKeys +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.config import Config, ConfigFields +from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile +from modelscope.utils.hub import get_model_type, parse_label_mapping +from modelscope.utils.logger import get_logger +from modelscope.utils.nlp import import_external_nltk_data +from modelscope.utils.type_assert import type_assert + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.mglm_summarization) +class MGLMSummarizationPreprocessor(Preprocessor): + + def __init__(self, *args, **kwargs): + """preprocess the data + Args: + model_dir (str): model path + """ + super().__init__(*args, **kwargs) + + @type_assert(object, (str, tuple, Dict)) + def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: + return data diff --git a/requirements/nlp.txt b/requirements/nlp.txt index 9a4abd71..80fee546 100644 --- a/requirements/nlp.txt +++ b/requirements/nlp.txt @@ -1,18 +1,25 @@ +boto3 en_core_web_sm>=2.3.5 +fasttext +filelock +ftfy jieba>=0.42.1 -megatron_util +matplotlib +nltk pai-easynlp +pandas # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged. protobuf>=3.19.0,<3.21.0 pythainlp pyvi -# rough-score was just recently updated from 0.0.4 to 0.0.7 -# which introduced compatability issues that are being investigated -rouge_score<=0.0.4 +regex sacremoses>=0.0.41 +scikit_learn +sentencepiece seqeval spacy>=2.3.5 subword_nmt>=0.3.8 +termcolor text2sql_lgesql tokenizers transformers>=4.12.0 diff --git a/tests/pipelines/test_mglm_text_summarization.py b/tests/pipelines/test_mglm_text_summarization.py new file mode 100644 index 00000000..47abc741 --- /dev/null +++ b/tests/pipelines/test_mglm_text_summarization.py @@ -0,0 +1,47 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import unittest + +from modelscope.models import Model +from modelscope.pipelines import pipeline +from modelscope.preprocessors import MGLMSummarizationPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class mGLMTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.output_dir = 'unittest_output' + os.makedirs(self.output_dir, exist_ok=True) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_mglm_with_name(self): + model = 'ZhipuAI/Multilingual-GLM-Summarization-zh' + preprocessor = MGLMSummarizationPreprocessor() + pipe = pipeline( + task=Tasks.text_summarization, + model=model, + preprocessor=preprocessor, + ) + result = pipe( + '据中国载人航天工程办公室消息,北京时间2022年10月25日,梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作,计划于近日择机实施发射。目前,文昌航天发射场设施设备状态良好,参试各单位正在加紧开展任务准备,全力以赴确保空间站建造任务决战决胜。' # noqa + ) + print(result) + + model = 'ZhipuAI/Multilingual-GLM-Summarization-en' + preprocessor = MGLMSummarizationPreprocessor() + pipe = pipeline( + task=Tasks.text_summarization, + model=model, + preprocessor=preprocessor, + ) + result = pipe( + '据中国载人航天工程办公室消息,北京时间2022年10月25日,梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作,计划于近日择机实施发射。目前,文昌航天发射场设施设备状态良好,参试各单位正在加紧开展任务准备,全力以赴确保空间站建造任务决战决胜。' # noqa + ) + print(result) + + +if __name__ == '__main__': + unittest.main() From 4b7e8e89aade38131e35e05d04fd4aa2dacca0c9 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Oct 2022 21:44:33 +0800 Subject: [PATCH 04/46] [to #42322933] Fix some bugs when downgrade the version of some dependencies 1. Fix bug in model exporting 2. Skip some long trainings in test level 2 3. Refine some comments 4. Fix a bug that mode is not correct when saving checkpoints Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10564716 --- modelscope/exporters/torch_model_exporter.py | 86 +++++++++++++++++-- modelscope/models/base/base_model.py | 7 ++ modelscope/models/nlp/bert/text_ranking.py | 1 + .../nlp/structbert/text_classification.py | 1 + modelscope/trainers/trainer.py | 4 +- ...st_export_sbert_sequence_classification.py | 2 +- .../test_finetune_sequence_classification.py | 2 +- tests/trainers/test_trainer_with_nlp.py | 2 +- 8 files changed, 92 insertions(+), 13 deletions(-) diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py index 94ef277a..7bf6c0c0 100644 --- a/modelscope/exporters/torch_model_exporter.py +++ b/modelscope/exporters/torch_model_exporter.py @@ -7,9 +7,9 @@ from typing import Any, Dict, Mapping import torch from torch import nn from torch.onnx import export as onnx_export -from torch.onnx.utils import _decide_input_format from modelscope.models import TorchModel +from modelscope.outputs import ModelOutputBase from modelscope.pipelines.base import collate_fn from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger @@ -102,6 +102,53 @@ class TorchModelExporter(Exporter): """ return None + @staticmethod + def _decide_input_format(model, args): + import inspect + + def _signature(model) -> inspect.Signature: + should_be_callable = getattr(model, 'forward', model) + if callable(should_be_callable): + return inspect.signature(should_be_callable) + raise ValueError('model has no forward method and is not callable') + + try: + sig = _signature(model) + except ValueError as e: + logger.warn('%s, skipping _decide_input_format' % e) + return args + try: + ordered_list_keys = list(sig.parameters.keys()) + if ordered_list_keys[0] == 'self': + ordered_list_keys = ordered_list_keys[1:] + args_dict: Dict = {} + if isinstance(args, list): + args_list = args + elif isinstance(args, tuple): + args_list = list(args) + else: + args_list = [args] + if isinstance(args_list[-1], dict): + args_dict = args_list[-1] + args_list = args_list[:-1] + n_nonkeyword = len(args_list) + for optional_arg in ordered_list_keys[n_nonkeyword:]: + if optional_arg in args_dict: + args_list.append(args_dict[optional_arg]) + # Check if this arg has a default value + else: + param = sig.parameters[optional_arg] + if param.default != param.empty: + args_list.append(param.default) + args = args_list if isinstance(args, list) else tuple(args_list) + # Cases of models with no input args + except IndexError: + logger.warn('No input args, skipping _decide_input_format') + except Exception as e: + logger.warn('Skipping _decide_input_format\n {}'.format(e.args[0])) + + return args + def _torch_export_onnx(self, model: nn.Module, output: str, @@ -179,16 +226,21 @@ class TorchModelExporter(Exporter): with torch.no_grad(): model.eval() outputs_origin = model.forward( - *_decide_input_format(model, dummy_inputs)) - if isinstance(outputs_origin, Mapping): - outputs_origin = numpify_tensor_nested( - list(outputs_origin.values())) + *self._decide_input_format(model, dummy_inputs)) + if isinstance(outputs_origin, (Mapping, ModelOutputBase)): + outputs_origin = list( + numpify_tensor_nested(outputs_origin).values()) elif isinstance(outputs_origin, (tuple, list)): - outputs_origin = numpify_tensor_nested(outputs_origin) + outputs_origin = list(numpify_tensor_nested(outputs_origin)) outputs = ort_session.run( onnx_outputs, numpify_tensor_nested(dummy_inputs), ) + outputs = numpify_tensor_nested(outputs) + if isinstance(outputs, dict): + outputs = list(outputs.values()) + elif isinstance(outputs, tuple): + outputs = list(outputs) tols = {} if rtol is not None: @@ -232,12 +284,26 @@ class TorchModelExporter(Exporter): 'Model property dummy_inputs must be set.') dummy_inputs = collate_fn(dummy_inputs, device) if isinstance(dummy_inputs, Mapping): - dummy_inputs = tuple(dummy_inputs.values()) + dummy_inputs = self._decide_input_format(model, dummy_inputs) + dummy_inputs_filter = [] + for _input in dummy_inputs: + if _input is not None: + dummy_inputs_filter.append(_input) + else: + break + + if len(dummy_inputs) != len(dummy_inputs_filter): + logger.warn( + f'Dummy inputs is not continuous in the forward method, ' + f'origin length: {len(dummy_inputs)}, ' + f'the length after filtering: {len(dummy_inputs_filter)}') + dummy_inputs = dummy_inputs_filter + with torch.no_grad(): model.eval() with replace_call(): traced_model = torch.jit.trace( - model, dummy_inputs, strict=strict) + model, tuple(dummy_inputs), strict=strict) torch.jit.save(traced_model, output) if validation: @@ -249,6 +315,10 @@ class TorchModelExporter(Exporter): outputs = numpify_tensor_nested(outputs) outputs_origin = model.forward(*dummy_inputs) outputs_origin = numpify_tensor_nested(outputs_origin) + if isinstance(outputs, dict): + outputs = list(outputs.values()) + if isinstance(outputs_origin, dict): + outputs_origin = list(outputs_origin.values()) tols = {} if rtol is not None: tols['rtol'] = rtol diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py index e01d1f05..1ca7e030 100644 --- a/modelscope/models/base/base_model.py +++ b/modelscope/models/base/base_model.py @@ -161,5 +161,12 @@ class Model(ABC): assert config is not None, 'Cannot save the model because the model config is empty.' if isinstance(config, Config): config = config.to_dict() + if 'preprocessor' in config and config['preprocessor'] is not None: + if 'mode' in config['preprocessor']: + config['preprocessor']['mode'] = 'inference' + elif 'val' in config['preprocessor'] and 'mode' in config[ + 'preprocessor']['val']: + config['preprocessor']['val']['mode'] = 'inference' + save_pretrained(self, target_folder, save_checkpoint_names, save_function, config, **kwargs) diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py index d6bbf277..b5ac8d7e 100644 --- a/modelscope/models/nlp/bert/text_ranking.py +++ b/modelscope/models/nlp/bert/text_ranking.py @@ -36,6 +36,7 @@ class BertForTextRanking(BertForSequenceClassification): output_attentions=None, output_hidden_states=None, return_dict=None, + *args, **kwargs) -> AttentionTextClassificationModelOutput: outputs = self.base_model.forward( input_ids=input_ids, diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py index 044cf8d0..8797beb3 100644 --- a/modelscope/models/nlp/structbert/text_classification.py +++ b/modelscope/models/nlp/structbert/text_classification.py @@ -109,6 +109,7 @@ class SbertForSequenceClassification(SbertPreTrainedModel): output_attentions=None, output_hidden_states=None, return_dict=None, + *args, **kwargs): r""" Args: diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index aaf24cfa..7478d8e4 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -672,7 +672,7 @@ class EpochBasedTrainer(BaseTrainer): self.model, cfg=cfg, default_args=default_args) except KeyError as e: self.logger.error( - f'Build optimizer error, the optimizer {cfg} is native torch optimizer, ' + f'Build optimizer error, the optimizer {cfg} is a torch native component, ' f'please check if your torch with version: {torch.__version__} matches the config.' ) raise e @@ -682,7 +682,7 @@ class EpochBasedTrainer(BaseTrainer): return build_lr_scheduler(cfg=cfg, default_args=default_args) except KeyError as e: self.logger.error( - f'Build lr_scheduler error, the lr_scheduler {cfg} is native torch lr_scheduler, ' + f'Build lr_scheduler error, the lr_scheduler {cfg} is a torch native component, ' f'please check if your torch with version: {torch.__version__} matches the config.' ) raise e diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py index 0e4f8349..7533732d 100644 --- a/tests/export/test_export_sbert_sequence_classification.py +++ b/tests/export/test_export_sbert_sequence_classification.py @@ -23,7 +23,7 @@ class TestExportSbertSequenceClassification(unittest.TestCase): shutil.rmtree(self.tmp_dir) super().tearDown() - @unittest.skip + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_export_sbert_sequence_classification(self): model = Model.from_pretrained(self.model_id) print( diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py index 02dd9d2f..061d37d3 100644 --- a/tests/trainers/test_finetune_sequence_classification.py +++ b/tests/trainers/test_finetune_sequence_classification.py @@ -38,7 +38,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase): shutil.rmtree(self.tmp_dir) super().tearDown() - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skip def test_trainer_cfg_class(self): dataset = MsDataset.load('clue', subset_name='tnews') train_dataset = dataset['train'] diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index d9d56b60..f1d9e414 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -72,7 +72,7 @@ class TestTrainerWithNlp(unittest.TestCase): output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR) pipeline_sentence_similarity(output_dir) - @unittest.skipUnless(test_level() >= 3, 'skip test in current test level') + @unittest.skip def test_trainer_with_backbone_head(self): model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' kwargs = dict( From 3791ee7ad2a1e4cc8f5586c7de138ef58a2db3db Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Sat, 29 Oct 2022 13:44:47 +0800 Subject: [PATCH 05/46] [to #45821936]fix: fix block user specify revision after release_datetime Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10572162 --- modelscope/hub/api.py | 11 ++- tests/hub/test_hub_revision_release_mode.py | 84 ++++++++++++++++++++- 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 5923319d..dca6d099 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -382,10 +382,11 @@ class HubApi: logger.info('Model revision not specified, use default: %s in development mode' % revision) if revision not in branches and revision not in tags: raise NotExistError('The model: %s has no branch or tag : %s .' % revision) + logger.info('Development mode use revision: %s' % revision) else: - revisions = self.list_model_revisions( - model_id, cutoff_timestamp=release_timestamp, use_cookies=False if cookies is None else cookies) - if revision is None: + if revision is None: # user not specified revision, use latest revision before release time + revisions = self.list_model_revisions( + model_id, cutoff_timestamp=release_timestamp, use_cookies=False if cookies is None else cookies) if len(revisions) == 0: raise NoValidRevisionError('The model: %s has no valid revision!' % model_id) # tags (revisions) returned from backend are guaranteed to be ordered by create-time @@ -393,9 +394,13 @@ class HubApi: revision = revisions[0] logger.info('Model revision not specified, use the latest revision: %s' % revision) else: + # use user-specified revision + revisions = self.list_model_revisions( + model_id, cutoff_timestamp=current_timestamp, use_cookies=False if cookies is None else cookies) if revision not in revisions: raise NotExistError( 'The model: %s has no revision: %s !' % (model_id, revision)) + logger.info('Use user-specified model revision: %s' % revision) return revision def get_model_branches_and_tags( diff --git a/tests/hub/test_hub_revision_release_mode.py b/tests/hub/test_hub_revision_release_mode.py index 729a1861..73a0625e 100644 --- a/tests/hub/test_hub_revision_release_mode.py +++ b/tests/hub/test_hub_revision_release_mode.py @@ -115,7 +115,7 @@ class HubRevisionTest(unittest.TestCase): time.sleep(10) self.add_new_file_and_tag_to_repo() t2 = datetime.now().isoformat(sep=' ', timespec='seconds') - logger.info('Secnod time: %s' % t2) + logger.info('Second time: %s' % t2) # set release_datetime_backup = version.__release_datetime__ logger.info('Origin __release_datetime__: %s' @@ -142,6 +142,43 @@ class HubRevisionTest(unittest.TestCase): finally: version.__release_datetime__ = release_datetime_backup + def test_snapshot_download_revision_user_set_revision(self): + with mock.patch.dict(os.environ, self.modified_environ, clear=True): + self.prepare_repo_data_and_tag() + t1 = datetime.now().isoformat(sep=' ', timespec='seconds') + logger.info('First time: %s' % t1) + time.sleep(10) + self.add_new_file_and_tag_to_repo() + t2 = datetime.now().isoformat(sep=' ', timespec='seconds') + logger.info('Secnod time: %s' % t2) + # set + release_datetime_backup = version.__release_datetime__ + logger.info('Origin __release_datetime__: %s' + % version.__release_datetime__) + try: + logger.info('Setting __release_datetime__ to: %s' % t1) + version.__release_datetime__ = t1 + with tempfile.TemporaryDirectory() as temp_cache_dir: + snapshot_path = snapshot_download( + self.model_id, + revision=self.revision, + cache_dir=temp_cache_dir) + assert os.path.exists( + os.path.join(snapshot_path, download_model_file_name)) + assert not os.path.exists( + os.path.join(snapshot_path, download_model_file_name2)) + with tempfile.TemporaryDirectory() as temp_cache_dir: + snapshot_path = snapshot_download( + self.model_id, + revision=self.revision2, + cache_dir=temp_cache_dir) + assert os.path.exists( + os.path.join(snapshot_path, download_model_file_name)) + assert os.path.exists( + os.path.join(snapshot_path, download_model_file_name2)) + finally: + version.__release_datetime__ = release_datetime_backup + def test_file_download_revision(self): with mock.patch.dict(os.environ, self.modified_environ, clear=True): self.prepare_repo_data_and_tag() @@ -175,7 +212,6 @@ class HubRevisionTest(unittest.TestCase): self.model_id, download_model_file_name, cache_dir=temp_cache_dir) - print('Downloaded file path: %s' % file_path) assert os.path.exists(file_path) file_path = model_file_download( self.model_id, @@ -185,6 +221,50 @@ class HubRevisionTest(unittest.TestCase): finally: version.__release_datetime__ = release_datetime_backup + def test_file_download_revision_user_set_revision(self): + with mock.patch.dict(os.environ, self.modified_environ, clear=True): + self.prepare_repo_data_and_tag() + t1 = datetime.now().isoformat(sep=' ', timespec='seconds') + logger.info('First time stamp: %s' % t1) + time.sleep(10) + self.add_new_file_and_tag_to_repo() + t2 = datetime.now().isoformat(sep=' ', timespec='seconds') + logger.info('Second time: %s' % t2) + release_datetime_backup = version.__release_datetime__ + logger.info('Origin __release_datetime__: %s' + % version.__release_datetime__) + try: + version.__release_datetime__ = t1 + logger.info('Setting __release_datetime__ to: %s' % t1) + with tempfile.TemporaryDirectory() as temp_cache_dir: + file_path = model_file_download( + self.model_id, + download_model_file_name, + revision=self.revision, + cache_dir=temp_cache_dir) + assert os.path.exists(file_path) + with self.assertRaises(NotExistError): + model_file_download( + self.model_id, + download_model_file_name2, + revision=self.revision, + cache_dir=temp_cache_dir) + with tempfile.TemporaryDirectory() as temp_cache_dir: + file_path = model_file_download( + self.model_id, + download_model_file_name, + revision=self.revision2, + cache_dir=temp_cache_dir) + assert os.path.exists(file_path) + file_path = model_file_download( + self.model_id, + download_model_file_name2, + revision=self.revision2, + cache_dir=temp_cache_dir) + assert os.path.exists(file_path) + finally: + version.__release_datetime__ = release_datetime_backup + if __name__ == '__main__': unittest.main() From ae55fed2162bae29e7bda5ec821109ae5e7962e0 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Sat, 29 Oct 2022 14:37:56 +0800 Subject: [PATCH 06/46] bumpy version to 1.0.0 --- modelscope/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/version.py b/modelscope/version.py index 541dfc57..ca813cc0 100644 --- a/modelscope/version.py +++ b/modelscope/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '0.5.0' +__version__ = '1.0.0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-10-13 08:56:12' From e07f3cdbf5a8a6de91fc19f32be14eda7a6e94c4 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Sat, 29 Oct 2022 15:05:26 +0800 Subject: [PATCH 07/46] remove fasttext --- requirements/nlp.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/nlp.txt b/requirements/nlp.txt index 80fee546..433f70f7 100644 --- a/requirements/nlp.txt +++ b/requirements/nlp.txt @@ -1,6 +1,5 @@ boto3 en_core_web_sm>=2.3.5 -fasttext filelock ftfy jieba>=0.42.1 From 29448c0f578757799e16d138d3b1af42db85fde5 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Sun, 30 Oct 2022 11:15:52 +0800 Subject: [PATCH 08/46] [to #42322933] disble vit --- tests/pipelines/test_face_emotion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/test_face_emotion.py b/tests/pipelines/test_face_emotion.py index 907e15ee..96fe51a7 100644 --- a/tests/pipelines/test_face_emotion.py +++ b/tests/pipelines/test_face_emotion.py @@ -17,12 +17,12 @@ class FaceEmotionTest(unittest.TestCase): result = pipeline(input) print(result) - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skip('skip since the model is set to private for now') def test_run_modelhub(self): face_emotion = pipeline(Tasks.face_emotion, model=self.model) self.pipeline_inference(face_emotion, self.img) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skip('skip since the model is set to private for now') def test_run_modelhub_default_model(self): face_emotion = pipeline(Tasks.face_emotion) self.pipeline_inference(face_emotion, self.img) From 902019c2e01c8fa1583f91d2b772872db6ebc75a Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Sun, 30 Oct 2022 13:55:49 +0800 Subject: [PATCH 09/46] [to #42322933] disble vgg19_fer --- tests/pipelines/test_facial_expression_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_facial_expression_recognition.py b/tests/pipelines/test_facial_expression_recognition.py index fff83ad6..f5151bef 100644 --- a/tests/pipelines/test_facial_expression_recognition.py +++ b/tests/pipelines/test_facial_expression_recognition.py @@ -23,7 +23,7 @@ class FacialExpressionRecognitionTest(unittest.TestCase): cv2.imwrite('result.png', img) print(f'output written to {osp.abspath("result.png")}') - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skip('skip since the model is set to private for now') def test_run_modelhub(self): fer = pipeline( Tasks.facial_expression_recognition, model=self.model_id) From 9f7b8b86a33d65d6374b19b355a7ea9d1e572f80 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Sun, 30 Oct 2022 13:59:12 +0800 Subject: [PATCH 10/46] [to #42322933] disble 2dkeypoints training since face_2d_keypoints_dataset is set to be private --- tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py b/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py index 4dffa998..e4f0c57e 100644 --- a/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py +++ b/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py @@ -50,7 +50,8 @@ class EasyCVTrainerTestFace2DKeypoints(unittest.TestCase): trainer = build_trainer(trainer_name, kwargs) trainer.train() - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skip( + 'skip since face_2d_keypoints_dataset is set to private for now') def test_trainer_single_gpu(self): temp_file_dir = tempfile.TemporaryDirectory() tmp_dir = temp_file_dir.name From e2d35fbb14b342c8ffc214469bca622bf954983c Mon Sep 17 00:00:00 2001 From: "yichang.zyc" Date: Sun, 30 Oct 2022 21:51:11 +0800 Subject: [PATCH 11/46] =?UTF-8?q?[to=20#42322933]clip=E6=94=AF=E6=8C=81fin?= =?UTF-8?q?etune=20=20=20=20=20=20=20=20=20Link:=20https://code.alibaba-in?= =?UTF-8?q?c.com/Ali-MaaS/MaaS-lib/codereview/10572842?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modelscope/metainfo.py | 6 + modelscope/metrics/builder.py | 1 + modelscope/metrics/inbatch_recall_metric.py | 55 +++ modelscope/models/multi_modal/clip/model.py | 156 ++------ .../multi_modal_embedding_pipeline.py | 22 +- modelscope/preprocessors/multi_modal.py | 177 +++++++++ .../hooks/clip_clamp_logit_scale_hook.py | 18 + .../trainers/multi_modal/clip/clip_trainer.py | 345 ++++++++++-------- .../multi_modal/clip/clip_trainer_utils.py | 211 ++++++----- tests/pipelines/test_multi_modal_embedding.py | 6 +- tests/trainers/test_clip_trainer.py | 83 +++++ 11 files changed, 704 insertions(+), 376 deletions(-) create mode 100644 modelscope/metrics/inbatch_recall_metric.py create mode 100644 modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py create mode 100644 tests/trainers/test_clip_trainer.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 3951541c..8c9964b8 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -389,6 +389,7 @@ class Preprocessors(object): # multi-modal preprocessor ofa_tasks_preprocessor = 'ofa-tasks-preprocessor' + clip_preprocessor = 'clip-preprocessor' mplug_tasks_preprocessor = 'mplug-tasks-preprocessor' # science preprocessor @@ -428,6 +429,8 @@ class Metrics(object): image_inpainting_metric = 'image-inpainting-metric' # metric for ocr NED = 'ned' + # metric for cross-modal retrieval + inbatch_recall = 'inbatch_recall' # metric for referring-video-object-segmentation task referring_video_object_segmentation_metric = 'referring-video-object-segmentation-metric' @@ -474,6 +477,9 @@ class Hooks(object): # Compression SparsityHook = 'SparsityHook' + # CLIP logit_scale clamp + ClipClampLogitScaleHook = 'ClipClampLogitScaleHook' + class LR_Schedulers(object): """learning rate scheduler is defined here diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py index 2b61c1ae..b9e402c5 100644 --- a/modelscope/metrics/builder.py +++ b/modelscope/metrics/builder.py @@ -24,6 +24,7 @@ class MetricKeys(object): ROUGE_1 = 'rouge-1' ROUGE_L = 'rouge-l' NED = 'ned' # ocr metric + BatchAcc = 'inbatch_t2i_recall_at_1' task_default_metrics = { diff --git a/modelscope/metrics/inbatch_recall_metric.py b/modelscope/metrics/inbatch_recall_metric.py new file mode 100644 index 00000000..d098a883 --- /dev/null +++ b/modelscope/metrics/inbatch_recall_metric.py @@ -0,0 +1,55 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Dict + +import numpy as np +import torch + +from modelscope.metainfo import Metrics +from modelscope.outputs import OutputKeys +from modelscope.utils.registry import default_group +from .base import Metric +from .builder import METRICS, MetricKeys + + +@METRICS.register_module( + group_key=default_group, module_name=Metrics.inbatch_recall) +class InbatchRecallMetric(Metric): + """The metric computation class for in-batch retrieval classes. + + This metric class calculates in-batch image recall@1 for each input batch. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.inbatch_t2i_hitcnts = [] + self.batch_sizes = [] + + def add(self, outputs: Dict, inputs: Dict): + image_features = outputs[OutputKeys.IMG_EMBEDDING] + text_features = outputs[OutputKeys.TEXT_EMBEDDING] + + assert type(image_features) == torch.Tensor and type( + text_features) == torch.Tensor + + with torch.no_grad(): + logits_per_image = image_features @ text_features.t() + logits_per_text = logits_per_image.t() + batch_size = logits_per_image.shape[0] + + ground_truth = torch.arange(batch_size).long() + ground_truth = ground_truth.to(image_features.device) + + inbatch_t2i_hitcnt = (logits_per_text.argmax(-1) == ground_truth + ).sum().float().item() + + self.inbatch_t2i_hitcnts.append(inbatch_t2i_hitcnt) + self.batch_sizes.append(batch_size) + + def evaluate(self): + assert len(self.inbatch_t2i_hitcnts) == len( + self.batch_sizes) and len(self.batch_sizes) > 0 + return { + MetricKeys.BatchAcc: + sum(self.inbatch_t2i_hitcnts) / sum(self.batch_sizes) + } diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py index 92d9e11a..b1c84292 100644 --- a/modelscope/models/multi_modal/clip/model.py +++ b/modelscope/models/multi_modal/clip/model.py @@ -15,15 +15,13 @@ import os from collections import OrderedDict -from typing import Any, Dict, Iterable, List, Tuple, Union +from typing import Any, Dict, Tuple, Union import json import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from PIL import Image -from torchvision.transforms import Compose, Normalize, Resize, ToTensor from modelscope.metainfo import Models from modelscope.models import TorchModel @@ -506,21 +504,6 @@ def convert_weights(model: nn.Module): model.apply(_convert_weights_to_fp16) -def _convert_to_rgb(image): - return image.convert('RGB') - - -def image_transform(image_size=224): - transform = Compose([ - _convert_to_rgb, - Resize((image_size, image_size)), - ToTensor(), - Normalize((0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711)), - ]) - return transform - - @MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) class CLIPForMultiModalEmbedding(TorchModel): @@ -540,72 +523,40 @@ class CLIPForMultiModalEmbedding(TorchModel): with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft: - model_info = json.load(fv) + self.model_info = json.load(fv) for k, v in json.load(ft).items(): - model_info[k] = v - - # image preprocess - self.img_preprocess = image_transform(model_info['image_resolution']) + self.model_info[k] = v - # text tokenizer vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}' self.tokenizer = FullTokenizer(vocab_file=vocab_file) # initialize the model - self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer) + self.clip_model = CLIP(**self.model_info, tokenizer=self.tokenizer) convert_weights(self.clip_model) # restore the pretrained weight checkpoint = torch.load( f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu') - sd = checkpoint['state_dict'] + sd = checkpoint[ + 'state_dict'] if 'state_dict' in checkpoint else checkpoint if next(iter(sd.items()))[0].startswith('module'): sd = {k[len('module.'):]: v for k, v in sd.items()} + # support the finetuned model + if next(iter(sd.items()))[0].startswith('clip_model'): + sd = {k[len('clip_model.'):]: v for k, v in sd.items()} self.clip_model.load_state_dict(sd) self.clip_model.eval() # place the model - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - if self.device == 'cuda': + self.device = 'cuda:{}'.format(int(os.environ.get( + 'LOCAL_RANK', 0))) if torch.cuda.is_available() else 'cpu' + if torch.cuda.is_available(): self.clip_model.to(self.device) - logger.info('Use GPU for inference') + logger.info('Use GPU {} for finetuning & inference'.format( + int(os.environ.get('LOCAL_RANK', 0)))) else: self.clip_model.float() - logger.info('Use CPU for inference') - - def tokenize(self, - texts: Union[str, List[str]], - context_length: int = 52) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s) - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all baseline models use 24 as the context length - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] - """ - if isinstance(texts, str): - texts = [texts] - - all_tokens = [] - for text in texts: - all_tokens.append( - [self.tokenizer.vocab['[CLS]']] - + self.tokenizer.convert_tokens_to_ids( - self.tokenizer.tokenize(text))[:context_length - 2] - + [self.tokenizer.vocab['[SEP]']]) - - result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) - - for i, tokens in enumerate(all_tokens): - assert len(tokens) <= context_length - result[i, :len(tokens)] = torch.tensor(tokens) - - return result + logger.info('Use CPU for finetuning & inference') def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: from modelscope.outputs import OutputKeys @@ -613,75 +564,36 @@ class CLIPForMultiModalEmbedding(TorchModel): OutputKeys.IMG_EMBEDDING: None, OutputKeys.TEXT_EMBEDDING: None } - if 'img' in input and input['img'] is not None: - image_input = input['img'] - - # single image input - if isinstance(image_input, Image.Image): - image_tensor = self.img_preprocess(image_input).unsqueeze(0) - # multi images input - elif isinstance(image_input, list): - if all([isinstance(elem, Image.Image) - for elem in image_input]): - image_tensor = torch.stack( - [self.img_preprocess(elem) for elem in image_input], - dim=0) - else: - unsupported_elem_type = [ - type(elem) for elem in image_input - if not isinstance(elem, Image.Image) - ][0] - raise TypeError( - f'img should be PIL.Image or List[PIL.Image], \ - but got a List containing one {unsupported_elem_type}' - ) - # others - else: - raise TypeError( - f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}' - ) - - image_tensor = image_tensor.to(self.device) - - with torch.no_grad(): + mode = input.get('mode', ModeKeys.INFERENCE) + + # encode the image + if 'img' in input and isinstance(input['img'], torch.Tensor): + image_tensor = input['img'].to(self.device) + if image_tensor.dim() == 5 and image_tensor.shape[1] == 1: + image_tensor = image_tensor.squeeze(1) + + with torch.autograd.set_grad_enabled(mode == ModeKeys.TRAIN): image_features = self.clip_model.encode_image(image_tensor) image_features /= image_features.norm( dim=-1, keepdim=True) # l2-normalize output[OutputKeys.IMG_EMBEDDING] = image_features - if 'text' in input and input['text'] is not None: - text_input = input['text'] - - # single text input - if isinstance(text_input, str): - text_tensor = self.tokenize(text_input) - # multi texts input - elif isinstance(text_input, list): - if all([isinstance(elem, str) for elem in text_input]): - text_tensor = self.tokenize(text_input) - else: - unsupported_elem_type = [ - type(elem) for elem in text_input - if not isinstance(elem, str) - ][0] - raise TypeError( - f'text should be str or List[str], but got a List containing one {unsupported_elem_type}' - ) - # others - else: - raise TypeError( - f'text should be str or List[str], but got {type(text_input)}' - ) - - text_tensor = text_tensor.to(self.device) - - with torch.no_grad(): + if 'text' in input and isinstance(input['text'], torch.Tensor): + text_tensor = input['text'].to(self.device) + if text_tensor.dim() == 3 and text_tensor.shape[1] == 1: + text_tensor = text_tensor.squeeze(1) + + with torch.autograd.set_grad_enabled(mode == ModeKeys.TRAIN): text_features = self.clip_model.encode_text(text_tensor) text_features /= text_features.norm( dim=-1, keepdim=True) # l2-normalize output[OutputKeys.TEXT_EMBEDDING] = text_features + if mode == ModeKeys.TRAIN: + output['logit_scale'] = (self.clip_model.logit_scale + * 1.0).exp().mean() + return output def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py index d3f15c23..18ee1dbf 100644 --- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py +++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py @@ -1,10 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict +from typing import Any, Dict, Optional, Union from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal.clip.model import CLIPForMultiModalEmbedding from modelscope.pipelines.base import Input, Model, Pipeline from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors.multi_modal import CLIPPreprocessor, Preprocessor from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -17,7 +19,10 @@ logger = get_logger() Tasks.multi_modal_embedding, module_name=Pipelines.multi_modal_embedding) class MultiModalEmbeddingPipeline(Pipeline): - def __init__(self, model: str, device: str = 'gpu'): + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + **kwargs): """ use `model` and `preprocessor` to create a kws pipeline for prediction Args: @@ -29,14 +34,17 @@ class MultiModalEmbeddingPipeline(Pipeline): pipe_model = model else: raise NotImplementedError('model must be a single str') + pipe_model.eval() + if preprocessor is None: + if isinstance(pipe_model, CLIPForMultiModalEmbedding): + preprocessor = CLIPPreprocessor(pipe_model.model_dir) + else: + raise NotImplementedError - super().__init__(model=pipe_model) - - def preprocess(self, input: Input) -> Dict[str, Any]: - return input + super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - return self.model(input) + return self.model(self.preprocess(input)) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 557b469a..17dffb48 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -3,8 +3,11 @@ import os.path as osp from io import BytesIO from typing import Any, Dict, List, Tuple, Union +import json import torch from PIL import Image +from timm.data import create_transform +from torchvision.transforms import Compose, Normalize, Resize, ToTensor from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Preprocessors @@ -107,6 +110,180 @@ class OfaPreprocessor(Preprocessor): eos_idx=self.tokenizer.eos_token_id) +def _convert_to_rgb(image): + return image.convert('RGB') + + +@PREPROCESSORS.register_module( + Fields.multi_modal, module_name=Preprocessors.clip_preprocessor) +class CLIPPreprocessor(Preprocessor): + + def __init__(self, + model_dir: str, + mode=ModeKeys.INFERENCE, + *args, + **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + mode: preprocessor mode (model mode) + """ + super().__init__(*args, **kwargs) + model_dir = model_dir if osp.exists(model_dir) else snapshot_download( + model_dir) + self.mode = mode + # text tokenizer + from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer + if 'tokenizer' in kwargs and isinstance(kwargs['tokenizer'], + FullTokenizer): + self.tokenizer = kwargs['tokenizer'] + else: + vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}' + self.tokenizer = FullTokenizer(vocab_file=vocab_file) + # image preprocessor + if 'resolution' in kwargs and isinstance(kwargs['resolution'], int): + self.image_resolution = kwargs['resolution'] + else: + self.image_resolution = json.load( + open('{}/vision_model_config.json'.format( + model_dir)))['image_resolution'] + self.img_preprocess = self._build_image_transform() + # key mapping + # specify the input keys, compatible with training and inference whose key names may be different + self.input_keys = {'img': 'img', 'text': 'text'} + + def _build_image_transform(self): + + if self.mode == ModeKeys.TRAIN: + transform = create_transform( + input_size=self.image_resolution, + scale=(0.9, 1.0), + is_training=True, + color_jitter=None, + auto_augment='original', + interpolation='bicubic', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711), + ) + transform = Compose(transform.transforms[:-3] + [_convert_to_rgb] + + transform.transforms[-3:]) + else: + transform = Compose([ + Resize((self.image_resolution, self.image_resolution), + interpolation=Image.BICUBIC), + _convert_to_rgb, + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ]) + return transform + + def tokenize(self, + texts: Union[str, List[str]], + context_length: int = 52) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + context_length : int + The context length to use; all baseline models use 24 as the context length + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + all_tokens = [] + for text in texts: + all_tokens.append( + [self.tokenizer.vocab['[CLS]']] + + self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(text))[:context_length - 2] + + [self.tokenizer.vocab['[SEP]']]) + + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + assert len(tokens) <= context_length + result[i, :len(tokens)] = torch.tensor(tokens) + + return result + + def set_input_img_key(self, new_key: str): + self.input_keys['img'] = new_key + + def set_input_text_key(self, new_key: str): + self.input_keys['text'] = new_key + + def __call__(self, input: Union[str, tuple, Dict[str, Any]], *args, + **kwargs) -> Dict[str, Any]: + output = {} + # preprocess the image input + input_img_key = self.input_keys['img'] + if input_img_key in input and input[input_img_key] is not None: + image_input = input[input_img_key] + + # single image input + if isinstance(image_input, Image.Image): + image_tensor = self.img_preprocess(image_input).unsqueeze(0) + # multi images input + elif isinstance(image_input, list): + if all([isinstance(elem, Image.Image) + for elem in image_input]): + image_tensor = torch.stack( + [self.img_preprocess(elem) + for elem in image_input], # noqa + dim=0) # noqa + else: + unsupported_elem_type = [ + type(elem) for elem in image_input + if not isinstance(elem, Image.Image) + ][0] + raise TypeError( + f'img should be PIL.Image or List[PIL.Image], \ + but got a List containing one {unsupported_elem_type}' + ) + # others + else: + raise TypeError( + f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}' + ) + output['img'] = image_tensor + + # preprocess the text input + input_text_key = self.input_keys['text'] + if input_text_key in input and input[input_text_key] is not None: + text_input = input[input_text_key] + + # single text input + if isinstance(text_input, str): + text_tensor = self.tokenize(text_input) + # multi texts input + elif isinstance(text_input, list): + if all([isinstance(elem, str) for elem in text_input]): + text_tensor = self.tokenize(text_input) + else: + unsupported_elem_type = [ + type(elem) for elem in text_input + if not isinstance(elem, str) + ][0] + raise TypeError( + f'text should be str or List[str], but got a List containing one {unsupported_elem_type}' + ) + # others + else: + raise TypeError( + f'text should be str or List[str], but got {type(text_input)}' + ) + output['text'] = text_tensor + + return output + + @PREPROCESSORS.register_module( Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor) class MPlugPreprocessor(Preprocessor): diff --git a/modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py b/modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py new file mode 100644 index 00000000..ce98e6c9 --- /dev/null +++ b/modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py @@ -0,0 +1,18 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import torch + +from modelscope.metainfo import Hooks +from modelscope.trainers.multi_modal.clip.clip_trainer import CLIPTrainer +from .builder import HOOKS +from .hook import Hook + + +@HOOKS.register_module(module_name=Hooks.ClipClampLogitScaleHook) +class ClipClampLogitScaleHook(Hook): + """ClipClampLogitScaleHook hook which performs clamp on CLIP logit scale parameter after update""" + + def after_train_iter(self, trainer: CLIPTrainer): + """Called after every training iter to evaluate the results.""" + unwrapped_model = getattr(trainer.model, 'module', trainer.model) + logit_scale = unwrapped_model.clip_model.logit_scale + logit_scale.data = torch.clamp(logit_scale.data, 0, 4.6052) diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer.py b/modelscope/trainers/multi_modal/clip/clip_trainer.py index cbe83417..40c524ac 100644 --- a/modelscope/trainers/multi_modal/clip/clip_trainer.py +++ b/modelscope/trainers/multi_modal/clip/clip_trainer.py @@ -1,169 +1,206 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import math import os -from typing import Dict, Optional +from typing import Callable, Dict, Optional, Tuple, Union import torch -import torch.distributed as dist -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler +from torch import distributed as dist +from torch import nn +from torch.utils.data import Dataset from modelscope.metainfo import Trainers -from modelscope.models.base import Model -from modelscope.trainers.base import BaseTrainer +from modelscope.models.base import Model, TorchModel +from modelscope.models.multi_modal.clip.model import convert_models_to_fp32 +from modelscope.msdatasets.ms_dataset import MsDataset +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.multi_modal import CLIPPreprocessor +from modelscope.trainers import EpochBasedTrainer from modelscope.trainers.builder import TRAINERS +from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config -from modelscope.utils.constant import ModeKeys -from modelscope.utils.logger import get_logger -from .clip_trainer_utils import ImageWithCaptionDataset, get_optimizer +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys, + ModeKeys) +from .clip_trainer_utils import get_loss, get_optimizer_params, get_schedule -logger = get_logger() + +def exclude(n): + return 'bn' in n or 'ln' in n or 'bias' in n or 'logit_scale' in n + + +def include(n): + return not exclude(n) @TRAINERS.register_module(module_name=Trainers.clip_multi_modal_embedding) -class CLIPTrainer(BaseTrainer): - - def __init__(self, cfg_file: str, model: str, device_id: int, *args, - **kwargs): - super().__init__(cfg_file) - - self.cfg = Config.from_file(cfg_file) - self.model = Model.from_pretrained(model) - self.device_id = device_id - self.total_epoch = self.cfg.train.epoch - self.train_batch_size = self.cfg.train.batch_size - self.val_batch_size = self.cfg.evaluation.batch_size - self.ckpt_dir = self.cfg.train.ckpt_dir - - self.train_dataset = ImageWithCaptionDataset( - json_file='{}/{}'.format(self.cfg.dataset.root_dir, - self.cfg.dataset.train_set), - img_dir=self.cfg.dataset.root_dir, - phase=ModeKeys.TRAIN) - self.val_dataset = ImageWithCaptionDataset( - json_file='{}/{}'.format(self.cfg.dataset.root_dir, - self.cfg.dataset.val_set), - img_dir=self.cfg.dataset.root_dir, - phase=ModeKeys.EVAL) - - def train(self, *args, **kwargs): - assert dist.is_initialized() - - self.model.clip_model.train() - self.model.clip_model.to(self.device_id) - ddp_model = torch.nn.parallel.DistributedDataParallel( - self.model.clip_model, device_ids=[ - self.device_id, - ]) - - optimizer = get_optimizer(ddp_model) - - for epoch in range(self.total_epoch): - train_sampler = DistributedSampler( - dataset=self.train_dataset, shuffle=True) - train_sampler.set_epoch(epoch) - - train_params = { - 'pin_memory': True, - 'collate_fn': None, - 'batch_size': self.train_batch_size, - 'shuffle': False, - 'drop_last': True, - 'sampler': train_sampler, - 'num_workers': 8 +class CLIPTrainer(EpochBasedTrainer): + + def __init__( + self, + model: Optional[Union[TorchModel, nn.Module, str]] = None, + cfg_file: Optional[str] = None, + arg_parse_fn: Optional[Callable] = None, + data_collator: Optional[Union[Callable, Dict[str, + Callable]]] = None, + train_dataset: Optional[Union[MsDataset, Dataset]] = None, + eval_dataset: Optional[Union[MsDataset, Dataset]] = None, + preprocessor: Optional[Union[Preprocessor, + Dict[str, Preprocessor]]] = None, + optimizers: Tuple[torch.optim.Optimizer, + torch.optim.lr_scheduler._LRScheduler] = (None, + None), + model_revision: Optional[str] = DEFAULT_MODEL_REVISION, + seed: int = 42, + **kwargs): + model = Model.from_pretrained(model, revision=model_revision) + # for training & eval, we convert the model from FP16 back to FP32 + # to compatible with modelscope amp training + convert_models_to_fp32(model) + cfg = Config.from_file(cfg_file) + if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0: + work_dir = cfg.train.work_dir + else: + work_dir = kwargs['work_dir'] + + # fetch the model name of CLIP model (base, large or large-336) + model_name = cfg.pretrained_model.model_name + + # world size + world_size = int(os.environ.get('WORLD_SIZE', 1)) + + # train step, optimizer and lr_scheduler + epoch_steps = math.ceil( + len(train_dataset) / # noqa + (cfg.train.dataloader.batch_size_per_gpu * world_size)) # noqa + cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs + + if optimizers[0] is None: + named_parameters = list(model.named_parameters()) + gain_or_bias_params = [ + p for n, p in named_parameters + if exclude(n) and p.requires_grad + ] + rest_params = [ + p for n, p in named_parameters + if include(n) and p.requires_grad + ] + optimizer_hparams = get_optimizer_params( + model_name, cfg) # lr, wd, beta1, beta2, eps + optimizer_args = { + 'params': [ + { + 'params': gain_or_bias_params, + 'weight_decay': 0. + }, + { + 'params': rest_params, + 'weight_decay': optimizer_hparams['weight_decay'] + }, + ], + 'lr': + optimizer_hparams['lr'], + 'betas': + (optimizer_hparams['beta1'], optimizer_hparams['beta2']), + 'eps': + optimizer_hparams['eps'], + } + optimizer = build_optimizer( + model, cfg=cfg.train.optimizer, default_args=optimizer_args) + else: + optimizer = optimizers[0] + + if optimizers[1] is None: + lr_scheduler = get_schedule(optimizer, cfg.train.lr_scheduler) + else: + lr_scheduler = optimizers[1] + optimizers = (optimizer, lr_scheduler) + + # loss module + loss_img = nn.CrossEntropyLoss() + loss_txt = nn.CrossEntropyLoss() + self.loss_img = loss_img.cuda(int(os.environ.get('LOCAL_RANK', 0))) + self.loss_txt = loss_txt.cuda(int(os.environ.get('LOCAL_RANK', 0))) + self.loss_cfg = cfg.train.loss_cfg + + # launcher and use_fp16 + if 'launcher' not in kwargs and cfg.train.get('launcher', None): + kwargs['launcher'] = cfg.train.launcher + if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False): + kwargs['use_fp16'] = cfg.train.use_fp16 + + # preprocessor + if preprocessor is None: + preprocessor = { + ConfigKeys.train: + CLIPPreprocessor( + model_dir=work_dir, + mode=ModeKeys.TRAIN, + tokenizer=model.tokenizer, + resolution=model.model_info['image_resolution']), + ConfigKeys.val: + CLIPPreprocessor( + model_dir=work_dir, + mode=ModeKeys.EVAL, + tokenizer=model.tokenizer, + resolution=model.model_info['image_resolution']), } - train_loader = DataLoader(self.train_dataset, **train_params) - - for batch_idx, (img_tensor, text_str_list, - img_id_list) in enumerate(train_loader): - text_info_list = [ - self.model.tokenize_text(tmp) for tmp in text_str_list - ] - text_ids_tensor = torch.cat([tmp[0] for tmp in text_info_list], - dim=0) - text_masks_tensor = torch.cat( - [tmp[1] for tmp in text_info_list], dim=0) - - img_tensor = img_tensor.to(self.device_id, non_blocking=True) - img_id_list = img_id_list.to(self.device_id, non_blocking=True) - text_ids_tensor = text_ids_tensor.to( - self.device_id, non_blocking=True) - text_masks_tensor = text_masks_tensor.to( - self.device_id, non_blocking=True) - - loss = ddp_model((img_tensor, text_ids_tensor, - text_masks_tensor, img_id_list), - ModeKeys.TRAIN) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if batch_idx % 10 == 0: - logger.info( - 'epoch: {}, train batch {}/{}, loss={:.5f}, logit_scale={:.5f}' - .format(epoch, batch_idx, len(train_loader), - loss.item(), - ddp_model.module.logit_scale.exp().item())) - if dist.get_rank() == 0: - os.makedirs(self.ckpt_dir, exist_ok=True) - torch.save(ddp_model.module.state_dict(), - '{}/epoch{}.pth'.format(self.ckpt_dir, epoch)) - - def evaluate(self, - checkpoint_path: Optional[str] = None, - *args, - **kwargs) -> Dict[str, float]: - if checkpoint_path is not None: - checkpoint_params = torch.load(checkpoint_path, 'cpu') - self.model.clip_model.load_state_dict(checkpoint_params) - self.model.clip_model.eval() - self.model.clip_model.to(self.device_id) - - val_params = { - 'collate_fn': None, - 'batch_size': self.val_batch_size, - 'shuffle': False, - 'drop_last': False, - 'num_workers': 8 - } - val_loader = DataLoader(self.val_dataset, **val_params) - - tp_cnt_per_batch = [] - processed_cnt = 0 - with torch.no_grad(): - for batch_idx, (img_tensor, text_str_list, - img_id_list) in enumerate(val_loader): - text_info_list = [ - self.model.tokenize_text(tmp) for tmp in text_str_list - ] - text_ids_tensor = torch.cat([tmp[0] for tmp in text_info_list], - dim=0) - text_masks_tensor = torch.cat( - [tmp[1] for tmp in text_info_list], dim=0) - - img_tensor = img_tensor.to(self.device_id, non_blocking=True) - img_id_list = img_id_list.to(self.device_id, non_blocking=True) - text_ids_tensor = text_ids_tensor.to( - self.device_id, non_blocking=True) - text_masks_tensor = text_masks_tensor.to( - self.device_id, non_blocking=True) - - img_feat = self.model.clip_model(img_tensor, input_type='img') - text_feat = self.model.clip_model( - (text_ids_tensor, text_masks_tensor), input_type='text') - - sim_mat = text_feat @ img_feat.t() - text_cnt, img_cnt = sim_mat.shape - top1_scores, match_ids = torch.max(sim_mat, dim=1) - - match_ids = match_ids.int() - gt_ids = torch.tensor(range(0, text_cnt)).to( - self.device_id, non_blocking=True).int() - error_cnt = torch.nonzero(match_ids - gt_ids) - processed_cnt += text_cnt - - tp_cnt_per_batch.append(text_cnt - 1.0 * error_cnt.numel()) - logger.info('current acc: {:.3f}'.format( - sum(tp_cnt_per_batch) / processed_cnt)) + # dataset related + self.dataset_cfg = cfg.dataset + if hasattr(self.dataset_cfg, 'column_map'): + # cases where dataset key names are not "img" and "text" + img_key_name = getattr(self.dataset_cfg.column_map, 'img', 'img') + preprocessor[ConfigKeys.train].set_input_img_key(img_key_name) + preprocessor[ConfigKeys.val].set_input_img_key(img_key_name) + text_key_name = getattr(self.dataset_cfg.column_map, 'text', + 'text') + preprocessor[ConfigKeys.train].set_input_text_key(text_key_name) + preprocessor[ConfigKeys.val].set_input_text_key(text_key_name) + self.global_batch_size = cfg.train.dataloader.batch_size_per_gpu * world_size + + super().__init__( + model=model, + cfg_file=cfg_file, + arg_parse_fn=arg_parse_fn, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + preprocessor=preprocessor, + optimizers=optimizers, + seed=seed, + **kwargs, + ) + + def train_step(self, model, inputs): + model.train() + inputs['mode'] = ModeKeys.TRAIN + model_outputs = model.forward( + inputs + ) # {OutputKeys.IMG_EMBEDDING: Tensor(batch_size, dim), OutputKeys.TEXT_EMBEDDING: Tensor(batch_size, dim)} + loss = get_loss(model_outputs, self.loss_img, self.loss_txt, + self.loss_cfg) + train_outputs = {'loss': loss} + # add model output info to log + if 'log_vars' not in train_outputs: + default_keys_pattern = ['loss'] + match_keys = set([]) + for key_p in default_keys_pattern: + match_keys.update( + [key for key in train_outputs.keys() if key_p in key]) + log_vars = {} + for key in match_keys: + value = train_outputs.get(key, None) + if value is not None: + if dist.is_available() and dist.is_initialized(): + value = value.data.clone() + dist.all_reduce(value.div_(dist.get_world_size())) + log_vars.update({key: value.item()}) + unwrapped_model = getattr(model, 'module', model) + log_vars[ + 'logit_scale'] = unwrapped_model.clip_model.logit_scale.data.clone( + ).item() # noqa + log_vars['global_batch_size'] = int(self.global_batch_size) + self.log_buffer.update(log_vars) + else: + self.log_buffer.update(train_outputs['log_vars']) + self.train_outputs = train_outputs diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py b/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py index 4e150fe7..fed255de 100644 --- a/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py +++ b/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py @@ -1,94 +1,125 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2022 The OFA-Sys Team. +# All rights reserved. +# This source code is licensed under the Apache 2.0 license +# found in the LICENSE file in the root directory. +import math import os -import random +from functools import partial +from inspect import unwrap -import json import torch -import torch.nn.functional as F -from PIL import Image -from torch.utils.data import Dataset -from torchvision import transforms - -from modelscope.utils.constant import ModeKeys - -train_transform = transforms.Compose([ - transforms.RandomResizedCrop( - 224, scale=(0.5, 1.0), interpolation=Image.BICUBIC), - transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], - p=0.8), - transforms.RandomGrayscale(p=0.2), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711)) -]) - -val_transform = transforms.Compose([ - transforms.Resize((224, 224), interpolation=Image.BICUBIC), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711)) -]) - - -class ImageWithCaptionDataset(Dataset): - - def __init__(self, json_file, img_dir, phase): - self.annotations = json.load(open(json_file)) - self.img_dir = img_dir - if phase == ModeKeys.TRAIN: - self.transform = train_transform - elif phase == ModeKeys.EVAL: - self.transform = val_transform - - self.img_name2img_id = {} - for anno_dict in self.annotations: - img_name = anno_dict['image'] - if img_name not in self.img_name2img_id: - self.img_name2img_id[img_name] = len(self.img_name2img_id) - - def __len__(self): - return len(self.annotations) - - def __getitem__(self, index): - anno_dict = self.annotations[index] - - img_path = os.path.join(self.img_dir, anno_dict['image']) - img_pil = Image.open(img_path).convert('RGB') - img_th = self.transform(img_pil) - img_id = self.img_name2img_id[anno_dict['image']] - - text_str = random.choice(anno_dict['caption']) - - return img_th, text_str, img_id - - -def get_params_groups(ddp_model, weight_decay): - decay = [] - no_decay = [] - for name, param in ddp_model.named_parameters(): - if not param.requires_grad: - continue - if len(param.shape) == 1 or name.endswith('.bias'): - no_decay.append(param) - else: - decay.append(param) - params_groups = [{ - 'params': no_decay, - 'weight_decay': 0. - }, { - 'params': decay, - 'weight_decay': weight_decay - }] - return params_groups - - -def get_optimizer(ddp_model): - from torch.optim import AdamW - lr_init = 1e-5 - betas = [0.9, 0.999] - weight_decay = 0.02 - params_groups = get_params_groups(ddp_model, weight_decay=weight_decay) - return AdamW( - params_groups, lr=lr_init, betas=betas, weight_decay=weight_decay) +import torch.distributed as dist +from torch.optim.lr_scheduler import LambdaLR + +from modelscope.outputs import OutputKeys + + +def get_optimizer_params(model_name, cfg): + # get default params + # Params from paper (https://arxiv.org/pdf/2103.00020.pdf) + # base model + if model_name in ['damo/multi-modal_clip-vit-base-patch16_zh']: + params = { + 'lr': 5.0e-4, + 'beta1': 0.9, + 'beta2': 0.98, + 'eps': 1.0e-6, + 'weight_decay': 0.0 + } + # large models + elif model_name in [ + 'damo/multi-modal_clip-vit-large-patch14_zh', + 'damo/multi-modal_clip-vit-large-patch14_336_zh' + ]: + params = { + 'lr': 4.0e-4, + 'beta1': 0.9, + 'beta2': 0.98, + 'eps': 1.0e-6, + 'weight_decay': 0.0 + } + else: + params = { + 'lr': 5.0e-4, + 'beta1': 0.9, + 'beta2': 0.999, + 'eps': 1.0e-8, + 'weight_decay': 0.0 + } + # override with config params + for key in ['lr', 'beta1', 'beta2', 'eps', 'weight_decay']: + if hasattr(cfg.train, 'optimizer_hparams'): + params[key] = getattr(cfg.train.optimizer_hparams, key, + params[key]) + return params + + +def get_loss(model_outputs, loss_img, loss_txt, loss_cfg): + image_features = model_outputs[OutputKeys.IMG_EMBEDDING] + text_features = model_outputs[OutputKeys.TEXT_EMBEDDING] + logit_scale = model_outputs['logit_scale'] + logit_scale = logit_scale.mean() + if loss_cfg.aggregate and int(os.environ.get('WORLD_SIZE', 1)) > 1: + world_size = dist.get_world_size() + rank = dist.get_rank() + + # We gather tensors from all gpus to get more negatives to contrast with. + gathered_image_features = [ + torch.zeros_like(image_features) for _ in range(world_size) + ] + gathered_text_features = [ + torch.zeros_like(text_features) for _ in range(world_size) + ] + dist.all_gather(gathered_image_features, image_features) + dist.all_gather(gathered_text_features, text_features) + + all_image_features = torch.cat([image_features] + + gathered_image_features[:rank] + + gathered_image_features[rank + 1:]) + all_text_features = torch.cat([text_features] + + gathered_text_features[:rank] + + gathered_text_features[rank + 1:]) + + # this is needed to send gradients back everywhere. + logits_per_image = logit_scale * all_image_features @ all_text_features.t( + ) + logits_per_text = logits_per_image.t() + + else: + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logit_scale * text_features @ image_features.t() + + ground_truth = torch.arange(len(logits_per_image)).long() + ground_truth = ground_truth.cuda( + int(os.environ.get('LOCAL_RANK', 0)), non_blocking=True) + + total_loss = (loss_img(logits_per_image, ground_truth) + + loss_txt(logits_per_text, ground_truth)) / 2 + + return total_loss + + +def lr_lambda(num_warmup_steps, num_training_steps, num_cycles, current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float( + max(1, num_training_steps - num_warmup_steps)) + return max( + 0.0, + 0.5 * # noqa + (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) # noqa + + +def get_schedule(optimizer, + scheduler, + num_cycles: float = 0.5, + last_epoch: int = -1): + num_warmup_steps = int(scheduler.warmup_proportion + * scheduler.num_train_steps) + num_training_steps = scheduler.num_train_steps + + return LambdaLR( + optimizer, + partial(lr_lambda, num_warmup_steps, num_training_steps, num_cycles), + last_epoch) diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py index ee9cdb1f..7eddc690 100644 --- a/tests/pipelines/test_multi_modal_embedding.py +++ b/tests/pipelines/test_multi_modal_embedding.py @@ -24,7 +24,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck): def test_run(self): pipeline_multi_modal_embedding = pipeline( Tasks.multi_modal_embedding, model=self.model_id) - text_embedding = pipeline_multi_modal_embedding( + text_embedding = pipeline_multi_modal_embedding.forward( self.test_input)[OutputKeys.TEXT_EMBEDDING] print('l1-norm: {}'.format( torch.norm(text_embedding, p=1, dim=-1).item())) @@ -36,7 +36,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck): model = Model.from_pretrained(self.model_id) pipeline_multi_modal_embedding = pipeline( task=Tasks.multi_modal_embedding, model=model) - text_embedding = pipeline_multi_modal_embedding( + text_embedding = pipeline_multi_modal_embedding.forward( self.test_input)[OutputKeys.TEXT_EMBEDDING] print('l1-norm: {}'.format( torch.norm(text_embedding, p=1, dim=-1).item())) @@ -47,7 +47,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck): def test_run_with_default_model(self): pipeline_multi_modal_embedding = pipeline( task=Tasks.multi_modal_embedding) - text_embedding = pipeline_multi_modal_embedding( + text_embedding = pipeline_multi_modal_embedding.forward( self.test_input)[OutputKeys.TEXT_EMBEDDING] print('l1-norm: {}'.format( torch.norm(text_embedding, p=1, dim=-1).item())) diff --git a/tests/trainers/test_clip_trainer.py b/tests/trainers/test_clip_trainer.py new file mode 100644 index 00000000..e460f1ac --- /dev/null +++ b/tests/trainers/test_clip_trainer.py @@ -0,0 +1,83 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import unittest + +import json + +from modelscope.metainfo import Metrics, Trainers +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile +from modelscope.utils.test_utils import test_level + + +class TestClipTrainer(unittest.TestCase): + + def setUp(self) -> None: + self.finetune_cfg = \ + {'framework': 'pytorch', + 'task': 'multi-modal-embedding', + 'pipeline': {'type': 'multi-modal-embedding'}, + 'pretrained_model': {'model_name': 'damo/multi-modal_clip-vit-base-patch16_zh'}, + 'dataset': {'column_map': {'img': 'image', 'text': 'query'}}, + 'train': {'work_dir': './workspace/ckpts/clip', + # 'launcher': 'pytorch', + 'max_epochs': 1, + 'use_fp16': True, + 'dataloader': {'batch_size_per_gpu': 8, + 'workers_per_gpu': 0, + 'shuffle': True, + 'drop_last': True}, + 'lr_scheduler': {'name': 'cosine', + 'warmup_proportion': 0.01}, + 'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False}, + 'optimizer': {'type': 'AdamW'}, + 'optimizer_hparams': {'lr': 5e-05, 'weight_decay': 0.01}, + 'optimizer_hook': {'type': 'TorchAMPOptimizerHook', + 'cumulative_iters': 1, + 'loss_keys': 'loss'}, + 'loss_cfg': {'aggregate': True}, + 'hooks': [{'type': 'BestCkptSaverHook', + 'metric_key': 'inbatch_t2i_recall_at_1', + 'interval': 100}, + {'type': 'TextLoggerHook', 'interval': 1}, + {'type': 'IterTimerHook'}, + {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}, + {'type': 'ClipClampLogitScaleHook'}]}, + 'evaluation': {'dataloader': {'batch_size_per_gpu': 8, + 'workers_per_gpu': 0, + 'shuffle': True, + 'drop_last': True}, + 'metrics': [{'type': 'inbatch_recall'}]}, + 'preprocessor': []} + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer_std(self): + WORKSPACE = './workspace/ckpts/clip' + os.makedirs(WORKSPACE, exist_ok=True) + config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION) + with open(config_file, 'w') as writer: + json.dump(self.finetune_cfg, writer) + + pretrained_model = 'damo/multi-modal_clip-vit-base-patch16_zh' + args = dict( + model=pretrained_model, + work_dir=WORKSPACE, + train_dataset=MsDataset.load( + 'muge', namespace='modelscope', split='train[:200]'), + eval_dataset=MsDataset.load( + 'muge', namespace='modelscope', split='validation[:100]'), + metrics=[Metrics.inbatch_recall], + cfg_file=config_file) + trainer = build_trainer( + name=Trainers.clip_multi_modal_embedding, default_args=args) + trainer.train() + + self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, + os.listdir(os.path.join(WORKSPACE, 'output'))) + shutil.rmtree(WORKSPACE) + + +if __name__ == '__main__': + unittest.main() From ce08cfbea862fe097c07d9646ba3bf380eef4467 Mon Sep 17 00:00:00 2001 From: "yuanzheng.yuanzhen" Date: Mon, 31 Oct 2022 18:47:06 +0800 Subject: [PATCH 12/46] [to #42322933]Add licenses Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10580553 * Add licenses --- modelscope/models/science/unifold/dataset.py | 3 +++ modelscope/models/science/unifold/model.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/modelscope/models/science/unifold/dataset.py b/modelscope/models/science/unifold/dataset.py index 05803f2c..29e1a8b0 100644 --- a/modelscope/models/science/unifold/dataset.py +++ b/modelscope/models/science/unifold/dataset.py @@ -1,3 +1,6 @@ +# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license, +# and is publicly available at https://github.com/dptech-corp/Uni-Fold. + import copy import logging import os diff --git a/modelscope/models/science/unifold/model.py b/modelscope/models/science/unifold/model.py index 6632751a..7f28f18d 100644 --- a/modelscope/models/science/unifold/model.py +++ b/modelscope/models/science/unifold/model.py @@ -1,3 +1,6 @@ +# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license, +# and is publicly available at https://github.com/dptech-corp/Uni-Fold. + import argparse import os from typing import Any From 64868bf2ad65308be1372e2c88f0133daf39d6a9 Mon Sep 17 00:00:00 2001 From: "xiaodongdeng.dxd" Date: Mon, 31 Oct 2022 20:42:56 +0800 Subject: [PATCH 13/46] =?UTF-8?q?[to=20#42322933]=E5=A4=9A=E6=A8=A1?= =?UTF-8?q?=E6=80=81=E9=A2=84=E8=AE=AD=E7=BB=83=E6=A8=A1=E5=9E=8BOFA?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=94=AF=E6=8C=816b=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E7=9A=84feature?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 多模态预训练模型OFA增加支持6b模型的feature Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10574571 --- .../multi_modal/ofa/configuration_ofa.py | 13 + .../models/multi_modal/ofa/modeling_ofa.py | 344 +++++++++++------- .../models/multi_modal/ofa/utils/utils.py | 40 ++ modelscope/models/multi_modal/ofa/vit.py | 155 ++++++++ .../models/multi_modal/ofa_for_all_tasks.py | 7 +- 5 files changed, 416 insertions(+), 143 deletions(-) mode change 100755 => 100644 modelscope/models/multi_modal/ofa/modeling_ofa.py create mode 100644 modelscope/models/multi_modal/ofa/vit.py diff --git a/modelscope/models/multi_modal/ofa/configuration_ofa.py b/modelscope/models/multi_modal/ofa/configuration_ofa.py index 4899f416..2edc651e 100644 --- a/modelscope/models/multi_modal/ofa/configuration_ofa.py +++ b/modelscope/models/multi_modal/ofa/configuration_ofa.py @@ -136,6 +136,12 @@ class OFAConfig(PretrainedConfig): entangle_position_embedding=False, interpolate_position=False, orig_patch_image_size=224, + share_attn_bias=False, + use_image_feature=True, + disable_entangle=False, + use_ofasys=False, + vit_type='vit_base', + vit_drop_path_rate=0.0, **kwargs): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings @@ -178,6 +184,13 @@ class OFAConfig(PretrainedConfig): self.interpolate_position = interpolate_position self.orig_patch_image_size = orig_patch_image_size + self.share_attn_bias = share_attn_bias + self.use_image_feature = use_image_feature + self.disable_entangle = disable_entangle + self.use_ofasys = use_ofasys + self.vit_type = vit_type + self.vit_drop_path_rate = vit_drop_path_rate + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py old mode 100755 new mode 100644 index 0a7a2ce6..69005ef0 --- a/modelscope/models/multi_modal/ofa/modeling_ofa.py +++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py @@ -35,6 +35,8 @@ from transformers.utils import logging from .configuration_ofa import OFAConfig from .generate import utils from .resnet import ResNet +from .utils.utils import DropPath +from .vit import vit_base, vit_huge, vit_large, vit_large_336 logger = logging.get_logger(__name__) @@ -249,45 +251,6 @@ class LayerDropModuleList(nn.ModuleList): yield m -def drop_path(x, drop_prob: float = 0.0, training: bool = False): - r""" - Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - - Args: - x (`nn.Modules`): input nn layers. - drop_prob (`float`): drop path ratio. - training (`bool`): whether is training or inference. - """ - if drop_prob == 0.0 or not training: - return x - keep_prob = 1 - drop_prob - shape = (1, x.shape[1], 1) - random_tensor = keep_prob + torch.rand( - shape, dtype=x.dtype, device=x.device) - random_tensor.floor_() # binarize - output = x.div(keep_prob) * random_tensor - return output - - -class DropPath(nn.Module): - r""" - Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - - Args: - drop_prob: drop path ratio. - """ - - def __init__(self, drop_prob=None): - super().__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return 'p={}'.format(self.drop_prob) - - class OFAAttention(nn.Module): r""" Multi-headed attention, with additional implementation for NormFormer. @@ -898,31 +861,49 @@ class OFAEncoder(OFAPreTrainedModel): self.padding_idx) if config.add_type_embedding: - self.type_embedding = Embedding(2, embed_dim, padding_idx=None) + if config.use_image_feature: + self.type_embedding = Embedding(2, embed_dim, padding_idx=None) + else: + self.type_embedding = Embedding(1, embed_dim, padding_idx=None) else: self.type_embedding = None - if config.resnet_type == 'resnet18': - self.embed_images = ResNet( - [2, 2, 2], drop_path_rate=config.resnet_drop_path_rate) - elif config.resnet_type == 'resnet34': - self.embed_images = ResNet( - [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate) - elif config.resnet_type == 'resnet50': - self.embed_images = ResNet( - [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate) - elif config.resnet_type == 'resnet101': - self.embed_images = ResNet( - [3, 4, 23], drop_path_rate=config.resnet_drop_path_rate) - elif config.resnet_type == 'resnet152': - self.embed_images = ResNet( - [3, 8, 36], drop_path_rate=config.resnet_drop_path_rate) - else: - raise NotImplementedError + if config.use_image_feature: + if config.use_ofasys: + vit_backbone = { + 'vit_base': vit_base, + 'vit_large': vit_large, + 'vit_large_336': vit_large_336, + 'vit_huge': vit_huge, + }[config.vit_type] + self.embed_images = vit_backbone(config.vit_drop_path_rate) - self.image_proj = Linear(1024, embed_dim) + self.image_proj = Linear(self.embed_images.width, embed_dim) - if config.resnet_model_path: + else: + if config.resnet_type == 'resnet18': + self.embed_images = ResNet( + [2, 2, 2], drop_path_rate=config.resnet_drop_path_rate) + elif config.resnet_type == 'resnet34': + self.embed_images = ResNet( + [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate) + elif config.resnet_type == 'resnet50': + self.embed_images = ResNet( + [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate) + elif config.resnet_type == 'resnet101': + self.embed_images = ResNet( + [3, 4, 23], + drop_path_rate=config.resnet_drop_path_rate) + elif config.resnet_type == 'resnet152': + self.embed_images = ResNet( + [3, 8, 36], + drop_path_rate=config.resnet_drop_path_rate) + else: + raise NotImplementedError + + self.image_proj = Linear(1024, embed_dim) + + if not config.use_ofasys and config.resnet_model_path: print('load resnet {}'.format(config.resnet_model_path)) resnet_state_dict = torch.load(config.resnet_model_path) self.embed_images.load_state_dict(resnet_state_dict) @@ -933,14 +914,21 @@ class OFAEncoder(OFAPreTrainedModel): self.embed_positions = Embedding(self.max_source_positions + 2, embed_dim) - self.embed_image_positions = Embedding(config.image_bucket_size**2 + 1, - embed_dim) - self.pos_ln = LayerNorm(embed_dim) - self.image_pos_ln = LayerNorm(embed_dim) + + if config.use_image_feature: + self.embed_image_positions = Embedding( + config.image_bucket_size**2 + 1, embed_dim) + if not config.use_ofasys: + self.pos_ln = LayerNorm(embed_dim) + + if config.use_image_feature: + self.image_pos_ln = LayerNorm(embed_dim) self.pos_scaling = float(embed_dim / self.num_attention_heads * config.attn_scale_factor)**-0.5 - self.pos_q_linear = nn.Linear(embed_dim, embed_dim) - self.pos_k_linear = nn.Linear(embed_dim, embed_dim) + + if not (config.use_ofasys and config.entangle_position_embedding): + self.pos_q_linear = nn.Linear(embed_dim, embed_dim) + self.pos_k_linear = nn.Linear(embed_dim, embed_dim) if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) @@ -965,22 +953,28 @@ class OFAEncoder(OFAPreTrainedModel): self.token_bucket_size = config.token_bucket_size token_num_rel_dis = 2 * config.token_bucket_size - 1 token_rp_bucket = make_token_bucket_position(config.token_bucket_size) + self.share_attn_bias = config.share_attn_bias + num_rel_pos_tables = 1 if config.share_attn_bias else config.encoder_layers self.token_rel_pos_table_list = nn.ModuleList([ Embedding( token_num_rel_dis, self.num_attention_heads, zero_init=True) - for _ in range(config.encoder_layers) + for _ in range(num_rel_pos_tables) ]) - self.image_bucket_size = config.image_bucket_size - image_num_rel_dis = (2 * config.image_bucket_size - - 1) * (2 * config.image_bucket_size - 1) + 3 - image_rp_bucket = make_image_bucket_position(config.image_bucket_size, - image_num_rel_dis) - self.image_rel_pos_table_list = nn.ModuleList([ - Embedding( - image_num_rel_dis, self.num_attention_heads, zero_init=True) - for _ in range(config.encoder_layers) - ]) + if config.use_image_feature: + self.image_bucket_size = config.image_bucket_size + image_num_rel_dis = (2 * config.image_bucket_size + - 1) * (2 * config.image_bucket_size - 1) + 3 + image_rp_bucket = make_image_bucket_position( + config.image_bucket_size, image_num_rel_dis) + self.image_rel_pos_table_list = nn.ModuleList([ + Embedding( + image_num_rel_dis, + self.num_attention_heads, + zero_init=True) for _ in range(num_rel_pos_tables) + ]) + + self.register_buffer('image_rp_bucket', image_rp_bucket) if config.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim) @@ -988,12 +982,12 @@ class OFAEncoder(OFAPreTrainedModel): self.layernorm_embedding = None self.register_buffer('token_rp_bucket', token_rp_bucket) - self.register_buffer('image_rp_bucket', image_rp_bucket) self.entangle_position_embedding = config.entangle_position_embedding self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() + self.use_ofasys = config.use_ofasys def get_input_embeddings(self): r""" @@ -1305,21 +1299,41 @@ class OFAEncoder(OFAPreTrainedModel): if has_pads: x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x)) - pos_embed = self.pos_ln(pos_embed) - if patch_images is not None: - image_pos_embed = self.image_pos_ln(image_pos_embed) - pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1) - if patch_images_2 is not None: - image_pos_embed_2 = self.image_pos_ln(image_pos_embed_2) - pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1) + if self.use_ofasys: + if patch_images is not None: + pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1) + if patch_images_2 is not None: + pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1) + else: + pos_embed = self.pos_ln(pos_embed) + if patch_images is not None: + image_pos_embed = self.image_pos_ln(image_pos_embed) + pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1) + if patch_images_2 is not None: + image_pos_embed_2 = self.image_pos_ln(image_pos_embed_2) + pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1) + + def build_abs_pos_bias(pos_embed): + batch_size, seq_length = pos_embed.size(0), pos_embed.size(1) + if not (self.use_ofasys and self.entangle_position_embedding): + pos_q = self.pos_q_linear(pos_embed).view( + batch_size, seq_length, self.num_attention_heads, + -1).transpose(1, 2) * self.pos_scaling + pos_k = self.pos_k_linear(pos_embed).view( + batch_size, seq_length, self.num_attention_heads, + -1).transpose(1, 2) + abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3)) + else: + abs_pos_bias = torch.zeros( + batch_size, + self.num_attention_heads, + seq_length, + seq_length, + dtype=pos_embed.dtype, + device=pos_embed.device) + return abs_pos_bias - pos_q = self.pos_q_linear(pos_embed).view( - x.size(0), x.size(1), self.num_attention_heads, -1).transpose( - 1, 2) * self.pos_scaling - pos_k = self.pos_k_linear(pos_embed).view( - x.size(0), x.size(1), self.num_attention_heads, - -1).transpose(1, 2) - abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3)) + abs_pos_bias = build_abs_pos_bias(pos_embed) # expand attention_mask if has_pads: @@ -1334,19 +1348,22 @@ class OFAEncoder(OFAPreTrainedModel): if output_hidden_states: encoder_states += (x, ) self_attn_bias = abs_pos_bias.clone() + + real_idx = 0 if self.share_attn_bias else idx + self_attn_bias[:, :, -input_ids.size(1):, -input_ids.size(1):] += self.get_rel_pos_bias( - input_ids, idx) + input_ids, real_idx) if patch_images_2 is not None: self_attn_bias[:, :, :image_num_patches_2, :image_num_patches_2] += \ - self.get_image_rel_pos_bias(image_position_ids_2, idx) + self.get_image_rel_pos_bias(image_position_ids_2, real_idx) self_attn_bias[:, :, image_num_patches_2:image_num_patches_2 + image_num_patches, # noqa image_num_patches_2:image_num_patches_2 + image_num_patches] += \ - self.get_image_rel_pos_bias(image_position_ids, idx) # noqa + self.get_image_rel_pos_bias(image_position_ids, real_idx) # noqa elif patch_images is not None: self_attn_bias[:, :, :x.size(1) - input_ids.size(1), :x.size(1) - input_ids.size(1)] += \ - self.get_image_rel_pos_bias(image_position_ids, idx) + self.get_image_rel_pos_bias(image_position_ids, real_idx) self_attn_bias = self_attn_bias.reshape(-1, x.size(1), x.size(1)) hidden_outputs = layer( @@ -1398,6 +1415,8 @@ class OFADecoder(OFAPreTrainedModel): self._future_mask = torch.empty(0) self.share_input_output_embed = config.share_decoder_input_output_embed self.num_attention_heads = config.decoder_attention_heads + self.use_ofasys = config.use_ofasys + self.disable_entangle = config.disable_entangle if embed_tokens is not None: self.embed_tokens = embed_tokens @@ -1415,18 +1434,31 @@ class OFADecoder(OFAPreTrainedModel): else: self.layernorm_embedding = None + if config.use_ofasys: + if config.add_type_embedding: + self.type_embedding = Embedding( + 1, self.embed_dim, padding_idx=None) + else: + self.type_embedding = None + self.window_size = config.code_image_size // 8 self.embed_positions = Embedding(self.max_target_positions + 2, self.embed_dim) - self.embed_image_positions = Embedding(config.image_bucket_size**2 + 1, - self.embed_dim) - self.pos_ln = LayerNorm(self.embed_dim) - self.image_pos_ln = LayerNorm(self.embed_dim) + + if not config.use_ofasys: + self.embed_image_positions = Embedding( + config.image_bucket_size**2 + 1, self.embed_dim) + if not config.use_ofasys: + self.pos_ln = LayerNorm(self.embed_dim) + self.image_pos_ln = LayerNorm(self.embed_dim) self.pos_scaling = float(self.embed_dim / self.num_attention_heads * config.attn_scale_factor)**-0.5 - self.self_pos_q_linear = nn.Linear(self.embed_dim, self.embed_dim) - self.self_pos_k_linear = nn.Linear(self.embed_dim, self.embed_dim) + + if not (config.use_ofasys and config.entangle_position_embedding): + self.self_pos_q_linear = nn.Linear(self.embed_dim, self.embed_dim) + self.self_pos_k_linear = nn.Linear(self.embed_dim, self.embed_dim) + self.cross_pos_q_linear = nn.Linear(self.embed_dim, self.embed_dim) self.cross_pos_k_linear = nn.Linear(self.embed_dim, self.embed_dim) @@ -1463,33 +1495,41 @@ class OFADecoder(OFAPreTrainedModel): self.token_bucket_size = config.token_bucket_size token_num_rel_dis = 2 * config.token_bucket_size - 1 token_rp_bucket = make_token_bucket_position(config.token_bucket_size) + + self.share_attn_bias = config.share_attn_bias + num_rel_pos_tables = 1 if config.share_attn_bias else config.decoder_layers self.token_rel_pos_table_list = nn.ModuleList([ Embedding( token_num_rel_dis, self.num_attention_heads, zero_init=True) - for _ in range(config.decoder_layers) + for _ in range(num_rel_pos_tables) ]) - self.image_bucket_size = config.image_bucket_size - image_num_rel_dis = (2 * config.image_bucket_size - - 1) * (2 * config.image_bucket_size - 1) + 3 - image_rp_bucket = make_image_bucket_position(config.image_bucket_size, - image_num_rel_dis) - image_position_idx = torch.arange(self.window_size).unsqueeze(0).expand(self.window_size, self.window_size) + \ - torch.arange(self.window_size).unsqueeze(1) * config.image_bucket_size + 1 # noqa - image_position_idx = torch.cat( - [torch.tensor([0]), image_position_idx.view(-1)]) - image_position_idx = torch.cat( - [image_position_idx, - torch.tensor([1024] * 768)]) - self.image_rel_pos_table_list = nn.ModuleList([ - Embedding( - image_num_rel_dis, self.num_attention_heads, zero_init=True) - for _ in range(config.decoder_layers) - ]) + if config.use_image_feature: + if not config.use_ofasys: + self.image_bucket_size = config.image_bucket_size + image_num_rel_dis = (2 * config.image_bucket_size - 1) * ( + 2 * config.image_bucket_size - 1) + 3 + image_rp_bucket = make_image_bucket_position( + config.image_bucket_size, image_num_rel_dis) + image_position_idx = torch.arange(self.window_size).unsqueeze(0).expand(self.window_size, self.window_size) + \ + torch.arange(self.window_size).unsqueeze(1) * config.image_bucket_size + 1 # noqa + image_position_idx = torch.cat( + [torch.tensor([0]), + image_position_idx.view(-1)]) + image_position_idx = torch.cat( + [image_position_idx, + torch.tensor([1024] * 768)]) + self.register_buffer('image_position_idx', image_position_idx) + + self.image_rel_pos_table_list = nn.ModuleList([ + Embedding( + image_num_rel_dis, + self.num_attention_heads, + zero_init=True) for _ in range(num_rel_pos_tables) + ]) + self.register_buffer('image_rp_bucket', image_rp_bucket) self.register_buffer('token_rp_bucket', token_rp_bucket) - self.register_buffer('image_rp_bucket', image_rp_bucket) - self.register_buffer('image_position_idx', image_position_idx) self.entangle_position_embedding = config.entangle_position_embedding self.gradient_checkpointing = False @@ -1556,26 +1596,46 @@ class OFADecoder(OFAPreTrainedModel): batch_size = tgt_pos_embed.size(0) tgt_len = tgt_pos_embed.size(1) - tgt_pos_embed = self.image_pos_ln( - tgt_pos_embed) if use_image else self.pos_ln(tgt_pos_embed) + if not self.use_ofasys: + tgt_pos_embed = self.image_pos_ln( + tgt_pos_embed) if use_image else self.pos_ln(tgt_pos_embed) if src_pos_embed is not None: src_len = src_pos_embed.size(1) - pos_q = self.cross_pos_q_linear(tgt_pos_embed).view( - batch_size, tgt_len, self.num_attention_heads, -1).transpose( - 1, 2) * self.pos_scaling - pos_k = self.cross_pos_k_linear(src_pos_embed).view( - batch_size, src_len, self.num_attention_heads, - -1).transpose(1, 2) + if not (self.entangle_position_embedding and self.use_ofasys): + pos_q = self.cross_pos_q_linear(tgt_pos_embed).view( + batch_size, tgt_len, self.num_attention_heads, + -1).transpose(1, 2) * self.pos_scaling + pos_k = self.cross_pos_k_linear(src_pos_embed).view( + batch_size, src_len, self.num_attention_heads, + -1).transpose(1, 2) + abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3)) + else: + abs_pos_bias = torch.zeros( + batch_size, + self.num_attention_heads, + tgt_len, + src_len, + dtype=tgt_pos_embed.dtype, + device=tgt_pos_embed.device) else: - src_len = tgt_pos_embed.size(1) - pos_q = self.self_pos_q_linear(tgt_pos_embed).view( - batch_size, tgt_len, self.num_attention_heads, -1).transpose( - 1, 2) * self.pos_scaling - pos_k = self.self_pos_k_linear(tgt_pos_embed).view( - batch_size, src_len, self.num_attention_heads, - -1).transpose(1, 2) - abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3)) + # batch_size, seq_length = tgt_pos_embed.size(0), tgt_pos_embed.size(1) + if not (self.entangle_position_embedding and self.use_ofasys): + pos_q = self.self_pos_q_linear(tgt_pos_embed).view( + batch_size, tgt_len, self.num_attention_heads, + -1).transpose(1, 2) * self.pos_scaling + pos_k = self.self_pos_k_linear(tgt_pos_embed).view( + batch_size, tgt_len, self.num_attention_heads, + -1).transpose(1, 2) + abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3)) + else: + abs_pos_bias = torch.zeros( + batch_size, + self.num_attention_heads, + tgt_len, + tgt_len, + dtype=tgt_pos_embed.dtype, + device=tgt_pos_embed.device) return abs_pos_bias @@ -1809,17 +1869,18 @@ class OFADecoder(OFAPreTrainedModel): past_key_values) > 0 else None self_attn_bias = self_abs_pos_bias.clone() + real_idx = 0 if self.share_attn_bias else idx if code_masks is None or not code_masks.any(): self_attn_bias += self.get_rel_pos_bias( - all_prev_output_tokens, idx).unsqueeze(0) + all_prev_output_tokens, real_idx).unsqueeze(0) elif code_masks is not None and code_masks.all(): self_attn_bias += self.get_image_rel_pos_bias( - all_prev_output_tokens, idx).unsqueeze(0) + all_prev_output_tokens, real_idx).unsqueeze(0) else: self_attn_bias[~code_masks] += self.get_rel_pos_bias( - all_prev_output_tokens, idx).unsqueeze(0) + all_prev_output_tokens, real_idx).unsqueeze(0) self_attn_bias[code_masks] += self.get_image_rel_pos_bias( - all_prev_output_tokens, idx).unsqueeze(0) + all_prev_output_tokens, real_idx).unsqueeze(0) self_attn_bias = self_attn_bias.reshape( -1, *self_attn_bias.size()[-2:]) @@ -1892,6 +1953,7 @@ class OFAModel(OFAPreTrainedModel): self.encoder = OFAEncoder(config, shared) self.decoder = OFADecoder(config, shared) + self.use_ofasys = config.use_ofasys # Initialize weights and apply final processing self.post_init() diff --git a/modelscope/models/multi_modal/ofa/utils/utils.py b/modelscope/models/multi_modal/ofa/utils/utils.py index 6d8943a1..c5aa8483 100644 --- a/modelscope/models/multi_modal/ofa/utils/utils.py +++ b/modelscope/models/multi_modal/ofa/utils/utils.py @@ -2,6 +2,7 @@ from typing import Optional import torch +import torch.nn as nn def expand_mask(mask: torch.Tensor, @@ -17,3 +18,42 @@ def expand_mask(mask: torch.Tensor, src_len).to(dtype) return expanded_mask.masked_fill(expanded_mask.bool(), torch.finfo(dtype).min) + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + r""" + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Args: + x (`nn.Modules`): input nn layers. + drop_prob (`float`): drop path ratio. + training (`bool`): whether is training or inference. + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (1, x.shape[1], 1) + random_tensor = keep_prob + torch.rand( + shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + r""" + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Args: + drop_prob: drop path ratio. + """ + + def __init__(self, drop_prob=None): + super().__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) diff --git a/modelscope/models/multi_modal/ofa/vit.py b/modelscope/models/multi_modal/ofa/vit.py new file mode 100644 index 00000000..b6bba7ee --- /dev/null +++ b/modelscope/models/multi_modal/ofa/vit.py @@ -0,0 +1,155 @@ +from collections import OrderedDict + +import torch +import torch.nn.functional as F +from fairseq.modules import LayerNorm +from torch import nn + +from .utils.utils import DropPath + +__all__ = [ + 'vit_base', + 'vit_large', + 'vit_large_336', + 'vit_huge', +] + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None, + drop_path_rate=0.0): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([ + ('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model)), + ])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + self.drop_path = DropPath(drop_path_rate) + + def attention(self, x: torch.Tensor): + self.attn_mask = ( + self.attn_mask.to(dtype=x.dtype, device=x.device) + if self.attn_mask is not None else None) + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.drop_path(self.attention(self.ln_1(x))) + x = x + self.drop_path(self.mlp(self.ln_2(x))) + return x + + +class Transformer(nn.Module): + + def __init__( + self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None, + drop_path_rate: float = 0.0, + ): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask, drop_path_rate) + for _ in range(layers) + ]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + + def __init__( + self, + input_resolution: int, + patch_size: int, + width: int, + layers: int, + heads: int, + drop_path_rate: float = 0.0, + ): + super().__init__() + self.input_resolution = input_resolution + self.patch_size = patch_size + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False, + ) + + scale = width**-0.5 + self.width = width + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = LayerNorm(width) + self.transformer = Transformer( + width, layers, heads, drop_path_rate=drop_path_rate) + + def forward(self, x: torch.Tensor): + resolution = x.shape[-2] + height, width = x.shape[-2] // self.patch_size, x.shape[ + -1] // self.patch_size + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + + if resolution != self.input_resolution: + old_pe = self.positional_embedding[1:] + patch_num = self.input_resolution // self.patch_size + old_pe = old_pe.reshape(1, patch_num, patch_num, + -1).permute(0, 3, 1, 2) + new_pe = F.interpolate( + old_pe, size=(height, width), mode='bilinear') + new_pe = new_pe.permute(0, 2, 3, 1).reshape(height * width, -1) + x = x + new_pe.to(x.dtype) + else: + x = x + self.positional_embedding[1:].to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + bz, seq, hidden = x.shape + x = x.transpose(1, 2).reshape(bz, hidden, height, width) + + return x + + +def vit_base(drop_path_rate: float = 0.0): + return VisionTransformer(224, 16, 768, 9, 12, drop_path_rate) + + +def vit_large(drop_path_rate: float = 0.0): + return VisionTransformer(224, 14, 1024, 18, 16, drop_path_rate) + + +def vit_large_336(drop_path_rate: float = 0.0): + return VisionTransformer(336, 14, 1024, 18, 16, drop_path_rate) + + +def vit_huge(drop_path_rate: float = 0.0): + return VisionTransformer(224, 14, 1280, 24, 16, drop_path_rate) diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py index 56d19ad8..2c6034e8 100644 --- a/modelscope/models/multi_modal/ofa_for_all_tasks.py +++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py @@ -53,8 +53,11 @@ class OfaForAllTasks(TorchModel): raise NotImplementedError # there is some diff between here and our ofa code, # there will be no need to use param: use_bpe - self.tokenizer.add_tokens([''.format(i) for i in range(8192)]) - self.tokenizer.add_tokens([''.format(i) for i in range(1000)]) + if not model.use_ofasys: + self.tokenizer.add_tokens( + [''.format(i) for i in range(8192)]) + self.tokenizer.add_tokens( + [''.format(i) for i in range(1000)]) self.cfg.update({'num_bins': 1000, 'num_codes': 8192}) self.batch_size = self.cfg.model.get('batch_size', 1) self.patch_image_size = self.cfg.model.get('patch_image_size', 480) From e72988c2bae19c9c7bc7ea08bc940515a766bac7 Mon Sep 17 00:00:00 2001 From: "shouzhou.bx" Date: Mon, 31 Oct 2022 20:46:49 +0800 Subject: [PATCH 14/46] add face detection to face_2d_keypoints_pipeline --- modelscope/outputs/outputs.py | 23 +- .../face_2d_keypoints_pipeline.py | 254 +++++++++++++++++- modelscope/utils/cv/image_utils.py | 65 +++++ tests/pipelines/test_face_2d_keypoints.py | 29 +- 4 files changed, 347 insertions(+), 24 deletions(-) diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index b983125a..b7003809 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -69,11 +69,23 @@ TASK_OUTPUTS = { # face 2d keypoint result for single sample # { # "keypoints": [ - # [x1, y1]*106 + # [[x, y]*106], + # [[x, y]*106], + # [[x, y]*106], # ], - # "poses": [pitch, roll, yaw] + # "poses": [ + # [pitch, roll, yaw], + # [pitch, roll, yaw], + # [pitch, roll, yaw], + # ], + # "boxes": [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ] # } - Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES], + Tasks.face_2d_keypoints: + [OutputKeys.KEYPOINTS, OutputKeys.POSES, OutputKeys.BOXES], # face detection result for single sample # { @@ -699,8 +711,9 @@ TASK_OUTPUTS = { # "text_embedding": np.array with shape [1, D], # "caption": "this is an image caption text." # } - Tasks.generative_multi_modal_embedding: - [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION], + Tasks.generative_multi_modal_embedding: [ + OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION + ], # multi-modal similarity result for single sample # { diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py index b48d013e..4de5a4f2 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py @@ -1,9 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import copy +import math from typing import Any +import cv2 +import numpy as np + from modelscope.metainfo import Pipelines from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage from modelscope.utils.constant import ModelFile, Tasks from .base import EasyCVPipeline @@ -29,18 +36,251 @@ class Face2DKeypointsPipeline(EasyCVPipeline): *args, **kwargs) + # face detect pipeline + det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' + self.face_detection = pipeline( + Tasks.face_detection, model=det_model_id) + def show_result(self, img, points, scale=2, save_path=None): return self.predict_op.show_result(img, points, scale, save_path) + def _choose_face(self, det_result, min_face=10): + """ + choose face with maximum area + Args: + det_result: output of face detection pipeline + min_face: minimum size of valid face w/h + """ + bboxes = np.array(det_result[OutputKeys.BOXES]) + landmarks = np.array(det_result[OutputKeys.KEYPOINTS]) + if bboxes.shape[0] == 0: + logger.warn('No face detected!') + return None + # face idx with enough size + face_idx = [] + for i in range(bboxes.shape[0]): + box = bboxes[i] + if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face: + face_idx += [i] + if len(face_idx) == 0: + logger.warn( + f'Face size not enough, less than {min_face}x{min_face}!') + return None + bboxes = bboxes[face_idx] + landmarks = landmarks[face_idx] + + return bboxes, landmarks + + def expend_box(self, box, w, h, scalex=0.3, scaley=0.5): + x1 = box[0] + y1 = box[1] + wb = box[2] - x1 + hb = box[3] - y1 + deltax = int(wb * scalex) + deltay1 = int(hb * scaley) + deltay2 = int(hb * scalex) + x1 = x1 - deltax + y1 = y1 - deltay1 + if x1 < 0: + deltax = deltax + x1 + x1 = 0 + if y1 < 0: + deltay1 = deltay1 + y1 + y1 = 0 + x2 = x1 + wb + 2 * deltax + y2 = y1 + hb + deltay1 + deltay2 + x2 = np.clip(x2, 0, w - 1) + y2 = np.clip(y2, 0, h - 1) + return [x1, y1, x2, y2] + + def rotate_point(self, angle, center, landmark): + rad = angle * np.pi / 180.0 + alpha = np.cos(rad) + beta = np.sin(rad) + M = np.zeros((2, 3), dtype=np.float32) + M[0, 0] = alpha + M[0, 1] = beta + M[0, 2] = (1 - alpha) * center[0] - beta * center[1] + M[1, 0] = -beta + M[1, 1] = alpha + M[1, 2] = beta * center[0] + (1 - alpha) * center[1] + + landmark_ = np.asarray([(M[0, 0] * x + M[0, 1] * y + M[0, 2], + M[1, 0] * x + M[1, 1] * y + M[1, 2]) + for (x, y) in landmark]) + return M, landmark_ + + def random_normal(self): + """ + 3-sigma rule + return: (-1, +1) + """ + mu, sigma = 0, 1 + while True: + s = np.random.normal(mu, sigma) + if s < mu - 3 * sigma or s > mu + 3 * sigma: + continue + return s / 3 * sigma + + def rotate_crop_img(self, img, pts, M): + image_size = 256 + enlarge_ratio = 1.1 + + imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0]))) + + x1 = pts[5][0] + y1 = pts[5][1] + x2 = pts[6][0] + y2 = pts[6][1] + w = x2 - x1 + 1 + h = y2 - y1 + 1 + x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w) + y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h) + + new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w) + new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h) + new_x1 = x1 + int(self.random_normal() * image_size * 0.05) + new_y1 = y1 + int(self.random_normal() * image_size * 0.05) + new_x2 = new_x1 + new_w + new_y2 = new_y1 + new_h + + height, width, _ = imgT.shape + dx = max(0, -new_x1) + dy = max(0, -new_y1) + new_x1 = max(0, new_x1) + new_y1 = max(0, new_y1) + + edx = max(0, new_x2 - width) + edy = max(0, new_y2 - height) + new_x2 = min(width, new_x2) + new_y2 = min(height, new_y2) + + sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2] + if dx > 0 or dy > 0 or edx > 0 or edy > 0: + sub_imgT = cv2.copyMakeBorder( + sub_imgT, + dy, + edy, + dx, + edx, + cv2.BORDER_CONSTANT, + value=(103.94, 116.78, 123.68)) + + return sub_imgT, imgT, [new_x1, new_y1, new_x2, + new_y2], [dx, dy, edx, edy] + + def crop_img(self, imgT, pts, angle): + image_size = 256 + enlarge_ratio = 1.1 + + x1 = np.min(pts[:, 0]) + x2 = np.max(pts[:, 0]) + y1 = np.min(pts[:, 1]) + y2 = np.max(pts[:, 1]) + w = x2 - x1 + 1 + h = y2 - y1 + 1 + x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w) + y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h) + + new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w) + new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h) + new_x1 = x1 + int(self.random_normal() * image_size * 0.05) + new_y1 = y1 + int(self.random_normal() * image_size * 0.05) + new_x2 = new_x1 + new_w + new_y2 = new_y1 + new_h + + new_xy = new_x1, new_y1 + pts = pts - new_xy + + height, width, _ = imgT.shape + dx = max(0, -new_x1) + dy = max(0, -new_y1) + new_x1 = max(0, new_x1) + new_y1 = max(0, new_y1) + + edx = max(0, new_x2 - width) + edy = max(0, new_y2 - height) + new_x2 = min(width, new_x2) + new_y2 = min(height, new_y2) + + sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2] + if dx > 0 or dy > 0 or edx > 0 or edy > 0: + sub_imgT = cv2.copyMakeBorder( + sub_imgT, + dy, + edy, + dx, + edx, + cv2.BORDER_CONSTANT, + value=(103.94, 116.78, 123.68)) + + return sub_imgT, [new_x1, new_y1, new_x2, new_y2], [dx, dy, edx, edy] + def __call__(self, inputs) -> Any: - outputs = self.predict_op(inputs) + image_size = 256 + + img = LoadImage.convert_to_ndarray(inputs) + h, w, c = img.shape + img_rgb = copy.deepcopy(img) + img_rgb = img_rgb[:, :, ::-1] + det_result = self.face_detection(img_rgb) + boxes, keypoints = self._choose_face(det_result) + + output_boxes = [] + output_keypoints = [] + output_poses = [] + for idx, box_ori in enumerate(boxes): + box = self.expend_box(box_ori, w, h, scalex=0.15, scaley=0.15) + y0 = int(box[1]) + y1 = int(box[3]) + x0 = int(box[0]) + x1 = int(box[2]) + sub_img = img[y0:y1, x0:x1] + + keypoint = keypoints[idx] + pts = [[keypoint[0], keypoint[1]], [keypoint[2], keypoint[3]], + [keypoint[4], keypoint[5]], [keypoint[6], keypoint[7]], + [keypoint[8], keypoint[9]], [box[0], box[1]], + [box[2], box[3]]] + # radian + angle = math.atan2((pts[1][1] - pts[0][1]), + (pts[1][0] - pts[0][0])) + # angle + theta = angle * (180 / np.pi) + + center = [image_size // 2, image_size // 2] + cx, cy = center + M, landmark_ = self.rotate_point(theta, (cx, cy), pts) + sub_img, imgT, bbox, delta_border = self.rotate_crop_img( + img, pts, M) + + outputs = self.predict_op([sub_img])[0] + tmp_keypoints = outputs['point'] + + for idx in range(0, len(tmp_keypoints)): + tmp_keypoints[idx][0] += (delta_border[0] + bbox[0]) + tmp_keypoints[idx][1] += (delta_border[1] + bbox[1]) + + for idx in range(0, 3): + sub_img, bbox, delta_border = self.crop_img( + imgT, tmp_keypoints, 0) + outputs = self.predict_op([sub_img])[0] + tmp_keypoints = outputs['point'] + for idx in range(0, len(tmp_keypoints)): + tmp_keypoints[idx][0] += (delta_border[0] + bbox[0]) + tmp_keypoints[idx][1] += (delta_border[1] + bbox[1]) + + M2, tmp_keypoints = self.rotate_point(-theta, (cx, cy), + tmp_keypoints) - results = [{ - OutputKeys.KEYPOINTS: output['point'], - OutputKeys.POSES: output['pose'] - } for output in outputs] + output_keypoints.append(np.array(tmp_keypoints)) + output_poses.append(np.array(outputs['pose'])) + output_boxes.append(np.array(box_ori)) - if self._is_single_inputs(inputs): - results = results[0] + results = { + OutputKeys.KEYPOINTS: output_keypoints, + OutputKeys.POSES: output_poses, + OutputKeys.BOXES: output_boxes + } return results diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index 34dc2348..095c36ec 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -91,6 +91,71 @@ def draw_keypoints(output, original_image): return image +def draw_106face_keypoints(in_path, + keypoints, + boxes, + scale=4.0, + save_path=None): + face_contour_point_index = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 + ] + left_eye_brow_point_index = [33, 34, 35, 36, 37, 38, 39, 40, 41, 33] + right_eye_brow_point_index = [42, 43, 44, 45, 46, 47, 48, 49, 50, 42] + left_eye_point_index = [66, 67, 68, 69, 70, 71, 72, 73, 66] + right_eye_point_index = [75, 76, 77, 78, 79, 80, 81, 82, 75] + nose_bridge_point_index = [51, 52, 53, 54] + nose_contour_point_index = [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65] + mouth_outer_point_index = [ + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 84 + ] + mouth_inter_point_index = [96, 97, 98, 99, 100, 101, 102, 103, 96] + + img = cv2.imread(in_path) + + for i in range(len(boxes)): + draw_box(img, np.array(boxes[i])) + + image = cv2.resize(img, dsize=None, fx=scale, fy=scale) + + def draw_line(point_index, image, point): + for i in range(len(point_index) - 1): + cur_index = point_index[i] + next_index = point_index[i + 1] + cur_pt = (int(point[cur_index][0] * scale), + int(point[cur_index][1] * scale)) + next_pt = (int(point[next_index][0] * scale), + int(point[next_index][1] * scale)) + cv2.line(image, cur_pt, next_pt, (0, 0, 255), thickness=2) + + for i in range(len(keypoints)): + points = keypoints[i] + + draw_line(face_contour_point_index, image, points) + draw_line(left_eye_brow_point_index, image, points) + draw_line(right_eye_brow_point_index, image, points) + draw_line(left_eye_point_index, image, points) + draw_line(right_eye_point_index, image, points) + draw_line(nose_bridge_point_index, image, points) + draw_line(nose_contour_point_index, image, points) + draw_line(mouth_outer_point_index, image, points) + draw_line(mouth_inter_point_index, image, points) + + size = len(points) + for i in range(size): + x = int(points[i][0]) + y = int(points[i][1]) + cv2.putText(image, str(i), (int(x * scale), int(y * scale)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) + cv2.circle(image, (int(x * scale), int(y * scale)), 2, (0, 255, 0), + cv2.FILLED) + + if save_path is not None: + cv2.imwrite(save_path, image) + + return image + + def draw_face_detection_no_lm_result(img_path, detection_result): bboxes = np.array(detection_result[OutputKeys.BOXES]) scores = np.array(detection_result[OutputKeys.SCORES]) diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py index a5e347e8..7ccc8a59 100644 --- a/tests/pipelines/test_face_2d_keypoints.py +++ b/tests/pipelines/test_face_2d_keypoints.py @@ -1,11 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import unittest -import cv2 - from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import draw_106face_keypoints from modelscope.utils.test_utils import test_level @@ -13,7 +12,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_face_2d_keypoints(self): - img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png' + img_path = 'data/test/images/face_detection.png' model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment' face_2d_keypoints_align = pipeline( @@ -21,15 +20,21 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): output = face_2d_keypoints_align(img_path) output_keypoints = output[OutputKeys.KEYPOINTS] - output_pose = output[OutputKeys.POSES] - - img = cv2.imread(img_path) - img = face_2d_keypoints_align.show_result( - img, output_keypoints, scale=2, save_path='face_keypoints.jpg') - - self.assertEqual(output_keypoints.shape[0], 106) - self.assertEqual(output_keypoints.shape[1], 2) - self.assertEqual(output_pose.shape[0], 3) + output_poses = output[OutputKeys.POSES] + output_boxes = output[OutputKeys.BOXES] + + draw_106face_keypoints( + img_path, + output_keypoints, + output_boxes, + scale=2, + save_path='face_keypoints.jpg') + + for idx in range(len(output_keypoints)): + self.assertEqual(output_keypoints[idx].shape[0], 106) + self.assertEqual(output_keypoints[idx].shape[1], 2) + self.assertEqual(output_poses[idx].shape[0], 3) + self.assertEqual(output_boxes[idx].shape[0], 4) if __name__ == '__main__': From 0d3b7b0df210418326295c4cbe1c07152e540af0 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Mon, 31 Oct 2022 20:52:27 +0800 Subject: [PATCH 15/46] [to #42322933]fix bugs relate to token cls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.修复token classification preprocessor finetune结果错误问题 2.修复word segmentation output 无用属性 3. 修复nlp preprocessor传use_fast错误 4. 修复torch model exporter bug 5. 修复文档撰写过程中发现trainer相关bug Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10573269 --- modelscope/exporters/torch_model_exporter.py | 5 +- modelscope/outputs/outputs.py | 11 +- .../nlp/token_classification_pipeline.py | 4 +- .../nlp/word_segmentation_pipeline.py | 6 +- modelscope/preprocessors/nlp/nlp_base.py | 17 +- .../nlp/token_classification_preprocessor.py | 148 ++++++++++-------- .../trainers/nlp/text_generation_trainer.py | 2 +- modelscope/trainers/nlp_trainer.py | 6 +- modelscope/trainers/trainer.py | 2 +- tests/outputs/test_model_outputs.py | 3 +- .../test_finetune_token_classificatin.py | 2 +- 11 files changed, 110 insertions(+), 96 deletions(-) diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py index 7bf6c0c0..1d332591 100644 --- a/modelscope/exporters/torch_model_exporter.py +++ b/modelscope/exporters/torch_model_exporter.py @@ -128,7 +128,7 @@ class TorchModelExporter(Exporter): args_list = list(args) else: args_list = [args] - if isinstance(args_list[-1], dict): + if isinstance(args_list[-1], Mapping): args_dict = args_list[-1] args_list = args_list[:-1] n_nonkeyword = len(args_list) @@ -284,9 +284,8 @@ class TorchModelExporter(Exporter): 'Model property dummy_inputs must be set.') dummy_inputs = collate_fn(dummy_inputs, device) if isinstance(dummy_inputs, Mapping): - dummy_inputs = self._decide_input_format(model, dummy_inputs) dummy_inputs_filter = [] - for _input in dummy_inputs: + for _input in self._decide_input_format(model, dummy_inputs): if _input is not None: dummy_inputs_filter.append(_input) else: diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index b7003809..2c6dd85a 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -491,17 +491,8 @@ TASK_OUTPUTS = { # word segmentation result for single sample # { # "output": "今天 天气 不错 , 适合 出去 游玩" - # "labels": [ - # {'word': '今天', 'label': 'PROPN'}, - # {'word': '天气', 'label': 'PROPN'}, - # {'word': '不错', 'label': 'VERB'}, - # {'word': ',', 'label': 'NUM'}, - # {'word': '适合', 'label': 'NOUN'}, - # {'word': '出去', 'label': 'PART'}, - # {'word': '游玩', 'label': 'ADV'}, - # ] # } - Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS], + Tasks.word_segmentation: [OutputKeys.OUTPUT], # TODO @wenmeng.zwm support list of result check # named entity recognition result for single sample diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 75bc538d..4af187ee 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -109,13 +109,13 @@ class TokenClassificationPipeline(Pipeline): chunk['span'] = text[chunk['start']:chunk['end']] chunks.append(chunk) - # for cws output + # for cws outputs if len(chunks) > 0 and chunks[0]['type'] == 'cws': spans = [ chunk['span'] for chunk in chunks if chunk['span'].strip() ] seg_result = ' '.join(spans) - outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + outputs = {OutputKeys.OUTPUT: seg_result} # for ner outputs else: diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 0df8f1ad..c57f6b93 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -115,15 +115,15 @@ class WordSegmentationPipeline(Pipeline): chunk['span'] = text[chunk['start']:chunk['end']] chunks.append(chunk) - # for cws output + # for cws outputs if len(chunks) > 0 and chunks[0]['type'] == 'cws': spans = [ chunk['span'] for chunk in chunks if chunk['span'].strip() ] seg_result = ' '.join(spans) - outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + outputs = {OutputKeys.OUTPUT: seg_result} - # for ner outpus + # for ner output else: outputs = {OutputKeys.OUTPUT: chunks} return outputs diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 48a04d7a..45efc6e7 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -34,6 +34,7 @@ class NLPBasePreprocessor(Preprocessor, ABC): label=None, label2id=None, mode=ModeKeys.INFERENCE, + use_fast=None, **kwargs): """The NLP preprocessor base class. @@ -45,14 +46,18 @@ class NLPBasePreprocessor(Preprocessor, ABC): label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping if this mapping is not supplied. mode: Run this preprocessor in either 'train'/'eval'/'inference' mode + use_fast: use the fast version of tokenizer + """ self.model_dir = model_dir self.first_sequence = first_sequence self.second_sequence = second_sequence self.label = label - self.use_fast = kwargs.pop('use_fast', None) - if self.use_fast is None and os.path.isfile( + self.use_fast = use_fast + if self.use_fast is None and model_dir is None: + self.use_fast = False + elif self.use_fast is None and os.path.isfile( os.path.join(model_dir, 'tokenizer_config.json')): with open(os.path.join(model_dir, 'tokenizer_config.json'), 'r') as f: @@ -61,8 +66,8 @@ class NLPBasePreprocessor(Preprocessor, ABC): self.use_fast = False if self.use_fast is None else self.use_fast self.label2id = label2id - if self.label2id is None: - self.label2id = parse_label_mapping(self.model_dir) + if self.label2id is None and model_dir is not None: + self.label2id = parse_label_mapping(model_dir) super().__init__(mode, **kwargs) @property @@ -106,6 +111,7 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): label: str = 'label', label2id: dict = None, mode: str = ModeKeys.INFERENCE, + use_fast: bool = None, **kwargs): """The NLP tokenizer preprocessor base class. @@ -122,11 +128,12 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): - config.json label2id/id2label - label_mapping.json mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different. + use_fast: use the fast version of tokenizer kwargs: These kwargs will be directly fed into the tokenizer. """ super().__init__(model_dir, first_sequence, second_sequence, label, - label2id, mode) + label2id, mode, use_fast, **kwargs) self.model_dir = model_dir self.tokenize_kwargs = kwargs self.tokenizer = self.build_tokenizer(model_dir) diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index 2de0c806..5069048b 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Tuple, Union +import numpy as np import torch from modelscope.metainfo import Preprocessors @@ -20,9 +21,7 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): """ def __init__(self, **kwargs): - super().__init__(**kwargs) - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') + self.first_sequence: str = kwargs.pop('first_sequence', 'tokens') self.label = kwargs.pop('label', OutputKeys.LABELS) def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: @@ -80,10 +79,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): 'is_split_into_words', False) if 'label2id' in kwargs: kwargs.pop('label2id') - self.tokenize_kwargs = kwargs - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + @type_assert(object, (str, dict)) + def __call__(self, data: Union[dict, str]) -> Dict[str, Any]: """process the raw input data Args: @@ -99,18 +97,24 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): text = None labels_list = None if isinstance(data, str): + # for inference inputs without label text = data + self.tokenize_kwargs['add_special_tokens'] = False elif isinstance(data, dict): + # for finetune inputs with label text = data.get(self.first_sequence) labels_list = data.get(self.label) + if isinstance(text, list): + self.tokenize_kwargs['is_split_into_words'] = True input_ids = [] label_mask = [] offset_mapping = [] - if self.is_split_into_words: - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) + token_type_ids = [] + if self.is_split_into_words and self._mode == ModeKeys.INFERENCE: + for offset, token in enumerate(list(text)): + subtoken_ids = self.tokenizer.encode(token, + **self.tokenize_kwargs) if len(subtoken_ids) == 0: subtoken_ids = [self.tokenizer.unk_token_id] input_ids.extend(subtoken_ids) @@ -119,10 +123,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): else: if self.tokenizer.is_fast: encodings = self.tokenizer( - text, - add_special_tokens=False, - return_offsets_mapping=True, - **self.tokenize_kwargs) + text, return_offsets_mapping=True, **self.tokenize_kwargs) + attention_mask = encodings['attention_mask'] + token_type_ids = encodings['token_type_ids'] input_ids = encodings['input_ids'] word_ids = encodings.word_ids() for i in range(len(word_ids)): @@ -143,69 +146,80 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( text) - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - offset_mapping = offset_mapping[:sum(label_mask)] + if self._mode == ModeKeys.INFERENCE: + if len(input_ids) >= self.sequence_length - 2: + input_ids = input_ids[:self.sequence_length - 2] + label_mask = label_mask[:self.sequence_length - 2] + input_ids = [self.tokenizer.cls_token_id + ] + input_ids + [self.tokenizer.sep_token_id] + label_mask = [0] + label_mask + [0] + attention_mask = [1] * len(input_ids) + offset_mapping = offset_mapping[:sum(label_mask)] - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] + if not self.is_transformer_based_model: + input_ids = input_ids[1:-1] + attention_mask = attention_mask[1:-1] + label_mask = label_mask[1:-1] - if self._mode == ModeKeys.INFERENCE: input_ids = torch.tensor(input_ids).unsqueeze(0) attention_mask = torch.tensor(attention_mask).unsqueeze(0) label_mask = torch.tensor( label_mask, dtype=torch.bool).unsqueeze(0) - # the token classification - output = { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } - - # align the labels with tokenized text - if labels_list is not None: - assert self.label2id is not None - # Map that sends B-Xxx label to its I-Xxx counterpart - b_to_i_label = [] - label_enumerate_values = [ - k for k, v in sorted( - self.label2id.items(), key=lambda item: item[1]) - ] - for idx, label in enumerate(label_enumerate_values): - if label.startswith('B-') and label.replace( - 'B-', 'I-') in label_enumerate_values: - b_to_i_label.append( - label_enumerate_values.index( - label.replace('B-', 'I-'))) - else: - b_to_i_label.append(idx) + # the token classification + output = { + 'text': text, + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + 'offset_mapping': offset_mapping + } + else: + output = { + 'input_ids': input_ids, + 'token_type_ids': token_type_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + } - label_row = [self.label2id[lb] for lb in labels_list] - previous_word_idx = None - label_ids = [] - for word_idx in word_ids: - if word_idx is None: - label_ids.append(-100) - elif word_idx != previous_word_idx: - label_ids.append(label_row[word_idx]) - else: - if self.label_all_tokens: - label_ids.append(b_to_i_label[label_row[word_idx]]) + # align the labels with tokenized text + if labels_list is not None: + assert self.label2id is not None + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + label_enumerate_values = [ + k for k, v in sorted( + self.label2id.items(), key=lambda item: item[1]) + ] + for idx, label in enumerate(label_enumerate_values): + if label.startswith('B-') and label.replace( + 'B-', 'I-') in label_enumerate_values: + b_to_i_label.append( + label_enumerate_values.index( + label.replace('B-', 'I-'))) else: + b_to_i_label.append(idx) + + label_row = [self.label2id[lb] for lb in labels_list] + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + if word_idx is None: label_ids.append(-100) - previous_word_idx = word_idx - labels = label_ids - output['labels'] = labels + elif word_idx != previous_word_idx: + label_ids.append(label_row[word_idx]) + else: + if self.label_all_tokens: + label_ids.append(b_to_i_label[label_row[word_idx]]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + labels = label_ids + output['labels'] = labels + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } return output def get_tokenizer_class(self): diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py index 0e26f153..f02faf71 100644 --- a/modelscope/trainers/nlp/text_generation_trainer.py +++ b/modelscope/trainers/nlp/text_generation_trainer.py @@ -18,7 +18,7 @@ class TextGenerationTrainer(NlpEpochBasedTrainer): return tokenizer.decode(tokens.tolist(), skip_special_tokens=True) def evaluation_step(self, data): - model = self.model + model = self.model.module if self._dist else self.model model.eval() with torch.no_grad(): diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index a92a3706..5ff6f62f 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -586,14 +586,16 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): preprocessor_mode=ModeKeys.TRAIN, **model_args, **self.train_keys, - mode=ModeKeys.TRAIN) + mode=ModeKeys.TRAIN, + use_fast=True) eval_preprocessor = Preprocessor.from_pretrained( self.model_dir, cfg_dict=self.cfg, preprocessor_mode=ModeKeys.EVAL, **model_args, **self.eval_keys, - mode=ModeKeys.EVAL) + mode=ModeKeys.EVAL, + use_fast=True) return train_preprocessor, eval_preprocessor diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 7478d8e4..3556badf 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -876,7 +876,7 @@ class EpochBasedTrainer(BaseTrainer): Subclass and override to inject custom behavior. """ - model = self.model + model = self.model.module if self._dist else self.model model.eval() if is_parallel(model): diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py index 31271869..311ce201 100644 --- a/tests/outputs/test_model_outputs.py +++ b/tests/outputs/test_model_outputs.py @@ -21,9 +21,10 @@ class TestModelOutput(unittest.TestCase): self.assertEqual(outputs['logits'], torch.Tensor([1])) self.assertEqual(outputs[0], torch.Tensor([1])) self.assertEqual(outputs.logits, torch.Tensor([1])) + outputs.loss = torch.Tensor([2]) logits, loss = outputs self.assertEqual(logits, torch.Tensor([1])) - self.assertTrue(loss is None) + self.assertTrue(loss is not None) if __name__ == '__main__': diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py index 9bdab9b7..a92cee7b 100644 --- a/tests/trainers/test_finetune_token_classificatin.py +++ b/tests/trainers/test_finetune_token_classificatin.py @@ -87,7 +87,7 @@ class TestFinetuneTokenClassification(unittest.TestCase): cfg['dataset'] = { 'train': { 'labels': label_enumerate_values, - 'first_sequence': 'first_sequence', + 'first_sequence': 'tokens', 'label': 'labels', } } From 3464324f6b5d9d0ef975cd0b0e76870e95b5fa22 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Mon, 31 Oct 2022 22:15:25 +0800 Subject: [PATCH 16/46] [to #42322933] limit datasets version for now --- requirements/framework.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/framework.txt b/requirements/framework.txt index 2408cda6..17fbd8a3 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,7 @@ addict attrs -datasets +# version beyond 2.6.0 introduces compatbility issue and is being resolved +datasets<=2.6.0 easydict einops filelock>=3.3.0 From 5302259a0a3fb7cafdce473aa78990e7dc84e676 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Mon, 31 Oct 2022 22:46:17 +0800 Subject: [PATCH 17/46] [to #45854437]fix: add user name to user-agent Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10584797 --- modelscope/hub/api.py | 9 +++++++-- modelscope/hub/constants.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index dca6d099..7468e5e3 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -23,7 +23,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, API_RESPONSE_FIELD_MESSAGE, API_RESPONSE_FIELD_USERNAME, DEFAULT_CREDENTIALS_PATH, - MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS, + MODELSCOPE_ENVIRONMENT, + MODELSCOPE_USERNAME, ONE_YEAR_SECONDS, Licenses, ModelVisibility) from modelscope.hub.errors import (InvalidParameter, NotExistError, NotLoginException, NoValidRevisionError, @@ -760,14 +761,18 @@ class ModelScopeConfig: env = 'custom' if MODELSCOPE_ENVIRONMENT in os.environ: env = os.environ[MODELSCOPE_ENVIRONMENT] + user_name = 'unknown' + if MODELSCOPE_USERNAME in os.environ: + user_name = os.environ[MODELSCOPE_USERNAME] - ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % ( + ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % ( __version__, platform.python_version(), ModelScopeConfig.get_user_session_id(), platform.platform(), platform.processor(), env, + user_name, ) if isinstance(user_agent, dict): ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items()) diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index 730702c1..373a0cf4 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -18,6 +18,7 @@ API_RESPONSE_FIELD_EMAIL = 'Email' API_RESPONSE_FIELD_MESSAGE = 'Message' MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG' +MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME' ONE_YEAR_SECONDS = 24 * 365 * 60 * 60 From 06abae4dc6d68e99cba56608c857de5cdabd16b0 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Tue, 1 Nov 2022 09:56:15 +0800 Subject: [PATCH 18/46] [to #42322933]add token-cls test cases and bug fix Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10585502 --- .../nlp/token_classification_preprocessor.py | 3 +-- tests/pipelines/test_named_entity_recognition.py | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index 5069048b..92b7c46b 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -140,8 +140,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): label_mask.append(1) offset_mapping.append(encodings['offset_mapping'][i]) else: - encodings = self.tokenizer( - text, add_special_tokens=False, **self.tokenize_kwargs) + encodings = self.tokenizer(text, **self.tokenize_kwargs) input_ids = encodings['input_ids'] label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( text) diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index 3658cf3f..aef4aaed 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -19,9 +19,11 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): self.task = Tasks.named_entity_recognition self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' + english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom' tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' sentence = '这与温岭市新河镇的一个神秘的传说有关。' + sentence_en = 'pizza shovel' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): @@ -89,6 +91,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.named_entity_recognition, model=self.lcrf_model_id) print(pipeline_ins(input=self.sentence)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_english_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.english_model_id) + print(pipeline_ins(input='pizza shovel')) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.named_entity_recognition) From 9187103e3a32d4048e79e57d23fa596b2d1bffd5 Mon Sep 17 00:00:00 2001 From: "yichang.zyc" Date: Tue, 1 Nov 2022 09:57:31 +0800 Subject: [PATCH 19/46] =?UTF-8?q?[to=20#42322933]=E5=85=BC=E5=AE=B9?= =?UTF-8?q?=E6=96=B0=E5=A2=9Eclip=20huge=E6=A8=A1=E5=9E=8B=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-?= =?UTF-8?q?lib/codereview/10585552?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * compatiable with vit huge, and set clip base default mm-ebed pipeline --- modelscope/models/multi_modal/clip/model.py | 6 ++++-- modelscope/pipelines/builder.py | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py index b1c84292..9b82e4a1 100644 --- a/modelscope/models/multi_modal/clip/model.py +++ b/modelscope/models/multi_modal/clip/model.py @@ -349,11 +349,13 @@ class CLIP(nn.Module): text_num_hidden_layers: int, text_type_vocab_size: int, tokenizer: FullTokenizer, + # vision_head_width, added this param for ViT-H + vision_head_width: int = 64, ): super().__init__() if isinstance(vision_layers, (tuple, list)): - vision_heads = vision_width * 32 // 64 + vision_heads = vision_width * 32 // vision_head_width self.visual = ModifiedResNet( layers=vision_layers, output_dim=embed_dim, @@ -361,7 +363,7 @@ class CLIP(nn.Module): input_resolution=image_resolution, width=vision_width) else: - vision_heads = vision_width // 64 + vision_heads = vision_width // vision_head_width self.visual = VisualTransformer( input_resolution=image_resolution, patch_size=vision_patch_size, diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 498c9ed8..70f8f11c 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_resnet50_live-category'), Tasks.video_category: (Pipelines.video_category, 'damo/cv_resnet50_video-category'), - Tasks.multi_modal_embedding: - (Pipelines.multi_modal_embedding, - 'damo/multi-modal_clip-vit-large-patch14_zh'), + Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding, + 'damo/multi-modal_clip-vit-base-patch16_zh'), Tasks.generative_multi_modal_embedding: (Pipelines.generative_multi_modal_embedding, 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding' From 40b677095605594d426b9c731687fb834d04b4fc Mon Sep 17 00:00:00 2001 From: "liugao.lg" Date: Tue, 1 Nov 2022 10:22:11 +0800 Subject: [PATCH 20/46] [to #42322933]fix ocr prepreocess & conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复ocr预处理逻辑不一致问题 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10581697 --- modelscope/preprocessors/multi_modal.py | 1 - modelscope/preprocessors/ofa/ocr_recognition.py | 11 ++++++----- requirements/multi-modal.txt | 2 ++ tests/trainers/test_ofa_trainer.py | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 17dffb48..13876058 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -96,7 +96,6 @@ class OfaPreprocessor(Preprocessor): data = input else: data = self._build_dict(input) - data = self._ofa_input_compatibility_conversion(data) sample = self.preprocess(data) str_data = dict() for k, v in data.items(): diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py index 26fff9d2..a0342c14 100644 --- a/modelscope/preprocessors/ofa/ocr_recognition.py +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -2,12 +2,12 @@ from typing import Any, Dict import torch -from PIL import Image +import unicodedata2 from torchvision import transforms from torchvision.transforms import InterpolationMode from torchvision.transforms import functional as F +from zhconv import convert -from modelscope.preprocessors.image import load_image from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor @@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: sample = self._build_infer_sample(data) - target = data[self.column_map['text']] - target = target.translate(self.transtab).strip() + target = sample['label'] target_token_list = target.strip().split() target = ' '.join(target_token_list[:self.max_tgt_length]) sample['target'] = self.tokenize_text(target, add_bos=False) @@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): 'patch_mask': torch.tensor([True]) } if 'text' in self.column_map and self.column_map['text'] in data: - sample['label'] = data[self.column_map['text']] + target = data[self.column_map['text']] + target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans')) + sample['label'] = target return sample diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt index 255f6155..578f0b54 100644 --- a/requirements/multi-modal.txt +++ b/requirements/multi-modal.txt @@ -11,3 +11,5 @@ timm tokenizers torchvision transformers>=4.12.0 +unicodedata2 +zhconv diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py index 3f68a9fb..85c21881 100644 --- a/tests/trainers/test_ofa_trainer.py +++ b/tests/trainers/test_ofa_trainer.py @@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase): 'ocr_fudanvi_zh', subset_name='scene', namespace='modelscope', - split='train[:200]', + split='train[800:900]', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS), eval_dataset=MsDataset.load( 'ocr_fudanvi_zh', From f451ff8905e1615ec3adb3110fac89d8fe9bb492 Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 11:22:46 +0800 Subject: [PATCH 21/46] api tagging for pipeline/train/evaluate --- modelscope/hub/api.py | 24 ++++++++++++++++++++++++ modelscope/pipelines/base.py | 5 ++++- modelscope/trainers/trainer.py | 7 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 7468e5e3..36c246f1 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -646,6 +646,30 @@ class HubApi: def check_local_cookies(self, use_cookies) -> CookieJar: return self._check_cookie(use_cookies=use_cookies) + def create_library_statistics(self, + method: str, + name: str, + cn_name: Optional[str]): + """ + create library statistics. called by train()/evaluate()/pipeline() + + Args: + method (str): called methed name,i.e train/evaluate/pipeline + name (str): model name, for example: damo/cv_unet_person-image-cartoon_compound-models + cn_name (str): model name in chinese, for example: 达摩卡通化模型 + Raises: + ValueError: If user_cookies is True, but no local cookie. + + Returns: + None + """ + path = f'{self.endpoint}/api/v1/statistics/library' + headers = {'user-agent': ModelScopeConfig.get_user_agent()} + params = {"Method": method, "Name": name, "CnName": cn_name} + r = requests.post(path, params=params, headers=headers) + r.raise_for_status() + return + class ModelScopeConfig: path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index bca80502..b8856dea 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -23,6 +23,7 @@ from modelscope.utils.hub import read_config, snapshot_download from modelscope.utils.import_utils import is_tf_available, is_torch_available from modelscope.utils.logger import get_logger from modelscope.utils.torch_utils import _find_free_port, _is_free_port +from modelscope.hub.api import HubApi from .util import is_model, is_official_hub_path if is_torch_available(): @@ -151,7 +152,9 @@ class Pipeline(ABC): **kwargs) -> Union[Dict[str, Any], Generator]: # model provider should leave it as it is # modelscope library developer will handle this function - + _api = HubApi() + model_name = self.cfg.task + _api.create_library_statistics("pipeline", model_name, None) # place model to cpu or gpu if (self.model or (self.has_multiple_models and self.models[0])): if not self._model_prepare: diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 3556badf..6e5f4180 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -39,6 +39,7 @@ from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, init_dist, set_random_seed) +from modelscope.hub.api import HubApi from .base import BaseTrainer from .builder import TRAINERS from .default_config import merge_cfg @@ -436,6 +437,9 @@ class EpochBasedTrainer(BaseTrainer): def train(self, checkpoint_path=None, *args, **kwargs): self._mode = ModeKeys.TRAIN + _api = HubApi() + model_name = self.cfg.task + _api.create_library_statistics("train", model_name, None) if self.train_dataset is None: self.train_dataloader = self.get_train_dataloader() @@ -456,6 +460,9 @@ class EpochBasedTrainer(BaseTrainer): self.train_loop(self.train_dataloader) def evaluate(self, checkpoint_path=None): + _api = HubApi() + model_name = self.cfg.task + _api.create_library_statistics("evaluate", model_name, None) if checkpoint_path is not None and os.path.isfile(checkpoint_path): from modelscope.trainers.hooks import CheckpointHook CheckpointHook.load_checkpoint(checkpoint_path, self) From a79a900e94d2bff8fd4e3d8843ff065f35ca6096 Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 11:35:28 +0800 Subject: [PATCH 22/46] change api to utils --- modelscope/hub/api.py | 23 ----------------------- modelscope/hub/utils/utils.py | 13 +++++++++++++ modelscope/pipelines/base.py | 5 ++--- modelscope/trainers/trainer.py | 8 +++----- 4 files changed, 18 insertions(+), 31 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 36c246f1..224c55ff 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -646,29 +646,6 @@ class HubApi: def check_local_cookies(self, use_cookies) -> CookieJar: return self._check_cookie(use_cookies=use_cookies) - def create_library_statistics(self, - method: str, - name: str, - cn_name: Optional[str]): - """ - create library statistics. called by train()/evaluate()/pipeline() - - Args: - method (str): called methed name,i.e train/evaluate/pipeline - name (str): model name, for example: damo/cv_unet_person-image-cartoon_compound-models - cn_name (str): model name in chinese, for example: 达摩卡通化模型 - Raises: - ValueError: If user_cookies is True, but no local cookie. - - Returns: - None - """ - path = f'{self.endpoint}/api/v1/statistics/library' - headers = {'user-agent': ModelScopeConfig.get_user_agent()} - params = {"Method": method, "Name": name, "CnName": cn_name} - r = requests.post(path, params=params, headers=headers) - r.raise_for_status() - return class ModelScopeConfig: diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index a54f3413..8d5db579 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -4,6 +4,7 @@ import hashlib import os from datetime import datetime from typing import Optional +import requests from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, DEFAULT_MODELSCOPE_GROUP, @@ -12,6 +13,7 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, from modelscope.hub.errors import FileIntegrityError from modelscope.utils.file_utils import get_default_cache_dir from modelscope.utils.logger import get_logger +from modelscope.hub.api import ModelScopeConfig logger = get_logger() @@ -85,3 +87,14 @@ def file_integrity_validation(file_path, expected_sha256): msg = 'File %s integrity check failed, the download may be incomplete, please try again.' % file_path logger.error(msg) raise FileIntegrityError(msg) + + +def create_library_statistics(method: str, + name: str, + cn_name: Optional[str]): + path = f'{get_endpoint()}/api/v1/statistics/library' + headers = {'user-agent': ModelScopeConfig.get_user_agent()} + params = {"Method": method, "Name": name, "CnName": cn_name} + r = requests.post(path, params=params, headers=headers) + r.raise_for_status() + return diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index b8856dea..a56ee934 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -23,7 +23,7 @@ from modelscope.utils.hub import read_config, snapshot_download from modelscope.utils.import_utils import is_tf_available, is_torch_available from modelscope.utils.logger import get_logger from modelscope.utils.torch_utils import _find_free_port, _is_free_port -from modelscope.hub.api import HubApi +from modelscope.hub.utils.utils import create_library_statistics from .util import is_model, is_official_hub_path if is_torch_available(): @@ -152,9 +152,8 @@ class Pipeline(ABC): **kwargs) -> Union[Dict[str, Any], Generator]: # model provider should leave it as it is # modelscope library developer will handle this function - _api = HubApi() model_name = self.cfg.task - _api.create_library_statistics("pipeline", model_name, None) + create_library_statistics("pipeline", model_name, None) # place model to cpu or gpu if (self.model or (self.has_multiple_models and self.models[0])): if not self._model_prepare: diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 6e5f4180..92541252 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -39,7 +39,7 @@ from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, init_dist, set_random_seed) -from modelscope.hub.api import HubApi +from modelscope.hub.utils.utils import create_library_statistics from .base import BaseTrainer from .builder import TRAINERS from .default_config import merge_cfg @@ -437,9 +437,8 @@ class EpochBasedTrainer(BaseTrainer): def train(self, checkpoint_path=None, *args, **kwargs): self._mode = ModeKeys.TRAIN - _api = HubApi() model_name = self.cfg.task - _api.create_library_statistics("train", model_name, None) + create_library_statistics("train", model_name, None) if self.train_dataset is None: self.train_dataloader = self.get_train_dataloader() @@ -460,9 +459,8 @@ class EpochBasedTrainer(BaseTrainer): self.train_loop(self.train_dataloader) def evaluate(self, checkpoint_path=None): - _api = HubApi() model_name = self.cfg.task - _api.create_library_statistics("evaluate", model_name, None) + create_library_statistics("evaluate", model_name, None) if checkpoint_path is not None and os.path.isfile(checkpoint_path): from modelscope.trainers.hooks import CheckpointHook CheckpointHook.load_checkpoint(checkpoint_path, self) From 60af6b701b453fdb09cf1f326f8cfac35fcfa27f Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 11:59:59 +0800 Subject: [PATCH 23/46] fix task to model; handle exception --- modelscope/hub/utils/utils.py | 13 ++++++++----- modelscope/pipelines/base.py | 2 +- modelscope/trainers/trainer.py | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 8d5db579..5c915998 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -92,9 +92,12 @@ def file_integrity_validation(file_path, expected_sha256): def create_library_statistics(method: str, name: str, cn_name: Optional[str]): - path = f'{get_endpoint()}/api/v1/statistics/library' - headers = {'user-agent': ModelScopeConfig.get_user_agent()} - params = {"Method": method, "Name": name, "CnName": cn_name} - r = requests.post(path, params=params, headers=headers) - r.raise_for_status() + try: + path = f'{get_endpoint()}/api/v1/statistics/library' + headers = {'user-agent': ModelScopeConfig.get_user_agent()} + params = {"Method": method, "Name": name, "CnName": cn_name} + r = requests.post(path, params=params, headers=headers) + r.raise_for_status() + except Exception: + pass return diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index a56ee934..9280cc09 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -152,7 +152,7 @@ class Pipeline(ABC): **kwargs) -> Union[Dict[str, Any], Generator]: # model provider should leave it as it is # modelscope library developer will handle this function - model_name = self.cfg.task + model_name = self.cfg.model.type create_library_statistics("pipeline", model_name, None) # place model to cpu or gpu if (self.model or (self.has_multiple_models and self.models[0])): diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 92541252..522405ff 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -437,7 +437,7 @@ class EpochBasedTrainer(BaseTrainer): def train(self, checkpoint_path=None, *args, **kwargs): self._mode = ModeKeys.TRAIN - model_name = self.cfg.task + model_name = self.cfg.model.type create_library_statistics("train", model_name, None) if self.train_dataset is None: @@ -459,7 +459,7 @@ class EpochBasedTrainer(BaseTrainer): self.train_loop(self.train_dataloader) def evaluate(self, checkpoint_path=None): - model_name = self.cfg.task + model_name = self.cfg.model.type create_library_statistics("evaluate", model_name, None) if checkpoint_path is not None and os.path.isfile(checkpoint_path): from modelscope.trainers.hooks import CheckpointHook From 4080f8071e96d4dbcc5ae8af10b051e14fea30ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 12:57:04 +0800 Subject: [PATCH 24/46] temp --- modelscope/hub/api.py | 11 +++++++++++ modelscope/msdatasets/ms_dataset.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 7468e5e3..0262fc1d 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -646,6 +646,17 @@ class HubApi: def check_local_cookies(self, use_cookies) -> CookieJar: return self._check_cookie(use_cookies=use_cookies) + def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str): + # todo: 1. check args 2. + + url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}' + cookies = ModelScopeConfig.get_cookies() + r = requests.post(url, cookies=cookies, headers=self.headers) + resp = r.json() + raise_on_error(resp) + print(resp) + return resp['Message'] + class ModelScopeConfig: path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 0c537df7..a7d29990 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -727,3 +727,17 @@ class MsDataset: resp_msg = _delete_manager.delete(object_name=object_name) logger.info(f'Object {object_name} successfully removed!') return resp_msg + + +if __name__ == '__main__': + from modelscope.hub.api import HubApi + api = HubApi() + # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da') # online + # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6') # test + + channel = 'local' # dsw + dataset_name = 'small_coco_for_test' + namespace = 'wangxingjun778test' + resp = api.count_uv_by_channel( + dataset_name=dataset_name, namespace=namespace, channel=channel) + print(resp) From f5c31b33198288405f209773cd41a5efa1991e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 13:31:25 +0800 Subject: [PATCH 25/46] Add miss init --- .../models/science/unifold/modules/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 modelscope/models/science/unifold/modules/__init__.py diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py new file mode 100644 index 00000000..9821d212 --- /dev/null +++ b/modelscope/models/science/unifold/modules/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data pipeline for model features.""" From 943478de635393e957bb0bf6ad677fdd189ac5c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 13:32:57 +0800 Subject: [PATCH 26/46] Update --- .../models/science/unifold/modules/__init__.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py index 9821d212..63aa84ed 100644 --- a/modelscope/models/science/unifold/modules/__init__.py +++ b/modelscope/models/science/unifold/modules/__init__.py @@ -1,14 +1,3 @@ -# Copyright 2021 DeepMind Technologies Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Data pipeline for model features.""" +# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license, +# and is publicly available at https://github.com/dptech-corp/Uni-Fold. +"""Unifold Modules.""" From 2759d538bb30c8c82d0dd32ea3b4bcd7606d41d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 14:59:45 +0800 Subject: [PATCH 27/46] fix ut level for unifold --- tests/pipelines/test_unifold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_unifold.py b/tests/pipelines/test_unifold.py index df35dc5e..47bb7874 100644 --- a/tests/pipelines/test_unifold.py +++ b/tests/pipelines/test_unifold.py @@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck): self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \ 'NIAALKNHIDKIKPIAMQIYKKYSKNIP' - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_by_direct_model_download(self): model_dir = snapshot_download(self.model_id) mono_pipeline_ins = pipeline(task=self.task, model=model_dir) From cc76d900bcf2a7aae0a41d02d861f1865aba4b2c Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 15:31:08 +0800 Subject: [PATCH 28/46] add model name to baseModel. use model name as tag --- modelscope/hub/t_jy.py | 16 ++++++++++++++++ modelscope/models/base/base_model.py | 2 ++ modelscope/pipelines/base.py | 5 +++-- modelscope/trainers/trainer.py | 8 ++++---- 4 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 modelscope/hub/t_jy.py diff --git a/modelscope/hub/t_jy.py b/modelscope/hub/t_jy.py new file mode 100644 index 00000000..baf84f46 --- /dev/null +++ b/modelscope/hub/t_jy.py @@ -0,0 +1,16 @@ +def dec(param1): + print(param1) + + def in_dec(func): + def in_func(name): + return func(name) + return in_func + return in_dec + + +@dec("dec1") +def aa(param): + print(param) + return + +aa("heell") \ No newline at end of file diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py index 1ca7e030..721478c3 100644 --- a/modelscope/models/base/base_model.py +++ b/modelscope/models/base/base_model.py @@ -131,6 +131,8 @@ class Model(ABC): if not hasattr(model, 'cfg'): model.cfg = cfg + + model.name = model_name_or_path return model def save_pretrained(self, diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 9280cc09..b9a4a25c 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -152,8 +152,9 @@ class Pipeline(ABC): **kwargs) -> Union[Dict[str, Any], Generator]: # model provider should leave it as it is # modelscope library developer will handle this function - model_name = self.cfg.model.type - create_library_statistics("pipeline", model_name, None) + for single_model in self.models: + if hasattr(single_model, 'name'): + create_library_statistics("pipeline", single_model.name, None) # place model to cpu or gpu if (self.model or (self.has_multiple_models and self.models[0])): if not self._model_prepare: diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 522405ff..2e79667f 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -437,8 +437,8 @@ class EpochBasedTrainer(BaseTrainer): def train(self, checkpoint_path=None, *args, **kwargs): self._mode = ModeKeys.TRAIN - model_name = self.cfg.model.type - create_library_statistics("train", model_name, None) + if hasattr(self.model, 'name'): + create_library_statistics("train", self.model.name, None) if self.train_dataset is None: self.train_dataloader = self.get_train_dataloader() @@ -459,8 +459,8 @@ class EpochBasedTrainer(BaseTrainer): self.train_loop(self.train_dataloader) def evaluate(self, checkpoint_path=None): - model_name = self.cfg.model.type - create_library_statistics("evaluate", model_name, None) + if hasattr(self.model, 'name'): + create_library_statistics("evaluate", self.model.name, None) if checkpoint_path is not None and os.path.isfile(checkpoint_path): from modelscope.trainers.hooks import CheckpointHook CheckpointHook.load_checkpoint(checkpoint_path, self) From 184c35f80031574d53019124d56637ddfca4aa66 Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 15:32:04 +0800 Subject: [PATCH 29/46] rm useless --- modelscope/hub/t_jy.py | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 modelscope/hub/t_jy.py diff --git a/modelscope/hub/t_jy.py b/modelscope/hub/t_jy.py deleted file mode 100644 index baf84f46..00000000 --- a/modelscope/hub/t_jy.py +++ /dev/null @@ -1,16 +0,0 @@ -def dec(param1): - print(param1) - - def in_dec(func): - def in_func(name): - return func(name) - return in_func - return in_dec - - -@dec("dec1") -def aa(param): - print(param) - return - -aa("heell") \ No newline at end of file From 84032f90e3f2b4a183725ceda16a4b1dc204c2f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:34:58 +0800 Subject: [PATCH 30/46] add event tracking --- modelscope/hub/api.py | 20 ++++++++++++++------ modelscope/msdatasets/ms_dataset.py | 16 ++-------------- modelscope/utils/constant.py | 8 ++++++++ requirements/framework.txt | 2 +- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 0262fc1d..f2ff822d 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -39,8 +39,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DEFAULT_MODEL_REVISION, DEFAULT_REPOSITORY_REVISION, MASTER_MODEL_BRANCH, DatasetFormations, - DatasetMetaFormats, DownloadMode, - ModelFile) + DatasetMetaFormats, DownloadChannel, + DownloadMode, ModelFile) from modelscope.utils.logger import get_logger from .utils.utils import (get_endpoint, get_release_datetime, model_id_to_group_owner_name) @@ -646,15 +646,23 @@ class HubApi: def check_local_cookies(self, use_cookies) -> CookieJar: return self._check_cookie(use_cookies=use_cookies) - def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str): - # todo: 1. check args 2. + def dataset_download_uv(self, dataset_name: str, namespace: str): + if not dataset_name or not namespace: + raise ValueError('dataset_name or namespace cannot be empty!') - url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}' + # get channel and user_name + channel = DownloadChannel.LOCAL.value + user_name = '' + if MODELSCOPE_ENVIRONMENT in os.environ: + channel = os.environ[MODELSCOPE_ENVIRONMENT] + if MODELSCOPE_USERNAME in os.environ: + user_name = os.environ[MODELSCOPE_USERNAME] + + url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}' cookies = ModelScopeConfig.get_cookies() r = requests.post(url, cookies=cookies, headers=self.headers) resp = r.json() raise_on_error(resp) - print(resp) return resp['Message'] diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index a7d29990..5c8ea59f 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -274,6 +274,8 @@ class MsDataset: try: api.on_dataset_download( dataset_name=download_dataset, namespace=namespace) + api.dataset_download_uv( + dataset_name=download_dataset, namespace=namespace) except Exception as e: logger.error(e) @@ -727,17 +729,3 @@ class MsDataset: resp_msg = _delete_manager.delete(object_name=object_name) logger.info(f'Object {object_name} successfully removed!') return resp_msg - - -if __name__ == '__main__': - from modelscope.hub.api import HubApi - api = HubApi() - # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da') # online - # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6') # test - - channel = 'local' # dsw - dataset_name = 'small_coco_for_test' - namespace = 'wangxingjun778test' - resp = api.count_uv_by_channel( - dataset_name=dataset_name, namespace=namespace, channel=channel) - print(resp) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 2729b75a..f0a97dbd 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -238,6 +238,14 @@ class DownloadMode(enum.Enum): FORCE_REDOWNLOAD = 'force_redownload' +class DownloadChannel(enum.Enum): + """ Channels of datasets downloading for uv/pv counting. + """ + LOCAL = 'local' + DSW = 'dsw' + EAIS = 'eais' + + class UploadMode(enum.Enum): """ How to upload object to remote. """ diff --git a/requirements/framework.txt b/requirements/framework.txt index 17fbd8a3..e78bc9a9 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,7 +1,7 @@ addict attrs # version beyond 2.6.0 introduces compatbility issue and is being resolved -datasets<=2.6.0 +datasets<=2.5.2 easydict einops filelock>=3.3.0 From 79c44a68102e182b3194e3b9e6244d4891859274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:41:01 +0800 Subject: [PATCH 31/46] add event tracking --- requirements/framework.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/framework.txt b/requirements/framework.txt index e78bc9a9..a86c0cc5 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,6 @@ addict attrs -# version beyond 2.6.0 introduces compatbility issue and is being resolved +# version beyond 2.5.2 introduces compatbility issue and is being resolved datasets<=2.5.2 easydict einops From 63a08e7be68bce218eb6ca755ecbc821017d83b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:49:21 +0800 Subject: [PATCH 32/46] add event tracking --- tests/msdatasets/test_dataset_upload.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py index 3d35d480..b67c2ebb 100644 --- a/tests/msdatasets/test_dataset_upload.py +++ b/tests/msdatasets/test_dataset_upload.py @@ -104,7 +104,11 @@ class DatasetUploadTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ds_download_dir(self): - test_ds = MsDataset.load(self.dataset_name, self.namespace) + from modelscope.utils.constant import DownloadMode + test_ds = MsDataset.load( + self.dataset_name, + namespace=self.namespace, + download_mode=DownloadMode.FORCE_REDOWNLOAD) assert test_ds.config_kwargs['split_config'].values() @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') From e45ab2c32d66a3ae8014be045d773719b82cb0cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:51:00 +0800 Subject: [PATCH 33/46] add event tracking --- tests/msdatasets/test_dataset_upload.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py index b67c2ebb..d91f24d7 100644 --- a/tests/msdatasets/test_dataset_upload.py +++ b/tests/msdatasets/test_dataset_upload.py @@ -8,7 +8,8 @@ import zipfile from modelscope.msdatasets import MsDataset from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects from modelscope.utils import logger as logging -from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile +from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode, + ModelFile) from modelscope.utils.test_utils import test_level logger = logging.get_logger(__name__) @@ -104,7 +105,6 @@ class DatasetUploadTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ds_download_dir(self): - from modelscope.utils.constant import DownloadMode test_ds = MsDataset.load( self.dataset_name, namespace=self.namespace, From 5f3c9433fc83bc13fb00d552270e5dc8d6933854 Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 16:35:46 +0800 Subject: [PATCH 34/46] fix format --- modelscope/hub/api.py | 1 - modelscope/hub/utils/utils.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 224c55ff..7468e5e3 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -647,7 +647,6 @@ class HubApi: return self._check_cookie(use_cookies=use_cookies) - class ModelScopeConfig: path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) COOKIES_FILE_NAME = 'cookies' diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 5c915998..312647c2 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -95,7 +95,7 @@ def create_library_statistics(method: str, try: path = f'{get_endpoint()}/api/v1/statistics/library' headers = {'user-agent': ModelScopeConfig.get_user_agent()} - params = {"Method": method, "Name": name, "CnName": cn_name} + params = {'Method': method, 'Name': name, 'CnName': cn_name} r = requests.post(path, params=params, headers=headers) r.raise_for_status() except Exception: From 76bb518d75818ce8e19afa0f0b775b00ac9a72cd Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 16:59:47 +0800 Subject: [PATCH 35/46] fix format --- modelscope/hub/utils/utils.py | 8 +++----- modelscope/trainers/trainer.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 312647c2..f9a75cce 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -2,10 +2,11 @@ import hashlib import os +import requests from datetime import datetime from typing import Optional -import requests +from modelscope.hub.api import ModelScopeConfig from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, DEFAULT_MODELSCOPE_GROUP, MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG, @@ -13,7 +14,6 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, from modelscope.hub.errors import FileIntegrityError from modelscope.utils.file_utils import get_default_cache_dir from modelscope.utils.logger import get_logger -from modelscope.hub.api import ModelScopeConfig logger = get_logger() @@ -89,9 +89,7 @@ def file_integrity_validation(file_path, expected_sha256): raise FileIntegrityError(msg) -def create_library_statistics(method: str, - name: str, - cn_name: Optional[str]): +def create_library_statistics(method: str, name: str, cn_name: Optional[str]): try: path = f'{get_endpoint()}/api/v1/statistics/library' headers = {'user-agent': ModelScopeConfig.get_user_agent()} diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 2e79667f..d59c3dfc 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -14,6 +14,7 @@ from torch.utils.data import DataLoader, Dataset from torch.utils.data.dataloader import default_collate from torch.utils.data.distributed import DistributedSampler +from modelscope.hub.utils.utils import create_library_statistics from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.metrics import build_metric, task_default_metrics @@ -39,7 +40,6 @@ from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, init_dist, set_random_seed) -from modelscope.hub.utils.utils import create_library_statistics from .base import BaseTrainer from .builder import TRAINERS from .default_config import merge_cfg From 30c8c27145261a3e5c7606976e11faef733d3f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 17:06:30 +0800 Subject: [PATCH 36/46] up requirements --- requirements/science.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/science.txt b/requirements/science.txt index 72994f72..c345da99 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -4,3 +4,5 @@ ml_collections scipy tensorboardX tokenizers +biopython +ipdb From 853e5235d56bf35922cde0db843cb62353e19a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 17:32:04 +0800 Subject: [PATCH 37/46] fix requirements --- requirements/science.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/science.txt b/requirements/science.txt index c345da99..636f98f4 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -1,8 +1,8 @@ -iopath +biopython lmdb ml_collections scipy tensorboardX tokenizers -biopython -ipdb +iopath +ipdb \ No newline at end of file From 9ae5b67204e5648eb54e1ea43ca741623c87e1da Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Tue, 1 Nov 2022 17:40:28 +0800 Subject: [PATCH 38/46] fix style issues --- modelscope/hub/utils/utils.py | 3 ++- modelscope/pipelines/base.py | 4 ++-- modelscope/trainers/trainer.py | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index f9a75cce..d0a87cbd 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -2,10 +2,11 @@ import hashlib import os -import requests from datetime import datetime from typing import Optional +import requests + from modelscope.hub.api import ModelScopeConfig from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, DEFAULT_MODELSCOPE_GROUP, diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index b9a4a25c..68010012 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -10,6 +10,7 @@ from typing import Any, Dict, Generator, List, Mapping, Union import numpy as np +from modelscope.hub.utils.utils import create_library_statistics from modelscope.models.base import Model from modelscope.msdatasets import MsDataset from modelscope.outputs import TASK_OUTPUTS @@ -23,7 +24,6 @@ from modelscope.utils.hub import read_config, snapshot_download from modelscope.utils.import_utils import is_tf_available, is_torch_available from modelscope.utils.logger import get_logger from modelscope.utils.torch_utils import _find_free_port, _is_free_port -from modelscope.hub.utils.utils import create_library_statistics from .util import is_model, is_official_hub_path if is_torch_available(): @@ -154,7 +154,7 @@ class Pipeline(ABC): # modelscope library developer will handle this function for single_model in self.models: if hasattr(single_model, 'name'): - create_library_statistics("pipeline", single_model.name, None) + create_library_statistics('pipeline', single_model.name, None) # place model to cpu or gpu if (self.model or (self.has_multiple_models and self.models[0])): if not self._model_prepare: diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index d59c3dfc..12c25f30 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -14,8 +14,8 @@ from torch.utils.data import DataLoader, Dataset from torch.utils.data.dataloader import default_collate from torch.utils.data.distributed import DistributedSampler -from modelscope.hub.utils.utils import create_library_statistics from modelscope.hub.snapshot_download import snapshot_download +from modelscope.hub.utils.utils import create_library_statistics from modelscope.metainfo import Trainers from modelscope.metrics import build_metric, task_default_metrics from modelscope.models.base import Model, TorchModel @@ -438,7 +438,7 @@ class EpochBasedTrainer(BaseTrainer): def train(self, checkpoint_path=None, *args, **kwargs): self._mode = ModeKeys.TRAIN if hasattr(self.model, 'name'): - create_library_statistics("train", self.model.name, None) + create_library_statistics('train', self.model.name, None) if self.train_dataset is None: self.train_dataloader = self.get_train_dataloader() @@ -460,7 +460,7 @@ class EpochBasedTrainer(BaseTrainer): def evaluate(self, checkpoint_path=None): if hasattr(self.model, 'name'): - create_library_statistics("evaluate", self.model.name, None) + create_library_statistics('evaluate', self.model.name, None) if checkpoint_path is not None and os.path.isfile(checkpoint_path): from modelscope.trainers.hooks import CheckpointHook CheckpointHook.load_checkpoint(checkpoint_path, self) From 420b63f03b55d5c2a591fd69cd060ed3a8141ef4 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Tue, 1 Nov 2022 17:44:18 +0800 Subject: [PATCH 39/46] fix style issues --- requirements/science.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/science.txt b/requirements/science.txt index 636f98f4..c30ff644 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -1,8 +1,8 @@ biopython +iopath +ipdb lmdb ml_collections scipy tensorboardX tokenizers -iopath -ipdb \ No newline at end of file From aecb88044eba1789a675f22a32cc6f2eed71b91a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 17:44:37 +0800 Subject: [PATCH 40/46] up --- requirements/science.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/science.txt b/requirements/science.txt index 636f98f4..c30ff644 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -1,8 +1,8 @@ biopython +iopath +ipdb lmdb ml_collections scipy tensorboardX tokenizers -iopath -ipdb \ No newline at end of file From f2faf3acb38e3ccb6e62379e4314f00c844db36f Mon Sep 17 00:00:00 2001 From: "jiangyu.xzy" Date: Tue, 1 Nov 2022 18:04:48 +0800 Subject: [PATCH 41/46] fix import bug --- modelscope/hub/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index d0a87cbd..61d560fa 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -7,7 +7,6 @@ from typing import Optional import requests -from modelscope.hub.api import ModelScopeConfig from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, DEFAULT_MODELSCOPE_GROUP, MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG, @@ -92,6 +91,7 @@ def file_integrity_validation(file_path, expected_sha256): def create_library_statistics(method: str, name: str, cn_name: Optional[str]): try: + from modelscope.hub.api import ModelScopeConfig path = f'{get_endpoint()}/api/v1/statistics/library' headers = {'user-agent': ModelScopeConfig.get_user_agent()} params = {'Method': method, 'Name': name, 'CnName': cn_name} From e870d55e28b97732686849a22084ed7dca4c2182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AF=BF=E5=B7=9E?= Date: Tue, 1 Nov 2022 20:31:16 +0800 Subject: [PATCH 42/46] fix no face bug and adaptive for 360 degree of head --- .../face_2d_keypoints_pipeline.py | 136 +++++++----------- 1 file changed, 53 insertions(+), 83 deletions(-) diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py index 4de5a4f2..94cbb74e 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py @@ -12,8 +12,11 @@ from modelscope.pipelines import pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import LoadImage from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger from .base import EasyCVPipeline +logger = get_logger() + @PIPELINES.register_module( Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints) @@ -123,54 +126,28 @@ class Face2DKeypointsPipeline(EasyCVPipeline): return s / 3 * sigma def rotate_crop_img(self, img, pts, M): - image_size = 256 - enlarge_ratio = 1.1 - imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0]))) x1 = pts[5][0] + x2 = pts[5][0] y1 = pts[5][1] - x2 = pts[6][0] - y2 = pts[6][1] - w = x2 - x1 + 1 - h = y2 - y1 + 1 - x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w) - y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h) - - new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w) - new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h) - new_x1 = x1 + int(self.random_normal() * image_size * 0.05) - new_y1 = y1 + int(self.random_normal() * image_size * 0.05) - new_x2 = new_x1 + new_w - new_y2 = new_y1 + new_h + y2 = pts[5][1] + for i in range(0, 9): + x1 = min(x1, pts[i][0]) + x2 = max(x2, pts[i][0]) + y1 = min(y1, pts[i][1]) + y2 = max(y2, pts[i][1]) height, width, _ = imgT.shape - dx = max(0, -new_x1) - dy = max(0, -new_y1) - new_x1 = max(0, new_x1) - new_y1 = max(0, new_y1) + x1 = min(max(0, int(x1)), width) + y1 = min(max(0, int(y1)), height) + x2 = min(max(0, int(x2)), width) + y2 = min(max(0, int(y2)), height) + sub_imgT = imgT[y1:y2, x1:x2] - edx = max(0, new_x2 - width) - edy = max(0, new_y2 - height) - new_x2 = min(width, new_x2) - new_y2 = min(height, new_y2) + return sub_imgT, imgT, [x1, y1, x2, y2] - sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2] - if dx > 0 or dy > 0 or edx > 0 or edy > 0: - sub_imgT = cv2.copyMakeBorder( - sub_imgT, - dy, - edy, - dx, - edx, - cv2.BORDER_CONSTANT, - value=(103.94, 116.78, 123.68)) - - return sub_imgT, imgT, [new_x1, new_y1, new_x2, - new_y2], [dx, dy, edx, edy] - - def crop_img(self, imgT, pts, angle): - image_size = 256 + def crop_img(self, imgT, pts): enlarge_ratio = 1.1 x1 = np.min(pts[:, 0]) @@ -181,94 +158,87 @@ class Face2DKeypointsPipeline(EasyCVPipeline): h = y2 - y1 + 1 x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w) y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h) + x1 = max(0, x1) + y1 = max(0, y1) - new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w) - new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h) - new_x1 = x1 + int(self.random_normal() * image_size * 0.05) - new_y1 = y1 + int(self.random_normal() * image_size * 0.05) + new_w = int(enlarge_ratio * w) + new_h = int(enlarge_ratio * h) + new_x1 = x1 + new_y1 = y1 new_x2 = new_x1 + new_w new_y2 = new_y1 + new_h - new_xy = new_x1, new_y1 - pts = pts - new_xy - height, width, _ = imgT.shape - dx = max(0, -new_x1) - dy = max(0, -new_y1) - new_x1 = max(0, new_x1) - new_y1 = max(0, new_y1) - edx = max(0, new_x2 - width) - edy = max(0, new_y2 - height) - new_x2 = min(width, new_x2) - new_y2 = min(height, new_y2) + new_x1 = min(max(0, new_x1), width) + new_y1 = min(max(0, new_y1), height) + new_x2 = max(min(width, new_x2), 0) + new_y2 = max(min(height, new_y2), 0) sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2] - if dx > 0 or dy > 0 or edx > 0 or edy > 0: - sub_imgT = cv2.copyMakeBorder( - sub_imgT, - dy, - edy, - dx, - edx, - cv2.BORDER_CONSTANT, - value=(103.94, 116.78, 123.68)) - - return sub_imgT, [new_x1, new_y1, new_x2, new_y2], [dx, dy, edx, edy] - def __call__(self, inputs) -> Any: - image_size = 256 + return sub_imgT, [new_x1, new_y1, new_x2, new_y2] + def __call__(self, inputs) -> Any: img = LoadImage.convert_to_ndarray(inputs) h, w, c = img.shape img_rgb = copy.deepcopy(img) img_rgb = img_rgb[:, :, ::-1] det_result = self.face_detection(img_rgb) + + bboxes = np.array(det_result[OutputKeys.BOXES]) + if bboxes.shape[0] == 0: + logger.warn('No face detected!') + results = { + OutputKeys.KEYPOINTS: [], + OutputKeys.POSES: [], + OutputKeys.BOXES: [] + } + return results + boxes, keypoints = self._choose_face(det_result) output_boxes = [] output_keypoints = [] output_poses = [] - for idx, box_ori in enumerate(boxes): - box = self.expend_box(box_ori, w, h, scalex=0.15, scaley=0.15) + for index, box_ori in enumerate(boxes): + box = self.expend_box(box_ori, w, h, scalex=0.1, scaley=0.1) y0 = int(box[1]) y1 = int(box[3]) x0 = int(box[0]) x1 = int(box[2]) sub_img = img[y0:y1, x0:x1] - keypoint = keypoints[idx] + keypoint = keypoints[index] pts = [[keypoint[0], keypoint[1]], [keypoint[2], keypoint[3]], [keypoint[4], keypoint[5]], [keypoint[6], keypoint[7]], [keypoint[8], keypoint[9]], [box[0], box[1]], - [box[2], box[3]]] + [box[2], box[1]], [box[0], box[3]], [box[2], box[3]]] # radian angle = math.atan2((pts[1][1] - pts[0][1]), (pts[1][0] - pts[0][0])) # angle theta = angle * (180 / np.pi) - center = [image_size // 2, image_size // 2] + center = [w // 2, h // 2] cx, cy = center M, landmark_ = self.rotate_point(theta, (cx, cy), pts) - sub_img, imgT, bbox, delta_border = self.rotate_crop_img( - img, pts, M) + sub_imgT, imgT, bbox = self.rotate_crop_img(img, landmark_, M) - outputs = self.predict_op([sub_img])[0] + outputs = self.predict_op([sub_imgT])[0] tmp_keypoints = outputs['point'] for idx in range(0, len(tmp_keypoints)): - tmp_keypoints[idx][0] += (delta_border[0] + bbox[0]) - tmp_keypoints[idx][1] += (delta_border[1] + bbox[1]) + tmp_keypoints[idx][0] += bbox[0] + tmp_keypoints[idx][1] += bbox[1] - for idx in range(0, 3): - sub_img, bbox, delta_border = self.crop_img( - imgT, tmp_keypoints, 0) + for idx in range(0, 6): + sub_img, bbox = self.crop_img(imgT, tmp_keypoints) outputs = self.predict_op([sub_img])[0] tmp_keypoints = outputs['point'] for idx in range(0, len(tmp_keypoints)): - tmp_keypoints[idx][0] += (delta_border[0] + bbox[0]) - tmp_keypoints[idx][1] += (delta_border[1] + bbox[1]) + tmp_keypoints[idx][0] += bbox[0] + tmp_keypoints[idx][1] += bbox[1] M2, tmp_keypoints = self.rotate_point(-theta, (cx, cy), tmp_keypoints) From 30128b698916c526d4ee4d3d77e09c58f5612621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AF=BF=E5=B7=9E?= Date: Tue, 1 Nov 2022 20:42:58 +0800 Subject: [PATCH 43/46] update --- .../easycv_pipelines/face_2d_keypoints_pipeline.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py index 94cbb74e..29a96a5f 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py @@ -113,18 +113,6 @@ class Face2DKeypointsPipeline(EasyCVPipeline): for (x, y) in landmark]) return M, landmark_ - def random_normal(self): - """ - 3-sigma rule - return: (-1, +1) - """ - mu, sigma = 0, 1 - while True: - s = np.random.normal(mu, sigma) - if s < mu - 3 * sigma or s > mu + 3 * sigma: - continue - return s / 3 * sigma - def rotate_crop_img(self, img, pts, M): imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0]))) From 1ca24299da877b92387c40403e2bb420489acff9 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Wed, 2 Nov 2022 13:51:59 +0800 Subject: [PATCH 44/46] [to #45892407]fix: fix pytorch_lighting incompatible with taming-transformers-rom1504 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10604329 * [to #45892407]fix: fix pytorch_lighting incompatible with taming-transformers-rom1504 --- requirements/multi-modal.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt index 578f0b54..31e9601d 100644 --- a/requirements/multi-modal.txt +++ b/requirements/multi-modal.txt @@ -2,6 +2,8 @@ ftfy>=6.0.3 ofa>=0.0.2 pycocoevalcap>=1.2 pycocotools>=2.0.4 +# compatible with taming-transformers-rom1504 +pytorch_lightning<=1.7.7 # rough-score was just recently updated from 0.0.4 to 0.0.7 # which introduced compatability issues that are being investigated rouge_score<=0.0.4 From 93a52ec42d7fe5c683257f650d9449ac0f45c2cb Mon Sep 17 00:00:00 2001 From: "yingda.chen" Date: Wed, 2 Nov 2022 14:07:48 +0800 Subject: [PATCH 45/46] update README Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10601974 --- README.md | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 944c1f07..1da48ef2 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,26 @@ # Introduction -ModelScope library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba. +[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bringing together most advanced machine learning models from the AI community, and to streamlining the process of leveraging and applying AI models . The core ModelScope library enables developers to perform model inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains. -# Design doc +The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modality, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done within only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary. -Please refer to alidoc [link](https://alidocs.dingtalk.com/i/nodes/OBldywvrKxo89xmAO05yJQk2ngpNbLz4?nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA&iframeQuery=utm_source%3Dportal%26utm_medium%3Dportal_space_file_tree) +Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with the backend services of ModelScope, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate various entity (models and datasets) management to be performed seamlessly under-the-hood, such as entity lookup, version control, and cache management. -# Development doc +# Installation -Please refer to [develop.md](docs/source/develop.md) +Please refer to [installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85). -# ChangeLog -* 20/05/2022 First release version +# Get Started -Refer to [change_log.md](docs/source/change_log.md) for more details +You can refer to [quick_start](https://modelscope.cn/docs/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) for quick start. + +We also provide other documentations including: +* [Introduction to tasks](https://modelscope.cn/docs/%E4%BB%BB%E5%8A%A1%E7%9A%84%E4%BB%8B%E7%BB%8D) +* [Use pipeline for model inference](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline) +* [Finetune example](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train) +* [Preprocessing of data](https://modelscope.cn/docs/%E6%95%B0%E6%8D%AE%E7%9A%84%E9%A2%84%E5%A4%84%E7%90%86) +* [Evaluation metrics](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AF%84%E4%BC%B0) + +# License + +This project is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). From 5f1b9a621871f310ee44138c62b588bbc7d83c73 Mon Sep 17 00:00:00 2001 From: "yichang.zyc" Date: Wed, 2 Nov 2022 14:23:26 +0800 Subject: [PATCH 46/46] add default config and fix proprocess detokenizer Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10603232 --- .../models/multi_modal/ofa_for_all_tasks.py | 18 ++++++++++++++- modelscope/preprocessors/multi_modal.py | 2 +- .../preprocessors/ofa/ocr_recognition.py | 13 +++-------- .../multi_modal/ofa/ofa_trainer_utils.py | 22 +++++++++---------- 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py index 2c6034e8..fc578b25 100644 --- a/modelscope/models/multi_modal/ofa_for_all_tasks.py +++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import math import os +import re import string from functools import partial from os import path as osp @@ -110,6 +111,8 @@ class OfaForAllTasks(TorchModel): Tasks.text_classification: inference_d[self.gen_type], Tasks.image_classification: inference_d[self.gen_type], } + pattern_str = '((?<=[^ a-zA-Z0-9.,:!?]) +| +(?=[^ a-zA-Z0-9.,:!?]))' + self.pattern = re.compile(pattern_str) def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: input = move_to_device(input, self.model.device) @@ -135,8 +138,18 @@ class OfaForAllTasks(TorchModel): caption = input[OutputKeys.CAPTION] result_l = list() for cap in caption: - result_l.append(cap.translate(self.transtab).strip()) + if self.language == 'en': + result_l.append(cap.translate(self.transtab).strip()) + else: + result_l.append(cap) input[OutputKeys.CAPTION] = result_l + if self.gen_type == 'generation' and self.language in [ + 'zh', 'cn' + ] and self.cfg.task != Tasks.visual_grounding: + ret_l = list() + for text in input[OFA_TASK_KEY_MAPPING[self.cfg.task]]: + ret_l.append(self.detokenizer(text)) + input[OFA_TASK_KEY_MAPPING[self.cfg.task]] = ret_l return input def _text_gen_inference(self, input): @@ -314,3 +327,6 @@ class OfaForAllTasks(TorchModel): save_function=partial(save_function, with_meta=False), config=config, **kwargs) + + def detokenizer(self, text): + return self.pattern.sub('', text) diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 13876058..3a3ae820 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -77,7 +77,7 @@ class OfaPreprocessor(Preprocessor): data[key] = item return data - def _ofa_input_compatibility_conversion(self, data): + def _ofa_input_compatibility_conversion(self, data): # fake if 'image' in data and self.cfg.model.get('type', None) == 'ofa': if isinstance(data['image'], str): image = load_image(data['image']) diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py index a0342c14..58e3ea6e 100644 --- a/modelscope/preprocessors/ofa/ocr_recognition.py +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -73,21 +73,14 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): """ super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir, mode, *args, **kwargs) - # Initialize transform - if self.cfg.model.imagenet_default_mean_and_std: - mean = IMAGENET_DEFAULT_MEAN - std = IMAGENET_DEFAULT_STD - else: - mean = [0.5, 0.5, 0.5] - std = [0.5, 0.5, 0.5] self.patch_resize_transform = transforms.Compose([ lambda image: ocr_resize( image, - self.cfg.model.patch_image_size, - is_document=self.cfg.model.is_document), + self.patch_image_size, + is_document=self.cfg.model.get('is_document', False)), transforms.ToTensor(), - transforms.Normalize(mean=mean, std=std), + transforms.Normalize(mean=self.mean, std=self.std), ]) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py index 3c38884c..3930febb 100644 --- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py +++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py @@ -103,20 +103,20 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss): def __init__(self, args): super().__init__() - self.sentence_avg = args.sentence_avg - self.eps = args.label_smoothing - self.ignore_prefix_size = args.ignore_prefix_size - self.ignore_eos = args.ignore_eos - self.report_accuracy = args.report_accuracy - self.drop_worst_ratio = args.drop_worst_ratio - self.drop_worst_after = args.drop_worst_after - self.use_rdrop = args.use_rdrop - self.reg_alpha = args.reg_alpha - self.sample_patch_num = args.sample_patch_num + self.sentence_avg = args.get('sentence_avg', False) + self.eps = args.get('label_smoothing', 0.1) + self.ignore_prefix_size = args.get('ignore_prefix_size', 0) + self.ignore_eos = args.get('ignore_eos', False) + self.report_accuracy = args.get('report_accuracy', False) + self.drop_worst_ratio = args.get('drop_worst_ratio', 0.0) + self.drop_worst_after = args.get('drop_worst_after', 0) + self.use_rdrop = args.get('use_rdrop', False) + self.reg_alpha = args.get('reg_alpha', 1.0) + self.sample_patch_num = args.get('sample_patch_num', 196) self.constraint_start = None self.constraint_end = None - if args.constraint_range: + if args.get('constraint_range', None): constraint_start, constraint_end = args.constraint_range.split(',') self.constraint_start = int(constraint_start) self.constraint_end = int(constraint_end)