co-contributed with 夕陌&雨泓 * add torch epoch based trainer and dis utils * add hooks including optimizer, lrscheduler, logging, checkpoint, evaluation, time profiling * add torch mdoel base and test * add optimizer and lrscheduler module * add sbert for text classification example * add task_dataset for dataset-level processor Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9338412master
@@ -0,0 +1,175 @@ | |||
{ | |||
"framework": "pytorch", | |||
"task": "image_classification", | |||
"work_dir": "./work_dir", | |||
"model": { | |||
"type": "classification", | |||
"pretrained": null, | |||
"backbone": { | |||
"type": "ResNet", | |||
"depth": 50, | |||
"out_indices": [ | |||
4 | |||
], | |||
"norm_cfg": { | |||
"type": "BN" | |||
} | |||
}, | |||
"head": { | |||
"type": "ClsHead", | |||
"with_avg_pool": true, | |||
"in_channels": 2048, | |||
"loss_config": { | |||
"type": "CrossEntropyLossWithLabelSmooth", | |||
"label_smooth": 0 | |||
}, | |||
"num_classes": 1000 | |||
} | |||
}, | |||
"dataset": { | |||
"train": { | |||
"type": "ClsDataset", | |||
"data_source": { | |||
"list_file": "data/imagenet_raw/meta/train_labeled.txt", | |||
"root": "data/imagenet_raw/train/", | |||
"type": "ClsSourceImageList" | |||
} | |||
}, | |||
"val": { | |||
"type": "ClsDataset", | |||
"data_source": { | |||
"list_file": "data/imagenet_raw/meta/val_labeled.txt", | |||
"root": "data/imagenet_raw/validation/", | |||
"type": "ClsSourceImageList" | |||
} | |||
}, | |||
"test": {} | |||
}, | |||
"preprocessor":{ | |||
"train": [ | |||
{ | |||
"type": "RandomResizedCrop", | |||
"size": 224 | |||
}, | |||
{ | |||
"type": "RandomHorizontalFlip" | |||
}, | |||
{ | |||
"type": "ToTensor" | |||
}, | |||
{ | |||
"type": "Normalize", | |||
"mean": [ | |||
0.485, | |||
0.456, | |||
0.406 | |||
], | |||
"std": [ | |||
0.229, | |||
0.224, | |||
0.225 | |||
] | |||
}, | |||
{ | |||
"type": "Collect", | |||
"keys": [ | |||
"img", | |||
"gt_labels" | |||
] | |||
} | |||
], | |||
"val": [ | |||
{ | |||
"type": "Resize", | |||
"size": 256 | |||
}, | |||
{ | |||
"type": "CenterCrop", | |||
"size": 224 | |||
}, | |||
{ | |||
"type": "ToTensor" | |||
}, | |||
{ | |||
"type": "Normalize", | |||
"mean": [ | |||
0.485, | |||
0.456, | |||
0.406 | |||
], | |||
"std": [ | |||
0.229, | |||
0.224, | |||
0.225 | |||
] | |||
}, | |||
{ | |||
"type": "Collect", | |||
"keys": [ | |||
"img", | |||
"gt_labels" | |||
] | |||
} | |||
] | |||
}, | |||
"train": { | |||
"dataloader": { | |||
"batch_size_per_gpu": 2, | |||
"workers_per_gpu": 1 | |||
}, | |||
"optimizer": { | |||
"type": "SGD", | |||
"lr": 0.01, | |||
"options": { | |||
"grad_clip": { | |||
"max_norm": 2.0 | |||
} | |||
} | |||
}, | |||
"lr_scheduler": { | |||
"type": "StepLR", | |||
"step_size": 2, | |||
"options": { | |||
"warmup": { | |||
"type": "LinearWarmup", | |||
"warmup_iters": 2 | |||
} | |||
} | |||
}, | |||
"hooks": | |||
[ | |||
{ | |||
"type": "CheckpointHook", | |||
"interval": 2 | |||
}, | |||
{ | |||
"type": "TextLoggerHook", | |||
"interval": 1 | |||
}, | |||
{ | |||
"type": "IterTimerHook" | |||
}, | |||
{ | |||
"type": "EvaluationHook", | |||
"interval": 1 | |||
} | |||
] | |||
}, | |||
"evaluation": { | |||
"dataloader": { | |||
"batch_size_per_gpu": 2, | |||
"workers_per_gpu": 1, | |||
"shuffle": false | |||
}, | |||
"metrics": ["accuracy", "precision", "recall"] | |||
} | |||
} |
@@ -0,0 +1,77 @@ | |||
{ | |||
"task": "sentence-similarity", | |||
"preprocessor": { | |||
"type": "bert-seq-cls-tokenizer-finetune", | |||
"first_sequence": "sentence1", | |||
"second_sequence": "sentence2" | |||
}, | |||
"model": { | |||
"type": "structbert", | |||
"attention_probs_dropout_prob": 0.1, | |||
"easynlp_version": "0.0.3", | |||
"gradient_checkpointing": false, | |||
"hidden_act": "gelu", | |||
"hidden_dropout_prob": 0.1, | |||
"hidden_size": 768, | |||
"initializer_range": 0.02, | |||
"intermediate_size": 3072, | |||
"layer_norm_eps": 1e-12, | |||
"max_position_embeddings": 512, | |||
"num_attention_heads": 12, | |||
"num_hidden_layers": 12, | |||
"pad_token_id": 0, | |||
"position_embedding_type": "absolute", | |||
"transformers_version": "4.6.0.dev0", | |||
"type_vocab_size": 2, | |||
"use_cache": true, | |||
"vocab_size": 30522 | |||
}, | |||
"pipeline": { | |||
"type": "sentence-similarity" | |||
}, | |||
"work_dir": "/tmp", | |||
"train": { | |||
"dataloader": { | |||
"batch_size_per_gpu": 2, | |||
"workers_per_gpu": 1 | |||
}, | |||
"optimizer": { | |||
"type": "SGD", | |||
"lr": 0.01, | |||
"options": { | |||
"grad_clip": { | |||
"max_norm": 2.0 | |||
} | |||
} | |||
}, | |||
"lr_scheduler": { | |||
"type": "StepLR", | |||
"step_size": 2, | |||
"options": { | |||
"warmup": { | |||
"type": "LinearWarmup", | |||
"warmup_iters": 2 | |||
} | |||
} | |||
}, | |||
"hooks": [{ | |||
"type": "CheckpointHook", | |||
"interval": 1 | |||
}, { | |||
"type": "TextLoggerHook", | |||
"interval": 1 | |||
}, { | |||
"type": "IterTimerHook" | |||
}, { | |||
"type": "EvaluationHook", | |||
"interval": 1 | |||
}] | |||
}, | |||
"evaluation": { | |||
"dataloader": { | |||
"batch_size_per_gpu": 2, | |||
"workers_per_gpu": 1, | |||
"shuffle": false | |||
} | |||
} | |||
} |
@@ -36,10 +36,10 @@ modelscope.pipelines.base module | |||
:undoc-members: | |||
:show-inheritance: | |||
modelscope.pipelines.outputs module | |||
modelscope.outputs module | |||
----------------------------------- | |||
.. automodule:: modelscope.pipelines.outputs | |||
.. automodule:: modelscope.outputs | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
@@ -127,3 +127,16 @@ class Preprocessors(object): | |||
# multi-modal | |||
ofa_image_caption = 'ofa-image-caption' | |||
mplug_visual_question_answering = 'mplug-visual-question-answering' | |||
class Metrics(object): | |||
""" Names for different metrics. | |||
""" | |||
# accuracy | |||
accuracy = 'accuracy' | |||
# metrics for sequence classification task | |||
seq_cls_metric = 'seq_cls_metric' | |||
# metrics for token-classification task | |||
token_cls_metric = 'token-cls-metric' |
@@ -0,0 +1,3 @@ | |||
from .base import Metric | |||
from .builder import METRICS, build_metric, task_default_metrics | |||
from .sequence_classification_metric import SequenceClassificationMetric |
@@ -0,0 +1,37 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from abc import ABC, abstractmethod | |||
from typing import Dict | |||
class Metric(ABC): | |||
"""The metric base class for computing metrics. | |||
The subclasses can either compute a single metric like 'accuracy', or compute the | |||
complex metrics for a specific task with or without other Metric subclasses. | |||
""" | |||
@abstractmethod | |||
def add(self, outputs: Dict, inputs: Dict): | |||
""" Append logits and labels within an eval loop. | |||
Will be called after every batch finished to gather the model predictions and the labels. | |||
Args: | |||
outputs: The model prediction outputs. | |||
inputs: The mini batch inputs from the dataloader. | |||
Returns: None | |||
""" | |||
pass | |||
@abstractmethod | |||
def evaluate(self): | |||
"""Evaluate the metrics after the eval finished. | |||
Will be called after the whole validation finished. | |||
Returns: The actual metric dict with standard names. | |||
""" | |||
pass |
@@ -0,0 +1,35 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from ..metainfo import Metrics | |||
from ..utils.config import ConfigDict | |||
from ..utils.constant import Tasks | |||
from ..utils.registry import Registry, build_from_cfg, default_group | |||
METRICS = Registry('metrics') | |||
class MetricKeys(object): | |||
ACCURACY = 'accuracy' | |||
F1 = 'f1' | |||
PRECISION = 'precision' | |||
RECALL = 'recall' | |||
task_default_metrics = { | |||
Tasks.sentence_similarity: [Metrics.seq_cls_metric], | |||
} | |||
def build_metric(metric_name: str, | |||
field: str = default_group, | |||
default_args: dict = None): | |||
""" Build metric given metric_name and field. | |||
Args: | |||
metric_name (:obj:`str`): The metric name. | |||
field (str, optional): The field of this metric, default value: 'default' for all fields. | |||
default_args (dict, optional): Default initialization arguments. | |||
""" | |||
cfg = ConfigDict({'type': metric_name}) | |||
return build_from_cfg( | |||
cfg, METRICS, group_key=field, default_args=default_args) |
@@ -0,0 +1,40 @@ | |||
from typing import Dict, List, Union | |||
import numpy as np | |||
from modelscope.outputs import OutputKeys | |||
from ..metainfo import Metrics | |||
from ..utils.registry import default_group | |||
from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify | |||
from .base import Metric | |||
from .builder import METRICS, MetricKeys | |||
@METRICS.register_module( | |||
group_key=default_group, module_name=Metrics.seq_cls_metric) | |||
class SequenceClassificationMetric(Metric): | |||
"""The metric computation class for sequence classification classes. | |||
""" | |||
label_name = 'labels' | |||
def __init__(self): | |||
self.preds = [] | |||
self.labels = [] | |||
def add(self, outputs: Dict, inputs: Dict): | |||
ground_truths = inputs[SequenceClassificationMetric.label_name] | |||
eval_results = outputs[OutputKeys.LOGITS] | |||
self.preds.append( | |||
torch_nested_numpify(torch_nested_detach(eval_results))) | |||
self.labels.append( | |||
torch_nested_numpify(torch_nested_detach(ground_truths))) | |||
def evaluate(self): | |||
preds = np.concatenate(self.preds, axis=0) | |||
labels = np.concatenate(self.labels, axis=0) | |||
preds = np.argmax(preds, axis=1) | |||
return { | |||
MetricKeys.ACCURACY: | |||
(preds == labels).astype(np.float32).mean().item() | |||
} |
@@ -0,0 +1,23 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import Dict | |||
import torch | |||
from .base import Model | |||
class TorchModel(Model, torch.nn.Module): | |||
""" Base model interface for pytorch | |||
""" | |||
def __init__(self, model_dir=None, *args, **kwargs): | |||
# init reference: https://stackoverflow.com/questions\ | |||
# /9575409/calling-parent-class-init-with-multiple-inheritance-whats-the-right-way | |||
super().__init__(model_dir) | |||
super(Model, self).__init__() | |||
def forward(self, inputs: Dict[str, | |||
torch.Tensor]) -> Dict[str, torch.Tensor]: | |||
raise NotImplementedError |
@@ -108,7 +108,7 @@ class CLIPForMultiModalEmbedding(Model): | |||
return text_ids_tensor, text_mask_tensor | |||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.outputs import OutputKeys | |||
output = { | |||
OutputKeys.IMG_EMBEDDING: None, | |||
OutputKeys.TEXT_EMBEDDING: None | |||
@@ -134,7 +134,7 @@ class CLIPForMultiModalEmbedding(Model): | |||
img_embedding = self.clip_model( | |||
input_data=img_tensor, input_type='img') | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.outputs import OutputKeys | |||
output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() | |||
if 'text' in input and input['text'] is not None: | |||
@@ -76,7 +76,7 @@ class OfaForImageCaptioning(Model): | |||
input = fairseq.utils.move_to_cuda(input, device=self._device) | |||
results, _ = self.eval_caption(self.task, self.generator, self.models, | |||
input) | |||
from ...pipelines.outputs import OutputKeys | |||
from modelscope.outputs import OutputKeys | |||
return { | |||
'image_id': results[0]['image_id'], | |||
OutputKeys.CAPTION: results[0][OutputKeys.CAPTION] | |||
@@ -7,7 +7,10 @@ import torch | |||
from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel | |||
from torch import nn | |||
from modelscope.metainfo import Models | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Model | |||
from ..builder import MODELS | |||
class SbertTextClassfier(SbertPreTrainedModel): | |||
@@ -20,7 +23,11 @@ class SbertTextClassfier(SbertPreTrainedModel): | |||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
self.classifier = nn.Linear(config.hidden_size, config.num_labels) | |||
def forward(self, input_ids=None, token_type_ids=None): | |||
def forward(self, | |||
input_ids=None, | |||
token_type_ids=None, | |||
labels=None, | |||
**kwargs): | |||
outputs = self.encoder( | |||
input_ids, | |||
token_type_ids=token_type_ids, | |||
@@ -29,6 +36,10 @@ class SbertTextClassfier(SbertPreTrainedModel): | |||
pooled_output = outputs[1] | |||
pooled_output = self.dropout(pooled_output) | |||
logits = self.classifier(pooled_output) | |||
if labels is not None: | |||
loss_fct = nn.CrossEntropyLoss() | |||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |||
return {'logits': logits, 'loss': loss} | |||
return {'logits': logits} | |||
@@ -4,6 +4,7 @@ from modelscope.utils.constant import Tasks | |||
class OutputKeys(object): | |||
LOGITS = 'logits' | |||
SCORES = 'scores' | |||
LABEL = 'label' | |||
LABELS = 'labels' |
@@ -7,10 +7,10 @@ import soundfile as sf | |||
import torch | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Input, Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
def audio_norm(x): | |||
@@ -9,12 +9,12 @@ import yaml | |||
from modelscope.fileio import File | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.preprocessors.audio import LinearAECAndFbank | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
logger = get_logger() | |||
@@ -5,9 +5,9 @@ import numpy as np | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.models import Model | |||
from modelscope.models.audio.tts import SambertHifigan | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input, InputModel, Pipeline | |||
from modelscope.pipelines.builder import PIPELINES | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import Fields, Tasks | |||
__all__ = ['TextToSpeechSambertHifiganPipeline'] | |||
@@ -7,10 +7,10 @@ from typing import Any, Dict, Generator, List, Union | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.models.base import Model | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.outputs import TASK_OUTPUTS | |||
from modelscope.preprocessors import Preprocessor | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.logger import get_logger | |||
from .outputs import TASK_OUTPUTS | |||
from .util import is_model, is_official_hub_path | |||
Tensor = Union['torch.Tensor', 'tf.Tensor'] | |||
@@ -6,6 +6,7 @@ import torch | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.models.cv.action_recognition.models import BaseVideoModel | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors.video import ReadVideoData | |||
from modelscope.utils.config import Config | |||
@@ -13,7 +14,6 @@ from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
logger = get_logger() | |||
@@ -10,13 +10,13 @@ from torchvision import transforms | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.models.cv.animal_recognition import resnet | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors import load_image | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
logger = get_logger() | |||
@@ -11,8 +11,8 @@ from PIL import Image | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.models.cv.cmdssl_video_embedding.resnet2p1d import \ | |||
resnet26_2p1d | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
@@ -11,13 +11,13 @@ from modelscope.models.cv.cartoon.facelib.facer import FaceAna | |||
from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import ( | |||
get_reference_facial_points, warp_and_crop_face) | |||
from modelscope.models.cv.cartoon.utils import get_f5p, padTo16x, resize_size | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors import load_image | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
@@ -6,13 +6,13 @@ import numpy as np | |||
import PIL | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors import load_image | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
logger = get_logger() | |||
@@ -7,13 +7,13 @@ import PIL | |||
import tensorflow as tf | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors import load_image | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | |||
if tf.__version__ >= '2.0': | |||
@@ -3,12 +3,12 @@ from typing import Any, Dict | |||
import torch | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines.base import Input | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Model, Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
logger = get_logger() | |||
@@ -2,6 +2,7 @@ | |||
from typing import Any, Dict, Union | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SpaceForDialogIntent | |||
@@ -9,7 +10,6 @@ from ...preprocessors import DialogIntentPredictionPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['DialogIntentPredictionPipeline'] | |||
@@ -2,6 +2,7 @@ | |||
from typing import Dict, Union | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SpaceForDialogModeling | |||
@@ -9,7 +10,6 @@ from ...preprocessors import DialogModelingPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline, Tensor | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['DialogModelingPipeline'] | |||
@@ -1,12 +1,12 @@ | |||
from typing import Any, Dict, Union | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model, SpaceForDialogStateTracking | |||
from ...preprocessors import DialogStateTrackingPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['DialogStateTrackingPipeline'] | |||
@@ -3,6 +3,7 @@ from typing import Any, Dict, Optional, Union | |||
import torch | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp.masked_language import MaskedLanguageModelBase | |||
@@ -11,7 +12,6 @@ from ...utils.config import Config | |||
from ...utils.constant import ModelFile, Tasks | |||
from ..base import Pipeline, Tensor | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['FillMaskPipeline'] | |||
_type_map = {'veco': 'roberta', 'sbert': 'bert'} | |||
@@ -4,6 +4,7 @@ from typing import Any, Dict, Union | |||
import numpy as np | |||
import torch | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SbertForNLI | |||
@@ -11,7 +12,6 @@ from ...preprocessors import NLIPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['NLIPipeline'] | |||
@@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||
import numpy as np | |||
import torch | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SbertForSentenceSimilarity | |||
@@ -10,7 +11,6 @@ from ...preprocessors import SentenceSimilarityPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Input, Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['SentenceSimilarityPipeline'] | |||
@@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||
import numpy as np | |||
import torch | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SbertForSentimentClassification | |||
@@ -10,7 +11,6 @@ from ...preprocessors import SentimentClassificationPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['SentimentClassificationPipeline'] | |||
@@ -4,12 +4,12 @@ import numpy as np | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.models.nlp import BertForSequenceClassification | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
from modelscope.utils.constant import Tasks | |||
from ...models import Model | |||
from ..base import Input, Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['SequenceClassificationPipeline'] | |||
@@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union | |||
import torch | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import PalmForTextGeneration | |||
@@ -9,7 +10,6 @@ from ...preprocessors import TextGenerationPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline, Tensor | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['TextGenerationPipeline'] | |||
@@ -4,6 +4,7 @@ from typing import Any, Dict | |||
import numpy as np | |||
import tensorflow as tf | |||
from modelscope.outputs import OutputKeys | |||
from ...hub.snapshot_download import snapshot_download | |||
from ...metainfo import Pipelines | |||
from ...models.nlp import CsanmtForTranslation | |||
@@ -11,7 +12,6 @@ from ...utils.constant import ModelFile, Tasks | |||
from ...utils.logger import get_logger | |||
from ..base import Pipeline, Tensor | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
@@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union | |||
import torch | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SbertForTokenClassification | |||
@@ -9,7 +10,6 @@ from ...preprocessors import TokenClassificationPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline, Tensor | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['WordSegmentationPipeline'] | |||
@@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||
import torch | |||
from scipy.special import softmax | |||
from modelscope.outputs import OutputKeys | |||
from ...metainfo import Pipelines | |||
from ...models import Model | |||
from ...models.nlp import SbertForZeroShotClassification | |||
@@ -10,7 +11,6 @@ from ...preprocessors import ZeroShotClassificationPreprocessor | |||
from ...utils.constant import Tasks | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from ..outputs import OutputKeys | |||
__all__ = ['ZeroShotClassificationPipeline'] | |||
@@ -8,6 +8,7 @@ from transformers import AutoTokenizer | |||
from ..metainfo import Preprocessors | |||
from ..models import Model | |||
from ..utils.constant import Fields, InputFields | |||
from ..utils.hub import parse_label_mapping | |||
from ..utils.type_assert import type_assert | |||
from .base import Preprocessor | |||
from .builder import PREPROCESSORS | |||
@@ -115,7 +116,8 @@ class SentenceSimilarityPreprocessor(NLPPreprocessorBase): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
kwargs['truncation'] = True | |||
kwargs['padding'] = False | |||
kwargs['padding'] = False if 'padding' not in kwargs else kwargs[ | |||
'padding'] | |||
kwargs['return_tensors'] = 'pt' | |||
kwargs['max_length'] = kwargs.pop('sequence_length', 128) | |||
super().__init__(model_dir, *args, **kwargs) | |||
@@ -143,6 +145,7 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | |||
print(f'this is the tokenzier {self.tokenizer}') | |||
self.label2id = parse_label_mapping(self.model_dir) | |||
@type_assert(object, (str, tuple, Dict)) | |||
def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||
@@ -164,7 +167,7 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
'id': [], | |||
'input_ids': [], | |||
'attention_mask': [], | |||
'token_type_ids': [] | |||
'token_type_ids': [], | |||
} | |||
max_seq_length = self.sequence_length | |||
@@ -186,6 +189,29 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
return rst | |||
@PREPROCESSORS.register_module( | |||
Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune') | |||
class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor): | |||
"""Sentence similarity preprocessor in the finetune scenario | |||
Mainly added the label mapping procedure. | |||
""" | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
kwargs['padding'] = 'max_length' | |||
super().__init__(model_dir, *args, **kwargs) | |||
self.label2id = parse_label_mapping(self.model_dir) | |||
@type_assert(object, (str, tuple, Dict)) | |||
def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||
rst = super().__call__(data) | |||
rst = {k: v.squeeze() for k, v in rst.items()} | |||
if self.label2id is not None and 'label' in data: | |||
rst['labels'] = [] | |||
rst['labels'].append(self.label2id[str(data['label'])]) | |||
return rst | |||
@PREPROCESSORS.register_module( | |||
Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) | |||
class TextGenerationPreprocessor(NLPPreprocessorBase): | |||
@@ -0,0 +1,3 @@ | |||
from .base import TaskDataset | |||
from .builder import build_task_dataset | |||
from .torch_base_dataset import TorchTaskDataset |
@@ -0,0 +1,48 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from abc import ABC, abstractmethod | |||
from typing import Any, List, Tuple | |||
class TaskDataset(ABC): | |||
"""The task dataset base class for all the task specific dataset processors. | |||
""" | |||
def __init__(self, | |||
datasets: Tuple[Any, List[Any]], | |||
mode, | |||
preprocessor=None, | |||
**kwargs): | |||
super().__init__() | |||
self.mode = mode | |||
self.preprocessor = preprocessor | |||
self._inner_dataset = self.compose_dataset(datasets) | |||
@abstractmethod | |||
def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: | |||
"""Prepare a dataset. | |||
User can process the input datasets in a whole dataset perspective. | |||
This method also helps to merge several datasets to one. | |||
Args: | |||
datasets: The original dataset(s) | |||
Returns: A single dataset, which may be created after merging. | |||
""" | |||
pass | |||
@abstractmethod | |||
def preprocess_dataset(self, data): | |||
"""Preprocess the data fetched from the inner_dataset. | |||
If the preprocessor is None, the original data will be returned, else the preprocessor will be called. | |||
User can override this method to implement custom logics. | |||
Args: | |||
data: The data fetched from the dataset. | |||
Returns: The processed data. | |||
""" | |||
pass |
@@ -0,0 +1,21 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from modelscope.utils.config import ConfigDict | |||
from modelscope.utils.registry import Registry, build_from_cfg | |||
TASK_DATASETS = Registry('task_datasets') | |||
def build_task_dataset(cfg: ConfigDict, | |||
task_name: str = None, | |||
default_args: dict = None): | |||
""" Build task specific dataset processor given model config dict and the task name. | |||
Args: | |||
cfg (:obj:`ConfigDict`): config dict for model object. | |||
task_name (str, optional): task name, refer to | |||
:obj:`Tasks` for more details | |||
default_args (dict, optional): Default initialization arguments. | |||
""" | |||
return build_from_cfg( | |||
cfg, TASK_DATASETS, group_key=task_name, default_args=default_args) |
@@ -0,0 +1,63 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import Any, List, Tuple | |||
from torch.utils.data import ConcatDataset, Dataset | |||
from .base import TaskDataset | |||
class TorchTaskDataset(TaskDataset, Dataset): | |||
"""The task dataset base class for all the torch-based task processors. | |||
This base class is enough for most cases, except there are procedures which can not be executed in | |||
preprocessors and Datasets like dataset merging. | |||
""" | |||
def __init__(self, | |||
datasets: Tuple[Any, List[Any]], | |||
mode, | |||
preprocessor=None, | |||
**kwargs): | |||
TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs) | |||
def __getitem__(self, index) -> Any: | |||
return self.preprocess_dataset(self._inner_dataset[index]) | |||
def __len__(self): | |||
return len(self._inner_dataset) | |||
def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: | |||
"""Prepare a dataset. | |||
User can process the input datasets in a whole dataset perspective. | |||
This method gives a default implementation of datasets merging, user can override this | |||
method to write custom logics. | |||
Args: | |||
datasets: The original dataset(s) | |||
Returns: A single dataset, which may be created after merging. | |||
""" | |||
if isinstance(datasets, List): | |||
if len(datasets) == 1: | |||
return datasets[0] | |||
elif len(datasets) > 1: | |||
return ConcatDataset(datasets) | |||
else: | |||
return datasets | |||
def preprocess_dataset(self, data): | |||
"""Preprocess the data fetched from the inner_dataset. | |||
If the preprocessor is None, the original data will be returned, else the preprocessor will be called. | |||
User can override this method to implement custom logics. | |||
Args: | |||
data: The data fetched from the dataset. | |||
Returns: The processed data. | |||
""" | |||
return self.preprocessor( | |||
data) if self.preprocessor is not None else data |
@@ -1,3 +1,4 @@ | |||
from .base import DummyTrainer | |||
from .builder import build_trainer | |||
from .nlp import SequenceClassificationTrainer | |||
from .trainer import EpochBasedTrainer |
@@ -1,10 +1,12 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import time | |||
from abc import ABC, abstractmethod | |||
from typing import Callable, Dict, List, Optional, Tuple, Union | |||
from modelscope.trainers.builder import TRAINERS | |||
from modelscope.utils.config import Config | |||
from .utils.log_buffer import LogBuffer | |||
class BaseTrainer(ABC): | |||
@@ -27,6 +29,8 @@ class BaseTrainer(ABC): | |||
self.args = self.cfg.to_args(arg_parse_fn) | |||
else: | |||
self.args = None | |||
self.log_buffer = LogBuffer() | |||
self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) | |||
@abstractmethod | |||
def train(self, *args, **kwargs): | |||
@@ -5,9 +5,10 @@ from modelscope.utils.constant import Tasks | |||
from modelscope.utils.registry import Registry, build_from_cfg | |||
TRAINERS = Registry('trainers') | |||
HOOKS = Registry('hooks') | |||
def build_trainer(name: str = None, default_args: dict = None): | |||
def build_trainer(name: str = 'EpochBasedTrainer', default_args: dict = None): | |||
""" build trainer given a trainer name | |||
Args: | |||
@@ -15,7 +16,5 @@ def build_trainer(name: str = None, default_args: dict = None): | |||
will be used. | |||
default_args (dict, optional): Default initialization arguments. | |||
""" | |||
if name is None: | |||
name = 'Trainer' | |||
cfg = dict(type=name) | |||
return build_from_cfg(cfg, TRAINERS, default_args=default_args) |
@@ -0,0 +1,16 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .builder import HOOKS, build_hook | |||
from .checkpoint_hook import CheckpointHook | |||
from .evaluation_hook import EvaluationHook | |||
from .hook import Hook | |||
from .iter_timer_hook import IterTimerHook | |||
from .logger.text_logger_hook import TextLoggerHook | |||
from .lr_scheduler_hook import LrSchedulerHook | |||
from .optimizer_hook import OptimizerHook | |||
from .priority import Priority | |||
__all__ = [ | |||
'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook', | |||
'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook', | |||
'IterTimerHook' | |||
] |
@@ -0,0 +1,9 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||
HOOKS = Registry('hooks') | |||
def build_hook(cfg, default_args=None): | |||
return build_from_cfg( | |||
cfg, HOOKS, group_key=default_group, default_args=default_args) |
@@ -0,0 +1,92 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
from modelscope import __version__ | |||
from modelscope.utils.checkpoint import save_checkpoint | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.torch_utils import get_dist_info | |||
from .builder import HOOKS | |||
from .hook import Hook | |||
from .priority import Priority | |||
@HOOKS.register_module() | |||
class CheckpointHook(Hook): | |||
"""Save checkpoints periodically. | |||
Args: | |||
interval (int): The frequency to save model. If `by_epoch=True`, | |||
it means the number of epochs, else means the number of iterations | |||
by_epoch (bool): Saving checkpoints by epoch or by iteration. | |||
save_optimizer (bool): Whether to save optimizer state dict. Default: True. | |||
save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir` | |||
save_last (bool): Whether to save the last checkpoint. Default: True. | |||
""" | |||
PRIORITY = Priority.NORMAL | |||
def __init__(self, | |||
interval=0, | |||
by_epoch=True, | |||
save_optimizer=True, | |||
save_dir=None, | |||
save_last=True): | |||
self.interval = interval | |||
self.by_epoch = by_epoch | |||
self.save_optimizer = save_optimizer | |||
self.save_dir = save_dir | |||
self.save_last = save_last | |||
def before_run(self, trainer): | |||
if not self.save_dir: | |||
self.save_dir = trainer.work_dir | |||
if not hasattr(trainer, 'logger'): | |||
self.logger = get_logger(__name__) | |||
else: | |||
self.logger = trainer.logger | |||
self.logger.info(f'Checkpoints will be saved to {self.save_dir}') | |||
def after_train_epoch(self, trainer): | |||
if not self.by_epoch: | |||
return | |||
if self._should_save(trainer): | |||
self.logger.info(f'Saving checkpoint at {trainer.epoch + 1} epoch') | |||
self._save_checkpoint(trainer) | |||
def _save_checkpoint(self, trainer): | |||
if self.by_epoch: | |||
cur_save_name = os.path.join(self.save_dir, | |||
f'epoch_{trainer.epoch + 1}.pth') | |||
else: | |||
cur_save_name = os.path.join(self.save_dir, | |||
f'iter_{trainer.epoch + 1}.pth') | |||
rank, _ = get_dist_info() | |||
if rank == 0: | |||
save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) | |||
def after_train_iter(self, trainer): | |||
if self.by_epoch: | |||
return | |||
if self._should_save(trainer): | |||
self.logger.info( | |||
f'Saving checkpoint at {trainer.iter + 1} iterations') | |||
self._save_checkpoint(trainer) | |||
def _should_save(self, trainer): | |||
if self.by_epoch: | |||
check_last = self.is_last_epoch | |||
check_frequency = self.every_n_epochs | |||
else: | |||
check_last = self.is_last_iter | |||
check_frequency = self.every_n_iters | |||
if check_frequency(trainer, | |||
self.interval) or (self.save_last | |||
and check_last(trainer)): | |||
return True | |||
return False |
@@ -0,0 +1,74 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .builder import HOOKS | |||
from .hook import Hook | |||
from .priority import Priority | |||
@HOOKS.register_module() | |||
class EvaluationHook(Hook): | |||
"""Evaluation hook. | |||
Args: | |||
interval (int): Evaluation interval. | |||
by_epoch (bool): Evaluate by epoch or by iteration. | |||
start_idx (int | None, optional): The epoch/iterations validation begins. | |||
Default: None, validate every interval epochs/iterations from scratch. | |||
""" | |||
PRIORITY = Priority.NORMAL | |||
def __init__(self, interval=1, by_epoch=True, start_idx=None): | |||
assert interval > 0, 'interval must be a positive number' | |||
self.interval = interval | |||
self.start_idx = start_idx | |||
self.by_epoch = by_epoch | |||
def after_train_iter(self, trainer): | |||
"""Called after every training iter to evaluate the results.""" | |||
if not self.by_epoch and self._should_evaluate(trainer): | |||
self.do_evaluate(trainer) | |||
def after_train_epoch(self, trainer): | |||
"""Called after every training epoch to evaluate the results.""" | |||
if self.by_epoch and self._should_evaluate(trainer): | |||
self.do_evaluate(trainer) | |||
def do_evaluate(self, trainer): | |||
"""Evaluate the results.""" | |||
eval_res = trainer.evaluate() | |||
for name, val in eval_res.items(): | |||
trainer.log_buffer.output[name] = val | |||
trainer.log_buffer.ready = True | |||
def _should_evaluate(self, trainer): | |||
"""Judge whether to perform evaluation. | |||
Here is the rule to judge whether to perform evaluation: | |||
1. It will not perform evaluation during the epoch/iteration interval, | |||
which is determined by ``self.interval``. | |||
2. It will not perform evaluation if the ``start_idx`` is larger than | |||
current epochs/iters. | |||
3. It will not perform evaluation when current epochs/iters is larger than | |||
the ``start_idx`` but during epoch/iteration interval. | |||
Returns: | |||
bool: The flag indicating whether to perform evaluation. | |||
""" | |||
if self.by_epoch: | |||
current = trainer.epoch | |||
check_time = self.every_n_epochs | |||
else: | |||
current = trainer.iter | |||
check_time = self.every_n_iters | |||
if self.start_idx is None: | |||
if not check_time(trainer, self.interval): | |||
return False | |||
elif (current + 1) < self.start_idx: | |||
return False | |||
else: | |||
if (current + 1 - self.start_idx) % self.interval: | |||
return False | |||
return True |
@@ -0,0 +1,208 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from modelscope.utils.import_utils import is_method_overridden | |||
from .priority import Priority | |||
class Hook: | |||
""" | |||
The Hook base class of any modelscope trainer. You can build your own hook inherited from this class. | |||
""" | |||
# TODO @jiangnana.jnn use constant variable for stages | |||
stages = ('before_run', 'before_train_epoch', 'before_train_iter', | |||
'after_train_iter', 'after_train_epoch', 'before_val_epoch', | |||
'before_val_iter', 'after_val_iter', 'after_val_epoch', | |||
'after_run') | |||
PRIORITY = Priority.NORMAL | |||
def before_run(self, trainer): | |||
""" | |||
Will be called before any loop begins. | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
pass | |||
def after_run(self, trainer): | |||
""" | |||
Will be called after all loops end. | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
pass | |||
def before_epoch(self, trainer): | |||
""" | |||
Will be called before every epoch begins. | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
pass | |||
def after_epoch(self, trainer): | |||
""" | |||
Will be called after every epoch ends. | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
pass | |||
def before_iter(self, trainer): | |||
""" | |||
Will be called before every loop begins. | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
pass | |||
def after_iter(self, trainer): | |||
""" | |||
Will be called after every loop ends. | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
pass | |||
def before_train_epoch(self, trainer): | |||
""" | |||
Will be called before every train epoch begins. Default call ``self.before_epoch`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.before_epoch(trainer) | |||
def before_val_epoch(self, trainer): | |||
""" | |||
Will be called before every validation epoch begins. Default call ``self.before_epoch`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.before_epoch(trainer) | |||
def after_train_epoch(self, trainer): | |||
""" | |||
Will be called after every train epoch ends. Default call ``self.after_epoch`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.after_epoch(trainer) | |||
def after_val_epoch(self, trainer): | |||
""" | |||
Will be called after every validation epoch ends. Default call ``self.after_epoch`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.after_epoch(trainer) | |||
def before_train_iter(self, trainer): | |||
""" | |||
Will be called before every train loop begins. Default call ``self.before_iter`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.before_iter(trainer) | |||
def before_val_iter(self, trainer): | |||
""" | |||
Will be called before every validation loop begins. Default call ``self.before_iter`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.before_iter(trainer) | |||
def after_train_iter(self, trainer): | |||
""" | |||
Will be called after every train loop ends. Default call ``self.after_iter`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.after_iter(trainer) | |||
def after_val_iter(self, trainer): | |||
""" | |||
Will be called after every validation loop ends. Default call ``self.after_iter`` | |||
Args: | |||
trainer: The trainer instance. | |||
Returns: None | |||
""" | |||
self.after_iter(trainer) | |||
def every_n_epochs(self, trainer, n): | |||
""" | |||
Whether to reach every ``n`` epochs | |||
Returns: bool | |||
""" | |||
return (trainer.epoch + 1) % n == 0 if n > 0 else False | |||
def every_n_iters(self, trainer, n): | |||
""" | |||
Whether to reach every ``n`` iterations | |||
Returns: bool | |||
""" | |||
return (trainer.iter + 1) % n == 0 if n > 0 else False | |||
def end_of_epoch(self, trainer): | |||
""" | |||
Whether to reach the end of every epoch | |||
Returns: bool | |||
""" | |||
return trainer.inner_iter + 1 == len(trainer.data_loader) | |||
def is_last_epoch(self, trainer): | |||
""" | |||
Whether to reach the last epoch | |||
Returns: bool | |||
""" | |||
return trainer.epoch + 1 == trainer._max_epochs | |||
def is_last_iter(self, trainer): | |||
""" | |||
Whether to reach the last iteration in the entire training process | |||
Returns: bool | |||
""" | |||
return trainer.iter + 1 == trainer._max_iters | |||
def get_triggered_stages(self): | |||
trigger_stages = set() | |||
for stage in Hook.stages: | |||
if is_method_overridden(stage, Hook, self): | |||
trigger_stages.add(stage) | |||
return [stage for stage in Hook.stages if stage in trigger_stages] |
@@ -0,0 +1,22 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import time | |||
from .builder import HOOKS | |||
from .hook import Hook | |||
from .priority import Priority | |||
@HOOKS.register_module() | |||
class IterTimerHook(Hook): | |||
PRIORITY = Priority.LOW | |||
def before_epoch(self, trainer): | |||
self.start_time = time.time() | |||
def before_iter(self, trainer): | |||
trainer.log_buffer.update( | |||
{'data_load_time': time.time() - self.start_time}) | |||
def after_iter(self, trainer): | |||
trainer.log_buffer.update({'time': time.time() - self.start_time}) | |||
self.start_time = time.time() |
@@ -0,0 +1,6 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from modelscope.trainers.utils.log_buffer import LogBuffer | |||
from .base import LoggerHook | |||
from .text_logger_hook import TextLoggerHook | |||
__all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer'] |
@@ -0,0 +1,115 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import numbers | |||
from abc import ABCMeta, abstractmethod | |||
import numpy as np | |||
import torch | |||
from modelscope.trainers.hooks.hook import Hook | |||
from ..priority import Priority | |||
class LoggerHook(Hook): | |||
"""Base class for logger hooks. | |||
Args: | |||
interval (int): Logging interval (every k iterations). | |||
ignore_last (bool): Ignore the log of last iterations in each epoch | |||
if less than `interval`. | |||
reset_flag (bool): Whether to clear the output buffer after logging. | |||
by_epoch (bool): Whether EpochBasedtrainer is used. | |||
""" | |||
__metaclass__ = ABCMeta | |||
PRIORITY = Priority.VERY_LOW | |||
def __init__(self, | |||
interval=10, | |||
ignore_last=True, | |||
reset_flag=False, | |||
by_epoch=True): | |||
self.interval = interval | |||
self.ignore_last = ignore_last | |||
self.reset_flag = reset_flag | |||
self.by_epoch = by_epoch | |||
@abstractmethod | |||
def log(self, trainer): | |||
pass | |||
@staticmethod | |||
def is_scalar(val, include_np=True, include_torch=True): | |||
"""Tell the input variable is a scalar or not. | |||
Args: | |||
val: Input variable. | |||
include_np (bool): Whether to treat 0-d np.ndarray as a scalar. | |||
include_torch (bool): Whether to treat 0-d torch.Tensor as a scalar. | |||
Returns: | |||
bool: True or False. | |||
""" | |||
if isinstance(val, numbers.Number): | |||
return True | |||
elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: | |||
return True | |||
elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: | |||
return True | |||
else: | |||
return False | |||
def get_epoch(self, trainer): | |||
if trainer.mode == 'train': | |||
epoch = trainer.epoch + 1 | |||
elif trainer.mode == 'val': | |||
# normal val mode | |||
# trainer.epoch += 1 has been done before val workflow | |||
epoch = trainer.epoch | |||
else: | |||
raise ValueError(f"trainer mode should be 'train' or 'val', " | |||
f'but got {trainer.mode}') | |||
return epoch | |||
def get_iter(self, trainer, inner_iter=False): | |||
"""Get the current training iteration step.""" | |||
if self.by_epoch and inner_iter: | |||
current_iter = trainer.inner_iter + 1 | |||
else: | |||
current_iter = trainer.iter + 1 | |||
return current_iter | |||
def before_run(self, trainer): | |||
for hook in trainer.hooks[::-1]: | |||
if isinstance(hook, LoggerHook): | |||
hook.reset_flag = True | |||
break | |||
def before_epoch(self, trainer): | |||
trainer.log_buffer.clear() # clear logs of last epoch | |||
def after_train_iter(self, trainer): | |||
if self.by_epoch and self.every_n_epochs(trainer, self.interval): | |||
trainer.log_buffer.average(self.interval) | |||
elif not self.by_epoch and self.every_n_iters(trainer, self.interval): | |||
trainer.log_buffer.average(self.interval) | |||
elif self.end_of_epoch(trainer) and not self.ignore_last: | |||
# not precise but more stable | |||
trainer.log_buffer.average(self.interval) | |||
if trainer.log_buffer.ready: | |||
self.log(trainer) | |||
if self.reset_flag: | |||
trainer.log_buffer.clear_output() | |||
def after_train_epoch(self, trainer): | |||
if trainer.log_buffer.ready: | |||
self.log(trainer) | |||
if self.reset_flag: | |||
trainer.log_buffer.clear_output() | |||
def after_val_epoch(self, trainer): | |||
trainer.log_buffer.average() | |||
self.log(trainer) | |||
if self.reset_flag: | |||
trainer.log_buffer.clear_output() |
@@ -0,0 +1,159 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import datetime | |||
import os | |||
import os.path as osp | |||
from collections import OrderedDict | |||
import json | |||
import torch | |||
from torch import distributed as dist | |||
from modelscope.utils.torch_utils import get_dist_info | |||
from ..builder import HOOKS | |||
from .base import LoggerHook | |||
@HOOKS.register_module() | |||
class TextLoggerHook(LoggerHook): | |||
"""Logger hook in text, Output log to both console and local json file. | |||
Args: | |||
by_epoch (bool, optional): Whether EpochBasedtrainer is used. | |||
Default: True. | |||
interval (int, optional): Logging interval (every k iterations). | |||
Default: 10. | |||
ignore_last (bool, optional): Ignore the log of last iterations in each | |||
epoch if less than :attr:`interval`. Default: True. | |||
reset_flag (bool, optional): Whether to clear the output buffer after | |||
logging. Default: False. | |||
out_dir (str): The directory to save log. If is None, use `trainer.work_dir` | |||
""" | |||
def __init__(self, | |||
by_epoch=True, | |||
interval=10, | |||
ignore_last=True, | |||
reset_flag=False, | |||
out_dir=None): | |||
super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, | |||
by_epoch) | |||
self.by_epoch = by_epoch | |||
self.time_sec_tot = 0 | |||
self.out_dir = out_dir | |||
self._logged_keys = [] # store the key has been logged | |||
def before_run(self, trainer): | |||
super(TextLoggerHook, self).before_run(trainer) | |||
if self.out_dir is None: | |||
self.out_dir = trainer.work_dir | |||
if not osp.exists(self.out_dir): | |||
os.makedirs(self.out_dir) | |||
trainer.logger.info('Text logs will be saved to {}'.format( | |||
self.out_dir)) | |||
self.start_iter = trainer.iter | |||
self.json_log_path = osp.join(self.out_dir, | |||
'{}.log.json'.format(trainer.timestamp)) | |||
if hasattr(trainer, 'meta') and trainer.meta is not None: | |||
self._dump_log(trainer.meta, trainer) | |||
def _get_max_memory(self, trainer): | |||
device = getattr(trainer.model, 'output_device', None) | |||
mem = torch.cuda.max_memory_allocated(device=device) | |||
mem_mb = torch.tensor([mem / (1024 * 1024)], | |||
dtype=torch.int, | |||
device=device) | |||
_, world_size = get_dist_info() | |||
if world_size > 1: | |||
dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) | |||
return mem_mb.item() | |||
def _log_info(self, log_dict, trainer): | |||
if log_dict['mode'] == 'train': | |||
if isinstance(log_dict['lr'], dict): | |||
lr_str = [] | |||
for k, val in log_dict['lr'].items(): | |||
lr_str.append(f'lr_{k}: {val:.3e}') | |||
lr_str = ' '.join(lr_str) | |||
else: | |||
lr_str = f'lr: {log_dict["lr"]:.3e}' | |||
if self.by_epoch: | |||
log_str = f'Epoch [{log_dict["epoch"]}][{log_dict["iter"]}/{len(trainer.data_loader)}]\t' | |||
else: | |||
log_str = f'Iter [{log_dict["iter"]}/{trainer.max_iters}]\t' | |||
log_str += f'{lr_str}, ' | |||
self._logged_keys.extend(['lr', 'mode', 'iter', 'epoch']) | |||
if 'time' in log_dict.keys(): | |||
self.time_sec_tot += (log_dict['time'] * self.interval) | |||
time_sec_avg = self.time_sec_tot / ( | |||
trainer.iter - self.start_iter + 1) | |||
eta_sec = time_sec_avg * (trainer.max_iters - trainer.iter - 1) | |||
eta_str = str(datetime.timedelta(seconds=int(eta_sec))) | |||
log_str += f'eta: {eta_str}, ' | |||
log_str += f'time: {log_dict["time"]:.3f}, data_load_time: {log_dict["data_load_time"]:.3f}, ' | |||
self._logged_keys.extend([ | |||
'time', | |||
'data_load_time', | |||
]) | |||
else: | |||
# val/test time | |||
# here 1000 is the length of the val dataloader | |||
# by epoch: Epoch[val] [4][1000] | |||
# by iter: Iter[val] [1000] | |||
if self.by_epoch: | |||
log_str = f'Epoch({log_dict["mode"]}) [{log_dict["epoch"]}][{log_dict["iter"]}]\t' | |||
else: | |||
log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' | |||
self._logged_keys.extend(['mode', 'iter', 'epoch']) | |||
log_items = [] | |||
for name, val in log_dict.items(): | |||
if name in self._logged_keys: | |||
continue | |||
if isinstance(val, float): | |||
val = f'{val:.4f}' | |||
log_items.append(f'{name}: {val}') | |||
log_str += ', '.join(log_items) | |||
trainer.logger.info(log_str) | |||
def _dump_log(self, log_dict): | |||
# dump log in json format | |||
json_log = OrderedDict() | |||
for k, v in log_dict.items(): | |||
json_log[k] = self._round_float(v) | |||
rank, _ = get_dist_info() | |||
if rank == 0: | |||
with open(self.json_log_path, 'a+') as f: | |||
json.dump(json_log, f) | |||
f.write('\n') | |||
def _round_float(self, items, ndigits=5): | |||
if isinstance(items, list): | |||
return [self._round_float(item) for item in items] | |||
elif isinstance(items, float): | |||
return round(items, ndigits) | |||
else: | |||
return items | |||
def log(self, trainer): | |||
cur_iter = self.get_iter(trainer, inner_iter=True) | |||
log_dict = OrderedDict( | |||
mode=trainer.mode, epoch=self.get_epoch(trainer), iter=cur_iter) | |||
# statistic memory | |||
if torch.cuda.is_available(): | |||
log_dict['memory'] = self._get_max_memory(trainer) | |||
log_dict = dict(log_dict, **trainer.log_buffer.output) | |||
self._log_info(log_dict, trainer) | |||
self._dump_log(log_dict) | |||
return log_dict |
@@ -0,0 +1,71 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
from .builder import HOOKS | |||
from .hook import Hook | |||
from .priority import Priority | |||
@HOOKS.register_module() | |||
class LrSchedulerHook(Hook): | |||
"""Lr scheduler. | |||
Args: | |||
by_epoch (bool): Whether lr changes by epoch | |||
warmup (dict): warm up config | |||
""" | |||
PRIORITY = Priority.VERY_HIGH | |||
def __init__(self, by_epoch=True, warmup=None) -> None: | |||
super().__init__() | |||
self.by_epoch = by_epoch | |||
if not self.by_epoch: | |||
raise ValueError('We only support ``by_epoch=True`` now!') | |||
self.warmup = warmup | |||
self.warmup_lr_scheduler = None | |||
def before_run(self, trainer): | |||
if self.warmup is not None: | |||
assert isinstance(self.warmup, dict) and 'type' in self.warmup | |||
self.warmup_lr_scheduler = build_lr_scheduler( | |||
cfg=self.warmup, | |||
default_args={'base_scheduler': trainer.lr_scheduler}) | |||
def get_current_lr(self, trainer): | |||
import torch | |||
if isinstance(trainer.optimizer, torch.optim.Optimizer): | |||
lr = [group['lr'] for group in trainer.optimizer.param_groups] | |||
elif isinstance(trainer.optimizer, dict): | |||
lr = dict() | |||
for name, optim in trainer.optimizer.items(): | |||
lr[name] = [group['lr'] for group in optim.param_groups] | |||
else: | |||
raise RuntimeError( | |||
'lr is not applicable because optimizer does not exist.') | |||
return lr | |||
def before_train_iter(self, trainer): | |||
trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) | |||
def before_train_epoch(self, trainer): | |||
if self.by_epoch: | |||
if self.warmup_lr_scheduler is not None: | |||
self.warmup_lr_scheduler.step() | |||
else: | |||
trainer.lr_scheduler.step() | |||
trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) | |||
def _get_log_lr(self, trainer): | |||
cur_lr = self.get_current_lr(trainer) | |||
# only record lr of the first param group | |||
if isinstance(cur_lr, list): | |||
lr = cur_lr[0] | |||
else: | |||
assert isinstance(cur_lr, dict) | |||
lr = {} | |||
for k, lr_ in cur_lr.items(): | |||
assert isinstance(lr_, list) | |||
lr.update({k: lr_[0]}) | |||
return lr |
@@ -0,0 +1,37 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from torch.nn.utils import clip_grad | |||
from .builder import HOOKS | |||
from .hook import Hook | |||
from .priority import Priority | |||
@HOOKS.register_module() | |||
class OptimizerHook(Hook): | |||
PRIORITY = Priority.ABOVE_NORMAL | |||
def __init__(self, grad_clip=None, loss_keys='loss') -> None: | |||
if isinstance(loss_keys, str): | |||
loss_keys = [loss_keys] | |||
assert isinstance(loss_keys, (tuple, list)) | |||
self.loss_keys = loss_keys | |||
self.grad_clip = grad_clip | |||
def clip_grads(self, params, **clip_args): | |||
params = list( | |||
filter(lambda p: p.requires_grad and p.grad is not None, params)) | |||
if len(params) > 0: | |||
return clip_grad.clip_grad_norm_(params, **clip_args) | |||
def after_train_iter(self, trainer): | |||
trainer.optimizer.zero_grad() | |||
for k in self.loss_keys: | |||
trainer.train_outputs[k].backward() | |||
clip_args = self.grad_clip | |||
if clip_args is not None: | |||
self.clip_grads(trainer.model.parameters(), **clip_args) | |||
trainer.optimizer.step() |
@@ -0,0 +1,62 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from enum import Enum | |||
from typing import Union | |||
class Priority(Enum): | |||
"""Hook priority levels. | |||
+--------------+------------+ | |||
| Level | Value | | |||
+==============+============+ | |||
| HIGHEST | 0 | | |||
+--------------+------------+ | |||
| VERY_HIGH | 10 | | |||
+--------------+------------+ | |||
| HIGH | 30 | | |||
+--------------+------------+ | |||
| ABOVE_NORMAL | 40 | | |||
+--------------+------------+ | |||
| NORMAL | 50 | | |||
+--------------+------------+ | |||
| BELOW_NORMAL | 60 | | |||
+--------------+------------+ | |||
| LOW | 70 | | |||
+--------------+------------+ | |||
| VERY_LOW | 90 | | |||
+--------------+------------+ | |||
| LOWEST | 100 | | |||
+--------------+------------+ | |||
""" | |||
HIGHEST = 0 | |||
VERY_HIGH = 10 | |||
HIGH = 30 | |||
ABOVE_NORMAL = 40 | |||
NORMAL = 50 | |||
BELOW_NORMAL = 60 | |||
LOW = 70 | |||
VERY_LOW = 90 | |||
LOWEST = 100 | |||
def get_priority(priority: Union[int, str, Priority]) -> int: | |||
"""Get priority value. | |||
Args: | |||
priority (int or str or :obj:`Priority`): Priority. | |||
Returns: | |||
int: The priority value. | |||
""" | |||
if isinstance(priority, int): | |||
if priority < 0 or priority > 100: | |||
raise ValueError('priority must be between 0 and 100') | |||
return priority | |||
elif isinstance(priority, Priority): | |||
return priority.value | |||
elif isinstance(priority, str): | |||
return Priority[priority.upper()].value | |||
else: | |||
raise TypeError('priority must be an integer or Priority enum value') |
@@ -0,0 +1,8 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .builder import LR_SCHEDULER, build_lr_scheduler | |||
from .warmup import BaseWarmup, ConstantWarmup, ExponentialWarmup, LinearWarmup | |||
__all__ = [ | |||
'LR_SCHEDULER', 'build_lr_scheduler', 'BaseWarmup', 'ConstantWarmup', | |||
'LinearWarmup', 'ExponentialWarmup' | |||
] |
@@ -0,0 +1,47 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import inspect | |||
from modelscope.utils.config import ConfigDict | |||
from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||
LR_SCHEDULER = Registry('lr scheduler') | |||
def build_lr_scheduler(cfg: ConfigDict, default_args: dict = None): | |||
""" build lr scheduler from given lr scheduler config dict | |||
Args: | |||
cfg (:obj:`ConfigDict`): config dict for lr scheduler object. | |||
default_args (dict, optional): Default initialization arguments. | |||
""" | |||
if cfg['type'].lower().endswith('warmup'): | |||
# build warmup lr scheduler | |||
if not hasattr(cfg, 'base_scheduler'): | |||
if default_args is None or ('base_scheduler' not in default_args): | |||
raise ValueError( | |||
'Must provide ``base_scheduler`` which is an instance of ``torch.optim.lr_scheduler._LRScheduler`` ' | |||
'for build warmup lr scheduler.') | |||
else: | |||
# build lr scheduler without warmup | |||
if not hasattr(cfg, 'optimizer'): | |||
if default_args is None or ('optimizer' not in default_args): | |||
raise ValueError( | |||
'Must provide ``optimizer`` which is an instance of ``torch.optim.Optimizer`` ' | |||
'for build lr scheduler') | |||
return build_from_cfg( | |||
cfg, LR_SCHEDULER, group_key=default_group, default_args=default_args) | |||
def register_torch_lr_scheduler(): | |||
from torch.optim import lr_scheduler | |||
from torch.optim.lr_scheduler import _LRScheduler | |||
members = inspect.getmembers(lr_scheduler) | |||
for name, obj in members: | |||
if inspect.isclass(obj) and issubclass(obj, _LRScheduler): | |||
LR_SCHEDULER.register_module(module_name=name, module_cls=obj) | |||
register_torch_lr_scheduler() |
@@ -0,0 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .base import BaseWarmup | |||
from .warmup import ConstantWarmup, ExponentialWarmup, LinearWarmup | |||
__all__ = ['BaseWarmup', 'ConstantWarmup', 'LinearWarmup', 'ExponentialWarmup'] |
@@ -0,0 +1,75 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from torch.optim.lr_scheduler import _LRScheduler | |||
class BaseWarmup(_LRScheduler): | |||
"""Base warmup scheduler | |||
Args: | |||
base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
warmup_iters (int | list): Warmup iterations | |||
last_epoch (int): The index of last epoch. | |||
""" | |||
def __init__(self, | |||
base_scheduler, | |||
warmup_iters, | |||
last_epoch=-1, | |||
verbose=False): | |||
self.base_scheduler = base_scheduler | |||
self.warmup_iters = warmup_iters | |||
optimizer = self.base_scheduler.optimizer | |||
self._is_init_step = True | |||
super(BaseWarmup, self).__init__( | |||
optimizer, last_epoch=last_epoch, verbose=verbose) | |||
def get_lr(self): | |||
return self.base_scheduler.get_lr() | |||
def state_dict(self): | |||
self.base_scheduler.state_dict() | |||
def load_state_dict(self, state_dict): | |||
self.base_scheduler.load_state_dict(state_dict) | |||
def scale(self): | |||
"""Scale the learning rates. | |||
""" | |||
scale_value = self.get_warmup_scale(self.base_scheduler._step_count | |||
- 1) | |||
if isinstance(scale_value, (int, float)): | |||
scale_value = [ | |||
scale_value for _ in range(len(self.optimizer.param_groups)) | |||
] | |||
else: | |||
assert isinstance( | |||
scale_value, (list, tuple)), 'Only support list or tuple type!' | |||
assert len(scale_value) == len( | |||
self.optimizer.param_groups), ('Size mismatch {} != {}'.format( | |||
len(scale_value), len(self.optimizer.param_groups))) | |||
for i, group in enumerate(self.optimizer.param_groups): | |||
group['lr'] *= scale_value[i] | |||
def step(self, epoch=None): | |||
""" | |||
When ``self.base_scheduler._step_count`` is less than ``self.warmup_iters``, multiply lr by scale | |||
""" | |||
if self.base_scheduler._step_count > self.warmup_iters: | |||
return self.base_scheduler.step(epoch=epoch) | |||
for group, lr in zip(self.optimizer.param_groups, self.base_lrs): | |||
group['lr'] = lr | |||
# `base_scheduler` has done step() at init when build | |||
if self._is_init_step: | |||
self._is_init_step = False | |||
else: | |||
self.base_scheduler.step(epoch=epoch) | |||
self.scale() | |||
@classmethod | |||
def get_warmup_scale(self, cur_iter): | |||
pass |
@@ -0,0 +1,79 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from modelscope.trainers.lrscheduler.builder import LR_SCHEDULER | |||
from .base import BaseWarmup | |||
@LR_SCHEDULER.register_module() | |||
class ConstantWarmup(BaseWarmup): | |||
"""Linear warmup scheduler. | |||
Args: | |||
base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
warmup_ratio (float): Lr used at warmup stage equals to warmup_ratio * initial_lr | |||
warmup_iters (int | list): Warmup iterations | |||
last_epoch (int): The index of last epoch. | |||
""" | |||
def __init__(self, | |||
base_scheduler, | |||
warmup_iters, | |||
warmup_ratio=0.1, | |||
last_epoch=-1): | |||
self.warmup_ratio = warmup_ratio | |||
super(ConstantWarmup, self).__init__( | |||
base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||
def get_warmup_scale(self, cur_iter): | |||
if cur_iter >= self.warmup_iters: | |||
return 1.0 | |||
return self.warmup_ratio | |||
@LR_SCHEDULER.register_module() | |||
class LinearWarmup(BaseWarmup): | |||
"""Linear warmup scheduler. | |||
Args: | |||
base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
warmup_iters (int | list): Warmup iterations | |||
warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr | |||
last_epoch (int): The index of last epoch. | |||
""" | |||
def __init__(self, | |||
base_scheduler, | |||
warmup_iters, | |||
warmup_ratio=0.1, | |||
last_epoch=-1): | |||
self.warmup_ratio = warmup_ratio | |||
super(LinearWarmup, self).__init__( | |||
base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||
def get_warmup_scale(self, cur_iter): | |||
k = (1 - cur_iter / self.warmup_iters) * (1 - self.warmup_ratio) | |||
return 1 - k | |||
@LR_SCHEDULER.register_module() | |||
class ExponentialWarmup(BaseWarmup): | |||
"""Exponential warmup scheduler. | |||
Args: | |||
base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
warmup_iters (int | list): Warmup iterations | |||
warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr | |||
last_epoch (int): The index of last epoch. | |||
""" | |||
def __init__(self, | |||
base_scheduler, | |||
warmup_iters, | |||
warmup_ratio=0.1, | |||
last_epoch=-1): | |||
self.warmup_ratio = warmup_ratio | |||
super(ExponentialWarmup, self).__init__( | |||
base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||
def get_warmup_scale(self, cur_iter): | |||
k = self.warmup_ratio**(1 - cur_iter / self.warmup_iters) | |||
return k |
@@ -0,0 +1,4 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .builder import OPTIMIZERS, build_optimizer | |||
__all__ = ['OPTIMIZERS', 'build_optimizer'] |
@@ -0,0 +1,39 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import inspect | |||
import torch | |||
from modelscope.utils.config import ConfigDict | |||
from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||
OPTIMIZERS = Registry('optimizer') | |||
def build_optimizer(model: torch.nn.Module, | |||
cfg: ConfigDict, | |||
default_args: dict = None): | |||
""" build optimizer from optimizer config dict | |||
Args: | |||
cfg (:obj:`ConfigDict`): config dict for optimizer object. | |||
default_args (dict, optional): Default initialization arguments. | |||
""" | |||
if hasattr(model, 'module'): | |||
model = model.module | |||
cfg.params = model.parameters() | |||
return build_from_cfg( | |||
cfg, OPTIMIZERS, group_key=default_group, default_args=default_args) | |||
def register_torch_optimizers(): | |||
for name, module in inspect.getmembers(torch.optim): | |||
if name.startswith('__'): | |||
continue | |||
if inspect.isclass(module) and issubclass(module, | |||
torch.optim.Optimizer): | |||
OPTIMIZERS.register_module( | |||
default_group, module_name=name, module_cls=module) | |||
register_torch_optimizers() |
@@ -0,0 +1,703 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os.path | |||
import random | |||
import time | |||
from distutils.version import LooseVersion | |||
from functools import partial | |||
from typing import Callable, List, Optional, Tuple, Union | |||
import numpy as np | |||
import torch | |||
from addict import Dict | |||
from torch import distributed as dist | |||
from torch import nn | |||
from torch.utils.data import DataLoader, Dataset | |||
from torch.utils.data.distributed import DistributedSampler | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.metrics import build_metric, task_default_metrics | |||
from modelscope.models.base import Model | |||
from modelscope.models.base_torch import TorchModel | |||
from modelscope.msdatasets.ms_dataset import MsDataset | |||
from modelscope.preprocessors import build_preprocessor | |||
from modelscope.preprocessors.base import Preprocessor | |||
from modelscope.task_datasets import TorchTaskDataset, build_task_dataset | |||
from modelscope.trainers.hooks.builder import HOOKS | |||
from modelscope.trainers.hooks.priority import Priority, get_priority | |||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
from modelscope.trainers.optimizer.builder import build_optimizer | |||
from modelscope.utils.config import ConfigDict | |||
from modelscope.utils.constant import Hubs, ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.registry import build_from_cfg | |||
from modelscope.utils.tensor_utils import torch_default_data_collator | |||
from modelscope.utils.torch_utils import get_dist_info | |||
from .base import BaseTrainer | |||
from .builder import TRAINERS | |||
from .hooks.hook import Hook | |||
@TRAINERS.register_module() | |||
class EpochBasedTrainer(BaseTrainer): | |||
"""Epoch based Trainer, a training helper for PyTorch. | |||
Args: | |||
cfg_file(str): The local config file. | |||
model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir | |||
or a model id. If model is None, build_model method will be called. | |||
data_collator (`Callable`, *optional*): | |||
The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. | |||
train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): | |||
The dataset to use for training. | |||
Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a | |||
distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a | |||
`torch.Generator` for the randomization that must be identical on all processes (and the Trainer will | |||
manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally | |||
sets the seed of the RNGs used. | |||
eval_dataset (`torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation. | |||
preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor. | |||
NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code, | |||
this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file. | |||
Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and | |||
this preprocessing action will be executed every time the dataset's __getitem__ is called. | |||
optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple | |||
containing the optimizer and the scheduler to use. | |||
max_epochs: (int, optional): Total training epochs. | |||
""" | |||
def __init__( | |||
self, | |||
model: Optional[Union[TorchModel, nn.Module, str]] = None, | |||
cfg_file: Optional[str] = None, | |||
arg_parse_fn: Optional[Callable] = None, | |||
data_collator: Optional[Callable] = None, | |||
train_dataset: Optional[Dataset] = None, | |||
eval_dataset: Optional[Dataset] = None, | |||
preprocessor: Optional[Preprocessor] = None, | |||
optimizers: Tuple[torch.optim.Optimizer, | |||
torch.optim.lr_scheduler._LRScheduler] = (None, | |||
None), | |||
**kwargs): | |||
if isinstance(model, str): | |||
if os.path.exists(model): | |||
self.model_dir = model if os.path.isdir( | |||
model) else os.path.dirname(model) | |||
else: | |||
self.model_dir = snapshot_download(model) | |||
cfg_file = os.path.join(self.model_dir, ModelFile.CONFIGURATION) | |||
self.model = self.build_model() | |||
else: | |||
assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class' | |||
assert isinstance( | |||
model, | |||
(TorchModel, nn.Module | |||
)), 'model should be either str, TorchMode or nn.Module.' | |||
self.model_dir = os.path.dirname(cfg_file) | |||
self.model = model | |||
super().__init__(cfg_file, arg_parse_fn) | |||
if 'work_dir' in kwargs: | |||
self.work_dir = kwargs['work_dir'] | |||
else: | |||
self.work_dir = self.cfg.train.get('work_dir', './work_dir') | |||
self.preprocessor = None | |||
if isinstance(preprocessor, Preprocessor): | |||
self.preprocessor = preprocessor | |||
elif hasattr(self.cfg, 'preprocessor'): | |||
self.preprocessor = self.build_preprocessor() | |||
# TODO @wenmeng.zwm add data collator option | |||
# TODO how to fill device option? | |||
self.device = int( | |||
os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None | |||
self.train_dataset = self.to_task_dataset( | |||
train_dataset, mode='train', preprocessor=self.preprocessor) | |||
self.eval_dataset = self.to_task_dataset( | |||
eval_dataset, mode='eval', preprocessor=self.preprocessor) | |||
self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | |||
self.metrics = self.get_metrics() | |||
self.optimizers = optimizers | |||
self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) | |||
self._mode = 'train' | |||
self._hooks: List[Hook] = [] | |||
self._epoch = 0 | |||
self._iter = 0 | |||
self._inner_iter = 0 | |||
if 'max_epochs' not in kwargs: | |||
assert hasattr( | |||
self.cfg.train, | |||
'max_epochs'), 'max_epochs is missing in configuration file' | |||
self._max_epochs = self.cfg.train.max_epochs | |||
else: | |||
self._max_epochs = kwargs['max_epochs'] | |||
# TODO @wenmeng.zwm add seed init fn | |||
self._seed = 0 | |||
self._dist = get_dist_info()[1] > 1 | |||
@property | |||
def mode(self): | |||
return self._mode | |||
@property | |||
def hooks(self) -> List[Hook]: | |||
"""list[:obj:`Hook`]: A list of registered hooks.""" | |||
return self._hooks | |||
@property | |||
def epoch(self) -> int: | |||
"""int: Current epoch.""" | |||
return self._epoch | |||
@property | |||
def iter(self) -> int: | |||
"""int: Current iteration.""" | |||
return self._iter | |||
@property | |||
def inner_iter(self) -> int: | |||
"""int: Iteration in an epoch.""" | |||
return self._inner_iter | |||
@property | |||
def max_epochs(self): | |||
"""int: Maximum training epochs.""" | |||
return self._max_epochs | |||
@property | |||
def max_iters(self): | |||
"""int: Maximum training iterations.""" | |||
return self._max_epochs * len(self.data_loader) | |||
def to_task_dataset(self, | |||
datasets: Tuple[Dataset, List[Dataset]], | |||
mode: str, | |||
preprocessor: Optional[Preprocessor] = None): | |||
"""Build the task specific dataset processor for this trainer. | |||
Returns: The task dataset processor for the task. If no result for the very model-type and task, | |||
the default TaskDataset will be returned. | |||
""" | |||
try: | |||
if not datasets: | |||
return datasets | |||
if isinstance(datasets, TorchTaskDataset): | |||
return datasets | |||
task_dataset = build_task_dataset( | |||
ConfigDict({ | |||
**self.cfg.model, | |||
'mode': mode, | |||
'preprocessor': preprocessor, | |||
'datasets': datasets, | |||
}), getattr(self.cfg, 'task', None)) | |||
return task_dataset | |||
except Exception: | |||
if isinstance(datasets, (List, Tuple)) or preprocessor is not None: | |||
return TorchTaskDataset( | |||
datasets, | |||
mode=mode, | |||
preprocessor=preprocessor, | |||
**(self.cfg.model if hasattr(self.cfg, 'model') else {})) | |||
else: | |||
return datasets | |||
def build_preprocessor(self) -> Preprocessor: | |||
"""Build the preprocessor. | |||
User can override this method to implement custom logits. | |||
Returns: The preprocessor instance. | |||
""" | |||
# TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor | |||
# when they are different ones in training and evaluation | |||
cfg = ConfigDict({ | |||
**getattr(self.cfg, 'preprocessor'), 'model_dir': | |||
self.model_dir | |||
}) | |||
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||
def get_metrics(self) -> List[str]: | |||
"""Get the metric class types. | |||
The first choice will be the metrics configured in the config file, if not found, the default metrics will be | |||
used. | |||
If no metrics is found and the eval dataset exists, the method will raise an error. | |||
Returns: The metric types. | |||
""" | |||
metrics = self.cfg.evaluation.metrics if hasattr( | |||
self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, | |||
'metrics') else None | |||
metrics = metrics if metrics is not None else task_default_metrics.get( | |||
self.cfg.task) | |||
if metrics is None and self.eval_dataset is not None: | |||
raise ValueError( | |||
f'Metrics are needed in evaluation, please try to either ' | |||
f'add metrics in configuration.json or add the default metric for {self.cfg.task}.' | |||
) | |||
if isinstance(metrics, str): | |||
metrics = [metrics] | |||
return metrics | |||
def train(self, *args, **kwargs): | |||
self.model.train() | |||
self._mode = 'train' | |||
if self.train_dataset is None: | |||
self.train_dataloader = self.get_train_dataloader() | |||
else: | |||
self.train_dataloader = self._build_dataloader_with_dataset( | |||
self.train_dataset, **self.cfg.train.get('dataloader', {})) | |||
self.data_loader = self.train_dataloader | |||
self.register_optimizers_hook() | |||
self.register_hook_from_cfg(self.cfg.train.hooks) | |||
self.train_loop(self.train_dataloader) | |||
def evaluate(self, checkpoint_path=None): | |||
self.model.eval() | |||
self._mode = 'val' | |||
if self.eval_dataset is None: | |||
self.eval_dataloader = self.get_eval_data_loader() | |||
else: | |||
self.eval_dataloader = self._build_dataloader_with_dataset( | |||
self.eval_dataset, **self.cfg.evaluation.get('dataloader', {})) | |||
self.data_loader = self.eval_dataloader | |||
metric_classes = [build_metric(metric) for metric in self.metrics] | |||
self.evaluation_loop(self.eval_dataloader, checkpoint_path, | |||
metric_classes) | |||
metric_values = {} | |||
for metric_cls in metric_classes: | |||
metric_values.update(metric_cls.evaluate()) | |||
return metric_values | |||
def build_model(self) -> Union[nn.Module, TorchModel]: | |||
""" Instantiate a pytorch model and return. | |||
By default, we will create a model using config from configuration file. You can | |||
subclass and override this method in a subclass. | |||
""" | |||
# TODO temp implementation, waiting for @zhangzhicheng | |||
model = Model.from_pretrained(self.model_dir) | |||
if not isinstance(model, nn.Module) and hasattr(model, 'model'): | |||
return model.model | |||
def collate_fn(self, data): | |||
"""Prepare the input just before the forward function. | |||
This method will move the tensors to the right device. | |||
Usually this method does not need to be overridden. | |||
Args: | |||
data: The data out of the dataloader. | |||
Returns: The processed data. | |||
""" | |||
if isinstance(data, dict): | |||
return type(data)({k: self.collate_fn(v) for k, v in data.items()}) | |||
elif isinstance(data, (tuple, np.ndarray, list)): | |||
return type(data)(self.collate_fn(v) for v in data) | |||
elif isinstance(data, torch.Tensor) and self.device is not None: | |||
kwargs = dict(device=self.device) | |||
return data.to(**kwargs) | |||
return data | |||
def train_step(self, model, inputs): | |||
""" Perform a training step on a batch of inputs. | |||
Subclass and override to inject custom behavior. | |||
Args: | |||
model (`TorchModel`): The model to train. | |||
inputs (`Dict[str, Union[torch.Tensor, Any]]`): | |||
The inputs and targets of the model. | |||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the | |||
argument `labels`. Check your model's documentation for all accepted arguments. | |||
Return: | |||
`torch.Tensor`: The tensor with training loss on this batch. | |||
""" | |||
# EvaluationHook will do evaluate and change mode to val, return to train mode | |||
# TODO: find more pretty way to change mode | |||
model.train() | |||
self._mode = 'train' | |||
inputs = self.collate_fn(inputs) | |||
if isinstance(inputs, dict): | |||
train_outputs = model.forward(**inputs) | |||
else: | |||
train_outputs = model.forward(inputs) | |||
if not isinstance(train_outputs, dict): | |||
raise TypeError( | |||
'"model.train_step()" and "model.val_step()" must return a dict' | |||
) | |||
# add model output info to log | |||
if 'log_vars' not in train_outputs: | |||
default_keys_pattern = ['loss'] | |||
match_keys = set([]) | |||
for key_p in default_keys_pattern: | |||
match_keys.update( | |||
[key for key in train_outputs.keys() if key_p in key]) | |||
log_vars = {} | |||
for key in match_keys: | |||
value = train_outputs.get(key, None) | |||
if value is not None: | |||
if dist.is_available() and dist.is_initialized(): | |||
value = value.data.clone() | |||
dist.all_reduce(value.div_(dist.get_world_size())) | |||
log_vars.update({key: value.item()}) | |||
self.log_buffer.update(log_vars) | |||
else: | |||
self.log_buffer.update(train_outputs['log_vars']) | |||
self.train_outputs = train_outputs | |||
def prediction_step(self, model, inputs): | |||
""" Perform forward step by `model` using `inputs`. | |||
Args: | |||
model (`TorchModel`): The model to evaluate. | |||
inputs (`Dict[str, Union[torch.Tensor, Any]]`): | |||
The inputs and targets of the model. | |||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the | |||
argument `labels`. Check your model's documentation for all accepted arguments. | |||
prediction_loss_only (`bool`): | |||
Whether or not to return the loss only. | |||
ignore_keys (`Lst[str]`, *optional*): | |||
A list of keys in the output of your model (if it is a dictionary) that should be ignored when | |||
gathering predictions. | |||
Return: | |||
Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, | |||
logits and labels (each being optional). | |||
""" | |||
raise NotImplementedError | |||
def get_train_dataloader(self): | |||
""" Builder torch dataloader for training. | |||
We provide a reasonable default that works well. If you want to use something else, you can change | |||
the config for data.train in configuration file, or subclass and override this method | |||
(or `get_train_dataloader` in a subclass. | |||
""" | |||
train_data = self.cfg.dataset.train | |||
if self.train_dataset is None: | |||
self.train_dataset = self.build_dataset(train_data, mode='train') | |||
data_loader = self._build_dataloader_with_dataset( | |||
self.train_dataset, **self.cfg.train.get('dataloader', {})) | |||
return data_loader | |||
def get_eval_data_loader(self): | |||
""" Builder torch dataloader for evaluation. | |||
We provide a reasonable default that works well. If you want to use something else, you can change | |||
the config for dataset.eval in configuration file, or subclass and override this method in a subclass. | |||
pass | |||
""" | |||
val_data = self.cfg.dataset.val | |||
if self.eval_dataset is None: | |||
self.eval_dataset = self.build_dataset(val_data, mode='eval') | |||
batch_size = self.cfg.evaluation.batch_size | |||
workers = self.cfg.evaluation.workers | |||
shuffle = self.cfg.evaluation.get('shuffle', False) | |||
data_loader = self._build_dataloader_with_dataset( | |||
self.eval_dataset, | |||
batch_size_per_gpu=batch_size, | |||
workers_per_gpu=workers, | |||
shuffle=shuffle, | |||
dist=self._dist, | |||
seed=self._seed, | |||
persistent_workers=True, | |||
) | |||
return data_loader | |||
def build_dataset(self, data_cfg, mode): | |||
""" Build torch dataset object using data config | |||
""" | |||
dataset = MsDataset.load( | |||
dataset_name=data_cfg.name, | |||
split=data_cfg.split, | |||
subset_name=data_cfg.subset_name if hasattr( | |||
data_cfg, 'subset_name') else None, | |||
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | |||
) | |||
torch_dataset = dataset.to_torch_dataset( | |||
preprocessors=self.preprocessor, ) | |||
dataset = self.to_task_dataset( | |||
torch_dataset, mode, preprocessor=self.preprocessor) | |||
return dataset | |||
def create_optimizer_and_scheduler(self): | |||
""" Create optimizer and lr scheduler | |||
We provide a default implementation, if you want to customize your own optimizer | |||
and lr scheduler, you can either pass a tuple through trainer init function or | |||
subclass this class and override this method. | |||
""" | |||
optimizer, lr_scheduler = self.optimizers | |||
if optimizer is None: | |||
optimizer_cfg = self.cfg.train.get('optimizer', None) | |||
else: | |||
optimizer_cfg = None | |||
optim_options = {} | |||
if optimizer_cfg is not None: | |||
optim_options = optimizer_cfg.pop('options', {}) | |||
optimizer = build_optimizer(self.model, cfg=optimizer_cfg) | |||
if lr_scheduler is None: | |||
lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None) | |||
else: | |||
lr_scheduler_cfg = None | |||
lr_options = {} | |||
if lr_scheduler_cfg is not None: | |||
assert optimizer is not None | |||
lr_options = lr_scheduler_cfg.pop('options', {}) | |||
lr_scheduler = build_lr_scheduler( | |||
cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer}) | |||
self.optimizer = optimizer | |||
self.lr_scheduler = lr_scheduler | |||
return self.optimizer, self.lr_scheduler, optim_options, lr_options | |||
def register_optimizers_hook(self): | |||
""" Register optimizer hook and lr scheduler hook. | |||
""" | |||
optimizer, lr_scheduler = self.optimizers | |||
opti_error_msg = 'optimizers should be a tuple of `torch.optim.Optimizer`'\ | |||
' and `torch.optim.lr_scheduler._LRScheduler`' | |||
if optimizer is not None: | |||
assert isinstance(optimizer, torch.optim.Optimizer), opti_error_msg | |||
if lr_scheduler is not None: | |||
assert isinstance( | |||
lr_scheduler, | |||
torch.optim.lr_scheduler._LRScheduler), opti_error_msg | |||
_, _, optim_options, lr_options = self.create_optimizer_and_scheduler() | |||
lr_hook = dict(type='LrSchedulerHook', **lr_options) | |||
optim_hook = dict(type='OptimizerHook', **optim_options) | |||
self.register_hook_from_cfg([lr_hook, optim_hook]) | |||
def _build_dataloader_with_dataset(self, | |||
dataset: Dataset, | |||
batch_size_per_gpu: int, | |||
workers_per_gpu: int, | |||
dist: bool = False, | |||
shuffle: bool = True, | |||
seed: int = 0, | |||
persistent_workers=False, | |||
**kwargs) -> DataLoader: | |||
"""Build dataloader using input dataset and cfg. Used by `EpochBasedTrainer.train()` | |||
and `EpochBasedTrainer.evaluate()`. | |||
In distributed training, each GPU/process has a dataloader. | |||
In non-distributed training, there is only one dataloader for all GPUs. | |||
Args: | |||
dataset (Dataset): A PyTorch dataset. | |||
batch_size_per_gpu (int): Number of training samples on each GPU, i.e., | |||
batch size of each GPU. | |||
workers_per_gpu (int): How many subprocesses to use for data loading | |||
for each GPU. | |||
dist (bool): Distributed training/test or not. Default: True. | |||
shuffle (bool): Whether to shuffle the data at every epoch. | |||
Default: True. | |||
seed (int, Optional): Seed to be used. Default: 0. | |||
runner_type (str): Type of runner. Default: `EpochBasedRunner` | |||
persistent_workers (bool): If True, the data loader will not shutdown | |||
the worker processes after a dataset has been consumed once. | |||
This allows to maintain the workers `Dataset` instances alive. | |||
This argument is only valid when PyTorch>=1.7.0. Default: False. | |||
kwargs: any keyword argument to be used to initialize DataLoader | |||
Returns: | |||
DataLoader: A PyTorch dataloader. | |||
""" | |||
rank, world_size = get_dist_info() | |||
if dist: | |||
# When model is :obj:`DistributedDataParallel`, | |||
# `batch_size` of :obj:`dataloader` is the | |||
# number of training samples on each GPU. | |||
batch_size = batch_size_per_gpu | |||
num_workers = workers_per_gpu | |||
else: | |||
batch_size = batch_size_per_gpu | |||
num_workers = workers_per_gpu | |||
if dist: | |||
sampler = DistributedSampler( | |||
dataset, world_size, rank, shuffle=shuffle, seed=seed) | |||
else: | |||
sampler = None | |||
batch_sampler = None | |||
init_fn = partial( | |||
worker_init_fn, num_workers=num_workers, rank=rank, | |||
seed=seed) if seed is not None else None | |||
if LooseVersion(torch.__version__) >= LooseVersion('1.7.0'): | |||
kwargs['persistent_workers'] = persistent_workers | |||
elif persistent_workers is True: | |||
self.logger.warning( | |||
'persistent_workers is invalid because your pytorch ' | |||
'version is lower than 1.7.0') | |||
data_loader = DataLoader( | |||
dataset, | |||
batch_size=batch_size, | |||
sampler=sampler, | |||
num_workers=num_workers, | |||
batch_sampler=batch_sampler, | |||
collate_fn=self.data_collator, | |||
pin_memory=kwargs.pop('pin_memory', False), | |||
worker_init_fn=init_fn, | |||
**kwargs) | |||
return data_loader | |||
def train_loop(self, data_loader): | |||
""" Training loop used by `EpochBasedTrainer.train()` | |||
""" | |||
self.invoke_hook('before_run') | |||
self._epoch = 0 | |||
kwargs = {} | |||
for _ in range(self._epoch, self._max_epochs): | |||
self.invoke_hook('before_train_epoch') | |||
time.sleep(2) # Prevent possible deadlock during epoch transition | |||
for i, data_batch in enumerate(data_loader): | |||
self.data_batch = data_batch | |||
self._inner_iter = i | |||
self.invoke_hook('before_train_iter') | |||
self.train_step(self.model, data_batch, **kwargs) | |||
self.invoke_hook('after_train_iter') | |||
del self.data_batch | |||
self._iter += 1 | |||
self.invoke_hook('after_train_epoch') | |||
self._epoch += 1 | |||
time.sleep(1) # wait for some hooks like loggers to finish | |||
self.invoke_hook('after_run') | |||
def evaluation_loop(self, data_loader, checkpoint_path, metric_classes): | |||
""" Evaluation loop used by `EpochBasedTrainer.evaluate()`. | |||
""" | |||
if self._dist: | |||
from modelscope.trainers.utils.inference import multi_gpu_test | |||
multi_gpu_test( | |||
self.model, | |||
data_loader, | |||
tmpdir=None, | |||
gpu_collect=False, | |||
data_collate_fn=self.collate_fn, | |||
metric_classes=metric_classes) | |||
else: | |||
from modelscope.trainers.utils.inference import single_gpu_test | |||
single_gpu_test( | |||
self.model, | |||
data_loader, | |||
data_collate_fn=self.collate_fn, | |||
metric_classes=metric_classes) | |||
def register_hook(self, hook: Hook) -> None: | |||
"""Register a hook into the hook list. | |||
The hook will be inserted into a priority queue, with the specified | |||
priority (See :class:`Priority` for details of priorities). | |||
For hooks with the same priority, they will be triggered in the same | |||
order as they are registered. | |||
Args: | |||
hook (:obj:`Hook`): The hook to be registered. | |||
""" | |||
assert isinstance(hook, Hook) | |||
# insert the hook to a sorted list | |||
inserted = False | |||
for i in range(len(self._hooks) - 1, -1, -1): | |||
if get_priority(hook.PRIORITY) > get_priority( | |||
self._hooks[i].PRIORITY): | |||
self._hooks.insert(i + 1, hook) | |||
inserted = True | |||
break | |||
if not inserted: | |||
self._hooks.insert(0, hook) | |||
def register_hook_from_cfg(self, hook_cfg: Dict) -> None: | |||
"""Register a hook from its cfg. | |||
Args: | |||
hook_cfg (dict): Hook config. It should have at least keys 'type' | |||
and 'priority' indicating its type and priority. | |||
Note: | |||
The specific hook class to register should not use 'type' and | |||
'priority' arguments during initialization. | |||
""" | |||
hook_cfg = hook_cfg.copy() | |||
assert isinstance(hook_cfg, list) | |||
for cfg_i in hook_cfg: | |||
hook = build_from_cfg(cfg_i, HOOKS) | |||
self.register_hook(hook) | |||
def invoke_hook(self, fn_name: str) -> None: | |||
"""Call all hooks. | |||
Args: | |||
fn_name (str): The function name in each hook to be called, such as | |||
"before_train_epoch". | |||
""" | |||
for hook in self._hooks: | |||
getattr(hook, fn_name)(self) | |||
def get_hook_info(self) -> str: | |||
# Get hooks info in each stage | |||
stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages} | |||
for hook in self.hooks: | |||
try: | |||
priority = Priority(hook.priority).name # type: ignore | |||
except ValueError: | |||
priority = hook.priority # type: ignore | |||
classname = hook.__class__.__name__ | |||
hook_info = f'({priority:<12}) {classname:<35}' | |||
for trigger_stage in hook.get_triggered_stages(): | |||
stage_hook_map[trigger_stage].append(hook_info) | |||
stage_hook_infos = [] | |||
for stage in Hook.stages: | |||
hook_infos = stage_hook_map[stage] | |||
if len(hook_infos) > 0: | |||
info = f'{stage}:\n' | |||
info += '\n'.join(hook_infos) | |||
info += '\n -------------------- ' | |||
stage_hook_infos.append(info) | |||
return '\n'.join(stage_hook_infos) | |||
def worker_init_fn(worker_id, num_workers, rank, seed): | |||
# The seed of each worker equals to | |||
# num_worker * rank + worker_id + user_seed | |||
worker_seed = num_workers * rank + worker_id + seed | |||
np.random.seed(worker_seed) | |||
random.seed(worker_seed) | |||
torch.manual_seed(worker_seed) |
@@ -0,0 +1,208 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import pickle | |||
import shutil | |||
import tempfile | |||
import time | |||
import torch | |||
from torch import distributed as dist | |||
from tqdm import tqdm | |||
from modelscope.utils.torch_utils import get_dist_info | |||
def single_gpu_test(model, | |||
data_loader, | |||
data_collate_fn=None, | |||
metric_classes=None): | |||
"""Test model with a single gpu. | |||
Args: | |||
data_collate_fn: An optional data_collate_fn before fed into the model | |||
model (nn.Module): Model to be tested. | |||
data_loader (nn.Dataloader): Pytorch data loader. | |||
metric_classes(List): List of Metric class that uses to collect metrics | |||
Returns: | |||
list: The prediction results. | |||
""" | |||
model.eval() | |||
dataset = data_loader.dataset | |||
with tqdm(total=len(dataset), desc='test samples') as pbar: | |||
for data in data_loader: | |||
if data_collate_fn is not None: | |||
data = data_collate_fn(data) | |||
with torch.no_grad(): | |||
result = model(**data) | |||
if metric_classes is not None: | |||
for metric_cls in metric_classes: | |||
metric_cls.add(result, data) | |||
batch_size = len(result) | |||
for _ in range(batch_size): | |||
pbar.update() | |||
def multi_gpu_test(model, | |||
data_loader, | |||
tmpdir=None, | |||
gpu_collect=False, | |||
data_collate_fn=None, | |||
metric_classes=None): | |||
"""Test model with multiple gpus. | |||
This method tests model with multiple gpus and collects the results | |||
under two different modes: gpu and cpu modes. By setting | |||
``gpu_collect=True``, it encodes results to gpu tensors and use gpu | |||
communication for results collection. On cpu mode it saves the results on | |||
different gpus to ``tmpdir`` and collects them by the rank 0 worker. | |||
Args: | |||
model (nn.Module): Model to be tested. | |||
data_loader (nn.Dataloader): Pytorch data loader. | |||
data_collate_fn: An optional data_collate_fn before fed into the model | |||
tmpdir (str): Path of directory to save the temporary results from | |||
different gpus under cpu mode. | |||
gpu_collect (bool): Option to use either gpu or cpu to collect results. | |||
metric_classes(List): List of Metric class that uses to collect metrics | |||
Returns: | |||
list: The prediction results. | |||
""" | |||
model.eval() | |||
results = [] | |||
dataset = data_loader.dataset | |||
time.sleep(2) # This line can prevent deadlock problem in some cases. | |||
count = 0 | |||
with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: | |||
for _, data in enumerate(data_loader): | |||
if data_collate_fn is not None: | |||
data = data_collate_fn(data) | |||
with torch.no_grad(): | |||
result = model(**data) | |||
results.extend(result) | |||
rank, world_size = get_dist_info() | |||
if rank == 0: | |||
batch_size = len(result) | |||
batch_size_all = batch_size * world_size | |||
count += batch_size_all | |||
if count > len(dataset): | |||
batch_size_all = len(dataset) - (count - batch_size_all) | |||
for _ in range(batch_size_all): | |||
pbar.update() | |||
# collect results from all ranks | |||
if gpu_collect: | |||
results = collect_results_gpu(results, len(dataset)) | |||
else: | |||
results = collect_results_cpu(results, len(dataset), tmpdir) | |||
ground_truths = [dataset[i] for i in range(len(dataset))] | |||
if metric_classes is not None: | |||
for metric_cls in metric_classes: | |||
metric_cls.add(results, ground_truths) | |||
def collect_results_cpu(result_part, size, tmpdir=None): | |||
"""Collect results under cpu mode. | |||
On cpu mode, this function will save the results on different gpus to | |||
``tmpdir`` and collect them by the rank 0 worker. | |||
Args: | |||
result_part (list): Result list containing result parts | |||
to be collected. | |||
size (int): Size of the results, commonly equal to length of | |||
the results. | |||
tmpdir (str | None): temporal directory for collected results to | |||
store. If set to None, it will create a random temporal directory | |||
for it. | |||
Returns: | |||
list: The collected results. | |||
""" | |||
rank, world_size = get_dist_info() | |||
# TODO create a random tmp dir if it is not specified | |||
if tmpdir is None: | |||
tmpdir = tempfile.gettempdir() | |||
if not os.path.exists(tmpdir): | |||
os.makedirs(tmpdir) | |||
# dump the part result to the dir | |||
pickle.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl')) | |||
dist.barrier() | |||
# collect all parts | |||
if rank != 0: | |||
return None | |||
else: | |||
# load results of all parts from tmp dir | |||
part_list = [] | |||
for i in range(world_size): | |||
part_file = os.path.join(tmpdir, f'part_{i}.pkl') | |||
part_result = pickle.load(part_file) | |||
# When data is severely insufficient, an empty part_result | |||
# on a certain gpu could makes the overall outputs empty. | |||
if part_result: | |||
part_list.append(part_result) | |||
# sort the results | |||
ordered_results = [] | |||
for res in zip(*part_list): | |||
ordered_results.extend(list(res)) | |||
# the dataloader may pad some samples | |||
ordered_results = ordered_results[:size] | |||
# remove tmp dir | |||
shutil.rmtree(tmpdir) | |||
return ordered_results | |||
def collect_results_gpu(result_part, size): | |||
"""Collect results under gpu mode. | |||
On gpu mode, this function will encode results to gpu tensors and use gpu | |||
communication for results collection. | |||
Args: | |||
result_part (list): Result list containing result parts | |||
to be collected. | |||
size (int): Size of the results, commonly equal to length of | |||
the results. | |||
Returns: | |||
list: The collected results. | |||
""" | |||
rank, world_size = get_dist_info() | |||
# dump result part to tensor with pickle | |||
part_tensor = torch.tensor( | |||
bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') | |||
# gather all result part tensor shape | |||
shape_tensor = torch.tensor(part_tensor.shape, device='cuda') | |||
shape_list = [shape_tensor.clone() for _ in range(world_size)] | |||
dist.all_gather(shape_list, shape_tensor) | |||
# padding result part tensor to max length | |||
shape_max = torch.tensor(shape_list).max() | |||
part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') | |||
part_send[:shape_tensor[0]] = part_tensor | |||
part_recv_list = [ | |||
part_tensor.new_zeros(shape_max) for _ in range(world_size) | |||
] | |||
# gather all result part | |||
dist.all_gather(part_recv_list, part_send) | |||
if rank == 0: | |||
part_list = [] | |||
for recv, shape in zip(part_recv_list, shape_list): | |||
part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) | |||
# When data is severely insufficient, an empty part_result | |||
# on a certain gpu could makes the overall outputs empty. | |||
if part_result: | |||
part_list.append(part_result) | |||
# sort the results | |||
ordered_results = [] | |||
for res in zip(*part_list): | |||
ordered_results.extend(list(res)) | |||
# the dataloader may pad some samples | |||
ordered_results = ordered_results[:size] | |||
return ordered_results |
@@ -0,0 +1,42 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from collections import OrderedDict | |||
import numpy as np | |||
class LogBuffer: | |||
def __init__(self): | |||
self.val_history = OrderedDict() | |||
self.n_history = OrderedDict() | |||
self.output = OrderedDict() | |||
self.ready = False | |||
def clear(self) -> None: | |||
self.val_history.clear() | |||
self.n_history.clear() | |||
self.clear_output() | |||
def clear_output(self) -> None: | |||
self.output.clear() | |||
self.ready = False | |||
def update(self, vars: dict, count: int = 1) -> None: | |||
assert isinstance(vars, dict) | |||
for key, var in vars.items(): | |||
if key not in self.val_history: | |||
self.val_history[key] = [] | |||
self.n_history[key] = [] | |||
self.val_history[key].append(var) | |||
self.n_history[key].append(count) | |||
def average(self, n: int = 0) -> None: | |||
"""Average latest n values or all values.""" | |||
assert n >= 0 | |||
for key in self.val_history: | |||
values = np.array(self.val_history[key][-n:]) | |||
nums = np.array(self.n_history[key][-n:]) | |||
avg = np.sum(values * nums) / np.sum(nums) | |||
self.output[key] = avg | |||
self.ready = True |
@@ -0,0 +1,74 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import io | |||
import time | |||
from collections import OrderedDict | |||
from typing import Optional | |||
import torch | |||
from torch.optim import Optimizer | |||
from modelscope import __version__ | |||
from modelscope.fileio import File | |||
def weights_to_cpu(state_dict): | |||
"""Copy a model state_dict to cpu. | |||
Args: | |||
state_dict (OrderedDict): Model weights on GPU. | |||
Returns: | |||
OrderedDict: Model weights on GPU. | |||
""" | |||
state_dict_cpu = OrderedDict() | |||
for key, val in state_dict.items(): | |||
state_dict_cpu[key] = val.cpu() | |||
# Keep metadata in state_dict | |||
state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) | |||
return state_dict_cpu | |||
def save_checkpoint(model: torch.nn.Module, | |||
filename: str, | |||
optimizer: Optional[Optimizer] = None, | |||
meta: Optional[dict] = None) -> None: | |||
"""Save checkpoint to file. | |||
The checkpoint will have 3 fields: ``meta``, ``state_dict`` and | |||
``optimizer``. By default ``meta`` will contain version and time info. | |||
Args: | |||
model (Module): Module whose params are to be saved. | |||
filename (str): Checkpoint filename. | |||
optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. | |||
meta (dict, optional): Metadata to be saved in checkpoint. | |||
""" | |||
if meta is None: | |||
meta = {} | |||
elif not isinstance(meta, dict): | |||
raise TypeError(f'meta must be a dict or None, but got {type(meta)}') | |||
meta.update(modescope=__version__, time=time.asctime()) | |||
if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||
model = model.module | |||
if hasattr(model, 'CLASSES') and model.CLASSES is not None: | |||
# save class name to the meta | |||
meta.update(CLASSES=model.CLASSES) | |||
checkpoint = { | |||
'meta': meta, | |||
'state_dict': weights_to_cpu(model.state_dict()) | |||
} | |||
# save optimizer state dict in the checkpoint | |||
if isinstance(optimizer, Optimizer): | |||
checkpoint['optimizer'] = optimizer.state_dict() | |||
elif isinstance(optimizer, dict): | |||
checkpoint['optimizer'] = {} | |||
for name, optim in optimizer.items(): | |||
checkpoint['optimizer'][name] = optim.state_dict() | |||
with io.BytesIO() as f: | |||
torch.save(checkpoint, f) | |||
File.write(f.getvalue(), filename) |
@@ -13,12 +13,7 @@ class Fields(object): | |||
multi_modal = 'multi-modal' | |||
class Tasks(object): | |||
""" Names for tasks supported by modelscope. | |||
Holds the standard task name to use for identifying different tasks. | |||
This should be used to register models, pipelines, trainers. | |||
""" | |||
class CVTasks(object): | |||
# vision tasks | |||
image_to_text = 'image-to-text' | |||
pose_estimation = 'pose-estimation' | |||
@@ -33,6 +28,8 @@ class Tasks(object): | |||
action_recognition = 'action-recognition' | |||
video_embedding = 'video-embedding' | |||
class NLPTasks(object): | |||
# nlp tasks | |||
word_segmentation = 'word-segmentation' | |||
nli = 'nli' | |||
@@ -56,11 +53,15 @@ class Tasks(object): | |||
question_answering = 'question-answering' | |||
zero_shot_classification = 'zero-shot-classification' | |||
class AudioTasks(object): | |||
# audio tasks | |||
auto_speech_recognition = 'auto-speech-recognition' | |||
text_to_speech = 'text-to-speech' | |||
speech_signal_process = 'speech-signal-process' | |||
class MultiModalTasks(object): | |||
# multi-modal tasks | |||
image_captioning = 'image-captioning' | |||
visual_grounding = 'visual-grounding' | |||
@@ -69,6 +70,47 @@ class Tasks(object): | |||
visual_question_answering = 'visual-question-answering' | |||
class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks): | |||
""" Names for tasks supported by modelscope. | |||
Holds the standard task name to use for identifying different tasks. | |||
This should be used to register models, pipelines, trainers. | |||
""" | |||
reverse_field_index = {} | |||
@staticmethod | |||
def find_field_by_task(task_name): | |||
if len(Tasks.reverse_field_index) == 0: | |||
# Lazy init, not thread safe | |||
field_dict = { | |||
Fields.cv: [ | |||
getattr(Tasks, attr) for attr in dir(CVTasks) | |||
if not attr.startswith('__') | |||
], | |||
Fields.nlp: [ | |||
getattr(Tasks, attr) for attr in dir(NLPTasks) | |||
if not attr.startswith('__') | |||
], | |||
Fields.audio: [ | |||
getattr(Tasks, attr) for attr in dir(AudioTasks) | |||
if not attr.startswith('__') | |||
], | |||
Fields.multi_modal: [ | |||
getattr(Tasks, attr) for attr in dir(MultiModalTasks) | |||
if not attr.startswith('__') | |||
], | |||
} | |||
for field, tasks in field_dict.items(): | |||
for task in tasks: | |||
if task in Tasks.reverse_field_index: | |||
raise ValueError(f'Duplicate task: {task}') | |||
Tasks.reverse_field_index[task] = field | |||
return Tasks.reverse_field_index.get(task_name) | |||
class InputFields(object): | |||
""" Names for input data fields in the input data for pipelines | |||
""" | |||
@@ -100,6 +142,7 @@ class ModelFile(object): | |||
TF_CKPT_PREFIX = 'ckpt-' | |||
TORCH_MODEL_FILE = 'pytorch_model.pt' | |||
TORCH_MODEL_BIN_FILE = 'pytorch_model.bin' | |||
LABEL_MAPPING = 'label_mapping.json' | |||
class Requirements(object): | |||
@@ -86,3 +86,16 @@ def get_model_type(model_dir): | |||
return cfg.model_type if hasattr(cfg, 'model_type') else None | |||
except Exception as e: | |||
logger.error(f'parse config file failed with error: {e}') | |||
def parse_label_mapping(model_dir): | |||
import os | |||
import json | |||
label2id = None | |||
label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING) | |||
if os.path.exists(label_path): | |||
with open(label_path) as f: | |||
label_mapping = json.load(f) | |||
label2id = {name: idx for name, idx in label_mapping.items()} | |||
return label2id |
@@ -54,6 +54,38 @@ def import_modules_from_file(py_file: str): | |||
return module_name, mod | |||
def is_method_overridden(method, base_class, derived_class): | |||
"""Check if a method of base class is overridden in derived class. | |||
Args: | |||
method (str): the method name to check. | |||
base_class (type): the class of the base class. | |||
derived_class (type | Any): the class or instance of the derived class. | |||
""" | |||
assert isinstance(base_class, type), \ | |||
"base_class doesn't accept instance, Please pass class instead." | |||
if not isinstance(derived_class, type): | |||
derived_class = derived_class.__class__ | |||
base_method = getattr(base_class, method) | |||
derived_method = getattr(derived_class, method) | |||
return derived_method != base_method | |||
def has_method(obj: object, method: str) -> bool: | |||
"""Check whether the object has a method. | |||
Args: | |||
method (str): The method name to check. | |||
obj (object): The object to check. | |||
Returns: | |||
bool: True if the object has the method else False. | |||
""" | |||
return hasattr(obj, method) and callable(getattr(obj, method)) | |||
def import_modules(imports, allow_failed_imports=False): | |||
"""Import modules from the given list of strings. | |||
@@ -0,0 +1,77 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# Part of the implementation is borrowed from huggingface/transformers. | |||
def torch_nested_numpify(tensors): | |||
import torch | |||
"Numpify `tensors` (even if it's a nested list/tuple of tensors)." | |||
if isinstance(tensors, (list, tuple)): | |||
return type(tensors)(torch_nested_numpify(t) for t in tensors) | |||
if isinstance(tensors, torch.Tensor): | |||
t = tensors.cpu() | |||
return t.numpy() | |||
return tensors | |||
def torch_nested_detach(tensors): | |||
import torch | |||
"Detach `tensors` (even if it's a nested list/tuple of tensors)." | |||
if isinstance(tensors, (list, tuple)): | |||
return type(tensors)(torch_nested_detach(t) for t in tensors) | |||
if isinstance(tensors, torch.Tensor): | |||
return tensors.detach() | |||
return tensors | |||
def torch_default_data_collator(features): | |||
# TODO @jiangnana.jnn refine this default data collator | |||
import torch | |||
# if not isinstance(features[0], (dict, BatchEncoding)): | |||
# features = [vars(f) for f in features] | |||
first = features[0] | |||
if isinstance(first, dict): | |||
batch = {} | |||
# Special handling for labels. | |||
# Ensure that tensor is created with the correct type | |||
# (it should be automatically the case, but let's make sure of it.) | |||
if 'label' in first and first['label'] is not None: | |||
label = first['label'].item() if isinstance( | |||
first['label'], torch.Tensor) else first['label'] | |||
dtype = torch.long if isinstance(label, int) else torch.float | |||
batch['labels'] = torch.tensor([f['label'] for f in features], | |||
dtype=dtype) | |||
elif 'label_ids' in first and first['label_ids'] is not None: | |||
if isinstance(first['label_ids'], torch.Tensor): | |||
batch['labels'] = torch.stack( | |||
[f['label_ids'] for f in features]) | |||
else: | |||
dtype = torch.long if type( | |||
first['label_ids'][0]) is int else torch.float | |||
batch['labels'] = torch.tensor( | |||
[f['label_ids'] for f in features], dtype=dtype) | |||
# Handling of all other possible keys. | |||
# Again, we will use the first element to figure out which key/values are not None for this model. | |||
for k, v in first.items(): | |||
if k not in ('label', 'label_ids' | |||
) and v is not None and not isinstance(v, str): | |||
if isinstance(v, torch.Tensor): | |||
batch[k] = torch.stack([f[k] for f in features]) | |||
else: | |||
batch[k] = torch.tensor([f[k] for f in features]) | |||
elif isinstance(first, tuple): | |||
batch = [] | |||
for idx in range(len(first)): | |||
if isinstance(first[idx], torch.Tensor): | |||
batch.append(torch.stack([f[k] for f in features])) | |||
else: | |||
batch.append(torch.tensor([f[k] for f in features])) | |||
else: | |||
if isinstance(first, torch.Tensor): | |||
batch = torch.stack(features) | |||
else: | |||
batch = torch.tensor(features) | |||
return batch |
@@ -0,0 +1,127 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# Following code is partialy borrowed from openmmlab/mmcv | |||
import functools | |||
import os | |||
import socket | |||
import subprocess | |||
from collections import OrderedDict | |||
from typing import Callable, List, Optional, Tuple | |||
import torch | |||
import torch.multiprocessing as mp | |||
from torch import distributed as dist | |||
from torch._utils import (_flatten_dense_tensors, _take_tensors, | |||
_unflatten_dense_tensors) | |||
def _find_free_port() -> str: | |||
# Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 | |||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | |||
# Binding to port 0 will cause the OS to find an available port for us | |||
sock.bind(('', 0)) | |||
port = sock.getsockname()[1] | |||
sock.close() | |||
# NOTE: there is still a chance the port could be taken by other processes. | |||
return port | |||
def _is_free_port(port: int) -> bool: | |||
ips = socket.gethostbyname_ex(socket.gethostname())[-1] | |||
ips.append('localhost') | |||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |||
return all(s.connect_ex((ip, port)) != 0 for ip in ips) | |||
def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None: | |||
if mp.get_start_method(allow_none=True) is None: | |||
mp.set_start_method('spawn') | |||
if launcher == 'pytorch': | |||
_init_dist_pytorch(backend, **kwargs) | |||
elif launcher == 'mpi': | |||
_init_dist_mpi(backend, **kwargs) | |||
elif launcher == 'slurm': | |||
_init_dist_slurm(backend, **kwargs) | |||
else: | |||
raise ValueError(f'Invalid launcher type: {launcher}') | |||
def _init_dist_pytorch(backend: str, **kwargs) -> None: | |||
# rank = int(os.environ['RANK']) | |||
local_rank = int(os.environ['LOCAL_RANK']) | |||
torch.cuda.set_device(local_rank) | |||
dist.init_process_group(backend=backend, **kwargs) | |||
def _init_dist_mpi(backend: str, **kwargs) -> None: | |||
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) | |||
torch.cuda.set_device(local_rank) | |||
if 'MASTER_PORT' not in os.environ: | |||
# 29500 is torch.distributed default port | |||
os.environ['MASTER_PORT'] = '29500' | |||
if 'MASTER_ADDR' not in os.environ: | |||
raise KeyError('The environment variable MASTER_ADDR is not set') | |||
os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] | |||
os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] | |||
dist.init_process_group(backend=backend, **kwargs) | |||
def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None: | |||
"""Initialize slurm distributed training environment. | |||
If argument ``port`` is not specified, then the master port will be system | |||
environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system | |||
environment variable, then a default port ``29500`` will be used. | |||
Args: | |||
backend (str): Backend of torch.distributed. | |||
port (int, optional): Master port. Defaults to None. | |||
""" | |||
proc_id = int(os.environ['SLURM_PROCID']) | |||
ntasks = int(os.environ['SLURM_NTASKS']) | |||
node_list = os.environ['SLURM_NODELIST'] | |||
num_gpus = torch.cuda.device_count() | |||
torch.cuda.set_device(proc_id % num_gpus) | |||
addr = subprocess.getoutput( | |||
f'scontrol show hostname {node_list} | head -n1') | |||
# specify master port | |||
if port is not None: | |||
os.environ['MASTER_PORT'] = str(port) | |||
elif 'MASTER_PORT' in os.environ: | |||
pass # use MASTER_PORT in the environment variable | |||
else: | |||
# if torch.distributed default port(29500) is available | |||
# then use it, else find a free port | |||
if _is_free_port(29500): | |||
os.environ['MASTER_PORT'] = '29500' | |||
else: | |||
os.environ['MASTER_PORT'] = str(_find_free_port()) | |||
# use MASTER_ADDR in the environment variable if it already exists | |||
if 'MASTER_ADDR' not in os.environ: | |||
os.environ['MASTER_ADDR'] = addr | |||
os.environ['WORLD_SIZE'] = str(ntasks) | |||
os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) | |||
os.environ['RANK'] = str(proc_id) | |||
dist.init_process_group(backend=backend) | |||
def get_dist_info() -> Tuple[int, int]: | |||
if dist.is_available() and dist.is_initialized(): | |||
rank = dist.get_rank() | |||
world_size = dist.get_world_size() | |||
else: | |||
rank = 0 | |||
world_size = 1 | |||
return rank, world_size | |||
def master_only(func: Callable) -> Callable: | |||
@functools.wraps(func) | |||
def wrapper(*args, **kwargs): | |||
rank, _ = get_dist_info() | |||
if rank == 0: | |||
return func(*args, **kwargs) | |||
return wrapper |
@@ -1,3 +0,0 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:b4153d9ffc0b72eeaf162b5c9f4426f95dcea2bb0da9e7b5e1b72fd2643b1915 | |||
size 50444 |
@@ -0,0 +1,60 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import unittest | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from modelscope.models.base_torch import TorchModel | |||
class TorchBaseTest(unittest.TestCase): | |||
def test_custom_model(self): | |||
class MyTorchModel(TorchModel): | |||
def __init__(self): | |||
super().__init__() | |||
self.conv1 = nn.Conv2d(1, 20, 5) | |||
self.conv2 = nn.Conv2d(20, 20, 5) | |||
def forward(self, x): | |||
x = F.relu(self.conv1(x)) | |||
return F.relu(self.conv2(x)) | |||
model = MyTorchModel() | |||
model.train() | |||
model.eval() | |||
out = model.forward(torch.rand(1, 1, 10, 10)) | |||
self.assertEqual((1, 20, 2, 2), out.shape) | |||
def test_custom_model_with_postprocess(self): | |||
add_bias = 200 | |||
class MyTorchModel(TorchModel): | |||
def __init__(self): | |||
super().__init__() | |||
self.conv1 = nn.Conv2d(1, 20, 5) | |||
self.conv2 = nn.Conv2d(20, 20, 5) | |||
def forward(self, x): | |||
x = F.relu(self.conv1(x)) | |||
return F.relu(self.conv2(x)) | |||
def postprocess(self, x): | |||
return x + add_bias | |||
model = MyTorchModel() | |||
model.train() | |||
model.eval() | |||
out = model(torch.rand(1, 1, 10, 10)) | |||
self.assertEqual((1, 20, 2, 2), out.shape) | |||
self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10))) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -6,9 +6,9 @@ from typing import Any, Dict, List, Tuple, Union | |||
import numpy as np | |||
import PIL | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines import Pipeline, pipeline | |||
from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.registry import default_group | |||
@@ -2,8 +2,8 @@ | |||
import unittest | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -7,8 +7,8 @@ import cv2 | |||
from modelscope.fileio import File | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -5,9 +5,9 @@ import unittest | |||
import cv2 | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pipelines.base import Pipeline | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -5,8 +5,8 @@ import unittest | |||
import numpy as np | |||
from modelscope.models import Model | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -9,8 +9,8 @@ from scipy.io.wavfile import write | |||
from modelscope.metainfo import Pipelines | |||
from modelscope.models import Model | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pipelines.outputs import OutputKeys | |||
from modelscope.utils.constant import Fields, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.test_utils import test_level | |||
@@ -0,0 +1,108 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
from abc import ABCMeta | |||
import json | |||
import torch | |||
from torch import nn | |||
from torch.utils.data import Dataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import ModelFile | |||
class DummyDataset(Dataset, metaclass=ABCMeta): | |||
def __len__(self): | |||
return 20 | |||
def __getitem__(self, idx): | |||
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
class DummyModel(nn.Module): | |||
def __init__(self): | |||
super().__init__() | |||
self.linear = nn.Linear(5, 4) | |||
self.bn = nn.BatchNorm1d(4) | |||
def forward(self, feat, labels): | |||
x = self.linear(feat) | |||
x = self.bn(x) | |||
loss = torch.sum(x) | |||
return dict(logits=x, loss=loss) | |||
class CheckpointHookTest(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
def tearDown(self): | |||
super().tearDown() | |||
shutil.rmtree(self.tmp_dir) | |||
def test_checkpoint_hook(self): | |||
json_cfg = { | |||
'task': 'image_classification', | |||
'train': { | |||
'work_dir': self.tmp_dir, | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1 | |||
}, | |||
'optimizer': { | |||
'type': 'SGD', | |||
'lr': 0.01, | |||
'options': { | |||
'grad_clip': { | |||
'max_norm': 2.0 | |||
} | |||
} | |||
}, | |||
'lr_scheduler': { | |||
'type': 'StepLR', | |||
'step_size': 2, | |||
'options': { | |||
'warmup': { | |||
'type': 'LinearWarmup', | |||
'warmup_iters': 2 | |||
} | |||
} | |||
}, | |||
'hooks': [{ | |||
'type': 'CheckpointHook', | |||
'interval': 1 | |||
}] | |||
} | |||
} | |||
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
with open(config_path, 'w') as f: | |||
json.dump(json_cfg, f) | |||
trainer_name = 'EpochBasedTrainer' | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=DummyModel(), | |||
data_collator=None, | |||
train_dataset=DummyDataset(), | |||
max_epochs=2) | |||
trainer = build_trainer(trainer_name, kwargs) | |||
trainer.train() | |||
results_files = os.listdir(self.tmp_dir) | |||
self.assertIn('epoch_1.pth', results_files) | |||
self.assertIn('epoch_2.pth', results_files) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,188 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
from abc import ABCMeta | |||
import json | |||
import torch | |||
from torch import nn | |||
from torch.optim import SGD | |||
from torch.optim.lr_scheduler import MultiStepLR | |||
from torch.utils.data import Dataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import ModelFile | |||
class DummyDataset(Dataset, metaclass=ABCMeta): | |||
"""Base Dataset | |||
""" | |||
def __len__(self): | |||
return 10 | |||
def __getitem__(self, idx): | |||
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
class DummyModel(nn.Module): | |||
def __init__(self): | |||
super().__init__() | |||
self.linear = nn.Linear(5, 4) | |||
self.bn = nn.BatchNorm1d(4) | |||
def forward(self, feat, labels): | |||
x = self.linear(feat) | |||
x = self.bn(x) | |||
loss = torch.sum(x) | |||
return dict(logits=x, loss=loss) | |||
class LrSchedulerHookTest(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
def tearDown(self): | |||
super().tearDown() | |||
shutil.rmtree(self.tmp_dir) | |||
def test_lr_scheduler_hook(self): | |||
json_cfg = { | |||
'task': 'image_classification', | |||
'train': { | |||
'work_dir': self.tmp_dir, | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1 | |||
} | |||
} | |||
} | |||
config_path = os.path.join(self.tmp_dir, 'config.json') | |||
with open(config_path, 'w') as f: | |||
json.dump(json_cfg, f) | |||
model = DummyModel() | |||
optimizer = SGD(model.parameters(), lr=0.01) | |||
lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) | |||
trainer_name = 'EpochBasedTrainer' | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=model, | |||
train_dataset=DummyDataset(), | |||
optimizers=(optimizer, lr_scheduler), | |||
max_epochs=5) | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||
trainer.register_optimizers_hook() | |||
trainer.invoke_hook('before_run') | |||
log_lrs = [] | |||
optim_lrs = [] | |||
for _ in range(trainer._epoch, trainer._max_epochs): | |||
trainer.invoke_hook('before_train_epoch') | |||
for _, data_batch in enumerate(train_dataloader): | |||
trainer.invoke_hook('before_train_iter') | |||
log_lrs.append(trainer.log_buffer.output['lr']) | |||
optim_lrs.append(optimizer.param_groups[0]['lr']) | |||
trainer.train_step(trainer.model, data_batch) | |||
trainer.invoke_hook('after_train_iter') | |||
trainer.invoke_hook('after_train_epoch') | |||
trainer._epoch += 1 | |||
trainer.invoke_hook('after_run') | |||
iters = 5 | |||
target_lrs = [0.01] * iters * 1 + [0.001] * iters * 2 + [0.0001 | |||
] * iters * 2 | |||
self.assertListEqual(log_lrs, target_lrs) | |||
self.assertListEqual(optim_lrs, target_lrs) | |||
def test_warmup_lr_scheduler_hook(self): | |||
json_cfg = { | |||
'task': 'image_classification', | |||
'train': { | |||
'work_dir': self.tmp_dir, | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1 | |||
}, | |||
'optimizer': { | |||
'type': 'SGD', | |||
'lr': 0.01 | |||
}, | |||
'lr_scheduler': { | |||
'type': 'MultiStepLR', | |||
'milestones': [4, 6], | |||
'options': { | |||
'warmup': { | |||
'type': 'LinearWarmup', | |||
'warmup_iters': 3 | |||
} | |||
} | |||
} | |||
} | |||
} | |||
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
with open(config_path, 'w') as f: | |||
json.dump(json_cfg, f) | |||
model = DummyModel() | |||
# optimmizer = SGD(model.parameters(), lr=0.01) | |||
# lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) | |||
trainer_name = 'EpochBasedTrainer' | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=model, | |||
train_dataset=DummyDataset(), | |||
# optimizers=(optimmizer, lr_scheduler), | |||
max_epochs=7) | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||
trainer.register_optimizers_hook() | |||
trainer.invoke_hook('before_run') | |||
log_lrs = [] | |||
optim_lrs = [] | |||
for _ in range(trainer._epoch, trainer._max_epochs): | |||
trainer.invoke_hook('before_train_epoch') | |||
for _, data_batch in enumerate(train_dataloader): | |||
trainer.invoke_hook('before_train_iter') | |||
log_lrs.append(round(trainer.log_buffer.output['lr'], 5)) | |||
optim_lrs.append( | |||
round(trainer.optimizer.param_groups[0]['lr'], 5)) | |||
trainer.train_step(trainer.model, data_batch) | |||
trainer.invoke_hook('after_train_iter') | |||
trainer.invoke_hook('after_train_epoch') | |||
trainer.invoke_hook('after_run') | |||
iters = 5 | |||
target_lrs = [0.004] * iters * 1 + [0.007] * iters * 1 + [ | |||
0.01 | |||
] * iters * 1 + [0.001] * iters * 2 + [0.0001] * iters * 2 | |||
self.assertListEqual(log_lrs, target_lrs) | |||
self.assertListEqual(optim_lrs, target_lrs) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,128 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
from abc import ABCMeta | |||
import json | |||
import torch | |||
from torch import nn | |||
from torch.optim import SGD | |||
from torch.optim.lr_scheduler import MultiStepLR | |||
from torch.utils.data import Dataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import ModelFile | |||
class DummyDataset(Dataset, metaclass=ABCMeta): | |||
"""Base Dataset | |||
""" | |||
def __len__(self): | |||
return 10 | |||
def __getitem__(self, idx): | |||
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
class DummyModel(nn.Module): | |||
def __init__(self): | |||
super().__init__() | |||
self.linear = nn.Linear(5, 4) | |||
self.bn = nn.BatchNorm1d(4) | |||
def forward(self, feat, labels): | |||
x = self.linear(feat) | |||
x = self.bn(x) | |||
loss = torch.sum(x) | |||
return dict(logits=x, loss=loss) | |||
class IterTimerHookTest(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
def tearDown(self): | |||
super().tearDown() | |||
shutil.rmtree(self.tmp_dir) | |||
def test_iter_time_hook(self): | |||
json_cfg = { | |||
'task': 'image_classification', | |||
'train': { | |||
'work_dir': self.tmp_dir, | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1 | |||
}, | |||
'hooks': [{ | |||
'type': 'IterTimerHook', | |||
}] | |||
} | |||
} | |||
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
with open(config_path, 'w') as f: | |||
json.dump(json_cfg, f) | |||
model = DummyModel() | |||
optimizer = SGD(model.parameters(), lr=0.01) | |||
lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) | |||
trainer_name = 'EpochBasedTrainer' | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=model, | |||
train_dataset=DummyDataset(), | |||
optimizers=(optimizer, lr_scheduler), | |||
max_epochs=5) | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||
trainer.register_optimizers_hook() | |||
trainer.register_hook_from_cfg(trainer.cfg.train.hooks) | |||
trainer.invoke_hook('before_run') | |||
for i in range(trainer._epoch, trainer._max_epochs): | |||
trainer.invoke_hook('before_train_epoch') | |||
for _, data_batch in enumerate(train_dataloader): | |||
trainer.invoke_hook('before_train_iter') | |||
trainer.train_step(trainer.model, data_batch) | |||
trainer.invoke_hook('after_train_iter') | |||
self.assertIn('data_load_time', trainer.log_buffer.val_history) | |||
self.assertIn('time', trainer.log_buffer.val_history) | |||
self.assertIn('loss', trainer.log_buffer.val_history) | |||
trainer.invoke_hook('after_train_epoch') | |||
target_len = 5 * (i + 1) | |||
self.assertEqual( | |||
len(trainer.log_buffer.val_history['data_load_time']), | |||
target_len) | |||
self.assertEqual( | |||
len(trainer.log_buffer.val_history['time']), target_len) | |||
self.assertEqual( | |||
len(trainer.log_buffer.val_history['loss']), target_len) | |||
self.assertEqual( | |||
len(trainer.log_buffer.n_history['data_load_time']), | |||
target_len) | |||
self.assertEqual( | |||
len(trainer.log_buffer.n_history['time']), target_len) | |||
self.assertEqual( | |||
len(trainer.log_buffer.n_history['loss']), target_len) | |||
trainer.invoke_hook('after_run') | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,79 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import unittest | |||
import torch | |||
from torch import nn | |||
from torch.optim.lr_scheduler import MultiStepLR | |||
class WarmupTest(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
def test_constant_warmup(self): | |||
from modelscope.trainers.lrscheduler.warmup import ConstantWarmup | |||
net = nn.Linear(2, 2) | |||
base_lr = 0.02 | |||
warmup_iters = 3 | |||
warmup_ratio = 0.2 | |||
optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||
lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||
lr_scheduler_with_warmup = ConstantWarmup( | |||
lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||
res = [] | |||
for _ in range(10): | |||
lr_scheduler_with_warmup.step() | |||
for _, group in enumerate(optimizer.param_groups): | |||
res.append(group['lr']) | |||
base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||
self.assertListEqual(res, [0.004, 0.004, 0.02] + base_lrs) | |||
def test_linear_warmup(self): | |||
from modelscope.trainers.lrscheduler.warmup import LinearWarmup | |||
net = nn.Linear(2, 2) | |||
base_lr = 0.02 | |||
warmup_iters = 3 | |||
warmup_ratio = 0.1 | |||
optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||
lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||
lr_scheduler_with_warmup = LinearWarmup( | |||
lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||
res = [] | |||
for _ in range(10): | |||
lr_scheduler_with_warmup.step() | |||
for _, group in enumerate(optimizer.param_groups): | |||
res.append(round(group['lr'], 5)) | |||
base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||
self.assertListEqual(res, [0.0080, 0.0140, 0.02] + base_lrs) | |||
def test_exp_warmup(self): | |||
from modelscope.trainers.lrscheduler.warmup import ExponentialWarmup | |||
net = nn.Linear(2, 2) | |||
base_lr = 0.02 | |||
warmup_iters = 3 | |||
warmup_ratio = 0.1 | |||
optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||
lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||
lr_scheduler_with_warmup = ExponentialWarmup( | |||
lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||
res = [] | |||
for _ in range(10): | |||
lr_scheduler_with_warmup.step() | |||
for _, group in enumerate(optimizer.param_groups): | |||
res.append(round(group['lr'], 5)) | |||
base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||
self.assertListEqual(res, [0.00431, 0.00928, 0.02] + base_lrs) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,209 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
from abc import ABCMeta | |||
import json | |||
import torch | |||
from torch import nn | |||
from torch.optim import SGD | |||
from torch.optim.lr_scheduler import StepLR | |||
from torch.utils.data import Dataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import ModelFile | |||
from modelscope.utils.test_utils import test_level | |||
class DummyMetric: | |||
def __call__(self, ground_truth, predict_results): | |||
return {'accuracy': 0.5} | |||
class DummyDataset(Dataset, metaclass=ABCMeta): | |||
"""Base Dataset | |||
""" | |||
def __len__(self): | |||
return 20 | |||
def __getitem__(self, idx): | |||
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
class DummyModel(nn.Module): | |||
def __init__(self): | |||
super().__init__() | |||
self.linear = nn.Linear(5, 4) | |||
self.bn = nn.BatchNorm1d(4) | |||
def forward(self, feat, labels): | |||
x = self.linear(feat) | |||
x = self.bn(x) | |||
loss = torch.sum(x) | |||
return dict(logits=x, loss=loss) | |||
class TrainerTest(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
def tearDown(self): | |||
super().tearDown() | |||
shutil.rmtree(self.tmp_dir) | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_train_0(self): | |||
json_cfg = { | |||
'train': { | |||
'work_dir': | |||
self.tmp_dir, | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1 | |||
}, | |||
'optimizer': { | |||
'type': 'SGD', | |||
'lr': 0.01, | |||
'options': { | |||
'grad_clip': { | |||
'max_norm': 2.0 | |||
} | |||
} | |||
}, | |||
'lr_scheduler': { | |||
'type': 'StepLR', | |||
'step_size': 2, | |||
'options': { | |||
'warmup': { | |||
'type': 'LinearWarmup', | |||
'warmup_iters': 2 | |||
} | |||
} | |||
}, | |||
'hooks': [{ | |||
'type': 'CheckpointHook', | |||
'interval': 1 | |||
}, { | |||
'type': 'TextLoggerHook', | |||
'interval': 1 | |||
}, { | |||
'type': 'IterTimerHook' | |||
}, { | |||
'type': 'EvaluationHook', | |||
'interval': 1 | |||
}] | |||
}, | |||
'evaluation': { | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1, | |||
'shuffle': False | |||
}, | |||
'metrics': ['seq_cls_metric'] | |||
} | |||
} | |||
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
with open(config_path, 'w') as f: | |||
json.dump(json_cfg, f) | |||
trainer_name = 'EpochBasedTrainer' | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=DummyModel(), | |||
data_collator=None, | |||
train_dataset=DummyDataset(), | |||
eval_dataset=DummyDataset(), | |||
max_epochs=3) | |||
trainer = build_trainer(trainer_name, kwargs) | |||
trainer.train() | |||
results_files = os.listdir(self.tmp_dir) | |||
self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
self.assertIn('epoch_1.pth', results_files) | |||
self.assertIn('epoch_2.pth', results_files) | |||
self.assertIn('epoch_3.pth', results_files) | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_train_1(self): | |||
json_cfg = { | |||
'train': { | |||
'work_dir': | |||
self.tmp_dir, | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1 | |||
}, | |||
'hooks': [{ | |||
'type': 'CheckpointHook', | |||
'interval': 1 | |||
}, { | |||
'type': 'TextLoggerHook', | |||
'interval': 1 | |||
}, { | |||
'type': 'IterTimerHook' | |||
}, { | |||
'type': 'EvaluationHook', | |||
'interval': 1 | |||
}] | |||
}, | |||
'evaluation': { | |||
'dataloader': { | |||
'batch_size_per_gpu': 2, | |||
'workers_per_gpu': 1, | |||
'shuffle': False | |||
}, | |||
'metrics': ['seq_cls_metric'] | |||
} | |||
} | |||
config_path = os.path.join(self.tmp_dir, 'config.json') | |||
with open(config_path, 'w') as f: | |||
json.dump(json_cfg, f) | |||
model = DummyModel() | |||
optimmizer = SGD(model.parameters(), lr=0.01) | |||
lr_scheduler = StepLR(optimmizer, 2) | |||
trainer_name = 'EpochBasedTrainer' | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=model, | |||
data_collator=None, | |||
train_dataset=DummyDataset(), | |||
eval_dataset=DummyDataset(), | |||
optimizers=(optimmizer, lr_scheduler), | |||
max_epochs=3) | |||
trainer = build_trainer(trainer_name, kwargs) | |||
trainer.train() | |||
results_files = os.listdir(self.tmp_dir) | |||
self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
self.assertIn('epoch_1.pth', results_files) | |||
self.assertIn('epoch_2.pth', results_files) | |||
self.assertIn('epoch_3.pth', results_files) | |||
class DummyTrainerTest(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_dummy(self): | |||
default_args = dict(cfg_file='configs/examples/train.json') | |||
trainer = build_trainer('dummy', default_args) | |||
trainer.train() | |||
trainer.evaluate() | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -1,19 +0,0 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import unittest | |||
from modelscope.trainers import build_trainer | |||
class DummyTrainerTest(unittest.TestCase): | |||
def test_dummy(self): | |||
default_args = dict(cfg_file='configs/examples/train.json') | |||
trainer = build_trainer('dummy', default_args) | |||
trainer.train() | |||
trainer.evaluate() | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,91 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.models.nlp.sbert_for_sequence_classification import \ | |||
SbertTextClassfier | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import ModelFile | |||
from modelscope.utils.test_utils import test_level | |||
class TestTrainerWithNlp(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
from datasets import Dataset | |||
dataset_dict = { | |||
'sentence1': [ | |||
'This is test sentence1-1', 'This is test sentence2-1', | |||
'This is test sentence3-1' | |||
], | |||
'sentence2': [ | |||
'This is test sentence1-2', 'This is test sentence2-2', | |||
'This is test sentence3-2' | |||
], | |||
'label': [0, 1, 1] | |||
} | |||
dataset = Dataset.from_dict(dataset_dict) | |||
class MsDatasetDummy(MsDataset): | |||
def __len__(self): | |||
return len(self._hf_ds) | |||
self.dataset = MsDatasetDummy(dataset) | |||
def tearDown(self): | |||
shutil.rmtree(self.tmp_dir) | |||
super().tearDown() | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_trainer(self): | |||
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||
kwargs = dict( | |||
model=model_id, | |||
train_dataset=self.dataset, | |||
eval_dataset=self.dataset, | |||
work_dir=self.tmp_dir) | |||
trainer = build_trainer(default_args=kwargs) | |||
trainer.train() | |||
results_files = os.listdir(self.tmp_dir) | |||
self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
for i in range(10): | |||
self.assertIn(f'epoch_{i+1}.pth', results_files) | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
def test_trainer_with_model_and_args(self): | |||
tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(tmp_dir): | |||
os.makedirs(tmp_dir) | |||
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||
cache_path = snapshot_download(model_id) | |||
model = SbertTextClassfier.from_pretrained(cache_path) | |||
kwargs = dict( | |||
cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | |||
model=model, | |||
train_dataset=self.dataset, | |||
eval_dataset=self.dataset, | |||
max_epochs=2, | |||
work_dir=self.tmp_dir) | |||
trainer = build_trainer(default_args=kwargs) | |||
trainer.train() | |||
results_files = os.listdir(self.tmp_dir) | |||
self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
for i in range(2): | |||
self.assertIn(f'epoch_{i+1}.pth', results_files) | |||
if __name__ == '__main__': | |||
unittest.main() |