# Copyright (c) Alibaba, Inc. and its affiliates. import os import shutil import tempfile import unittest from modelscope.metainfo import Preprocessors, Trainers from modelscope.models import Model from modelscope.pipelines import pipeline from modelscope.trainers import build_trainer from modelscope.utils.constant import ModelFile, Tasks class TestFinetuneSequenceClassification(unittest.TestCase): epoch_num = 1 sentence1 = '今天气温比昨天高么?' sentence2 = '今天湿度比昨天高么?' def setUp(self): print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) def tearDown(self): shutil.rmtree(self.tmp_dir) super().tearDown() def finetune(self, model_id, train_dataset, eval_dataset, name=Trainers.nlp_base_trainer, cfg_modify_fn=None, **kwargs): kwargs = dict( model=model_id, train_dataset=train_dataset, eval_dataset=eval_dataset, work_dir=self.tmp_dir, cfg_modify_fn=cfg_modify_fn, **kwargs) os.environ['LOCAL_RANK'] = '0' trainer = build_trainer(name=name, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) for i in range(self.epoch_num): self.assertIn(f'epoch_{i+1}.pth', results_files) output_files = os.listdir( os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)) self.assertIn(ModelFile.CONFIGURATION, output_files) self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files) copy_src_files = os.listdir(trainer.model_dir) print(f'copy_src_files are {copy_src_files}') print(f'output_files are {output_files}') for item in copy_src_files: if not item.startswith('.'): self.assertIn(item, output_files) def pipeline_sentence_similarity(self, model_dir): model = Model.from_pretrained(model_dir) pipeline_ins = pipeline(task=Tasks.sentence_similarity, model=model) print(pipeline_ins(input=(self.sentence1, self.sentence2))) @unittest.skip def test_finetune_afqmc(self): def cfg_modify_fn(cfg): cfg.task = Tasks.sentence_similarity cfg['preprocessor'] = {'type': Preprocessors.sen_sim_tokenizer} cfg.train.optimizer.lr = 2e-5 cfg['dataset'] = { 'train': { 'labels': ['0', '1'], 'first_sequence': 'sentence1', 'second_sequence': 'sentence2', 'label': 'label', } } cfg.train.max_epochs = self.epoch_num cfg.train.lr_scheduler = { 'type': 'LinearLR', 'start_factor': 1.0, 'end_factor': 0.0, 'total_iters': int(len(dataset['train']) / 32) * cfg.train.max_epochs, 'options': { 'by_epoch': False } } cfg.train.hooks = [{ 'type': 'CheckpointHook', 'interval': 1 }, { 'type': 'TextLoggerHook', 'interval': 1 }, { 'type': 'IterTimerHook' }, { 'type': 'EvaluationHook', 'by_epoch': False, 'interval': 100 }] return cfg from datasets import load_dataset from datasets import DownloadConfig dc = DownloadConfig() dc.local_files_only = True dataset = load_dataset('clue', 'afqmc', download_config=dc) self.finetune( model_id='damo/nlp_structbert_backbone_tiny_std', train_dataset=dataset['train'], eval_dataset=dataset['validation'], cfg_modify_fn=cfg_modify_fn) output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR) self.pipeline_sentence_similarity(output_dir) @unittest.skip def test_finetune_tnews(self): def cfg_modify_fn(cfg): # TODO no proper task for tnews cfg.task = 'nli' cfg['preprocessor'] = {'type': 'nli-tokenizer'} cfg.train.optimizer.lr = 2e-5 cfg['dataset'] = { 'train': { 'labels': [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14' ], 'first_sequence': 'sentence', 'label': 'label', } } cfg.train.max_epochs = 5 cfg.train.lr_scheduler = { 'type': 'LinearLR', 'start_factor': 1.0, 'end_factor': 0.0, 'total_iters': int(len(dataset['train']) / 32) * cfg.train.max_epochs, 'options': { 'by_epoch': False } } cfg.train.hooks = [{ 'type': 'CheckpointHook', 'interval': 1 }, { 'type': 'TextLoggerHook', 'interval': 1 }, { 'type': 'IterTimerHook' }, { 'type': 'EvaluationHook', 'by_epoch': False, 'interval': 100 }] return cfg from datasets import load_dataset from datasets import DownloadConfig dc = DownloadConfig() dc.local_files_only = True dataset = load_dataset('clue', 'tnews', download_config=dc) self.finetune( model_id='damo/nlp_structbert_backbone_tiny_std', train_dataset=dataset['train'], eval_dataset=dataset['validation'], cfg_modify_fn=cfg_modify_fn) @unittest.skip def test_veco_xnli(self): from datasets import load_dataset langs = ['en'] langs_eval = ['en'] train_datasets = [] from datasets import DownloadConfig dc = DownloadConfig() dc.local_files_only = True for lang in langs: train_datasets.append( load_dataset('xnli', lang, split='train', download_config=dc)) eval_datasets = [] for lang in langs_eval: eval_datasets.append( load_dataset( 'xnli', lang, split='validation', download_config=dc)) train_len = sum([len(dataset) for dataset in train_datasets]) labels = ['0', '1', '2'] def cfg_modify_fn(cfg): cfg.task = 'nli' cfg['preprocessor'] = {'type': 'nli-tokenizer'} cfg['dataset'] = { 'train': { 'first_sequence': 'premise', 'second_sequence': 'hypothesis', 'labels': labels, 'label': 'label', } } cfg['train'] = { 'work_dir': '/tmp', 'max_epochs': 2, 'dataloader': { 'batch_size_per_gpu': 16, 'workers_per_gpu': 1 }, 'optimizer': { 'type': 'AdamW', 'lr': 2e-5, 'options': { 'cumulative_iters': 8, } }, 'lr_scheduler': { 'type': 'LinearLR', 'start_factor': 1.0, 'end_factor': 0.0, 'total_iters': int(train_len / 16) * 2, 'options': { 'by_epoch': False } }, 'hooks': [{ 'type': 'CheckpointHook', 'interval': 1, 'save_dir': '/root' }, { 'type': 'TextLoggerHook', 'interval': 1 }, { 'type': 'IterTimerHook' }, { 'type': 'EvaluationHook', 'by_epoch': False, 'interval': 500 }] } cfg['evaluation'] = { 'dataloader': { 'batch_size_per_gpu': 128, 'workers_per_gpu': 1, 'shuffle': False } } return cfg self.finetune( 'damo/nlp_veco_fill-mask-large', train_datasets, eval_datasets, name=Trainers.nlp_veco_trainer, cfg_modify_fn=cfg_modify_fn) if __name__ == '__main__': unittest.main()