# Copyright (c) Alibaba, Inc. and its affiliates. import os import shutil import tempfile import unittest from functools import reduce from modelscope.metainfo import Trainers from modelscope.trainers import build_trainer from modelscope.utils.test_utils import test_level class TestFinetuneTokenClassification(unittest.TestCase): def setUp(self): print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) def tearDown(self): shutil.rmtree(self.tmp_dir) super().tearDown() def finetune(self, model_id, train_dataset, eval_dataset, name=Trainers.nlp_base_trainer, cfg_modify_fn=None, **kwargs): kwargs = dict( model=model_id, train_dataset=train_dataset, eval_dataset=eval_dataset, work_dir=self.tmp_dir, cfg_modify_fn=cfg_modify_fn, **kwargs) os.environ['LOCAL_RANK'] = '0' trainer = build_trainer(name=name, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) for i in range(10): self.assertIn(f'epoch_{i+1}.pth', results_files) @unittest.skip def test_word_segmentation(self): """This unittest is used to reproduce the icwb2:pku dataset + structbert model training results. User can train a custom dataset by modifying this piece of code and comment the @unittest.skip. """ os.system( f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip' ) shutil.unpack_archive(f'{self.tmp_dir}/icwb2-data.zip', self.tmp_dir) from datasets import load_dataset from modelscope.preprocessors.nlp import WordSegmentationBlankSetToLabelPreprocessor preprocessor = WordSegmentationBlankSetToLabelPreprocessor() dataset = load_dataset( 'text', data_files=f'{self.tmp_dir}/icwb2-data/training/pku_training.utf8') def split_to_dict(examples): return preprocessor(examples['text']) dataset = dataset.map(split_to_dict, batched=False) def reducer(x, y): x = x.split(' ') if isinstance(x, str) else x y = y.split(' ') if isinstance(y, str) else y return x + y label_enumerate_values = list( set(reduce(reducer, dataset['train'][:1000]['labels']))) label_enumerate_values.sort() train_len = int(len(dataset['train']) * 0.7) train_dataset = dataset['train'].select(range(train_len)) dev_dataset = dataset['train'].select( range(train_len, len(dataset['train']))) def cfg_modify_fn(cfg): cfg.task = 'token-classification' cfg['dataset'] = { 'train': { 'labels': label_enumerate_values, 'first_sequence': 'tokens', 'label': 'labels', } } cfg['preprocessor'] = {'type': 'token-cls-tokenizer'} cfg.train.max_epochs = 2 cfg.train.lr_scheduler = { 'type': 'LinearLR', 'start_factor': 1.0, 'end_factor': 0.0, 'total_iters': int(len(train_dataset) / 32) * cfg.train.max_epochs, 'options': { 'by_epoch': False } } cfg.train.hooks = [{ 'type': 'CheckpointHook', 'interval': 1 }, { 'type': 'TextLoggerHook', 'interval': 1 }, { 'type': 'IterTimerHook' }, { 'type': 'EvaluationHook', 'by_epoch': False, 'interval': 50 }] return cfg self.finetune( 'damo/nlp_structbert_backbone_base_std', train_dataset, dev_dataset, cfg_modify_fn=cfg_modify_fn) if __name__ == '__main__': unittest.main()