|
- import unittest
- import _pickle
- from fastNLP import cache_results
- from fastNLP.io import EmbedLoader
- from fastNLP import DataSet
- from fastNLP import Instance
- import time
- import os
- import torch
- from torch import nn
- from fastNLP.core.utils import _move_model_to_device, _get_model_device
- import numpy as np
- from fastNLP.core.utils import seq_len_to_mask, get_seq_len
- from fastNLP.core.utils import iob2, iob2bioes
-
- class Model(nn.Module):
- def __init__(self):
- super().__init__()
- self.param = nn.Parameter(torch.zeros(0))
-
-
- class TestMoveModelDevice(unittest.TestCase):
- def test_case1(self):
- # 测试str
- model = Model()
- model = _move_model_to_device(model, 'cpu')
- assert model.param.device == torch.device('cpu')
- # 测试不存在的device报错
- with self.assertRaises(Exception):
- _move_model_to_device(model, 'cpuu')
- # 测试gpu
- if torch.cuda.is_available():
- model = _move_model_to_device(model, 'cuda')
- assert model.param.is_cuda
- model = _move_model_to_device(model, 'cuda:0')
- assert model.param.device == torch.device('cuda:0')
- with self.assertRaises(Exception):
- _move_model_to_device(model, 'cuda:1000')
- # 测试None
- model = _move_model_to_device(model, None)
-
- def test_case2(self):
- # 测试使用int初始化
- model = Model()
- if torch.cuda.is_available():
- model = _move_model_to_device(model, 0)
- assert model.param.device == torch.device('cuda:0')
- assert model.param.device == torch.device('cuda:0'), "The model should be in "
- with self.assertRaises(Exception):
- _move_model_to_device(model, 100)
- with self.assertRaises(Exception):
- _move_model_to_device(model, -1)
-
- def test_case3(self):
- # 测试None
- model = Model()
- device = _get_model_device(model)
- model = _move_model_to_device(model, None)
- assert device == _get_model_device(model), "The device should not change."
- if torch.cuda.is_available():
- model.cuda()
- device = _get_model_device(model)
- model = _move_model_to_device(model, None)
- assert device == _get_model_device(model), "The device should not change."
-
- model = nn.DataParallel(model, device_ids=[0])
- _move_model_to_device(model, None)
- with self.assertRaises(Exception):
- _move_model_to_device(model, 'cpu')
-
- def test_case4(self):
- # 测试传入list的内容
- model = Model()
- device = ['cpu']
- with self.assertRaises(Exception):
- _move_model_to_device(model, device)
- if torch.cuda.is_available():
- device = [0]
- _model = _move_model_to_device(model, device)
- assert not isinstance(_model, nn.DataParallel)
- device = [torch.device('cuda:0'), torch.device('cuda:0')]
- with self.assertRaises(Exception):
- _model = _move_model_to_device(model, device)
- if torch.cuda.device_count() > 1:
- device = [0, 1]
- _model = _move_model_to_device(model, device)
- assert isinstance(_model, nn.DataParallel)
- device = ['cuda', 'cuda:1']
- with self.assertRaises(Exception):
- _move_model_to_device(model, device)
-
- def test_case5(self):
- if not torch.cuda.is_available():
- return
- # torch.device()
- device = torch.device('cpu')
- model = Model()
- _move_model_to_device(model, device)
- device = torch.device('cuda')
- model = _move_model_to_device(model, device)
- assert model.param.device == torch.device('cuda:0')
- with self.assertRaises(Exception):
- _move_model_to_device(model, torch.device('cuda:100'))
-
-
- @cache_results('test/demo1.pkl')
- def process_data_1(embed_file, cws_train):
- embed, vocab = EmbedLoader.load_without_vocab(embed_file)
- time.sleep(1) # 测试是否通过读取cache获得结果
- with open(cws_train, 'r', encoding='utf-8') as f:
- d = DataSet()
- for line in f:
- line = line.strip()
- if len(line) > 0:
- d.append(Instance(raw=line))
- return embed, vocab, d
-
-
- class TestCache(unittest.TestCase):
- def test_cache_save(self):
- try:
- start_time = time.time()
- embed, vocab, d = process_data_1('tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt',
- 'tests/data_for_tests/cws_train')
- end_time = time.time()
- pre_time = end_time - start_time
- with open('test/demo1.pkl', 'rb') as f:
- _embed, _vocab, _d = _pickle.load(f)
- self.assertEqual(embed.shape, _embed.shape)
- for i in range(embed.shape[0]):
- self.assertListEqual(embed[i].tolist(), _embed[i].tolist())
- start_time = time.time()
- embed, vocab, d = process_data_1('tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt',
- 'tests/data_for_tests/cws_train')
- end_time = time.time()
- read_time = end_time - start_time
- print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time))
- self.assertGreater(pre_time - 0.5, read_time)
- finally:
- os.remove('test/demo1.pkl')
-
- def test_cache_save_overwrite_path(self):
- try:
- start_time = time.time()
- embed, vocab, d = process_data_1('tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', 'tests/data_for_tests/cws_train',
- _cache_fp='test/demo_overwrite.pkl')
- end_time = time.time()
- pre_time = end_time - start_time
- with open('test/demo_overwrite.pkl', 'rb') as f:
- _embed, _vocab, _d = _pickle.load(f)
- self.assertEqual(embed.shape, _embed.shape)
- for i in range(embed.shape[0]):
- self.assertListEqual(embed[i].tolist(), _embed[i].tolist())
- start_time = time.time()
- embed, vocab, d = process_data_1('tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt',
- 'tests/data_for_tests/cws_train',
- _cache_fp='test/demo_overwrite.pkl')
- end_time = time.time()
- read_time = end_time - start_time
- print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time))
- self.assertGreater(pre_time - 0.5, read_time)
- finally:
- os.remove('test/demo_overwrite.pkl')
-
- def test_cache_refresh(self):
- try:
- start_time = time.time()
- embed, vocab, d = process_data_1('tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt',
- 'tests/data_for_tests/cws_train',
- _refresh=True)
- end_time = time.time()
- pre_time = end_time - start_time
- with open('test/demo1.pkl', 'rb') as f:
- _embed, _vocab, _d = _pickle.load(f)
- self.assertEqual(embed.shape, _embed.shape)
- for i in range(embed.shape[0]):
- self.assertListEqual(embed[i].tolist(), _embed[i].tolist())
- start_time = time.time()
- embed, vocab, d = process_data_1('tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt',
- 'tests/data_for_tests/cws_train',
- _refresh=True)
- end_time = time.time()
- read_time = end_time - start_time
- print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time))
- self.assertGreater(0.1, pre_time - read_time)
- finally:
- os.remove('test/demo1.pkl')
-
- def test_duplicate_keyword(self):
- with self.assertRaises(RuntimeError):
- @cache_results(None)
- def func_verbose(a, _verbose):
- pass
-
- func_verbose(0, 1)
- with self.assertRaises(RuntimeError):
- @cache_results(None)
- def func_cache(a, _cache_fp):
- pass
-
- func_cache(1, 2)
- with self.assertRaises(RuntimeError):
- @cache_results(None)
- def func_refresh(a, _refresh):
- pass
-
- func_refresh(1, 2)
-
- def test_create_cache_dir(self):
- @cache_results('test/demo1/demo.pkl')
- def cache():
- return 1, 2
-
- try:
- results = cache()
- print(results)
- finally:
- os.remove('test/demo1/demo.pkl')
- os.rmdir('test/demo1')
-
-
- class TestSeqLenToMask(unittest.TestCase):
-
- def evaluate_mask_seq_len(self, seq_len, mask):
- max_len = int(max(seq_len))
- for i in range(len(seq_len)):
- length = seq_len[i]
- mask_i = mask[i]
- for j in range(max_len):
- self.assertEqual(mask_i[j], j<length)
-
- def test_numpy_seq_len(self):
- # 测试能否转换numpy类型的seq_len
- # 1. 随机测试
- seq_len = np.random.randint(1, 10, size=(10, ))
- mask = seq_len_to_mask(seq_len)
- max_len = seq_len.max()
- self.assertEqual(max_len, mask.shape[1])
- self.evaluate_mask_seq_len(seq_len, mask)
-
- # 2. 异常检测
- seq_len = np.random.randint(10, size=(10, 1))
- with self.assertRaises(AssertionError):
- mask = seq_len_to_mask(seq_len)
-
- # 3. pad到指定长度
- seq_len = np.random.randint(1, 10, size=(10,))
- mask = seq_len_to_mask(seq_len, 100)
- self.assertEqual(100, mask.shape[1])
-
- def test_pytorch_seq_len(self):
- # 1. 随机测试
- seq_len = torch.randint(1, 10, size=(10, ))
- max_len = seq_len.max()
- mask = seq_len_to_mask(seq_len)
- self.assertEqual(max_len, mask.shape[1])
- self.evaluate_mask_seq_len(seq_len.tolist(), mask)
-
- # 2. 异常检测
- seq_len = torch.randn(3, 4)
- with self.assertRaises(AssertionError):
- mask = seq_len_to_mask(seq_len)
-
- # 3. pad到指定长度
- seq_len = torch.randint(1, 10, size=(10, ))
- mask = seq_len_to_mask(seq_len, 100)
- self.assertEqual(100, mask.size(1))
-
-
- class TestUtils(unittest.TestCase):
- def test_get_seq_len(self):
- seq_len = torch.randint(1, 10, size=(10, ))
- mask = seq_len_to_mask(seq_len)
- new_seq_len = get_seq_len(mask)
- self.assertSequenceEqual(seq_len.tolist(), new_seq_len.tolist())
-
- def test_iob2(self):
- tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP']
- convert_tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP']
- self.assertSequenceEqual(convert_tags, iob2(tags))
-
- tags = ['I-NP', 'O', 'I-NP', 'I-VP', 'B-NP', 'I-NP', 'O', 'I-NP', 'I-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP']
- self.assertSequenceEqual(convert_tags, iob2(tags))
-
- def test_iob2bioes(self):
- tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP']
- convert_tags = ['S-NP', 'O', 'S-NP', 'S-VP', 'B-NP', 'E-NP', 'O', 'S-NP', 'S-PP', 'B-NP', 'E-NP', 'O', 'B-NP', 'E-NP', 'S-NP', 'O', 'B-NP', 'I-NP', 'E-NP']
-
- self.assertSequenceEqual(convert_tags, iob2bioes(tags))
-
- class TestConfusionMatrix(unittest.TestCase):
- def test1(self):
- # 测试能否正常打印
- from fastNLP import Vocabulary
- from fastNLP.core.utils import ConfusionMatrix
- import numpy as np
- vocab = Vocabulary(unknown=None, padding=None)
- vocab.add_word_lst(list('abcdef'))
- confusion_matrix = ConfusionMatrix(vocab)
- for _ in range(3):
- length = np.random.randint(1, 5)
- pred = np.random.randint(0, 3, size=(length,))
- target = np.random.randint(0, 3, size=(length,))
- confusion_matrix.add_pred_target(pred, target)
- print(confusion_matrix)
-
- # 测试print_ratio
- confusion_matrix = ConfusionMatrix(vocab, print_ratio=True)
- for _ in range(3):
- length = np.random.randint(1, 5)
- pred = np.random.randint(0, 3, size=(length,))
- target = np.random.randint(0, 3, size=(length,))
- confusion_matrix.add_pred_target(pred, target)
- print(confusion_matrix)
|