diff --git a/legacy/test/api/test_pipeline.py b/legacy/test/api/test_pipeline.py deleted file mode 100644 index c7094790..00000000 --- a/legacy/test/api/test_pipeline.py +++ /dev/null @@ -1,6 +0,0 @@ -import unittest - - -class TestPipeline(unittest.TestCase): - def test_case(self): - pass diff --git a/legacy/test/api/test_processor.py b/legacy/test/api/test_processor.py deleted file mode 100644 index 9611e458..00000000 --- a/legacy/test/api/test_processor.py +++ /dev/null @@ -1,101 +0,0 @@ -import random -import unittest - -import numpy as np - -from fastNLP import Vocabulary, Instance -from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor, PreAppendProcessor, SliceProcessor, Num2TagProcessor, \ - IndexerProcessor, VocabProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor, SetTargetProcessor, \ - SetInputProcessor, VocabIndexerProcessor -from fastNLP.core.dataset import DataSet - - -class TestProcessor(unittest.TestCase): - def test_FullSpaceToHalfSpaceProcessor(self): - ds = DataSet({"word": ["00, u1, u), (u2, u2"]}) - proc = FullSpaceToHalfSpaceProcessor("word") - ds = proc(ds) - self.assertEqual(ds.field_arrays["word"].content, ["00, u1, u), (u2, u2"]) - - def test_PreAppendProcessor(self): - ds = DataSet({"word": [["1234", "3456"], ["8789", "3464"]]}) - proc = PreAppendProcessor(data="abc", field_name="word") - ds = proc(ds) - self.assertEqual(ds.field_arrays["word"].content, [["abc", "1234", "3456"], ["abc", "8789", "3464"]]) - - def test_SliceProcessor(self): - ds = DataSet({"xx": [[random.randint(0, 10) for _ in range(30)]] * 40}) - proc = SliceProcessor(10, 20, 2, "xx", new_added_field_name="yy") - ds = proc(ds) - self.assertEqual(len(ds.field_arrays["yy"].content[0]), 5) - - def test_Num2TagProcessor(self): - ds = DataSet({"num": [["99.9982", "2134.0"], ["0.002", "234"]]}) - proc = Num2TagProcessor("", "num") - ds = proc(ds) - for data in ds.field_arrays["num"].content: - for d in data: - self.assertEqual(d, "") - - def test_VocabProcessor_and_IndexerProcessor(self): - ds = DataSet({"xx": [[str(random.randint(0, 10)) for _ in range(30)]] * 40}) - vocab_proc = VocabProcessor("xx") - vocab_proc(ds) - vocab = vocab_proc.vocab - self.assertTrue(isinstance(vocab, Vocabulary)) - self.assertTrue(len(vocab) > 5) - - proc = IndexerProcessor(vocab, "xx", "yy") - ds = proc(ds) - for data in ds.field_arrays["yy"].content[0]: - self.assertTrue(isinstance(data, int)) - - def test_SeqLenProcessor(self): - ds = DataSet({"xx": [[str(random.randint(0, 10)) for _ in range(30)]] * 10}) - proc = SeqLenProcessor("xx", "len") - ds = proc(ds) - for data in ds.field_arrays["len"].content: - self.assertEqual(data, 30) - - def test_ModelProcessor(self): - from fastNLP.models.cnn_text_classification import CNNText - model = CNNText((100, 100), 5) - ins_list = [] - for _ in range(64): - seq_len = np.random.randint(5, 30) - ins_list.append(Instance(word_seq=[np.random.randint(0, 100) for _ in range(seq_len)], seq_lens=seq_len)) - data_set = DataSet(ins_list) - data_set.set_input("word_seq", "seq_lens") - proc = ModelProcessor(model) - data_set = proc(data_set) - self.assertTrue("pred" in data_set) - - def test_Index2WordProcessor(self): - vocab = Vocabulary() - vocab.add_word_lst(["a", "b", "c", "d", "e"]) - proc = Index2WordProcessor(vocab, "tag_id", "tag") - data_set = DataSet([Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])]) - data_set = proc(data_set) - self.assertTrue("tag" in data_set) - - def test_SetTargetProcessor(self): - proc = SetTargetProcessor("a", "b", "c") - data_set = DataSet({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) - data_set = proc(data_set) - self.assertTrue(data_set["a"].is_target) - self.assertTrue(data_set["b"].is_target) - self.assertTrue(data_set["c"].is_target) - - def test_SetInputProcessor(self): - proc = SetInputProcessor("a", "b", "c") - data_set = DataSet({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) - data_set = proc(data_set) - self.assertTrue(data_set["a"].is_input) - self.assertTrue(data_set["b"].is_input) - self.assertTrue(data_set["c"].is_input) - - def test_VocabIndexerProcessor(self): - proc = VocabIndexerProcessor("word_seq", "word_ids") - data_set = DataSet([Instance(word_seq=["a", "b", "c", "d", "e"])]) - data_set = proc(data_set) - self.assertTrue("word_ids" in data_set) diff --git a/legacy/test/automl/test_enas.py b/legacy/test/automl/test_enas.py deleted file mode 100644 index 4fea1063..00000000 --- a/legacy/test/automl/test_enas.py +++ /dev/null @@ -1,111 +0,0 @@ -import unittest - -from fastNLP import DataSet -from fastNLP import Instance -from fastNLP import Vocabulary -from fastNLP.core.losses import CrossEntropyLoss -from fastNLP.core.metrics import AccuracyMetric - - -class TestENAS(unittest.TestCase): - def testENAS(self): - # 从csv读取数据到DataSet - sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv" - dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), - sep='\t') - print(len(dataset)) - print(dataset[0]) - print(dataset[-3]) - - dataset.append(Instance(raw_sentence='fake data', label='0')) - # 将所有数字转为小写 - dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') - # label转int - dataset.apply(lambda x: int(x['label']), new_field_name='label') - - # 使用空格分割句子 - def split_sent(ins): - return ins['raw_sentence'].split() - - dataset.apply(split_sent, new_field_name='words') - - # 增加长度信息 - dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') - print(len(dataset)) - print(dataset[0]) - - # DataSet.drop(func)筛除数据 - dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) - print(len(dataset)) - - # 设置DataSet中,哪些field要转为tensor - # set target,loss或evaluate中的golden,计算loss,模型评估时使用 - dataset.set_target("label") - # set input,模型forward时使用 - dataset.set_input("words", "seq_len") - - # 分出测试集、训练集 - test_data, train_data = dataset.split(0.5) - print(len(test_data)) - print(len(train_data)) - - # 构建词表, Vocabulary.add(word) - vocab = Vocabulary(min_freq=2) - train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) - vocab.build_vocab() - - # index句子, Vocabulary.to_index(word) - train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') - test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') - print(test_data[0]) - - # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 - from fastNLP.core.batch import Batch - from fastNLP.core.sampler import RandomSampler - - batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler()) - for batch_x, batch_y in batch_iterator: - print("batch_x has: ", batch_x) - print("batch_y has: ", batch_y) - break - - from fastNLP.automl.enas_model import ENASModel - from fastNLP.automl.enas_controller import Controller - model = ENASModel(embed_num=len(vocab), num_classes=5) - controller = Controller() - - from fastNLP.automl.enas_trainer import ENASTrainer - - # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 - train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 - train_data.rename_field('label', 'label_seq') - test_data.rename_field('words', 'word_seq') - test_data.rename_field('label', 'label_seq') - - loss = CrossEntropyLoss(pred="output", target="label_seq") - metric = AccuracyMetric(pred="predict", target="label_seq") - - trainer = ENASTrainer(model=model, controller=controller, train_data=train_data, dev_data=test_data, - loss=CrossEntropyLoss(pred="output", target="label_seq"), - metrics=AccuracyMetric(pred="predict", target="label_seq"), - check_code_level=-1, - save_path=None, - batch_size=32, - print_every=1, - n_epochs=3, - final_epochs=1) - trainer.train() - print('Train finished!') - - # 调用Tester在test_data上评价效果 - from fastNLP import Tester - - tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), - batch_size=4) - - acc = tester.test() - print(acc) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file