* dataset检查slice开始位置,确保结果不为空 * fieldarray检查content不为空 * optimizer接受的model params是一个generator,不能赋值 * code style refinetags/v0.2.0^2
| @@ -87,6 +87,8 @@ class DataSet(object): | |||||
| if isinstance(idx, int): | if isinstance(idx, int): | ||||
| return Instance(**{name: self.field_arrays[name][idx] for name in self.field_arrays}) | return Instance(**{name: self.field_arrays[name][idx] for name in self.field_arrays}) | ||||
| elif isinstance(idx, slice): | elif isinstance(idx, slice): | ||||
| if idx.start is not None and (idx.start >= len(self) or idx.start <= -len(self)): | |||||
| raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") | |||||
| data_set = DataSet() | data_set = DataSet() | ||||
| for field in self.field_arrays.values(): | for field in self.field_arrays.values(): | ||||
| data_set.add_field(name=field.name, | data_set.add_field(name=field.name, | ||||
| @@ -135,7 +137,9 @@ class DataSet(object): | |||||
| :param bool is_target: whether this field is label or target. | :param bool is_target: whether this field is label or target. | ||||
| """ | """ | ||||
| if len(self.field_arrays) != 0: | if len(self.field_arrays) != 0: | ||||
| assert len(self) == len(fields) | |||||
| if len(self) != len(fields): | |||||
| raise RuntimeError(f"The field to append must have the same size as dataset. " | |||||
| f"Dataset size {len(self)} != field size {len(fields)}") | |||||
| self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target, | self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target, | ||||
| is_input=is_input) | is_input=is_input) | ||||
| @@ -168,6 +172,7 @@ class DataSet(object): | |||||
| """ | """ | ||||
| if old_name in self.field_arrays: | if old_name in self.field_arrays: | ||||
| self.field_arrays[new_name] = self.field_arrays.pop(old_name) | self.field_arrays[new_name] = self.field_arrays.pop(old_name) | ||||
| self.field_arrays[new_name].name = new_name | |||||
| else: | else: | ||||
| raise KeyError("{} is not a valid name. ".format(old_name)) | raise KeyError("{} is not a valid name. ".format(old_name)) | ||||
| @@ -33,7 +33,10 @@ class FieldArray(object): | |||||
| type_set = set([type(item) for item in content[0]]) | type_set = set([type(item) for item in content[0]]) | ||||
| else: | else: | ||||
| # 1-D list | # 1-D list | ||||
| if len(content) == 0: | |||||
| raise RuntimeError("Cannot create FieldArray with an empty list.") | |||||
| type_set = set([type(item) for item in content]) | type_set = set([type(item) for item in content]) | ||||
| if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): | if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): | ||||
| return type_set.pop() | return type_set.pop() | ||||
| elif len(type_set) == 2 and float in type_set and int in type_set: | elif len(type_set) == 2 and float in type_set and int in type_set: | ||||
| @@ -42,8 +42,10 @@ class SGD(Optimizer): | |||||
| def construct_from_pytorch(self, model_params): | def construct_from_pytorch(self, model_params): | ||||
| if self.model_params is None: | if self.model_params is None: | ||||
| self.model_params = model_params | |||||
| return torch.optim.SGD(self.model_params, **self.settings) | |||||
| # careful! generator cannot be assigned. | |||||
| return torch.optim.SGD(model_params, **self.settings) | |||||
| else: | |||||
| return torch.optim.SGD(self.model_params, **self.settings) | |||||
| class Adam(Optimizer): | class Adam(Optimizer): | ||||
| @@ -75,5 +77,7 @@ class Adam(Optimizer): | |||||
| def construct_from_pytorch(self, model_params): | def construct_from_pytorch(self, model_params): | ||||
| if self.model_params is None: | if self.model_params is None: | ||||
| self.model_params = model_params | |||||
| return torch.optim.Adam(self.model_params, **self.settings) | |||||
| # careful! generator cannot be assigned. | |||||
| return torch.optim.Adam(model_params, **self.settings) | |||||
| else: | |||||
| return torch.optim.Adam(self.model_params, **self.settings) | |||||
| @@ -18,8 +18,8 @@ class CNNText(torch.nn.Module): | |||||
| def __init__(self, embed_num, | def __init__(self, embed_num, | ||||
| embed_dim, | embed_dim, | ||||
| num_classes, | num_classes, | ||||
| kernel_nums=(3,4,5), | |||||
| kernel_sizes=(3,4,5), | |||||
| kernel_nums=(3, 4, 5), | |||||
| kernel_sizes=(3, 4, 5), | |||||
| padding=0, | padding=0, | ||||
| dropout=0.5): | dropout=0.5): | ||||
| super(CNNText, self).__init__() | super(CNNText, self).__init__() | ||||
| @@ -45,7 +45,7 @@ class CNNText(torch.nn.Module): | |||||
| x = self.conv_pool(x) # [N,L,C] -> [N,C] | x = self.conv_pool(x) # [N,L,C] -> [N,C] | ||||
| x = self.dropout(x) | x = self.dropout(x) | ||||
| x = self.fc(x) # [N,C] -> [N, N_class] | x = self.fc(x) # [N,C] -> [N, N_class] | ||||
| return {'output':x} | |||||
| return {'output': x} | |||||
| def predict(self, word_seq): | def predict(self, word_seq): | ||||
| """ | """ | ||||
| @@ -78,4 +78,3 @@ class CNNText(torch.nn.Module): | |||||
| correct = (predict == label_seq).long().sum().item() | correct = (predict == label_seq).long().sum().item() | ||||
| total = label_seq.size(0) | total = label_seq.size(0) | ||||
| return {'acc': 1.0 * correct / total} | return {'acc': 1.0 * correct / total} | ||||
| @@ -0,0 +1,95 @@ | |||||
| import unittest | |||||
| from fastNLP import DataSet | |||||
| from fastNLP import Instance | |||||
| from fastNLP import Tester | |||||
| from fastNLP import Vocabulary | |||||
| from fastNLP.core.losses import CrossEntropyLoss | |||||
| from fastNLP.core.metrics import AccuracyMetric | |||||
| from fastNLP.models import CNNText | |||||
| class TestTutorial(unittest.TestCase): | |||||
| def test_tutorial(self): | |||||
| # 从csv读取数据到DataSet | |||||
| dataset = DataSet.read_csv("./data_for_tests/tutorial_sample_dataset.csv", headers=('raw_sentence', 'label'), | |||||
| sep='\t') | |||||
| print(len(dataset)) | |||||
| print(dataset[0]) | |||||
| dataset.append(Instance(raw_sentence='fake data', label='0')) | |||||
| dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') | |||||
| # label转int | |||||
| dataset.apply(lambda x: int(x['label']), new_field_name='label') | |||||
| # 使用空格分割句子 | |||||
| def split_sent(ins): | |||||
| return ins['raw_sentence'].split() | |||||
| dataset.apply(split_sent, new_field_name='words') | |||||
| # 增加长度信息 | |||||
| dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') | |||||
| print(len(dataset)) | |||||
| print(dataset[0]) | |||||
| # DataSet.drop(func)筛除数据 | |||||
| dataset.drop(lambda x: x['seq_len'] <= 3) | |||||
| print(len(dataset)) | |||||
| # 设置DataSet中,哪些field要转为tensor | |||||
| # set target,loss或evaluate中的golden,计算loss,模型评估时使用 | |||||
| dataset.set_target("label") | |||||
| # set input,模型forward时使用 | |||||
| dataset.set_input("words") | |||||
| # 分出测试集、训练集 | |||||
| test_data, train_data = dataset.split(0.5) | |||||
| print(len(test_data)) | |||||
| print(len(train_data)) | |||||
| # 构建词表, Vocabulary.add(word) | |||||
| vocab = Vocabulary(min_freq=2) | |||||
| train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) | |||||
| vocab.build_vocab() | |||||
| # index句子, Vocabulary.to_index(word) | |||||
| train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') | |||||
| test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') | |||||
| print(test_data[0]) | |||||
| model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) | |||||
| from fastNLP import Trainer | |||||
| from copy import deepcopy | |||||
| # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 | |||||
| train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 | |||||
| train_data.rename_field('label', 'label_seq') | |||||
| test_data.rename_field('words', 'word_seq') | |||||
| test_data.rename_field('label', 'label_seq') | |||||
| # 实例化Trainer,传入模型和数据,进行训练 | |||||
| copy_model = deepcopy(model) | |||||
| overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data, | |||||
| losser=CrossEntropyLoss(input="output", target="label_seq"), | |||||
| metrics=AccuracyMetric(pred="predict", target="label_seq"), | |||||
| save_path="./save", | |||||
| batch_size=4, | |||||
| n_epochs=10) | |||||
| overfit_trainer.train() | |||||
| trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, | |||||
| losser=CrossEntropyLoss(input="output", target="label_seq"), | |||||
| metrics=AccuracyMetric(pred="predict", target="label_seq"), | |||||
| save_path="./save", | |||||
| batch_size=4, | |||||
| n_epochs=10) | |||||
| trainer.train() | |||||
| print('Train finished!') | |||||
| # 使用fastNLP的Tester测试脚本 | |||||
| tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), | |||||
| batch_size=4) | |||||
| acc = tester.test() | |||||
| print(acc) | |||||