* dataset检查slice开始位置,确保结果不为空 * fieldarray检查content不为空 * optimizer接受的model params是一个generator,不能赋值 * code style refinetags/v0.2.0^2
@@ -87,6 +87,8 @@ class DataSet(object): | |||
if isinstance(idx, int): | |||
return Instance(**{name: self.field_arrays[name][idx] for name in self.field_arrays}) | |||
elif isinstance(idx, slice): | |||
if idx.start is not None and (idx.start >= len(self) or idx.start <= -len(self)): | |||
raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") | |||
data_set = DataSet() | |||
for field in self.field_arrays.values(): | |||
data_set.add_field(name=field.name, | |||
@@ -135,7 +137,9 @@ class DataSet(object): | |||
:param bool is_target: whether this field is label or target. | |||
""" | |||
if len(self.field_arrays) != 0: | |||
assert len(self) == len(fields) | |||
if len(self) != len(fields): | |||
raise RuntimeError(f"The field to append must have the same size as dataset. " | |||
f"Dataset size {len(self)} != field size {len(fields)}") | |||
self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target, | |||
is_input=is_input) | |||
@@ -168,6 +172,7 @@ class DataSet(object): | |||
""" | |||
if old_name in self.field_arrays: | |||
self.field_arrays[new_name] = self.field_arrays.pop(old_name) | |||
self.field_arrays[new_name].name = new_name | |||
else: | |||
raise KeyError("{} is not a valid name. ".format(old_name)) | |||
@@ -33,7 +33,10 @@ class FieldArray(object): | |||
type_set = set([type(item) for item in content[0]]) | |||
else: | |||
# 1-D list | |||
if len(content) == 0: | |||
raise RuntimeError("Cannot create FieldArray with an empty list.") | |||
type_set = set([type(item) for item in content]) | |||
if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): | |||
return type_set.pop() | |||
elif len(type_set) == 2 and float in type_set and int in type_set: | |||
@@ -42,8 +42,10 @@ class SGD(Optimizer): | |||
def construct_from_pytorch(self, model_params): | |||
if self.model_params is None: | |||
self.model_params = model_params | |||
return torch.optim.SGD(self.model_params, **self.settings) | |||
# careful! generator cannot be assigned. | |||
return torch.optim.SGD(model_params, **self.settings) | |||
else: | |||
return torch.optim.SGD(self.model_params, **self.settings) | |||
class Adam(Optimizer): | |||
@@ -75,5 +77,7 @@ class Adam(Optimizer): | |||
def construct_from_pytorch(self, model_params): | |||
if self.model_params is None: | |||
self.model_params = model_params | |||
return torch.optim.Adam(self.model_params, **self.settings) | |||
# careful! generator cannot be assigned. | |||
return torch.optim.Adam(model_params, **self.settings) | |||
else: | |||
return torch.optim.Adam(self.model_params, **self.settings) |
@@ -18,8 +18,8 @@ class CNNText(torch.nn.Module): | |||
def __init__(self, embed_num, | |||
embed_dim, | |||
num_classes, | |||
kernel_nums=(3,4,5), | |||
kernel_sizes=(3,4,5), | |||
kernel_nums=(3, 4, 5), | |||
kernel_sizes=(3, 4, 5), | |||
padding=0, | |||
dropout=0.5): | |||
super(CNNText, self).__init__() | |||
@@ -45,7 +45,7 @@ class CNNText(torch.nn.Module): | |||
x = self.conv_pool(x) # [N,L,C] -> [N,C] | |||
x = self.dropout(x) | |||
x = self.fc(x) # [N,C] -> [N, N_class] | |||
return {'output':x} | |||
return {'output': x} | |||
def predict(self, word_seq): | |||
""" | |||
@@ -78,4 +78,3 @@ class CNNText(torch.nn.Module): | |||
correct = (predict == label_seq).long().sum().item() | |||
total = label_seq.size(0) | |||
return {'acc': 1.0 * correct / total} | |||
@@ -0,0 +1,95 @@ | |||
import unittest | |||
from fastNLP import DataSet | |||
from fastNLP import Instance | |||
from fastNLP import Tester | |||
from fastNLP import Vocabulary | |||
from fastNLP.core.losses import CrossEntropyLoss | |||
from fastNLP.core.metrics import AccuracyMetric | |||
from fastNLP.models import CNNText | |||
class TestTutorial(unittest.TestCase): | |||
def test_tutorial(self): | |||
# 从csv读取数据到DataSet | |||
dataset = DataSet.read_csv("./data_for_tests/tutorial_sample_dataset.csv", headers=('raw_sentence', 'label'), | |||
sep='\t') | |||
print(len(dataset)) | |||
print(dataset[0]) | |||
dataset.append(Instance(raw_sentence='fake data', label='0')) | |||
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') | |||
# label转int | |||
dataset.apply(lambda x: int(x['label']), new_field_name='label') | |||
# 使用空格分割句子 | |||
def split_sent(ins): | |||
return ins['raw_sentence'].split() | |||
dataset.apply(split_sent, new_field_name='words') | |||
# 增加长度信息 | |||
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') | |||
print(len(dataset)) | |||
print(dataset[0]) | |||
# DataSet.drop(func)筛除数据 | |||
dataset.drop(lambda x: x['seq_len'] <= 3) | |||
print(len(dataset)) | |||
# 设置DataSet中,哪些field要转为tensor | |||
# set target,loss或evaluate中的golden,计算loss,模型评估时使用 | |||
dataset.set_target("label") | |||
# set input,模型forward时使用 | |||
dataset.set_input("words") | |||
# 分出测试集、训练集 | |||
test_data, train_data = dataset.split(0.5) | |||
print(len(test_data)) | |||
print(len(train_data)) | |||
# 构建词表, Vocabulary.add(word) | |||
vocab = Vocabulary(min_freq=2) | |||
train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) | |||
vocab.build_vocab() | |||
# index句子, Vocabulary.to_index(word) | |||
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') | |||
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') | |||
print(test_data[0]) | |||
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) | |||
from fastNLP import Trainer | |||
from copy import deepcopy | |||
# 更改DataSet中对应field的名称,要以模型的forward等参数名一致 | |||
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 | |||
train_data.rename_field('label', 'label_seq') | |||
test_data.rename_field('words', 'word_seq') | |||
test_data.rename_field('label', 'label_seq') | |||
# 实例化Trainer,传入模型和数据,进行训练 | |||
copy_model = deepcopy(model) | |||
overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data, | |||
losser=CrossEntropyLoss(input="output", target="label_seq"), | |||
metrics=AccuracyMetric(pred="predict", target="label_seq"), | |||
save_path="./save", | |||
batch_size=4, | |||
n_epochs=10) | |||
overfit_trainer.train() | |||
trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, | |||
losser=CrossEntropyLoss(input="output", target="label_seq"), | |||
metrics=AccuracyMetric(pred="predict", target="label_seq"), | |||
save_path="./save", | |||
batch_size=4, | |||
n_epochs=10) | |||
trainer.train() | |||
print('Train finished!') | |||
# 使用fastNLP的Tester测试脚本 | |||
tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), | |||
batch_size=4) | |||
acc = tester.test() | |||
print(acc) |