Browse Source

refactor type system in FieldArray:

* 重构dtype的检测代码,在FieldArray的初始化和append两处,达到更好的代码复用
* 类型检测的责任完全落在FieldArray,DataSet与之配合
测试:
* 整理dtype相关的测试代码
* 给所有tutorial添加测试
其他:
* 完善一个完整的Conll dataset loader
* 升级POS tag model训练脚本
tags/v0.3.1^2
FengZiYjun 6 years ago
parent
commit
e4f997d52a
12 changed files with 725 additions and 202 deletions
  1. +10
    -6
      fastNLP/core/dataset.py
  2. +146
    -68
      fastNLP/core/fieldarray.py
  3. +5
    -1
      fastNLP/core/instance.py
  4. +20
    -2
      fastNLP/io/dataset_loader.py
  5. +1
    -1
      reproduction/POS_tagging/pos_tag.cfg
  6. +70
    -20
      reproduction/POS_tagging/train_pos_tag.py
  7. +9
    -0
      test/core/test_batch.py
  8. +18
    -2
      test/core/test_dataset.py
  9. +5
    -5
      test/core/test_fieldarray.py
  10. +9
    -6
      test/models/test_biaffine_parser.py
  11. +0
    -91
      test/test_tutorial.py
  12. +432
    -0
      test/test_tutorials.py

+ 10
- 6
fastNLP/core/dataset.py View File

@@ -2,8 +2,8 @@ import _pickle as pickle


import numpy as np import numpy as np


from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.fieldarray import AutoPadder from fastNLP.core.fieldarray import AutoPadder
from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import get_func_signature
from fastNLP.io.base_loader import DataLoaderRegister from fastNLP.io.base_loader import DataLoaderRegister
@@ -142,7 +142,8 @@ class DataSet(object):
if len(self.field_arrays) == 0: if len(self.field_arrays) == 0:
# DataSet has no field yet # DataSet has no field yet
for name, field in ins.fields.items(): for name, field in ins.fields.items():
self.field_arrays[name] = FieldArray(name, [field])
field = field.tolist() if isinstance(field, np.ndarray) else field
self.field_arrays[name] = FieldArray(name, [field]) # 第一个样本,必须用list包装起来
else: else:
if len(self.field_arrays) != len(ins.fields): if len(self.field_arrays) != len(ins.fields):
raise ValueError( raise ValueError(
@@ -290,9 +291,11 @@ class DataSet(object):
extra_param['is_input'] = old_field.is_input extra_param['is_input'] = old_field.is_input
if 'is_target' not in extra_param: if 'is_target' not in extra_param:
extra_param['is_target'] = old_field.is_target extra_param['is_target'] = old_field.is_target
self.add_field(name=new_field_name, fields=results)
self.add_field(name=new_field_name, fields=results, is_input=extra_param["is_input"],
is_target=extra_param["is_target"])
else: else:
self.add_field(name=new_field_name, fields=results)
self.add_field(name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
is_target=extra_param.get("is_target", None))
else: else:
return results return results


@@ -334,13 +337,14 @@ class DataSet(object):
train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
train_set.field_arrays[field_name].is_2d_list = self.field_arrays[field_name].is_2d_list
train_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim

dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
dev_set.field_arrays[field_name].is_2d_list = self.field_arrays[field_name].is_2d_list
dev_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim


return train_set, dev_set return train_set, dev_set




+ 146
- 68
fastNLP/core/fieldarray.py View File

@@ -100,6 +100,22 @@ class FieldArray(object):
""" """


def __init__(self, name, content, is_target=None, is_input=None, padder=AutoPadder(pad_val=0)): def __init__(self, name, content, is_target=None, is_input=None, padder=AutoPadder(pad_val=0)):
"""DataSet在初始化时会有两类方法对FieldArray操作:
1) 如果DataSet使用dict初始化,那么在add_field中会构造FieldArray:
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
2) 如果DataSet使用list of Instance 初始化,那么在append中会先对第一个样本初始化FieldArray;
然后后面的样本使用FieldArray.append进行添加。
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])

注意:np.array必须仅在最外层,即np.array([np.array, np.array]) 和 list of np.array不考虑
类型检查(dtype check)发生在当该field被设置为is_input或者is_target时。

"""
self.name = name self.name = name
if isinstance(content, list): if isinstance(content, list):
content = content content = content
@@ -107,31 +123,39 @@ class FieldArray(object):
content = content.tolist() # convert np.ndarray into 2-D list content = content.tolist() # convert np.ndarray into 2-D list
else: else:
raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content)))
self.content = content
if len(content) == 0:
raise RuntimeError("Cannot initialize FieldArray with empty list.")

self.content = content # 1维 或 2维 或 3维 list, 形状可能不对齐
self.content_dim = None # 表示content是多少维的list
self.set_padder(padder) self.set_padder(padder)


self._is_target = None
self._is_input = None
self.BASIC_TYPES = (int, float, str) # content中可接受的Python基本类型,这里没有np.array


self.BASIC_TYPES = (int, float, str, np.ndarray)
self.is_2d_list = False
self.pytype = None # int, float, str, or np.ndarray
self.dtype = None # np.int64, np.float64, np.str
self.pytype = None
self.dtype = None
self._is_input = None
self._is_target = None


if is_input is not None:
if is_input is not None or is_target is not None:
self.is_input = is_input self.is_input = is_input
if is_target is not None:
self.is_target = is_target self.is_target = is_target


def _set_dtype(self):
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)

@property @property
def is_input(self): def is_input(self):
return self._is_input return self._is_input


@is_input.setter @is_input.setter
def is_input(self, value): def is_input(self, value):
"""
当 field_array.is_input = True / False 时被调用
"""
if value is True: if value is True:
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
self._set_dtype()
self._is_input = value self._is_input = value


@property @property
@@ -140,46 +164,99 @@ class FieldArray(object):


@is_target.setter @is_target.setter
def is_target(self, value): def is_target(self, value):
"""
当 field_array.is_target = True / False 时被调用
"""
if value is True: if value is True:
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
self._set_dtype()
self._is_target = value self._is_target = value


def _type_detection(self, content): def _type_detection(self, content):
"""

:param content: a list of int, float, str or np.ndarray, or a list of list of one.
:return type: one of int, float, str, np.ndarray
"""当该field被设置为is_input或者is_target时被调用


""" """
if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list):
# content is a 2-D list
if not all(isinstance(_, list) for _ in content): # strict check 2-D list
raise TypeError("Please provide 2-D list.")
type_set = set([self._type_detection(x) for x in content])
if len(type_set) == 2 and int in type_set and float in type_set:
type_set = {float}
elif len(type_set) > 1:
raise TypeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set))
self.is_2d_list = True
if len(content) == 0:
raise RuntimeError("Empty list in Field {}.".format(self.name))

type_set = set([type(item) for item in content])

if list in type_set:
if len(type_set) > 1:
# list 跟 非list 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
# >1维list
inner_type_set = set()
for l in content:
[inner_type_set.add(type(obj)) for obj in l]
if list not in inner_type_set:
# 二维list
self.content_dim = 2
return self._basic_type_detection(inner_type_set)
else:
if len(inner_type_set) == 1:
# >2维list
inner_inner_type_set = set()
for _2d_list in content:
for _1d_list in _2d_list:
[inner_inner_type_set.add(type(obj)) for obj in _1d_list]
if list in inner_inner_type_set:
raise RuntimeError("FieldArray cannot handle 4-D or more-D list.")
# 3维list
self.content_dim = 3
return self._basic_type_detection(inner_inner_type_set)
else:
# list 跟 非list 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, inner_type_set))
else:
# 一维list
for content_type in type_set:
if content_type not in self.BASIC_TYPES:
raise RuntimeError("Unexpected data type in Field '{}'. Expect one of {}. Got {}.".format(
self.name, self.BASIC_TYPES, content_type))
self.content_dim = 1
return self._basic_type_detection(type_set)

def _basic_type_detection(self, type_set):
"""
:param type_set: a set of Python types
:return: one of self.BASIC_TYPES
"""
if len(type_set) == 1:
return type_set.pop() return type_set.pop()

elif isinstance(content, list):
# content is a 1-D list
if len(content) == 0:
# the old error is not informative enough.
raise RuntimeError("Cannot create FieldArray with an empty list. Or one element in the list is empty.")
type_set = set([type(item) for item in content])

if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES:
return type_set.pop()
elif len(type_set) == 2 and float in type_set and int in type_set:
elif len(type_set) == 2:
# 有多个basic type; 可能需要up-cast
if float in type_set and int in type_set:
# up-cast int to float # up-cast int to float
return float return float
else: else:
raise TypeError("Cannot create FieldArray with type {}".format(*type_set))
# str 跟 int 或者 float 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
else: else:
raise TypeError("Cannot create FieldArray with type {}".format(type(content)))
# str, int, float混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))

def _1d_list_check(self, val):
"""如果不是1D list就报错
"""
type_set = set((type(obj) for obj in val))
if any(obj not in self.BASIC_TYPES for obj in type_set):
raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set))
self._basic_type_detection(type_set)
# otherwise: _basic_type_detection will raise error
return True

def _2d_list_check(self, val):
"""如果不是2D list 就报错
"""
type_set = set(type(obj) for obj in val)
if list(type_set) != [list]:
raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set))
inner_type_set = set()
for l in val:
for obj in l:
inner_type_set.add(type(obj))
self._basic_type_detection(inner_type_set)
return True


@staticmethod @staticmethod
def _map_to_np_type(basic_type): def _map_to_np_type(basic_type):
@@ -194,38 +271,39 @@ class FieldArray(object):


:param val: int, float, str, or a list of one. :param val: int, float, str, or a list of one.
""" """
if self.is_target is True or self.is_input is True:
# only check type when used as target or input
if isinstance(val, list):
pass
elif isinstance(val, tuple): # 确保最外层是list
val = list(val)
elif isinstance(val, np.ndarray):
val = val.tolist()
elif any((isinstance(val, t) for t in self.BASIC_TYPES)):
pass
else:
raise RuntimeError(
"Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))


val_type = type(val)
if val_type == list: # shape check
if self.is_2d_list is False:
raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.")
if self.is_input is True or self.is_target is True:
if type(val) == list:
if len(val) == 0: if len(val) == 0:
raise RuntimeError("Cannot append an empty list.")
val_list_type = set([type(_) for _ in val]) # type check
if len(val_list_type) == 2 and int in val_list_type and float in val_list_type:
# up-cast int to float
val_type = float
elif len(val_list_type) == 1:
val_type = val_list_type.pop()
raise ValueError("Cannot append an empty list.")
if self.content_dim == 2 and self._1d_list_check(val):
# 1维list检查
pass
elif self.content_dim == 3 and self._2d_list_check(val):
# 2维list检查
pass
else: else:
raise TypeError("Cannot append a list of {}".format(val_list_type))
else:
if self.is_2d_list is True:
raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.")

if val_type == float and self.pytype == int:
# up-cast
self.pytype = float
self.dtype = self._map_to_np_type(self.pytype)
elif val_type == int and self.pytype == float:
pass
elif val_type == self.pytype:
pass
raise RuntimeError(
"Dimension not matched: expect dim={}, got {}.".format(self.content_dim - 1, val))
elif type(val) in self.BASIC_TYPES and self.content_dim == 1:
# scalar检查
if type(val) == float and self.pytype == int:
self.pytype = float
self.dtype = self._map_to_np_type(self.pytype)
else: else:
raise TypeError("Cannot append type {} into type {}".format(val_type, self.pytype))
raise RuntimeError(
"Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))
self.content.append(val) self.content.append(val)


def __getitem__(self, indices): def __getitem__(self, indices):


+ 5
- 1
fastNLP/core/instance.py View File

@@ -11,6 +11,10 @@ class Instance(object):
""" """


def __init__(self, **fields): def __init__(self, **fields):
"""

:param fields: 可能是一维或者二维的 list or np.array
"""
self.fields = fields self.fields = fields


def add_field(self, field_name, field): def add_field(self, field_name, field):
@@ -32,5 +36,5 @@ class Instance(object):
def __repr__(self): def __repr__(self):
s = '\'' s = '\''
return "{" + ",\n".join( return "{" + ",\n".join(
"\'" + field_name + "\': " + str(self.fields[field_name]) +\
"\'" + field_name + "\': " + str(self.fields[field_name]) + \
f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}" f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}"

+ 20
- 2
fastNLP/io/dataset_loader.py View File

@@ -858,9 +858,22 @@ class ConllPOSReader(object):


ds.append(Instance(words=char_seq, ds.append(Instance(words=char_seq,
tag=pos_seq)) tag=pos_seq))

return ds return ds


def get_one(self, sample):
if len(sample) == 0:
return None
text = []
pos_tags = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
pos_tags.append(t2)
return text, pos_tags




class ConllxDataLoader(object): class ConllxDataLoader(object):
def load(self, path): def load(self, path):
@@ -879,7 +892,12 @@ class ConllxDataLoader(object):
datalist.append(sample) datalist.append(sample)


data = [self.get_one(sample) for sample in datalist] data = [self.get_one(sample) for sample in datalist]
return list(filter(lambda x: x is not None, data))
data_list = list(filter(lambda x: x is not None, data))

ds = DataSet()
for example in data_list:
ds.append(Instance(words=example[0], tag=example[1]))
return ds


def get_one(self, sample): def get_one(self, sample):
sample = list(map(list, zip(*sample))) sample = list(map(list, zip(*sample)))


+ 1
- 1
reproduction/POS_tagging/pos_tag.cfg View File

@@ -10,7 +10,7 @@ eval_sort_key = 'accuracy'


[model] [model]
rnn_hidden_units = 300 rnn_hidden_units = 300
word_emb_dim = 100
word_emb_dim = 300
dropout = 0.5 dropout = 0.5
use_crf = true use_crf = true
print_every_step = 10 print_every_step = 10


+ 70
- 20
reproduction/POS_tagging/train_pos_tag.py View File

@@ -8,16 +8,16 @@ import torch
# in order to run fastNLP without installation # in order to run fastNLP without installation
sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))



from fastNLP.api.pipeline import Pipeline from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import SeqLenProcessor, VocabIndexerProcessor
from fastNLP.api.processor import SeqLenProcessor, VocabIndexerProcessor, SetInputProcessor, IndexerProcessor
from fastNLP.core.metrics import SpanFPreRecMetric from fastNLP.core.metrics import SpanFPreRecMetric
from fastNLP.core.trainer import Trainer from fastNLP.core.trainer import Trainer
from fastNLP.io.config_io import ConfigLoader, ConfigSection from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.models.sequence_modeling import AdvSeqLabel from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.io.dataset_loader import ZhConllPOSReader
from fastNLP.io.dataset_loader import ZhConllPOSReader, ConllxDataLoader
from fastNLP.api.processor import ModelProcessor, Index2WordProcessor from fastNLP.api.processor import ModelProcessor, Index2WordProcessor



cfgfile = './pos_tag.cfg' cfgfile = './pos_tag.cfg'
pickle_path = "save" pickle_path = "save"


@@ -35,7 +35,7 @@ def load_tencent_embed(embed_path, word2id):
return embedding_tensor return embedding_tensor




def train(checkpoint=None):
def train(train_data_path, dev_data_path, checkpoint=None):
# load config # load config
train_param = ConfigSection() train_param = ConfigSection()
model_param = ConfigSection() model_param = ConfigSection()
@@ -43,24 +43,36 @@ def train(checkpoint=None):
print("config loaded") print("config loaded")


# Data Loader # Data Loader
dataset = ZhConllPOSReader().load("/home/hyan/train.conllx")
print("loading training set...")
dataset = ConllxDataLoader().load(train_data_path)
print("loading dev set...")
dev_data = ConllxDataLoader().load(dev_data_path)
print(dataset) print(dataset)
print("dataset transformed")
print("================= dataset ready =====================")


dataset.rename_field("tag", "truth") dataset.rename_field("tag", "truth")
dev_data.rename_field("tag", "truth")


vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq") vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq")
tag_proc = VocabIndexerProcessor("truth") tag_proc = VocabIndexerProcessor("truth")
seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True) seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True)
set_input_proc = SetInputProcessor("word_seq", "word_seq_origin_len", "truth")


vocab_proc(dataset) vocab_proc(dataset)
tag_proc(dataset) tag_proc(dataset)
seq_len_proc(dataset) seq_len_proc(dataset)


# index dev set
word_vocab, tag_vocab = vocab_proc.vocab, tag_proc.vocab
dev_data.apply(lambda ins: [word_vocab.to_index(w) for w in ins["words"]], new_field_name="word_seq")
dev_data.apply(lambda ins: [tag_vocab.to_index(w) for w in ins["truth"]], new_field_name="truth")
dev_data.apply(lambda ins: len(ins["word_seq"]), new_field_name="word_seq_origin_len")

# set input & target
dataset.set_input("word_seq", "word_seq_origin_len", "truth") dataset.set_input("word_seq", "word_seq_origin_len", "truth")
dev_data.set_input("word_seq", "word_seq_origin_len", "truth")
dataset.set_target("truth", "word_seq_origin_len") dataset.set_target("truth", "word_seq_origin_len")

print("processors defined")
dev_data.set_target("truth", "word_seq_origin_len")


# dataset.set_is_target(tag_ids=True) # dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = vocab_proc.get_vocab_size() model_param["vocab_size"] = vocab_proc.get_vocab_size()
@@ -71,7 +83,7 @@ def train(checkpoint=None):
if checkpoint is None: if checkpoint is None:
# pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx) # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
pre_trained = None pre_trained = None
model = AdvSeqLabel(model_param, id2words=tag_proc.vocab.idx2word, emb=pre_trained)
model = AdvSeqLabel(model_param, id2words=None, emb=pre_trained)
print(model) print(model)
else: else:
model = torch.load(checkpoint) model = torch.load(checkpoint)
@@ -80,33 +92,71 @@ def train(checkpoint=None):
trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict", trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth", target="truth",
seq_lens="word_seq_origin_len"), seq_lens="word_seq_origin_len"),
dev_data=dataset, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=5, n_epochs=6, save_path="./save")
dev_data=dev_data, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=5, n_epochs=6, save_path="./save_0")
trainer.train(load_best_model=True) trainer.train(load_best_model=True)


# save model & pipeline # save model & pipeline
model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len") model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len")
id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag") id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")


pp = Pipeline([vocab_proc, seq_len_proc, model_proc, id2tag])
pp = Pipeline([vocab_proc, seq_len_proc, set_input_proc, model_proc, id2tag])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab} save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
torch.save(save_dict, "model_pp.pkl") torch.save(save_dict, "model_pp.pkl")
print("pipeline saved") print("pipeline saved")


torch.save(model, "./save/best_model.pkl")

def run_test(test_path):
test_data = ZhConllPOSReader().load(test_path)

with open("model_pp.pkl", "rb") as f:
save_dict = torch.load(f)
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline

pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
truth = test_data.field_arrays["truth"].content
seq_len = test_data.field_arrays["word_seq_origin_len"].content

# padding by hand
max_length = max([len(seq) for seq in prediction])
for idx in range(len(prediction)):
prediction[idx] = list(prediction[idx]) + ([0] * (max_length - len(prediction[idx])))
truth[idx] = list(truth[idx]) + ([0] * (max_length - len(truth[idx])))
evaluator = SpanFPreRecMetric(tag_vocab=tag_vocab, pred="predict", target="truth",
seq_lens="word_seq_origin_len")
evaluator({"predict": torch.Tensor(prediction), "word_seq_origin_len": torch.Tensor(seq_len)},
{"truth": torch.Tensor(truth)})
test_result = evaluator.get_metric()
f1 = round(test_result['f'] * 100, 2)
pre = round(test_result['pre'] * 100, 2)
rec = round(test_result['rec'] * 100, 2)

return {"F1": f1, "precision": pre, "recall": rec}




if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--train", type=str, help="training conll file", default="/home/zyfeng/data/sample.conllx")
parser.add_argument("--dev", type=str, help="dev conll file", default="/home/zyfeng/data/sample.conllx")
parser.add_argument("--test", type=str, help="test conll file", default=None)

parser.add_argument("-c", "--restart", action="store_true", help="whether to continue training") parser.add_argument("-c", "--restart", action="store_true", help="whether to continue training")
parser.add_argument("-cp", "--checkpoint", type=str, help="checkpoint of the trained model") parser.add_argument("-cp", "--checkpoint", type=str, help="checkpoint of the trained model")
args = parser.parse_args() args = parser.parse_args()


if args.restart is True:
# 继续训练 python train_pos_tag.py -c -cp ./save/best_model.pkl
if args.checkpoint is None:
raise RuntimeError("Please provide the checkpoint. -cp ")
train(args.checkpoint)
if args.test is not None:
print(run_test(args.test))
else: else:
# 一次训练 python train_pos_tag.py
train()
if args.restart is True:
# 继续训练 python train_pos_tag.py -c -cp ./save/best_model.pkl
if args.checkpoint is None:
raise RuntimeError("Please provide the checkpoint. -cp ")
train(args.train, args.dev, args.checkpoint)
else:
# 一次训练 python train_pos_tag.py
train(args.train, args.dev)

+ 9
- 0
test/core/test_batch.py View File

@@ -89,3 +89,12 @@ class TestCase1(unittest.TestCase):
self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4)) self.assertEqual(tuple(y["y"].shape), (4, 4))

def test_list_of_numpy_to_tensor(self):
ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
[Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
print(x, y)

+ 18
- 2
test/core/test_dataset.py View File

@@ -6,15 +6,29 @@ from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance




class TestDataSet(unittest.TestCase):

class TestDataSetInit(unittest.TestCase):
"""初始化DataSet的办法有以下几种:
1) 用dict:
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
2) 用list of Instance:
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])

只接受纯list或者最外层ndarray
"""
def test_init_v1(self): def test_init_v1(self):
# 一维list
ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40) ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40)
self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays) self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40) self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
self.assertEqual(ds.field_arrays["y"].content, [[5, 6], ] * 40) self.assertEqual(ds.field_arrays["y"].content, [[5, 6], ] * 40)


def test_init_v2(self): def test_init_v2(self):
# 用dict
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays) self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40) self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
@@ -28,6 +42,8 @@ class TestDataSet(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = DataSet(0.00001) _ = DataSet(0.00001)



class TestDataSetMethods(unittest.TestCase):
def test_append(self): def test_append(self):
dd = DataSet() dd = DataSet()
for _ in range(3): for _ in range(3):


+ 5
- 5
test/core/test_fieldarray.py View File

@@ -42,13 +42,13 @@ class TestFieldArray(unittest.TestCase):
self.assertEqual(fa.pytype, str) self.assertEqual(fa.pytype, str)


def test_support_np_array(self): def test_support_np_array(self):
fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=True)
self.assertEqual(fa.dtype, np.ndarray)
self.assertEqual(fa.pytype, np.ndarray)
fa = FieldArray("y", np.array([[1.1, 2.2, 3.3, 4.4, 5.5]]), is_input=True)
self.assertEqual(fa.dtype, np.float64)
self.assertEqual(fa.pytype, float)


fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5])) fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5]))
self.assertEqual(fa.dtype, np.ndarray)
self.assertEqual(fa.pytype, np.ndarray)
self.assertEqual(fa.dtype, np.float64)
self.assertEqual(fa.pytype, float)


fa = FieldArray("my_field", np.random.rand(3, 5), is_input=True) fa = FieldArray("my_field", np.random.rand(3, 5), is_input=True)
# in this case, pytype is actually a float. We do not care about it. # in this case, pytype is actually a float. We do not care about it.


+ 9
- 6
test/models/test_biaffine_parser.py View File

@@ -1,8 +1,8 @@
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
import fastNLP

import unittest import unittest


import fastNLP
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric

data_file = """ data_file = """
1 The _ DET DT _ 3 det _ _ 1 The _ DET DT _ 3 det _ _
2 new _ ADJ JJ _ 3 amod _ _ 2 new _ ADJ JJ _ 3 amod _ _
@@ -41,6 +41,7 @@ data_file = """


""" """



def init_data(): def init_data():
ds = fastNLP.DataSet() ds = fastNLP.DataSet()
v = {'word_seq': fastNLP.Vocabulary(), v = {'word_seq': fastNLP.Vocabulary(),
@@ -60,18 +61,19 @@ def init_data():
data.append(line) data.append(line)


for name in ['word_seq', 'pos_seq', 'label_true']: for name in ['word_seq', 'pos_seq', 'label_true']:
ds.apply(lambda x: ['<st>']+list(x[name]), new_field_name=name)
ds.apply(lambda x: ['<st>'] + list(x[name]), new_field_name=name)
ds.apply(lambda x: v[name].add_word_lst(x[name])) ds.apply(lambda x: v[name].add_word_lst(x[name]))


for name in ['word_seq', 'pos_seq', 'label_true']: for name in ['word_seq', 'pos_seq', 'label_true']:
ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name) ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name)


ds.apply(lambda x: [0]+list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: [0] + list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens') ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens')
ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True) ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True)
ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True) ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True)
return ds, v['word_seq'], v['pos_seq'], v['label_true'] return ds, v['word_seq'], v['pos_seq'], v['label_true']



class TestBiaffineParser(unittest.TestCase): class TestBiaffineParser(unittest.TestCase):
def test_train(self): def test_train(self):
ds, v1, v2, v3 = init_data() ds, v1, v2, v3 = init_data()
@@ -84,5 +86,6 @@ class TestBiaffineParser(unittest.TestCase):
n_epochs=10, use_cuda=False, use_tqdm=False) n_epochs=10, use_cuda=False, use_tqdm=False)
trainer.train(load_best_model=False) trainer.train(load_best_model=False)



if __name__ == '__main__': if __name__ == '__main__':
unittest.main()
unittest.main()

+ 0
- 91
test/test_tutorial.py View File

@@ -1,91 +0,0 @@
import unittest

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Tester
from fastNLP import Vocabulary
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.models import CNNText


class TestTutorial(unittest.TestCase):
def test_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'),
sep='\t')
print(len(dataset))
print(dataset[0])

dataset.append(Instance(raw_sentence='fake data', label='0'))
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
dataset.apply(lambda x: int(x['label']), new_field_name='label')

# 使用空格分割句子
def split_sent(ins):
return ins['raw_sentence'].split()

dataset.apply(split_sent, new_field_name='words')
# 增加长度信息
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
print(len(dataset))
print(dataset[0])

# DataSet.drop(func)筛除数据
dataset.drop(lambda x: x['seq_len'] <= 3)
print(len(dataset))

# 设置DataSet中,哪些field要转为tensor
# set target,loss或evaluate中的golden,计算loss,模型评估时使用
dataset.set_target("label")
# set input,模型forward时使用
dataset.set_input("words")

# 分出测试集、训练集
test_data, train_data = dataset.split(0.5)
print(len(test_data))
print(len(train_data))

# 构建词表, Vocabulary.add(word)
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
vocab.build_vocab()

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
print(test_data[0])

model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)

from fastNLP import Trainer
from copy import deepcopy

# 更改DataSet中对应field的名称,要以模型的forward等参数名一致
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致
train_data.rename_field('label', 'label_seq')
test_data.rename_field('words', 'word_seq')
test_data.rename_field('label', 'label_seq')

# 实例化Trainer,传入模型和数据,进行训练
copy_model = deepcopy(model)
overfit_trainer = Trainer(train_data=test_data, model=copy_model,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4,
dev_data=test_data, save_path="./save")
overfit_trainer.train()

trainer = Trainer(train_data=train_data, model=model,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4,
dev_data=test_data, save_path="./save")
trainer.train()
print('Train finished!')

# 使用fastNLP的Tester测试脚本
tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)

+ 432
- 0
test/test_tutorials.py View File

@@ -0,0 +1,432 @@
import unittest

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric


class TestTutorial(unittest.TestCase):
def test_fastnlp_10min_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'),
sep='\t')
print(len(dataset))
print(dataset[0])
print(dataset[-3])

dataset.append(Instance(raw_sentence='fake data', label='0'))
# 将所有数字转为小写
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
dataset.apply(lambda x: int(x['label']), new_field_name='label')

# 使用空格分割句子
def split_sent(ins):
return ins['raw_sentence'].split()

dataset.apply(split_sent, new_field_name='words')

# 增加长度信息
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
print(len(dataset))
print(dataset[0])

# DataSet.drop(func)筛除数据
dataset.drop(lambda x: x['seq_len'] <= 3)
print(len(dataset))

# 设置DataSet中,哪些field要转为tensor
# set target,loss或evaluate中的golden,计算loss,模型评估时使用
dataset.set_target("label")
# set input,模型forward时使用
dataset.set_input("words", "seq_len")

# 分出测试集、训练集
test_data, train_data = dataset.split(0.5)
print(len(test_data))
print(len(train_data))

# 构建词表, Vocabulary.add(word)
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
vocab.build_vocab()

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
print(test_data[0])

# 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import RandomSampler

batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
print("batch_x has: ", batch_x)
print("batch_y has: ", batch_y)
break

from fastNLP.models import CNNText
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)

from fastNLP import Trainer
from copy import deepcopy

# 更改DataSet中对应field的名称,要以模型的forward等参数名一致
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致
train_data.rename_field('label', 'label_seq')
test_data.rename_field('words', 'word_seq')
test_data.rename_field('label', 'label_seq')

loss = CrossEntropyLoss(pred="output", target="label_seq")
metric = AccuracyMetric(pred="predict", target="label_seq")

# 实例化Trainer,传入模型和数据,进行训练
# 先在test_data拟合(确保模型的实现是正确的)
copy_model = deepcopy(model)
overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,
loss=loss,
metrics=metric,
save_path=None,
batch_size=32,
n_epochs=5)
overfit_trainer.train()

# 用train_data训练,在test_data验证
trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"),
save_path=None,
batch_size=32,
n_epochs=5)
trainer.train()
print('Train finished!')

# 调用Tester在test_data上评价效果
from fastNLP import Tester

tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)

def test_fastnlp_1min_tutorial(self):
# tutorials/fastnlp_1min_tutorial.ipynb
data_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t')
print(ds[1])

# 将所有数字转为小写
ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)

def split_sent(ins):
return ins['raw_sentence'].split()

ds.apply(split_sent, new_field_name='words', is_input=True)

# 分割训练集/验证集
train_data, dev_data = ds.split(0.3)
print("Train size: ", len(train_data))
print("Test size: ", len(dev_data))

from fastNLP import Vocabulary
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq',
is_input=True)
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq',
is_input=True)

from fastNLP.models import CNNText
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)

from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric
trainer = Trainer(model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric()
)
trainer.train()
print('Train finished!')

def test_fastnlp_advanced_tutorial(self):
import os
os.chdir("tutorials/fastnlp_advanced_tutorial")

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Trainer
from fastNLP import Tester

# ### Instance
# Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值
# 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法

# In[2]:

# 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成
instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1)
instance

# In[3]:

data_set = DataSet([instance] * 5)
data_set.append(instance)
data_set[-2:]

# In[4]:

# 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中
instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.',
label='1')
try:
data_set.append(instance2)
except:
pass
data_set[-2:]

# In[5]:

# 如果某一个field的名字不对,则该instance不能被append到dataset中
instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.',
label=1)
try:
data_set.append(instance3)
except:
print('cannot append instance')
pass
data_set[-2:]

# In[6]:

# 除了文本以外,还可以将tensor作为其中一个field的value
import torch
tensor_ins = Instance(image=torch.randn(5, 5), label=0)
ds = DataSet()
ds.append(tensor_ins)
ds

from fastNLP import DataSet
from fastNLP import Instance

# 从csv读取数据到DataSet
# 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取
dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t')
# 查看DataSet的大小
len(dataset)

# In[8]:

# 使用数字索引[k],获取第k个样本
dataset[0]

# In[9]:

# 获取的样本是一个Instance
type(dataset[0])

# In[10]:

# 使用数字索引[a: b],获取第a到第b个样本
dataset[0: 3]

# In[11]:

# 索引也可以是负数
dataset[-1]

data_path = ['premise', 'hypothesis', 'label']

# 读入文件
with open(data_path[0]) as f:
premise = f.readlines()

with open(data_path[1]) as f:
hypothesis = f.readlines()

with open(data_path[2]) as f:
label = f.readlines()

assert len(premise) == len(hypothesis) and len(hypothesis) == len(label)

# 组织DataSet
data_set = DataSet()
for p, h, l in zip(premise, hypothesis, label):
p = p.strip() # 将行末空格去除
h = h.strip() # 将行末空格去除
data_set.append(Instance(premise=p, hypothesis=h, truth=l))

data_set[0]

# ### DataSet的其他操作
# 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply()

# In[13]:

# 将premise域的所有文本转成小写
data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise')
data_set[-2:]

# In[14]:

# label转int
data_set.apply(lambda x: int(x['truth']), new_field_name='truth')
data_set[-2:]

# In[15]:

# 使用空格分割句子
def split_sent(ins):
return ins['premise'].split()

data_set.apply(split_sent, new_field_name='premise')
data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis')
data_set[-2:]

# In[16]:

# 筛选数据
origin_data_set_len = len(data_set)
data_set.drop(lambda x: len(x['premise']) <= 6)
origin_data_set_len, len(data_set)

# In[17]:

# 增加长度信息
data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len')
data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len')
data_set[-1]

# In[18]:

# 设定特征域、标签域
data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len")
data_set.set_target("truth")

# In[19]:

# 重命名field
data_set.rename_field('truth', 'label')
data_set[-1]

# In[20]:

# 切分训练、验证集、测试集
train_data, vad_data = data_set.split(0.5)
dev_data, test_data = vad_data.split(0.4)
len(train_data), len(dev_data), len(test_data)

# In[21]:

# 深拷贝一个数据集
import copy
train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy(dev_data)
del copy

# 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语
# Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>'
vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>')

# 构建词表
train_data.apply(lambda x: [vocab.add(word) for word in x['premise']])
train_data.apply(lambda x: [vocab.add(word) for word in x['hypothesis']])
vocab.build_vocab()

# In[23]:

# 根据词表index句子
train_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
train_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
train_data[-1], dev_data[-1], test_data[-1]

# 读入vocab文件
with open('vocab.txt') as f:
lines = f.readlines()
vocabs = []
for line in lines:
vocabs.append(line.strip())

# 实例化Vocabulary
vocab_bert = Vocabulary(unknown=None, padding=None)
# 将vocabs列表加入Vocabulary
vocab_bert.add_word_lst(vocabs)
# 构建词表
vocab_bert.build_vocab()
# 更新unknown与padding的token文本
vocab_bert.unknown = '[UNK]'
vocab_bert.padding = '[PAD]'

# In[25]:

# 根据词表index句子
train_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise')
train_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
new_field_name='hypothesis')
dev_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise')
dev_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
train_data_2[-1], dev_data_2[-1]

# step 1:加载模型参数(非必选)
from fastNLP.io.config_io import ConfigSection, ConfigLoader
args = ConfigSection()
ConfigLoader().load_config("./data/config", {"esim_model": args})
args["vocab_size"] = len(vocab)
args.data

# In[27]:

# step 2:加载ESIM模型
from fastNLP.models import ESIM
model = ESIM(**args.data)
model

# In[28]:

# 另一个例子:加载CNN文本分类模型
from fastNLP.models import CNNText
cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)
cnn_text_model

from fastNLP import CrossEntropyLoss
from fastNLP import Adam
from fastNLP import AccuracyMetric
trainer = Trainer(
train_data=train_data,
model=model,
loss=CrossEntropyLoss(pred='pred', target='label'),
metrics=AccuracyMetric(),
n_epochs=5,
batch_size=16,
print_every=-1,
validate_every=-1,
dev_data=dev_data,
use_cuda=True,
optimizer=Adam(lr=1e-3, weight_decay=0),
check_code_level=-1,
metric_key='acc',
use_tqdm=False,
)
trainer.train()

tester = Tester(
data=test_data,
model=model,
metrics=AccuracyMetric(),
batch_size=args["batch_size"],
)
tester.test()

os.chdir("../..")

Loading…
Cancel
Save