@@ -0,0 +1,44 @@ | |||
import pickle | |||
import numpy as np | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.predictor import Predictor | |||
class POS_tagger: | |||
def __init__(self): | |||
pass | |||
def predict(self, query): | |||
""" | |||
:param query: List[str] | |||
:return answer: List[str] | |||
""" | |||
# TODO: 根据query 构建DataSet | |||
pos_dataset = DataSet() | |||
pos_dataset["text_field"] = np.array(query) | |||
# 加载pipeline和model | |||
pipeline = self.load_pipeline("./xxxx") | |||
# 将DataSet作为参数运行 pipeline | |||
pos_dataset = pipeline(pos_dataset) | |||
# 加载模型 | |||
model = ModelLoader().load_pytorch("./xxx") | |||
# 调 predictor | |||
predictor = Predictor() | |||
output = predictor.predict(model, pos_dataset) | |||
# TODO: 转成最终输出 | |||
return None | |||
@staticmethod | |||
def load_pipeline(path): | |||
with open(path, "r") as fp: | |||
pipeline = pickle.load(fp) | |||
return pipeline |
@@ -56,8 +56,8 @@ class Batch(object): | |||
indices = self.idx_list[self.curidx:endidx] | |||
for field_name, field in self.dataset.get_fields(): | |||
batch = field.get(indices) | |||
if not field.tensorable: #TODO 修改 | |||
batch = torch.from_numpy(field.get(indices)) | |||
if not field.need_tensor: #TODO 修改 | |||
pass | |||
elif field.is_target: | |||
batch_y[field_name] = batch | |||
@@ -2,10 +2,12 @@ import random | |||
import sys | |||
from collections import defaultdict | |||
from copy import deepcopy | |||
import numpy as np | |||
from fastNLP.core.field import TextField, LabelField | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.core.fieldarray import FieldArray | |||
_READERS = {} | |||
@@ -14,43 +16,36 @@ class DataSet(object): | |||
""" | |||
def __init__(self, fields=None): | |||
""" | |||
""" | |||
pass | |||
def index_all(self, vocab): | |||
for ins in self: | |||
ins.index_all(vocab) | |||
return self | |||
def __init__(self, instance=None): | |||
if instance is not None: | |||
self._convert_ins(instance) | |||
else: | |||
self.field_arrays = {} | |||
def index_field(self, field_name, vocab): | |||
if isinstance(field_name, str): | |||
field_list = [field_name] | |||
vocab_list = [vocab] | |||
def _convert_ins(self, ins_list): | |||
if isinstance(ins_list, list): | |||
for ins in ins_list: | |||
self.append(ins) | |||
else: | |||
classes = (list, tuple) | |||
assert isinstance(field_name, classes) and isinstance(vocab, classes) and len(field_name) == len(vocab) | |||
field_list = field_name | |||
vocab_list = vocab | |||
for name, vocabs in zip(field_list, vocab_list): | |||
for ins in self: | |||
ins.index_field(name, vocabs) | |||
return self | |||
self.append(ins) | |||
def to_tensor(self, idx: int, padding_length: dict): | |||
"""Convert an instance in a dataset to tensor. | |||
def append(self, ins): | |||
# no field | |||
if len(self.field_arrays) == 0: | |||
for name, field in ins.field.items(): | |||
self.field_arrays[name] = FieldArray(name, [field]) | |||
else: | |||
assert len(self.field_arrays) == len(ins.field) | |||
for name, field in ins.field.items(): | |||
assert name in self.field_arrays | |||
self.field_arrays[name].append(field) | |||
:param idx: int, the index of the instance in the dataset. | |||
:param padding_length: int | |||
:return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | |||
tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | |||
def get_fields(self): | |||
return self.field_arrays | |||
""" | |||
ins = self[idx] | |||
return ins.to_tensor(padding_length, self.origin_len) | |||
def __len__(self): | |||
field = self.field_arrays.values()[0] | |||
return len(field) | |||
def get_length(self): | |||
"""Fetch lengths of all fields in all instances in a dataset. | |||
@@ -59,15 +54,10 @@ class DataSet(object): | |||
The list contains lengths of this field in all instances. | |||
""" | |||
lengths = defaultdict(list) | |||
for ins in self: | |||
for field_name, field_length in ins.get_length().items(): | |||
lengths[field_name].append(field_length) | |||
return lengths | |||
pass | |||
def shuffle(self): | |||
random.shuffle(self) | |||
return self | |||
pass | |||
def split(self, ratio, shuffle=True): | |||
"""Train/dev splitting | |||
@@ -78,58 +68,37 @@ class DataSet(object): | |||
dev_set: a DataSet object, representing the validation set | |||
""" | |||
assert 0 < ratio < 1 | |||
if shuffle: | |||
self.shuffle() | |||
split_idx = int(len(self) * ratio) | |||
dev_set = deepcopy(self) | |||
train_set = deepcopy(self) | |||
del train_set[:split_idx] | |||
del dev_set[split_idx:] | |||
return train_set, dev_set | |||
pass | |||
def rename_field(self, old_name, new_name): | |||
"""rename a field | |||
""" | |||
for ins in self: | |||
ins.rename_field(old_name, new_name) | |||
if old_name in self.field_arrays: | |||
self.field_arrays[new_name] = self.field_arrays.pop(old_name) | |||
else: | |||
raise KeyError | |||
return self | |||
def set_target(self, **fields): | |||
def set_is_target(self, **fields): | |||
"""Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged. | |||
:param key-value pairs for field-name and `is_target` value(True, False or None). | |||
""" | |||
for ins in self: | |||
ins.set_target(**fields) | |||
return self | |||
def update_vocab(self, **name_vocab): | |||
"""using certain field data to update vocabulary. | |||
e.g. :: | |||
# update word vocab and label vocab seperately | |||
dataset.update_vocab(word_seq=word_vocab, label_seq=label_vocab) | |||
:param key-value pairs for field-name and `is_target` value(True, False). | |||
""" | |||
for field_name, vocab in name_vocab.items(): | |||
for ins in self: | |||
vocab.update(ins[field_name].contents()) | |||
for name, val in fields.items(): | |||
if name in self.field_arrays: | |||
assert isinstance(val, bool) | |||
self.field_arrays[name].is_target = val | |||
else: | |||
raise KeyError | |||
return self | |||
def set_origin_len(self, origin_field, origin_len_name=None): | |||
"""make dataset tensor output contain origin_len field. | |||
e.g. :: | |||
# output "word_seq_origin_len", lengths based on "word_seq" field | |||
dataset.set_origin_len("word_seq") | |||
""" | |||
if origin_field is None: | |||
self.origin_len = None | |||
else: | |||
self.origin_len = (origin_field + "_origin_len", origin_field) \ | |||
if origin_len_name is None else (origin_len_name, origin_field) | |||
def set_need_tensor(self, **kwargs): | |||
for name, val in kwargs.items(): | |||
if name in self.field_arrays: | |||
assert isinstance(val, bool) | |||
self.field_arrays[name].need_tensor = val | |||
else: | |||
raise KeyError | |||
return self | |||
def __getattribute__(self, name): | |||
@@ -7,10 +7,9 @@ class Field(object): | |||
""" | |||
def __init__(self, name, is_target: bool): | |||
self.name = name | |||
def __init__(self, content, is_target: bool): | |||
self.is_target = is_target | |||
self.content = None | |||
self.content = content | |||
def index(self, vocab): | |||
"""create index field | |||
@@ -29,23 +28,15 @@ class Field(object): | |||
raise NotImplementedError | |||
def __repr__(self): | |||
return self.contents().__repr__() | |||
def new(self, *args, **kwargs): | |||
return self.__class__(*args, **kwargs, is_target=self.is_target) | |||
return self.content.__repr__() | |||
class TextField(Field): | |||
def __init__(self, name, text, is_target): | |||
def __init__(self, text, is_target): | |||
""" | |||
:param text: list of strings | |||
:param is_target: bool | |||
""" | |||
super(TextField, self).__init__(name, is_target) | |||
self.content = text | |||
def index(self, vocab): | |||
idx_field = IndexField(self.name+'_idx', self.content, vocab, self.is_target) | |||
return idx_field | |||
super(TextField, self).__init__(text, is_target) | |||
class IndexField(Field): | |||
@@ -82,75 +73,19 @@ class LabelField(Field): | |||
""" | |||
def __init__(self, label, is_target=True): | |||
super(LabelField, self).__init__(is_target) | |||
self.label = label | |||
self._index = None | |||
super(LabelField, self).__init__(label, is_target) | |||
def get_length(self): | |||
"""Fetch the length of the label field. | |||
:return length: int, the length of the label, always 1. | |||
""" | |||
return 1 | |||
def index(self, vocab): | |||
if self._index is None: | |||
if isinstance(self.label, str): | |||
self._index = vocab[self.label] | |||
return self._index | |||
def to_tensor(self, padding_length): | |||
if self._index is None: | |||
if isinstance(self.label, int): | |||
return torch.tensor(self.label) | |||
elif isinstance(self.label, str): | |||
raise RuntimeError("Field {} not indexed. Call index method.".format(self.label)) | |||
else: | |||
raise RuntimeError( | |||
"Not support type for LabelField. Expect str or int, got {}.".format(type(self.label))) | |||
else: | |||
return torch.LongTensor([self._index]) | |||
def contents(self): | |||
return [self.label] | |||
class SeqLabelField(Field): | |||
def __init__(self, label_seq, is_target=True): | |||
super(SeqLabelField, self).__init__(is_target) | |||
self.label_seq = label_seq | |||
self._index = None | |||
def get_length(self): | |||
return len(self.label_seq) | |||
def index(self, vocab): | |||
if self._index is None: | |||
self._index = [vocab[c] for c in self.label_seq] | |||
return self._index | |||
def to_tensor(self, padding_length): | |||
pads = [0] * (padding_length - self.get_length()) | |||
if self._index is None: | |||
if self.get_length() == 0: | |||
return torch.LongTensor(pads) | |||
elif isinstance(self.label_seq[0], int): | |||
return torch.LongTensor(self.label_seq + pads) | |||
elif isinstance(self.label_seq[0], str): | |||
raise RuntimeError("Field {} not indexed. Call index method.".format(self.label)) | |||
else: | |||
raise RuntimeError( | |||
"Not support type for SeqLabelField. Expect str or int, got {}.".format(type(self.label))) | |||
else: | |||
return torch.LongTensor(self._index + pads) | |||
def contents(self): | |||
return self.label_seq.copy() | |||
super(SeqLabelField, self).__init__(label_seq, is_target) | |||
class CharTextField(Field): | |||
def __init__(self, text, max_word_len, is_target=False): | |||
super(CharTextField, self).__init__(is_target) | |||
self.text = text | |||
# TODO | |||
raise NotImplementedError | |||
self.max_word_len = max_word_len | |||
self._index = [] | |||
@@ -0,0 +1,39 @@ | |||
import torch | |||
import numpy as np | |||
class FieldArray(object): | |||
def __init__(self, name, content, padding_val=0, is_target=True, need_tensor=True): | |||
self.name = name | |||
self.data = [self._convert_np(val) for val in content] | |||
self.padding_val = padding_val | |||
self.is_target = is_target | |||
self.need_tensor = need_tensor | |||
def _convert_np(self, val): | |||
if not isinstance(val, np.array): | |||
return np.array(val) | |||
else: | |||
return val | |||
def append(self, val): | |||
self.data.append(self._convert_np(val)) | |||
def get(self, idxes): | |||
if isinstance(idxes, int): | |||
return self.data[idxes] | |||
elif isinstance(idxes, list): | |||
id_list = np.array(idxes) | |||
batch_size = len(id_list) | |||
len_list = [(i, self.data[i].shape[0]) for i in id_list] | |||
_, max_len = max(len_list, key=lambda x: x[1]) | |||
array = np.full((batch_size, max_len), self.padding_val, dtype=np.int32) | |||
for i, (idx, length) in enumerate(len_list): | |||
if length == max_len: | |||
array[i] = self.data[idx] | |||
else: | |||
array[i][:length] = self.data[idx] | |||
return array | |||
def __len__(self): | |||
return len(self.data) |
@@ -7,8 +7,6 @@ class Instance(object): | |||
def __init__(self, **fields): | |||
self.fields = fields | |||
self.has_index = False | |||
self.indexes = {} | |||
def add_field(self, field_name, field): | |||
self.fields[field_name] = field | |||
@@ -17,8 +15,6 @@ class Instance(object): | |||
def rename_field(self, old_name, new_name): | |||
if old_name in self.fields: | |||
self.fields[new_name] = self.fields.pop(old_name) | |||
if old_name in self.indexes: | |||
self.indexes[new_name] = self.indexes.pop(old_name) | |||
else: | |||
raise KeyError("error, no such field: {}".format(old_name)) | |||
return self | |||
@@ -38,53 +34,5 @@ class Instance(object): | |||
def __setitem__(self, name, field): | |||
return self.add_field(name, field) | |||
def get_length(self): | |||
"""Fetch the length of all fields in the instance. | |||
:return length: dict of (str: int), which means (field name: field length). | |||
""" | |||
length = {name: field.get_length() for name, field in self.fields.items()} | |||
return length | |||
def index_field(self, field_name, vocab): | |||
"""use `vocab` to index certain field | |||
""" | |||
self.indexes[field_name] = self.fields[field_name].index(vocab) | |||
return self | |||
def index_all(self, vocab): | |||
"""use `vocab` to index all fields | |||
""" | |||
if self.has_index: | |||
print("error") | |||
return self.indexes | |||
indexes = {name: field.index(vocab) for name, field in self.fields.items()} | |||
self.indexes = indexes | |||
return indexes | |||
def to_tensor(self, padding_length: dict, origin_len=None): | |||
"""Convert instance to tensor. | |||
:param padding_length: dict of (str: int), which means (field name: padding_length of this field) | |||
:return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | |||
tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | |||
If is_target is False for all fields, tensor_y would be an empty dict. | |||
""" | |||
tensor_x = {} | |||
tensor_y = {} | |||
for name, field in self.fields.items(): | |||
if field.is_target is True: | |||
tensor_y[name] = field.to_tensor(padding_length[name]) | |||
elif field.is_target is False: | |||
tensor_x[name] = field.to_tensor(padding_length[name]) | |||
else: | |||
# is_target is None | |||
continue | |||
if origin_len is not None: | |||
name, field_name = origin_len | |||
tensor_x[name] = torch.LongTensor([self.fields[field_name].get_length()]) | |||
return tensor_x, tensor_y | |||
def __repr__(self): | |||
return self.fields.__repr__() |
@@ -2,9 +2,7 @@ import numpy as np | |||
import torch | |||
from fastNLP.core.batch import Batch | |||
from fastNLP.core.preprocess import load_pickle | |||
from fastNLP.core.sampler import SequentialSampler | |||
from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq2tag_dataset, convert_seq_dataset | |||
class Predictor(object): | |||
@@ -16,19 +14,9 @@ class Predictor(object): | |||
Currently, Predictor does not support GPU. | |||
""" | |||
def __init__(self, pickle_path, post_processor): | |||
""" | |||
:param pickle_path: str, the path to the pickle files. | |||
:param post_processor: a function or callable object, that takes list of batch outputs as input | |||
""" | |||
def __init__(self): | |||
self.batch_size = 1 | |||
self.batch_output = [] | |||
self.pickle_path = pickle_path | |||
self._post_processor = post_processor | |||
self.label_vocab = load_pickle(self.pickle_path, "label2id.pkl") | |||
self.word_vocab = load_pickle(self.pickle_path, "word2id.pkl") | |||
def predict(self, network, data): | |||
"""Perform inference using the trained model. | |||
@@ -37,9 +25,6 @@ class Predictor(object): | |||
:param data: a DataSet object. | |||
:return: list of list of strings, [num_examples, tag_seq_length] | |||
""" | |||
# transform strings into DataSet object | |||
# data = self.prepare_input(data) | |||
# turn on the testing mode; clean up the history | |||
self.mode(network, test=True) | |||
batch_output = [] | |||
@@ -51,7 +36,7 @@ class Predictor(object): | |||
prediction = self.data_forward(network, batch_x) | |||
batch_output.append(prediction) | |||
return self._post_processor(batch_output, self.label_vocab) | |||
return batch_output | |||
def mode(self, network, test=True): | |||
if test: | |||
@@ -64,37 +49,19 @@ class Predictor(object): | |||
y = network(**x) | |||
return y | |||
def prepare_input(self, data): | |||
"""Transform two-level list of strings into an DataSet object. | |||
In the training pipeline, this is done by Preprocessor. But in inference time, we do not call Preprocessor. | |||
:param data: list of list of strings. | |||
:: | |||
[ | |||
[word_11, word_12, ...], | |||
[word_21, word_22, ...], | |||
... | |||
] | |||
:return data_set: a DataSet instance. | |||
""" | |||
assert isinstance(data, list) | |||
data = convert_seq_dataset(data) | |||
data.index_field("word_seq", self.word_vocab) | |||
class SeqLabelInfer(Predictor): | |||
def __init__(self, pickle_path): | |||
print( | |||
"[FastNLP Warning] SeqLabelInfer will be deprecated. Please use Predictor directly.") | |||
super(SeqLabelInfer, self).__init__(pickle_path, seq_label_post_processor) | |||
super(SeqLabelInfer, self).__init__() | |||
class ClassificationInfer(Predictor): | |||
def __init__(self, pickle_path): | |||
print( | |||
"[FastNLP Warning] ClassificationInfer will be deprecated. Please use Predictor directly.") | |||
super(ClassificationInfer, self).__init__(pickle_path, text_classify_post_processor) | |||
super(ClassificationInfer, self).__init__() | |||
def seq_label_post_processor(batch_outputs, label_vocab): | |||
@@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): | |||
Loader for models. | |||
""" | |||
def __init__(self, data_path): | |||
super(ModelLoader, self).__init__(data_path) | |||
def __init__(self): | |||
super(ModelLoader, self).__init__() | |||
@staticmethod | |||
def load_pytorch(empty_model, model_path): | |||
@@ -19,3 +19,10 @@ class ModelLoader(BaseLoader): | |||
:param model_path: str, the path to the saved model. | |||
""" | |||
empty_model.load_state_dict(torch.load(model_path)) | |||
@staticmethod | |||
def load_pytorch(model_path): | |||
"""Load the entire model. | |||
""" | |||
return torch.load(model_path) |
@@ -127,7 +127,8 @@ class AdvSeqLabel(SeqLabeling): | |||
:param word_seq: LongTensor, [batch_size, mex_len] | |||
:param word_seq_origin_len: list of int. | |||
:param truth: LongTensor, [batch_size, max_len] | |||
:return y: | |||
:return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. | |||
If truth is not None, return loss, a scalar. Used in training. | |||
""" | |||
self.mask = self.make_mask(word_seq, word_seq_origin_len) | |||
@@ -15,10 +15,14 @@ class ModelSaver(object): | |||
""" | |||
self.save_path = save_path | |||
def save_pytorch(self, model): | |||
def save_pytorch(self, model, param_only=True): | |||
"""Save a pytorch model into .pkl file. | |||
:param model: a PyTorch model | |||
:param param_only: bool, whether only to save the model parameters or the entire model. | |||
""" | |||
torch.save(model.state_dict(), self.save_path) | |||
if param_only is True: | |||
torch.save(model.state_dict(), self.save_path) | |||
else: | |||
torch.save(model, self.save_path) |
@@ -59,42 +59,37 @@ def infer(): | |||
print("Inference finished!") | |||
def train(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
test_args = ConfigSection() | |||
ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
def train(): | |||
# load config | |||
trainer_args = ConfigSection() | |||
model_args = ConfigSection() | |||
ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
# Data Loader | |||
loader = PeopleDailyCorpusLoader() | |||
train_data, _ = loader.load() | |||
# Preprocessor | |||
preprocessor = SeqLabelPreprocess() | |||
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = preprocessor.vocab_size | |||
train_args["num_classes"] = preprocessor.num_classes | |||
# TODO: define processors | |||
# define pipeline | |||
pp = Pipeline() | |||
# TODO: pp.add_processor() | |||
# Trainer | |||
trainer = SeqLabelTrainer(**train_args.data) | |||
# run the pipeline, get data_set | |||
train_data = pp(train_data) | |||
# Model | |||
# define a model | |||
model = AdvSeqLabel(train_args) | |||
try: | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print('model parameter loaded!') | |||
except Exception as e: | |||
print("No saved model. Continue.") | |||
pass | |||
# Start training | |||
# call trainer to train | |||
trainer = SeqLabelTrainer(train_args) | |||
trainer.train(model, data_train, data_dev) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./save/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
# save model | |||
ModelSaver("./saved_model.pkl").save_pytorch(model, param_only=False) | |||
# TODO:save pipeline | |||
def test(): | |||
@@ -1,4 +1,4 @@ | |||
numpy>=1.14.2 | |||
torch==0.4.0 | |||
torch>=0.4.0 | |||
torchvision>=0.1.8 | |||
tensorboardX |