- [trainer]rename "batchify" to "make_batch" in trainer - [trainer]pack (batch_x_pad, seq_len) into batch_x in make_batch for seq labeling, because seq length before pad is needed to make masks - [trainer]unpack it in data_forward - [model]shorten model definition - [inference]build inference class. test_POS_pipeline.py is OK to infer - [preprocessor]handle pickles in a nicer manner - [FastNLP] add fastNLP.py as high-level API, not finished yettags/v0.1.0
@@ -1,26 +1,116 @@ | |||||
import torch | |||||
from fastNLP.action.action import Batchifier, SequentialSampler | |||||
from fastNLP.loader.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL | |||||
class Inference(object): | class Inference(object): | ||||
""" | """ | ||||
This is an interface focusing on predicting output based on trained models. | This is an interface focusing on predicting output based on trained models. | ||||
It does not care about evaluations of the model. | |||||
It does not care about evaluations of the model, which is different from Tester. | |||||
This is a high-level model wrapper to be called by FastNLP. | |||||
""" | """ | ||||
def __init__(self): | |||||
pass | |||||
def __init__(self, pickle_path): | |||||
self.batch_size = 1 | |||||
self.batch_output = [] | |||||
self.iterator = None | |||||
self.pickle_path = pickle_path | |||||
self.index2label = load_pickle(self.pickle_path, "id2class.pkl") | |||||
self.word2index = load_pickle(self.pickle_path, "word2id.pkl") | |||||
def predict(self, network, data): | |||||
""" | |||||
Perform inference. | |||||
:param network: | |||||
:param data: multi-level lists of strings | |||||
:return result: the model outputs | |||||
""" | |||||
# transform strings into indices | |||||
data = self.prepare_input(data) | |||||
# turn on the testing mode; clean up the history | |||||
self.mode(network, test=True) | |||||
self.iterator = iter(Batchifier(SequentialSampler(data), self.batch_size, drop_last=False)) | |||||
num_iter = len(data) // self.batch_size | |||||
for step in range(num_iter): | |||||
batch_x = self.batchify(data) | |||||
prediction = self.data_forward(network, batch_x) | |||||
self.batch_output.append(prediction) | |||||
return self.prepare_output(self.batch_output) | |||||
def mode(self, network, test=True): | |||||
if test: | |||||
network.eval() | |||||
else: | |||||
network.train() | |||||
self.batch_output.clear() | |||||
def data_forward(self, network, x): | |||||
""" | |||||
This is only for sequence labeling with CRF decoder. To do: more general ? | |||||
:param network: | |||||
:param x: | |||||
:return: | |||||
""" | |||||
seq_len = [len(seq) for seq in x] | |||||
x = torch.Tensor(x).long() | |||||
y = network(x) | |||||
prediction = network.prediction(y, seq_len) | |||||
# To do: hide framework | |||||
results = torch.Tensor(prediction).view(-1, ) | |||||
return list(results.data) | |||||
def batchify(self, data): | |||||
indices = next(self.iterator) | |||||
batch_x = [data[idx] for idx in indices] | |||||
batch_x = self.pad(batch_x) | |||||
return batch_x | |||||
@staticmethod | |||||
def pad(batch, fill=0): | |||||
""" | |||||
Pad a batch of samples to maximum length. | |||||
:param batch: list of list | |||||
:param fill: word index to pad, default 0. | |||||
:return: a padded batch | |||||
""" | |||||
max_length = max([len(x) for x in batch]) | |||||
for idx, sample in enumerate(batch): | |||||
if len(sample) < max_length: | |||||
batch[idx] = sample + [fill * (max_length - len(sample))] | |||||
return batch | |||||
def predict(self, model, data): | |||||
def prepare_input(self, data): | |||||
""" | """ | ||||
this is actually a forward pass. shall be shared by Trainer/Tester | |||||
:param model: | |||||
Transform three-level list of strings into that of index. | |||||
:param data: | :param data: | ||||
:return result: the output results | |||||
[ | |||||
[word_11, word_12, ...], | |||||
[word_21, word_22, ...], | |||||
... | |||||
] | |||||
""" | """ | ||||
raise NotImplementedError | |||||
data_index = [] | |||||
default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] | |||||
for example in data: | |||||
data_index.append([self.word2index.get(w, default_unknown_index) for w in example]) | |||||
return data_index | |||||
def prepare_input(self, data_path): | |||||
def prepare_output(self, batch_outputs): | |||||
""" | """ | ||||
This can also be shared. | |||||
:param data_path: | |||||
Transform list of batch outputs into strings. | |||||
:param batch_outputs: list of list [num_batch, tag_seq_length] | |||||
:return: | :return: | ||||
""" | """ | ||||
raise NotImplementedError | |||||
results = [] | |||||
for batch in batch_outputs: | |||||
results.append([self.index2label[int(x.data)] for x in batch]) | |||||
return results |
@@ -86,7 +86,7 @@ class BaseTrainer(Action): | |||||
# training iterations in one epoch | # training iterations in one epoch | ||||
for step in range(iterations): | for step in range(iterations): | ||||
batch_x, batch_y = self.batchify(data_train) # pad ? | |||||
batch_x, batch_y = self.make_batch(data_train) | |||||
prediction = self.data_forward(network, batch_x) | prediction = self.data_forward(network, batch_x) | ||||
@@ -180,7 +180,7 @@ class BaseTrainer(Action): | |||||
""" | """ | ||||
raise NotImplementedError | raise NotImplementedError | ||||
def batchify(self, data, output_length=True): | |||||
def make_batch(self, data, output_length=True): | |||||
""" | """ | ||||
1. Perform batching from data and produce a batch of training data. | 1. Perform batching from data and produce a batch of training data. | ||||
2. Add padding. | 2. Add padding. | ||||
@@ -191,9 +191,12 @@ class BaseTrainer(Action): | |||||
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | ||||
... | ... | ||||
] | ] | ||||
:return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||||
:return (batch_x, seq_len): tuple of two elements, if output_length is true. | |||||
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||||
seq_len: list. The length of the pre-padded sequence, if output_length is True. | |||||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | ||||
seq_len: list. The length of the pre-padded sequence, if output_length is True. | |||||
return batch_x and batch_y, if output_length is False | |||||
""" | """ | ||||
indices = next(self.iterator) | indices = next(self.iterator) | ||||
batch = [data[idx] for idx in indices] | batch = [data[idx] for idx in indices] | ||||
@@ -202,7 +205,7 @@ class BaseTrainer(Action): | |||||
batch_x_pad = self.pad(batch_x) | batch_x_pad = self.pad(batch_x) | ||||
if output_length: | if output_length: | ||||
seq_len = [len(x) for x in batch_x] | seq_len = [len(x) for x in batch_x] | ||||
return batch_x_pad, batch_y, seq_len | |||||
return (batch_x_pad, seq_len), batch_y | |||||
else: | else: | ||||
return batch_x_pad, batch_y | return batch_x_pad, batch_y | ||||
@@ -292,17 +295,23 @@ class POSTrainer(BaseTrainer): | |||||
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) | data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) | ||||
return data_train, data_dev, 0, 1 | return data_train, data_dev, 0, 1 | ||||
def data_forward(self, network, x): | |||||
def data_forward(self, network, inputs): | |||||
""" | """ | ||||
:param network: the PyTorch model | :param network: the PyTorch model | ||||
:param x: list of list, [batch_size, max_len] | |||||
:param inputs: list of list, [batch_size, max_len], | |||||
or tuple of (batch_x, seq_len), batch_x == [batch_size, max_len] | |||||
:return y: [batch_size, max_len, tag_size] | :return y: [batch_size, max_len, tag_size] | ||||
""" | """ | ||||
self.seq_len = [len(seq) for seq in x] | |||||
# unpack the returned value from make_batch | |||||
if isinstance(inputs, tuple): | |||||
x = inputs[0] | |||||
self.seq_len = inputs[1] | |||||
else: | |||||
x = inputs | |||||
x = torch.Tensor(x).long() | x = torch.Tensor(x).long() | ||||
self.batch_size = x.size(0) | self.batch_size = x.size(0) | ||||
self.max_len = x.size(1) | self.max_len = x.size(1) | ||||
# self.mask = seq_mask(seq_len, self.max_len) | |||||
y = network(x) | y = network(x) | ||||
return y | return y | ||||
@@ -325,11 +334,12 @@ class POSTrainer(BaseTrainer): | |||||
def get_loss(self, predict, truth): | def get_loss(self, predict, truth): | ||||
""" | """ | ||||
Compute loss given prediction and ground truth. | Compute loss given prediction and ground truth. | ||||
:param predict: prediction label vector, [batch_size, tag_size, tag_size] | |||||
:param predict: prediction label vector, [batch_size, max_len, tag_size] | |||||
:param truth: ground truth label vector, [batch_size, max_len] | :param truth: ground truth label vector, [batch_size, max_len] | ||||
:return: a scalar | :return: a scalar | ||||
""" | """ | ||||
truth = torch.Tensor(truth) | truth = torch.Tensor(truth) | ||||
assert truth.shape == (self.batch_size, self.max_len) | |||||
if self.loss_func is None: | if self.loss_func is None: | ||||
if hasattr(self.model, "loss"): | if hasattr(self.model, "loss"): | ||||
self.loss_func = self.model.loss | self.loss_func = self.model.loss | ||||
@@ -347,6 +357,35 @@ class POSTrainer(BaseTrainer): | |||||
else: | else: | ||||
return False | return False | ||||
def make_batch(self, data, output_length=True): | |||||
""" | |||||
1. Perform batching from data and produce a batch of training data. | |||||
2. Add padding. | |||||
:param data: list. Each entry is a sample, which is also a list of features and label(s). | |||||
E.g. | |||||
[ | |||||
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1 | |||||
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | |||||
... | |||||
] | |||||
:return (batch_x, seq_len): tuple of two elements, if output_length is true. | |||||
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||||
seq_len: list. The length of the pre-padded sequence, if output_length is True. | |||||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||||
return batch_x and batch_y, if output_length is False | |||||
""" | |||||
indices = next(self.iterator) | |||||
batch = [data[idx] for idx in indices] | |||||
batch_x = [sample[0] for sample in batch] | |||||
batch_y = [sample[1] for sample in batch] | |||||
batch_x_pad = self.pad(batch_x) | |||||
if output_length: | |||||
seq_len = [len(x) for x in batch_x] | |||||
return (batch_x_pad, seq_len), batch_y | |||||
else: | |||||
return batch_x_pad, batch_y | |||||
class LanguageModelTrainer(BaseTrainer): | class LanguageModelTrainer(BaseTrainer): | ||||
""" | """ | ||||
@@ -438,7 +477,7 @@ class ClassTrainer(BaseTrainer): | |||||
# training iterations in one epoch | # training iterations in one epoch | ||||
step = 0 | step = 0 | ||||
for batch_x, batch_y in self.batchify(data_train): | |||||
for batch_x, batch_y in self.make_batch(data_train): | |||||
prediction = self.data_forward(network, batch_x) | prediction = self.data_forward(network, batch_x) | ||||
loss = self.get_loss(prediction, batch_y) | loss = self.get_loss(prediction, batch_y) | ||||
@@ -533,7 +572,7 @@ class ClassTrainer(BaseTrainer): | |||||
"""Apply gradient.""" | """Apply gradient.""" | ||||
self.optimizer.step() | self.optimizer.step() | ||||
def batchify(self, data): | |||||
def make_batch(self, data): | |||||
"""Batch and pad data.""" | """Batch and pad data.""" | ||||
for indices in self.iterator: | for indices in self.iterator: | ||||
batch = [data[idx] for idx in indices] | batch = [data[idx] for idx in indices] | ||||
@@ -559,4 +598,4 @@ if __name__ == "__name__": | |||||
train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"} | train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"} | ||||
trainer = BaseTrainer(train_args) | trainer = BaseTrainer(train_args) | ||||
data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10] | data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10] | ||||
trainer.batchify(data=data_train) | |||||
trainer.make_batch(data=data_train) |
@@ -0,0 +1,104 @@ | |||||
from fastNLP.action.inference import Inference | |||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
from fastNLP.loader.model_loader import ModelLoader | |||||
""" | |||||
mapping from model name to [URL, file_name.class_name] | |||||
Notice that the class of the model should be in "models" directory. | |||||
Example: | |||||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] | |||||
""" | |||||
FastNLP_MODEL_COLLECTION = { | |||||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] | |||||
} | |||||
class FastNLP(object): | |||||
""" | |||||
High-level interface for direct model inference. | |||||
Usage: | |||||
fastnlp = FastNLP() | |||||
fastnlp.load("zh_pos_tag_model") | |||||
text = "这是最好的基于深度学习的中文分词系统。" | |||||
result = fastnlp.run(text) | |||||
print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"] | |||||
""" | |||||
def __init__(self, model_dir="./"): | |||||
self.model_dir = model_dir | |||||
self.model = None | |||||
def load(self, model_name): | |||||
""" | |||||
Load a pre-trained FastNLP model together with additional data. | |||||
:param model_name: str, the name of a FastNLP model. | |||||
""" | |||||
assert type(model_name) is str | |||||
if model_name not in FastNLP_MODEL_COLLECTION: | |||||
raise ValueError("No FastNLP model named {}.".format(model_name)) | |||||
if not self.model_exist(model_dir=self.model_dir): | |||||
self._download(model_name, FastNLP_MODEL_COLLECTION[model_name][0]) | |||||
model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name][1]) | |||||
model_args = ConfigSection() | |||||
# To do: customized config file for model init parameters | |||||
ConfigLoader.load_config(self.model_dir + "default.cfg", model_args) | |||||
model = model_class(model_args) | |||||
# To do: framework independent | |||||
ModelLoader.load_pytorch(model, self.model_dir + model_name) | |||||
self.model = model | |||||
print("Model loaded. ") | |||||
def run(self, infer_input): | |||||
""" | |||||
Perform inference over given input using the loaded model. | |||||
:param infer_input: str, raw text | |||||
:return results: | |||||
""" | |||||
infer = Inference() | |||||
data = infer.prepare_input(infer_input) | |||||
results = infer.predict(self.model, data) | |||||
return results | |||||
@staticmethod | |||||
def _get_model_class(file_class_name): | |||||
""" | |||||
Feature the class specified by <file_class_name> | |||||
:param file_class_name: str, contains the name of the Python module followed by the name of the class. | |||||
Example: "sequence_modeling.SeqLabeling" | |||||
:return module: the model class | |||||
""" | |||||
import_prefix = "fastNLP.models." | |||||
parts = (import_prefix + file_class_name).split(".") | |||||
from_module = ".".join(parts[:-1]) | |||||
module = __import__(from_module) | |||||
for sub in parts[1:]: | |||||
module = getattr(module, sub) | |||||
return module | |||||
def _load(self, model_dir, model_name): | |||||
# To do | |||||
return 0 | |||||
def _download(self, model_name, url): | |||||
""" | |||||
Download the model weights from <url> and save in <self.model_dir>. | |||||
:param model_name: | |||||
:param url: | |||||
""" | |||||
print("Downloading {} from {}".format(model_name, url)) | |||||
# To do | |||||
def model_exist(self, model_dir): | |||||
""" | |||||
Check whether the desired model is already in the directory. | |||||
:param model_dir: | |||||
""" | |||||
pass |
@@ -17,7 +17,7 @@ class BaseLoader(object): | |||||
def load_lines(self): | def load_lines(self): | ||||
with open(self.data_path, "r", encoding="utf=8") as f: | with open(self.data_path, "r", encoding="utf=8") as f: | ||||
text = f.readlines() | text = f.readlines() | ||||
return text | |||||
return [line.strip() for line in text] | |||||
class ToyLoader0(BaseLoader): | class ToyLoader0(BaseLoader): | ||||
@@ -11,9 +11,11 @@ class ModelLoader(BaseLoader): | |||||
def __init__(self, data_name, data_path): | def __init__(self, data_name, data_path): | ||||
super(ModelLoader, self).__init__(data_name, data_path) | super(ModelLoader, self).__init__(data_name, data_path) | ||||
def load_pytorch(self, empty_model): | |||||
@staticmethod | |||||
def load_pytorch(empty_model, model_path): | |||||
""" | """ | ||||
Load model parameters from .pkl files into the empty PyTorch model. | Load model parameters from .pkl files into the empty PyTorch model. | ||||
:param empty_model: a PyTorch model with initialized parameters. | :param empty_model: a PyTorch model with initialized parameters. | ||||
:param model_path: str, the path to the saved model. | |||||
""" | """ | ||||
empty_model.load_state_dict(torch.load(self.data_path)) | |||||
empty_model.load_state_dict(torch.load(model_path)) |
@@ -1,346 +1,361 @@ | |||||
import _pickle | |||||
import os | |||||
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||||
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||||
DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||||
'<reserved-3>', | |||||
'<reserved-4>'] # dict index = 2~4 | |||||
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||||
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||||
DEFAULT_RESERVED_LABEL[2]: 4} | |||||
# the first vocab in dict with the index = 5 | |||||
class BasePreprocess(object): | |||||
def __init__(self, data, pickle_path): | |||||
super(BasePreprocess, self).__init__() | |||||
self.data = data | |||||
self.pickle_path = pickle_path | |||||
if not self.pickle_path.endswith('/'): | |||||
self.pickle_path = self.pickle_path + '/' | |||||
class POSPreprocess(BasePreprocess): | |||||
""" | |||||
This class are used to preprocess the pos datasets. | |||||
""" | |||||
def __init__(self, data, pickle_path="./", train_dev_split=0): | |||||
""" | |||||
Preprocess pipeline, including building mapping from words to index, from index to words, | |||||
from labels/classes to index, from index to labels/classes. | |||||
:param data: three-level list | |||||
[ | |||||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||||
... | |||||
] | |||||
:param pickle_path: str, the directory to the pickle files. Default: "./" | |||||
:param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0. | |||||
To do: | |||||
1. simplify __init__ | |||||
""" | |||||
super(POSPreprocess, self).__init__(data, pickle_path) | |||||
self.pickle_path = pickle_path | |||||
if self.pickle_exist("word2id.pkl"): | |||||
# load word2index because the construction of the following objects needs it | |||||
with open(os.path.join(self.pickle_path, "word2id.pkl"), "rb") as f: | |||||
self.word2index = _pickle.load(f) | |||||
else: | |||||
self.word2index, self.label2index = self.build_dict(data) | |||||
with open(os.path.join(self.pickle_path, "word2id.pkl"), "wb") as f: | |||||
_pickle.dump(self.word2index, f) | |||||
if self.pickle_exist("class2id.pkl"): | |||||
with open(os.path.join(self.pickle_path, "class2id.pkl"), "rb") as f: | |||||
self.label2index = _pickle.load(f) | |||||
else: | |||||
with open(os.path.join(self.pickle_path, "class2id.pkl"), "wb") as f: | |||||
_pickle.dump(self.label2index, f) | |||||
#something will be wrong if word2id.pkl is found but class2id.pkl is not found | |||||
if not self.pickle_exist("id2word.pkl"): | |||||
index2word = self.build_reverse_dict(self.word2index) | |||||
with open(os.path.join(self.pickle_path, "id2word.pkl"), "wb") as f: | |||||
_pickle.dump(index2word, f) | |||||
if not self.pickle_exist("id2class.pkl"): | |||||
index2label = self.build_reverse_dict(self.label2index) | |||||
with open(os.path.join(self.pickle_path, "word2id.pkl"), "wb") as f: | |||||
_pickle.dump(index2label, f) | |||||
if not self.pickle_exist("data_train.pkl"): | |||||
data_train = self.to_index(data) | |||||
if train_dev_split > 0 and not self.pickle_exist("data_dev.pkl"): | |||||
data_dev = data_train[: int(len(data_train) * train_dev_split)] | |||||
with open(os.path.join(self.pickle_path, "data_dev.pkl"), "wb") as f: | |||||
_pickle.dump(data_dev, f) | |||||
with open(os.path.join(self.pickle_path, "data_train.pkl"), "wb") as f: | |||||
_pickle.dump(data_train, f) | |||||
def build_dict(self, data): | |||||
""" | |||||
Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||||
:param data: three-level list | |||||
[ | |||||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||||
... | |||||
] | |||||
:return word2index: dict of {str, int} | |||||
label2index: dict of {str, int} | |||||
""" | |||||
label2index = {} | |||||
word2index = DEFAULT_WORD_TO_INDEX | |||||
for example in data: | |||||
for word, label in zip(example[0], example[1]): | |||||
if word not in word2index: | |||||
word2index[word] = len(word2index) | |||||
if label not in label2index: | |||||
label2index[label] = len(label2index) | |||||
return word2index, label2index | |||||
def pickle_exist(self, pickle_name): | |||||
""" | |||||
:param pickle_name: the filename of target pickle file | |||||
:return: True if file exists else False | |||||
""" | |||||
if not os.path.exists(self.pickle_path): | |||||
os.makedirs(self.pickle_path) | |||||
file_name = os.path.join(self.pickle_path, pickle_name) | |||||
if os.path.exists(file_name): | |||||
return True | |||||
else: | |||||
return False | |||||
def build_reverse_dict(self, word_dict): | |||||
id2word = {word_dict[w]: w for w in word_dict} | |||||
return id2word | |||||
def to_index(self, data): | |||||
""" | |||||
Convert word strings and label strings into indices. | |||||
:param data: three-level list | |||||
[ | |||||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||||
... | |||||
] | |||||
:return data_index: the shape of data, but each string is replaced by its corresponding index | |||||
""" | |||||
data_index = [] | |||||
for example in data: | |||||
word_list = [] | |||||
label_list = [] | |||||
for word, label in zip(example[0], example[1]): | |||||
word_list.append(self.word2index[word]) | |||||
label_list.append(self.label2index[label]) | |||||
data_index.append([word_list, label_list]) | |||||
return data_index | |||||
@property | |||||
def vocab_size(self): | |||||
return len(self.word2index) | |||||
@property | |||||
def num_classes(self): | |||||
return len(self.label2index) | |||||
class ClassPreprocess(BasePreprocess): | |||||
""" | |||||
Pre-process the classification datasets. | |||||
Params: | |||||
pickle_path - directory to save result of pre-processing | |||||
Saves: | |||||
word2id.pkl | |||||
id2word.pkl | |||||
class2id.pkl | |||||
id2class.pkl | |||||
embedding.pkl | |||||
data_train.pkl | |||||
data_dev.pkl | |||||
data_test.pkl | |||||
""" | |||||
def __init__(self, pickle_path): | |||||
# super(ClassPreprocess, self).__init__(data, pickle_path) | |||||
self.word_dict = None | |||||
self.label_dict = None | |||||
self.pickle_path = pickle_path # save directory | |||||
def process(self, data, save_name): | |||||
""" | |||||
Process data. | |||||
Params: | |||||
data - nested list, data = [sample1, sample2, ...], | |||||
sample = [sentence, label], sentence = [word1, word2, ...] | |||||
save_name - name of processed data, such as data_train.pkl | |||||
Returns: | |||||
vocab_size - vocabulary size | |||||
n_classes - number of classes | |||||
""" | |||||
self.build_dict(data) | |||||
self.word2id() | |||||
vocab_size = self.id2word() | |||||
self.class2id() | |||||
num_classes = self.id2class() | |||||
self.embedding() | |||||
self.data_generate(data, save_name) | |||||
return vocab_size, num_classes | |||||
def build_dict(self, data): | |||||
"""Build vocabulary.""" | |||||
# just read if word2id.pkl and class2id.pkl exists | |||||
if self.pickle_exist("word2id.pkl") and \ | |||||
self.pickle_exist("class2id.pkl"): | |||||
file_name = os.path.join(self.pickle_path, "word2id.pkl") | |||||
with open(file_name, 'rb') as f: | |||||
self.word_dict = _pickle.load(f) | |||||
file_name = os.path.join(self.pickle_path, "class2id.pkl") | |||||
with open(file_name, 'rb') as f: | |||||
self.label_dict = _pickle.load(f) | |||||
return | |||||
# build vocabulary from scratch if nothing exists | |||||
self.word_dict = { | |||||
DEFAULT_PADDING_LABEL: 0, | |||||
DEFAULT_UNKNOWN_LABEL: 1, | |||||
DEFAULT_RESERVED_LABEL[0]: 2, | |||||
DEFAULT_RESERVED_LABEL[1]: 3, | |||||
DEFAULT_RESERVED_LABEL[2]: 4} | |||||
self.label_dict = {} | |||||
# collect every word and label | |||||
for sent, label in data: | |||||
if len(sent) <= 1: | |||||
continue | |||||
if label not in self.label_dict: | |||||
index = len(self.label_dict) | |||||
self.label_dict[label] = index | |||||
for word in sent: | |||||
if word not in self.word_dict: | |||||
index = len(self.word_dict) | |||||
self.word_dict[word[0]] = index | |||||
def pickle_exist(self, pickle_name): | |||||
""" | |||||
Check whether a pickle file exists. | |||||
Params | |||||
pickle_name: the filename of target pickle file | |||||
Return | |||||
True if file exists else False | |||||
""" | |||||
if not os.path.exists(self.pickle_path): | |||||
os.makedirs(self.pickle_path) | |||||
file_name = os.path.join(self.pickle_path, pickle_name) | |||||
if os.path.exists(file_name): | |||||
return True | |||||
else: | |||||
return False | |||||
def word2id(self): | |||||
"""Save vocabulary of {word:id} mapping format.""" | |||||
# nothing will be done if word2id.pkl exists | |||||
if self.pickle_exist("word2id.pkl"): | |||||
return | |||||
file_name = os.path.join(self.pickle_path, "word2id.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(self.word_dict, f) | |||||
def id2word(self): | |||||
"""Save vocabulary of {id:word} mapping format.""" | |||||
# nothing will be done if id2word.pkl exists | |||||
if self.pickle_exist("id2word.pkl"): | |||||
file_name = os.path.join(self.pickle_path, "id2word.pkl") | |||||
with open(file_name, 'rb') as f: | |||||
id2word_dict = _pickle.load(f) | |||||
return len(id2word_dict) | |||||
id2word_dict = {self.word_dict[w]: w for w in self.word_dict} | |||||
file_name = os.path.join(self.pickle_path, "id2word.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(id2word_dict, f) | |||||
return len(id2word_dict) | |||||
def class2id(self): | |||||
"""Save mapping of {class:id}.""" | |||||
# nothing will be done if class2id.pkl exists | |||||
if self.pickle_exist("class2id.pkl"): | |||||
return | |||||
file_name = os.path.join(self.pickle_path, "class2id.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(self.label_dict, f) | |||||
def id2class(self): | |||||
"""Save mapping of {id:class}.""" | |||||
# nothing will be done if id2class.pkl exists | |||||
if self.pickle_exist("id2class.pkl"): | |||||
file_name = os.path.join(self.pickle_path, "id2class.pkl") | |||||
with open(file_name, "rb") as f: | |||||
id2class_dict = _pickle.load(f) | |||||
return len(id2class_dict) | |||||
id2class_dict = {self.label_dict[c]: c for c in self.label_dict} | |||||
file_name = os.path.join(self.pickle_path, "id2class.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(id2class_dict, f) | |||||
return len(id2class_dict) | |||||
def embedding(self): | |||||
"""Save embedding lookup table corresponding to vocabulary.""" | |||||
# nothing will be done if embedding.pkl exists | |||||
if self.pickle_exist("embedding.pkl"): | |||||
return | |||||
# retrieve vocabulary from pre-trained embedding (not implemented) | |||||
def data_generate(self, data_src, save_name): | |||||
"""Convert dataset from text to digit.""" | |||||
# nothing will be done if file exists | |||||
save_path = os.path.join(self.pickle_path, save_name) | |||||
if os.path.exists(save_path): | |||||
return | |||||
data = [] | |||||
# for every sample | |||||
for sent, label in data_src: | |||||
if len(sent) <= 1: | |||||
continue | |||||
label_id = self.label_dict[label] # label id | |||||
sent_id = [] # sentence ids | |||||
for word in sent: | |||||
if word in self.word_dict: | |||||
sent_id.append(self.word_dict[word]) | |||||
else: | |||||
sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL]) | |||||
data.append([sent_id, label_id]) | |||||
# save data | |||||
with open(save_path, "wb") as f: | |||||
_pickle.dump(data, f) | |||||
class LMPreprocess(BasePreprocess): | |||||
def __init__(self, data, pickle_path): | |||||
super(LMPreprocess, self).__init__(data, pickle_path) | |||||
import _pickle | |||||
import os | |||||
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||||
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||||
DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||||
'<reserved-3>', | |||||
'<reserved-4>'] # dict index = 2~4 | |||||
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||||
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||||
DEFAULT_RESERVED_LABEL[2]: 4} | |||||
# the first vocab in dict with the index = 5 | |||||
def save_pickle(obj, pickle_path, file_name): | |||||
with open(os.path.join(pickle_path, file_name), "wb") as f: | |||||
_pickle.dump(obj, f) | |||||
print("{} saved. ".format(file_name)) | |||||
def load_pickle(pickle_path, file_name): | |||||
with open(os.path.join(pickle_path, file_name), "rb") as f: | |||||
obj = _pickle.load(f) | |||||
return obj | |||||
def pickle_exist(pickle_path, pickle_name): | |||||
""" | |||||
:param pickle_path: the directory of target pickle file | |||||
:param pickle_name: the filename of target pickle file | |||||
:return: True if file exists else False | |||||
""" | |||||
if not os.path.exists(pickle_path): | |||||
os.makedirs(pickle_path) | |||||
file_name = os.path.join(pickle_path, pickle_name) | |||||
if os.path.exists(file_name): | |||||
return True | |||||
else: | |||||
return False | |||||
class BasePreprocess(object): | |||||
def __init__(self, data, pickle_path): | |||||
super(BasePreprocess, self).__init__() | |||||
# self.data = data | |||||
self.pickle_path = pickle_path | |||||
if not self.pickle_path.endswith('/'): | |||||
self.pickle_path = self.pickle_path + '/' | |||||
class POSPreprocess(BasePreprocess): | |||||
""" | |||||
This class are used to preprocess the POS Tag datasets. | |||||
""" | |||||
def __init__(self, data, pickle_path="./", train_dev_split=0): | |||||
""" | |||||
Preprocess pipeline, including building mapping from words to index, from index to words, | |||||
from labels/classes to index, from index to labels/classes. | |||||
:param data: three-level list | |||||
[ | |||||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||||
... | |||||
] | |||||
:param pickle_path: str, the directory to the pickle files. Default: "./" | |||||
:param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0. | |||||
""" | |||||
super(POSPreprocess, self).__init__(data, pickle_path) | |||||
self.pickle_path = pickle_path | |||||
if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | |||||
self.word2index = load_pickle(self.pickle_path, "word2id.pkl") | |||||
self.label2index = load_pickle(self.pickle_path, "class2id.pkl") | |||||
else: | |||||
self.word2index, self.label2index = self.build_dict(data) | |||||
save_pickle(self.word2index, self.pickle_path, "word2id.pkl") | |||||
save_pickle(self.label2index, self.pickle_path, "class2id.pkl") | |||||
if not pickle_exist(pickle_path, "id2word.pkl"): | |||||
index2word = self.build_reverse_dict(self.word2index) | |||||
save_pickle(index2word, self.pickle_path, "id2word.pkl") | |||||
if not pickle_exist(pickle_path, "id2class.pkl"): | |||||
index2label = self.build_reverse_dict(self.label2index) | |||||
save_pickle(index2label, self.pickle_path, "id2class.pkl") | |||||
if not pickle_exist(pickle_path, "data_train.pkl"): | |||||
data_train = self.to_index(data) | |||||
if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"): | |||||
data_dev = data_train[: int(len(data_train) * train_dev_split)] | |||||
save_pickle(data_dev, self.pickle_path, "data_dev.pkl") | |||||
save_pickle(data_train, self.pickle_path, "data_train.pkl") | |||||
def build_dict(self, data): | |||||
""" | |||||
Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||||
:param data: three-level list | |||||
[ | |||||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||||
... | |||||
] | |||||
:return word2index: dict of {str, int} | |||||
label2index: dict of {str, int} | |||||
""" | |||||
label2index = {} | |||||
word2index = DEFAULT_WORD_TO_INDEX | |||||
for example in data: | |||||
for word, label in zip(example[0], example[1]): | |||||
if word not in word2index: | |||||
word2index[word] = len(word2index) | |||||
if label not in label2index: | |||||
label2index[label] = len(label2index) | |||||
return word2index, label2index | |||||
def build_reverse_dict(self, word_dict): | |||||
id2word = {word_dict[w]: w for w in word_dict} | |||||
return id2word | |||||
def to_index(self, data): | |||||
""" | |||||
Convert word strings and label strings into indices. | |||||
:param data: three-level list | |||||
[ | |||||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||||
... | |||||
] | |||||
:return data_index: the shape of data, but each string is replaced by its corresponding index | |||||
""" | |||||
data_index = [] | |||||
for example in data: | |||||
word_list = [] | |||||
label_list = [] | |||||
for word, label in zip(example[0], example[1]): | |||||
word_list.append(self.word2index[word]) | |||||
label_list.append(self.label2index[label]) | |||||
data_index.append([word_list, label_list]) | |||||
return data_index | |||||
@property | |||||
def vocab_size(self): | |||||
return len(self.word2index) | |||||
@property | |||||
def num_classes(self): | |||||
return len(self.label2index) | |||||
class ClassPreprocess(BasePreprocess): | |||||
""" | |||||
Pre-process the classification datasets. | |||||
Params: | |||||
pickle_path - directory to save result of pre-processing | |||||
Saves: | |||||
word2id.pkl | |||||
id2word.pkl | |||||
class2id.pkl | |||||
id2class.pkl | |||||
embedding.pkl | |||||
data_train.pkl | |||||
data_dev.pkl | |||||
data_test.pkl | |||||
""" | |||||
def __init__(self, pickle_path): | |||||
# super(ClassPreprocess, self).__init__(data, pickle_path) | |||||
self.word_dict = None | |||||
self.label_dict = None | |||||
self.pickle_path = pickle_path # save directory | |||||
def process(self, data, save_name): | |||||
""" | |||||
Process data. | |||||
Params: | |||||
data - nested list, data = [sample1, sample2, ...], | |||||
sample = [sentence, label], sentence = [word1, word2, ...] | |||||
save_name - name of processed data, such as data_train.pkl | |||||
Returns: | |||||
vocab_size - vocabulary size | |||||
n_classes - number of classes | |||||
""" | |||||
self.build_dict(data) | |||||
self.word2id() | |||||
vocab_size = self.id2word() | |||||
self.class2id() | |||||
num_classes = self.id2class() | |||||
self.embedding() | |||||
self.data_generate(data, save_name) | |||||
return vocab_size, num_classes | |||||
def build_dict(self, data): | |||||
"""Build vocabulary.""" | |||||
# just read if word2id.pkl and class2id.pkl exists | |||||
if self.pickle_exist("word2id.pkl") and \ | |||||
self.pickle_exist("class2id.pkl"): | |||||
file_name = os.path.join(self.pickle_path, "word2id.pkl") | |||||
with open(file_name, 'rb') as f: | |||||
self.word_dict = _pickle.load(f) | |||||
file_name = os.path.join(self.pickle_path, "class2id.pkl") | |||||
with open(file_name, 'rb') as f: | |||||
self.label_dict = _pickle.load(f) | |||||
return | |||||
# build vocabulary from scratch if nothing exists | |||||
self.word_dict = { | |||||
DEFAULT_PADDING_LABEL: 0, | |||||
DEFAULT_UNKNOWN_LABEL: 1, | |||||
DEFAULT_RESERVED_LABEL[0]: 2, | |||||
DEFAULT_RESERVED_LABEL[1]: 3, | |||||
DEFAULT_RESERVED_LABEL[2]: 4} | |||||
self.label_dict = {} | |||||
# collect every word and label | |||||
for sent, label in data: | |||||
if len(sent) <= 1: | |||||
continue | |||||
if label not in self.label_dict: | |||||
index = len(self.label_dict) | |||||
self.label_dict[label] = index | |||||
for word in sent: | |||||
if word not in self.word_dict: | |||||
index = len(self.word_dict) | |||||
self.word_dict[word[0]] = index | |||||
def pickle_exist(self, pickle_name): | |||||
""" | |||||
Check whether a pickle file exists. | |||||
Params | |||||
pickle_name: the filename of target pickle file | |||||
Return | |||||
True if file exists else False | |||||
""" | |||||
if not os.path.exists(self.pickle_path): | |||||
os.makedirs(self.pickle_path) | |||||
file_name = os.path.join(self.pickle_path, pickle_name) | |||||
if os.path.exists(file_name): | |||||
return True | |||||
else: | |||||
return False | |||||
def word2id(self): | |||||
"""Save vocabulary of {word:id} mapping format.""" | |||||
# nothing will be done if word2id.pkl exists | |||||
if self.pickle_exist("word2id.pkl"): | |||||
return | |||||
file_name = os.path.join(self.pickle_path, "word2id.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(self.word_dict, f) | |||||
def id2word(self): | |||||
"""Save vocabulary of {id:word} mapping format.""" | |||||
# nothing will be done if id2word.pkl exists | |||||
if self.pickle_exist("id2word.pkl"): | |||||
file_name = os.path.join(self.pickle_path, "id2word.pkl") | |||||
with open(file_name, 'rb') as f: | |||||
id2word_dict = _pickle.load(f) | |||||
return len(id2word_dict) | |||||
id2word_dict = {self.word_dict[w]: w for w in self.word_dict} | |||||
file_name = os.path.join(self.pickle_path, "id2word.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(id2word_dict, f) | |||||
return len(id2word_dict) | |||||
def class2id(self): | |||||
"""Save mapping of {class:id}.""" | |||||
# nothing will be done if class2id.pkl exists | |||||
if self.pickle_exist("class2id.pkl"): | |||||
return | |||||
file_name = os.path.join(self.pickle_path, "class2id.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(self.label_dict, f) | |||||
def id2class(self): | |||||
"""Save mapping of {id:class}.""" | |||||
# nothing will be done if id2class.pkl exists | |||||
if self.pickle_exist("id2class.pkl"): | |||||
file_name = os.path.join(self.pickle_path, "id2class.pkl") | |||||
with open(file_name, "rb") as f: | |||||
id2class_dict = _pickle.load(f) | |||||
return len(id2class_dict) | |||||
id2class_dict = {self.label_dict[c]: c for c in self.label_dict} | |||||
file_name = os.path.join(self.pickle_path, "id2class.pkl") | |||||
with open(file_name, "wb") as f: | |||||
_pickle.dump(id2class_dict, f) | |||||
return len(id2class_dict) | |||||
def embedding(self): | |||||
"""Save embedding lookup table corresponding to vocabulary.""" | |||||
# nothing will be done if embedding.pkl exists | |||||
if self.pickle_exist("embedding.pkl"): | |||||
return | |||||
# retrieve vocabulary from pre-trained embedding (not implemented) | |||||
def data_generate(self, data_src, save_name): | |||||
"""Convert dataset from text to digit.""" | |||||
# nothing will be done if file exists | |||||
save_path = os.path.join(self.pickle_path, save_name) | |||||
if os.path.exists(save_path): | |||||
return | |||||
data = [] | |||||
# for every sample | |||||
for sent, label in data_src: | |||||
if len(sent) <= 1: | |||||
continue | |||||
label_id = self.label_dict[label] # label id | |||||
sent_id = [] # sentence ids | |||||
for word in sent: | |||||
if word in self.word_dict: | |||||
sent_id.append(self.word_dict[word]) | |||||
else: | |||||
sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL]) | |||||
data.append([sent_id, label_id]) | |||||
# save data | |||||
with open(save_path, "wb") as f: | |||||
_pickle.dump(data, f) | |||||
class LMPreprocess(BasePreprocess): | |||||
def __init__(self, data, pickle_path): | |||||
super(LMPreprocess, self).__init__(data, pickle_path) | |||||
def infer_preprocess(pickle_path, data): | |||||
""" | |||||
Preprocess over inference data. | |||||
Transform three-level list of strings into that of index. | |||||
[ | |||||
[word_11, word_12, ...], | |||||
[word_21, word_22, ...], | |||||
... | |||||
] | |||||
""" | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
data_index = [] | |||||
for example in data: | |||||
data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example]) | |||||
return data_index |
@@ -9,17 +9,12 @@ class SeqLabeling(BaseModel): | |||||
PyTorch Network for sequence labeling | PyTorch Network for sequence labeling | ||||
""" | """ | ||||
def __init__(self, hidden_dim, | |||||
rnn_num_layer, | |||||
num_classes, | |||||
vocab_size, | |||||
word_emb_dim=100, | |||||
init_emb=None, | |||||
rnn_mode="gru", | |||||
bi_direction=False, | |||||
dropout=0.5, | |||||
use_crf=True): | |||||
def __init__(self, args): | |||||
super(SeqLabeling, self).__init__() | super(SeqLabeling, self).__init__() | ||||
vocab_size = args["vocab_size"] | |||||
word_emb_dim = args["word_emb_dim"] | |||||
hidden_dim = args["rnn_hidden_units"] | |||||
num_classes = args["num_classes"] | |||||
self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim) | self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim) | ||||
self.Rnn = encoder.lstm.Lstm(word_emb_dim, hidden_dim) | self.Rnn = encoder.lstm.Lstm(word_emb_dim, hidden_dim) | ||||
@@ -29,7 +24,7 @@ class SeqLabeling(BaseModel): | |||||
def forward(self, x): | def forward(self, x): | ||||
""" | """ | ||||
:param x: LongTensor, [batch_size, mex_len] | :param x: LongTensor, [batch_size, mex_len] | ||||
:return y: [batch_size, tag_size, tag_size] | |||||
:return y: [batch_size, mex_len, tag_size] | |||||
""" | """ | ||||
x = self.Embedding(x) | x = self.Embedding(x) | ||||
# [batch_size, max_len, word_emb_dim] | # [batch_size, max_len, word_emb_dim] | ||||
@@ -64,7 +59,7 @@ class SeqLabeling(BaseModel): | |||||
def prediction(self, x, seq_length): | def prediction(self, x, seq_length): | ||||
""" | """ | ||||
:param x: FloatTensor, [batch_size, tag_size, tag_size] | |||||
:param x: FloatTensor, [batch_size, max_len, tag_size] | |||||
:param seq_length: int | :param seq_length: int | ||||
:return prediction: list of tuple of (decode path(list), best score) | :return prediction: list of tuple of (decode path(list), best score) | ||||
""" | """ | ||||
@@ -13,7 +13,7 @@ class Lstm(nn.Module): | |||||
bidirectional : If True, becomes a bidirectional RNN. Default: False. | bidirectional : If True, becomes a bidirectional RNN. Default: False. | ||||
""" | """ | ||||
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.5, bidirectional=False): | |||||
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0, bidirectional=False): | |||||
super(Lstm, self).__init__() | super(Lstm, self).__init__() | ||||
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True, | self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True, | ||||
dropout=dropout, bidirectional=bidirectional) | dropout=dropout, bidirectional=bidirectional) | ||||
@@ -74,3 +74,9 @@ save_dev_input = false | |||||
save_loss = true | save_loss = true | ||||
batch_size = 1 | batch_size = 1 | ||||
pickle_path = "./data_for_tests/" | pickle_path = "./data_for_tests/" | ||||
rnn_hidden_units = 100 | |||||
rnn_layers = 1 | |||||
rnn_bi_direction = true | |||||
word_emb_dim = 100 | |||||
dropout = 0.5 | |||||
use_crf = true |
@@ -0,0 +1,2 @@ | |||||
迈向充满希望的新世纪——一九九八年新年讲话 | |||||
(附图片1张) |
@@ -4,8 +4,8 @@ sys.path.append("..") | |||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | ||||
from fastNLP.action.trainer import POSTrainer | from fastNLP.action.trainer import POSTrainer | ||||
from fastNLP.loader.dataset_loader import POSDatasetLoader | |||||
from fastNLP.loader.preprocess import POSPreprocess | |||||
from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader | |||||
from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||||
from fastNLP.saver.model_saver import ModelSaver | from fastNLP.saver.model_saver import ModelSaver | ||||
from fastNLP.loader.model_loader import ModelLoader | from fastNLP.loader.model_loader import ModelLoader | ||||
from fastNLP.action.tester import POSTester | from fastNLP.action.tester import POSTester | ||||
@@ -15,32 +15,49 @@ from fastNLP.action.inference import Inference | |||||
data_name = "people.txt" | data_name = "people.txt" | ||||
data_path = "data_for_tests/people.txt" | data_path = "data_for_tests/people.txt" | ||||
pickle_path = "data_for_tests" | pickle_path = "data_for_tests" | ||||
data_infer_path = "data_for_tests/people_infer.txt" | |||||
def test_infer(): | |||||
def infer(): | |||||
# Load infer configuration, the same as test | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
# fetch dictinary size and number of labels from pickle files | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
test_args["vocab_size"] = len(word2index) | |||||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
test_args["num_classes"] = len(index2label) | |||||
# Define the same model | # Define the same model | ||||
model = SeqLabeling(hidden_dim=train_args["rnn_hidden_units"], rnn_num_layer=train_args["rnn_layers"], | |||||
num_classes=train_args["num_classes"], vocab_size=train_args["vocab_size"], | |||||
word_emb_dim=train_args["word_emb_dim"], bi_direction=train_args["rnn_bi_direction"], | |||||
rnn_mode="gru", dropout=train_args["dropout"], use_crf=train_args["use_crf"]) | |||||
model = SeqLabeling(test_args) | |||||
# Dump trained parameters into the model | # Dump trained parameters into the model | ||||
ModelLoader("arbitrary_name", "./saved_model.pkl").load_pytorch(model) | |||||
ModelLoader.load_pytorch(model, "./saved_model.pkl") | |||||
print("model loaded!") | print("model loaded!") | ||||
# Data Loader | # Data Loader | ||||
pos_loader = POSDatasetLoader(data_name, data_path) | |||||
infer_data = pos_loader.load_lines() | |||||
# Preprocessor | |||||
POSPreprocess(infer_data, pickle_path) | |||||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
infer_data = raw_data_loader.load_lines() | |||||
""" | |||||
Transform strings into list of list of strings. | |||||
[ | |||||
[word_11, word_12, ...], | |||||
[word_21, word_22, ...], | |||||
... | |||||
] | |||||
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. | |||||
""" | |||||
# Inference interface | # Inference interface | ||||
infer = Inference() | |||||
infer = Inference(pickle_path) | |||||
results = infer.predict(model, infer_data) | results = infer.predict(model, infer_data) | ||||
print(results) | |||||
print("Inference finished!") | |||||
if __name__ == "__main__": | |||||
def train_test(): | |||||
# Config Loader | # Config Loader | ||||
train_args = ConfigSection() | train_args = ConfigSection() | ||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | ||||
@@ -58,10 +75,7 @@ if __name__ == "__main__": | |||||
trainer = POSTrainer(train_args) | trainer = POSTrainer(train_args) | ||||
# Model | # Model | ||||
model = SeqLabeling(hidden_dim=train_args["rnn_hidden_units"], rnn_num_layer=train_args["rnn_layers"], | |||||
num_classes=train_args["num_classes"], vocab_size=train_args["vocab_size"], | |||||
word_emb_dim=train_args["word_emb_dim"], bi_direction=train_args["rnn_bi_direction"], | |||||
rnn_mode="gru", dropout=train_args["dropout"], use_crf=train_args["use_crf"]) | |||||
model = SeqLabeling(train_args) | |||||
# Start training | # Start training | ||||
trainer.train(model) | trainer.train(model) | ||||
@@ -75,13 +89,10 @@ if __name__ == "__main__": | |||||
del model, trainer, pos_loader | del model, trainer, pos_loader | ||||
# Define the same model | # Define the same model | ||||
model = SeqLabeling(hidden_dim=train_args["rnn_hidden_units"], rnn_num_layer=train_args["rnn_layers"], | |||||
num_classes=train_args["num_classes"], vocab_size=train_args["vocab_size"], | |||||
word_emb_dim=train_args["word_emb_dim"], bi_direction=train_args["rnn_bi_direction"], | |||||
rnn_mode="gru", dropout=train_args["dropout"], use_crf=train_args["use_crf"]) | |||||
model = SeqLabeling(train_args) | |||||
# Dump trained parameters into the model | # Dump trained parameters into the model | ||||
ModelLoader("arbitrary_name", "./saved_model.pkl").load_pytorch(model) | |||||
ModelLoader.load_pytorch(model, "./saved_model.pkl") | |||||
print("model loaded!") | print("model loaded!") | ||||
# Load test configuration | # Load test configuration | ||||
@@ -97,3 +108,7 @@ if __name__ == "__main__": | |||||
# print test results | # print test results | ||||
print(tester.show_matrices()) | print(tester.show_matrices()) | ||||
print("model tested!") | print("model tested!") | ||||
if __name__ == "__main__": | |||||
infer() |