Browse Source

changes to preprocessor, trainer, inference & seq modeling

- [trainer]rename "batchify" to "make_batch" in trainer
- [trainer]pack (batch_x_pad, seq_len) into batch_x in make_batch for seq labeling, because seq length before pad is needed to make masks
- [trainer]unpack it in data_forward
- [model]shorten model definition
- [inference]build inference class. test_POS_pipeline.py is OK to infer
- [preprocessor]handle pickles in a nicer manner
- [FastNLP] add fastNLP.py as high-level API, not finished yet
tags/v0.1.0
FengZiYjun 6 years ago
parent
commit
fe17f611b6
11 changed files with 679 additions and 411 deletions
  1. +102
    -12
      fastNLP/action/inference.py
  2. +52
    -13
      fastNLP/action/trainer.py
  3. +104
    -0
      fastNLP/fastNLP.py
  4. +1
    -1
      fastNLP/loader/base_loader.py
  5. +4
    -2
      fastNLP/loader/model_loader.py
  6. +361
    -346
      fastNLP/loader/preprocess.py
  7. +7
    -12
      fastNLP/models/sequence_modeling.py
  8. +1
    -1
      fastNLP/modules/encoder/lstm.py
  9. +6
    -0
      test/data_for_tests/config
  10. +2
    -0
      test/data_for_tests/people_infer.txt
  11. +39
    -24
      test/test_POS_pipeline.py

+ 102
- 12
fastNLP/action/inference.py View File

@@ -1,26 +1,116 @@
import torch

from fastNLP.action.action import Batchifier, SequentialSampler
from fastNLP.loader.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL


class Inference(object):
"""
This is an interface focusing on predicting output based on trained models.
It does not care about evaluations of the model.
It does not care about evaluations of the model, which is different from Tester.
This is a high-level model wrapper to be called by FastNLP.

"""

def __init__(self):
pass
def __init__(self, pickle_path):
self.batch_size = 1
self.batch_output = []
self.iterator = None
self.pickle_path = pickle_path
self.index2label = load_pickle(self.pickle_path, "id2class.pkl")
self.word2index = load_pickle(self.pickle_path, "word2id.pkl")

def predict(self, network, data):
"""
Perform inference.
:param network:
:param data: multi-level lists of strings
:return result: the model outputs
"""
# transform strings into indices
data = self.prepare_input(data)

# turn on the testing mode; clean up the history
self.mode(network, test=True)

self.iterator = iter(Batchifier(SequentialSampler(data), self.batch_size, drop_last=False))

num_iter = len(data) // self.batch_size

for step in range(num_iter):
batch_x = self.batchify(data)

prediction = self.data_forward(network, batch_x)

self.batch_output.append(prediction)

return self.prepare_output(self.batch_output)

def mode(self, network, test=True):
if test:
network.eval()
else:
network.train()
self.batch_output.clear()

def data_forward(self, network, x):
"""
This is only for sequence labeling with CRF decoder. To do: more general ?
:param network:
:param x:
:return:
"""
seq_len = [len(seq) for seq in x]
x = torch.Tensor(x).long()
y = network(x)
prediction = network.prediction(y, seq_len)
# To do: hide framework
results = torch.Tensor(prediction).view(-1, )
return list(results.data)

def batchify(self, data):
indices = next(self.iterator)
batch_x = [data[idx] for idx in indices]
batch_x = self.pad(batch_x)
return batch_x

@staticmethod
def pad(batch, fill=0):
"""
Pad a batch of samples to maximum length.
:param batch: list of list
:param fill: word index to pad, default 0.
:return: a padded batch
"""
max_length = max([len(x) for x in batch])
for idx, sample in enumerate(batch):
if len(sample) < max_length:
batch[idx] = sample + [fill * (max_length - len(sample))]
return batch

def predict(self, model, data):
def prepare_input(self, data):
"""
this is actually a forward pass. shall be shared by Trainer/Tester
:param model:
Transform three-level list of strings into that of index.
:param data:
:return result: the output results
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
"""
raise NotImplementedError
data_index = []
default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL]
for example in data:
data_index.append([self.word2index.get(w, default_unknown_index) for w in example])
return data_index

def prepare_input(self, data_path):
def prepare_output(self, batch_outputs):
"""
This can also be shared.
:param data_path:
Transform list of batch outputs into strings.
:param batch_outputs: list of list [num_batch, tag_seq_length]
:return:
"""
raise NotImplementedError
results = []
for batch in batch_outputs:
results.append([self.index2label[int(x.data)] for x in batch])
return results

+ 52
- 13
fastNLP/action/trainer.py View File

@@ -86,7 +86,7 @@ class BaseTrainer(Action):

# training iterations in one epoch
for step in range(iterations):
batch_x, batch_y = self.batchify(data_train) # pad ?
batch_x, batch_y = self.make_batch(data_train)

prediction = self.data_forward(network, batch_x)

@@ -180,7 +180,7 @@ class BaseTrainer(Action):
"""
raise NotImplementedError

def batchify(self, data, output_length=True):
def make_batch(self, data, output_length=True):
"""
1. Perform batching from data and produce a batch of training data.
2. Add padding.
@@ -191,9 +191,12 @@ class BaseTrainer(Action):
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
...
]
:return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
:return (batch_x, seq_len): tuple of two elements, if output_length is true.
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
seq_len: list. The length of the pre-padded sequence, if output_length is True.
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]
seq_len: list. The length of the pre-padded sequence, if output_length is True.

return batch_x and batch_y, if output_length is False
"""
indices = next(self.iterator)
batch = [data[idx] for idx in indices]
@@ -202,7 +205,7 @@ class BaseTrainer(Action):
batch_x_pad = self.pad(batch_x)
if output_length:
seq_len = [len(x) for x in batch_x]
return batch_x_pad, batch_y, seq_len
return (batch_x_pad, seq_len), batch_y
else:
return batch_x_pad, batch_y

@@ -292,17 +295,23 @@ class POSTrainer(BaseTrainer):
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
return data_train, data_dev, 0, 1

def data_forward(self, network, x):
def data_forward(self, network, inputs):
"""
:param network: the PyTorch model
:param x: list of list, [batch_size, max_len]
:param inputs: list of list, [batch_size, max_len],
or tuple of (batch_x, seq_len), batch_x == [batch_size, max_len]
:return y: [batch_size, max_len, tag_size]
"""
self.seq_len = [len(seq) for seq in x]
# unpack the returned value from make_batch
if isinstance(inputs, tuple):
x = inputs[0]
self.seq_len = inputs[1]
else:
x = inputs
x = torch.Tensor(x).long()
self.batch_size = x.size(0)
self.max_len = x.size(1)
# self.mask = seq_mask(seq_len, self.max_len)
y = network(x)
return y

@@ -325,11 +334,12 @@ class POSTrainer(BaseTrainer):
def get_loss(self, predict, truth):
"""
Compute loss given prediction and ground truth.
:param predict: prediction label vector, [batch_size, tag_size, tag_size]
:param predict: prediction label vector, [batch_size, max_len, tag_size]
:param truth: ground truth label vector, [batch_size, max_len]
:return: a scalar
"""
truth = torch.Tensor(truth)
assert truth.shape == (self.batch_size, self.max_len)
if self.loss_func is None:
if hasattr(self.model, "loss"):
self.loss_func = self.model.loss
@@ -347,6 +357,35 @@ class POSTrainer(BaseTrainer):
else:
return False

def make_batch(self, data, output_length=True):
"""
1. Perform batching from data and produce a batch of training data.
2. Add padding.
:param data: list. Each entry is a sample, which is also a list of features and label(s).
E.g.
[
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
...
]
:return (batch_x, seq_len): tuple of two elements, if output_length is true.
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
seq_len: list. The length of the pre-padded sequence, if output_length is True.
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]

return batch_x and batch_y, if output_length is False
"""
indices = next(self.iterator)
batch = [data[idx] for idx in indices]
batch_x = [sample[0] for sample in batch]
batch_y = [sample[1] for sample in batch]
batch_x_pad = self.pad(batch_x)
if output_length:
seq_len = [len(x) for x in batch_x]
return (batch_x_pad, seq_len), batch_y
else:
return batch_x_pad, batch_y


class LanguageModelTrainer(BaseTrainer):
"""
@@ -438,7 +477,7 @@ class ClassTrainer(BaseTrainer):

# training iterations in one epoch
step = 0
for batch_x, batch_y in self.batchify(data_train):
for batch_x, batch_y in self.make_batch(data_train):
prediction = self.data_forward(network, batch_x)

loss = self.get_loss(prediction, batch_y)
@@ -533,7 +572,7 @@ class ClassTrainer(BaseTrainer):
"""Apply gradient."""
self.optimizer.step()

def batchify(self, data):
def make_batch(self, data):
"""Batch and pad data."""
for indices in self.iterator:
batch = [data[idx] for idx in indices]
@@ -559,4 +598,4 @@ if __name__ == "__name__":
train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"}
trainer = BaseTrainer(train_args)
data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10]
trainer.batchify(data=data_train)
trainer.make_batch(data=data_train)

+ 104
- 0
fastNLP/fastNLP.py View File

@@ -0,0 +1,104 @@
from fastNLP.action.inference import Inference
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader

"""
mapping from model name to [URL, file_name.class_name]
Notice that the class of the model should be in "models" directory.

Example:
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"]
"""
FastNLP_MODEL_COLLECTION = {
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"]
}


class FastNLP(object):
"""
High-level interface for direct model inference.
Usage:
fastnlp = FastNLP()
fastnlp.load("zh_pos_tag_model")
text = "这是最好的基于深度学习的中文分词系统。"
result = fastnlp.run(text)
print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"]
"""

def __init__(self, model_dir="./"):
self.model_dir = model_dir
self.model = None

def load(self, model_name):
"""
Load a pre-trained FastNLP model together with additional data.
:param model_name: str, the name of a FastNLP model.
"""
assert type(model_name) is str
if model_name not in FastNLP_MODEL_COLLECTION:
raise ValueError("No FastNLP model named {}.".format(model_name))

if not self.model_exist(model_dir=self.model_dir):
self._download(model_name, FastNLP_MODEL_COLLECTION[model_name][0])

model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name][1])

model_args = ConfigSection()
# To do: customized config file for model init parameters
ConfigLoader.load_config(self.model_dir + "default.cfg", model_args)

model = model_class(model_args)

# To do: framework independent
ModelLoader.load_pytorch(model, self.model_dir + model_name)

self.model = model

print("Model loaded. ")

def run(self, infer_input):
"""
Perform inference over given input using the loaded model.
:param infer_input: str, raw text
:return results:
"""
infer = Inference()
data = infer.prepare_input(infer_input)
results = infer.predict(self.model, data)
return results

@staticmethod
def _get_model_class(file_class_name):
"""
Feature the class specified by <file_class_name>
:param file_class_name: str, contains the name of the Python module followed by the name of the class.
Example: "sequence_modeling.SeqLabeling"
:return module: the model class
"""
import_prefix = "fastNLP.models."
parts = (import_prefix + file_class_name).split(".")
from_module = ".".join(parts[:-1])
module = __import__(from_module)
for sub in parts[1:]:
module = getattr(module, sub)
return module

def _load(self, model_dir, model_name):
# To do
return 0

def _download(self, model_name, url):
"""
Download the model weights from <url> and save in <self.model_dir>.
:param model_name:
:param url:
"""
print("Downloading {} from {}".format(model_name, url))
# To do

def model_exist(self, model_dir):
"""
Check whether the desired model is already in the directory.
:param model_dir:
"""
pass

+ 1
- 1
fastNLP/loader/base_loader.py View File

@@ -17,7 +17,7 @@ class BaseLoader(object):
def load_lines(self):
with open(self.data_path, "r", encoding="utf=8") as f:
text = f.readlines()
return text
return [line.strip() for line in text]


class ToyLoader0(BaseLoader):


+ 4
- 2
fastNLP/loader/model_loader.py View File

@@ -11,9 +11,11 @@ class ModelLoader(BaseLoader):
def __init__(self, data_name, data_path):
super(ModelLoader, self).__init__(data_name, data_path)

def load_pytorch(self, empty_model):
@staticmethod
def load_pytorch(empty_model, model_path):
"""
Load model parameters from .pkl files into the empty PyTorch model.
:param empty_model: a PyTorch model with initialized parameters.
:param model_path: str, the path to the saved model.
"""
empty_model.load_state_dict(torch.load(self.data_path))
empty_model.load_state_dict(torch.load(model_path))

+ 361
- 346
fastNLP/loader/preprocess.py View File

@@ -1,346 +1,361 @@
import _pickle
import os
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
# the first vocab in dict with the index = 5
class BasePreprocess(object):
def __init__(self, data, pickle_path):
super(BasePreprocess, self).__init__()
self.data = data
self.pickle_path = pickle_path
if not self.pickle_path.endswith('/'):
self.pickle_path = self.pickle_path + '/'
class POSPreprocess(BasePreprocess):
"""
This class are used to preprocess the pos datasets.
"""
def __init__(self, data, pickle_path="./", train_dev_split=0):
"""
Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:param pickle_path: str, the directory to the pickle files. Default: "./"
:param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0.
To do:
1. simplify __init__
"""
super(POSPreprocess, self).__init__(data, pickle_path)
self.pickle_path = pickle_path
if self.pickle_exist("word2id.pkl"):
# load word2index because the construction of the following objects needs it
with open(os.path.join(self.pickle_path, "word2id.pkl"), "rb") as f:
self.word2index = _pickle.load(f)
else:
self.word2index, self.label2index = self.build_dict(data)
with open(os.path.join(self.pickle_path, "word2id.pkl"), "wb") as f:
_pickle.dump(self.word2index, f)
if self.pickle_exist("class2id.pkl"):
with open(os.path.join(self.pickle_path, "class2id.pkl"), "rb") as f:
self.label2index = _pickle.load(f)
else:
with open(os.path.join(self.pickle_path, "class2id.pkl"), "wb") as f:
_pickle.dump(self.label2index, f)
#something will be wrong if word2id.pkl is found but class2id.pkl is not found
if not self.pickle_exist("id2word.pkl"):
index2word = self.build_reverse_dict(self.word2index)
with open(os.path.join(self.pickle_path, "id2word.pkl"), "wb") as f:
_pickle.dump(index2word, f)
if not self.pickle_exist("id2class.pkl"):
index2label = self.build_reverse_dict(self.label2index)
with open(os.path.join(self.pickle_path, "word2id.pkl"), "wb") as f:
_pickle.dump(index2label, f)
if not self.pickle_exist("data_train.pkl"):
data_train = self.to_index(data)
if train_dev_split > 0 and not self.pickle_exist("data_dev.pkl"):
data_dev = data_train[: int(len(data_train) * train_dev_split)]
with open(os.path.join(self.pickle_path, "data_dev.pkl"), "wb") as f:
_pickle.dump(data_dev, f)
with open(os.path.join(self.pickle_path, "data_train.pkl"), "wb") as f:
_pickle.dump(data_train, f)
def build_dict(self, data):
"""
Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return word2index: dict of {str, int}
label2index: dict of {str, int}
"""
label2index = {}
word2index = DEFAULT_WORD_TO_INDEX
for example in data:
for word, label in zip(example[0], example[1]):
if word not in word2index:
word2index[word] = len(word2index)
if label not in label2index:
label2index[label] = len(label2index)
return word2index, label2index
def pickle_exist(self, pickle_name):
"""
:param pickle_name: the filename of target pickle file
:return: True if file exists else False
"""
if not os.path.exists(self.pickle_path):
os.makedirs(self.pickle_path)
file_name = os.path.join(self.pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False
def build_reverse_dict(self, word_dict):
id2word = {word_dict[w]: w for w in word_dict}
return id2word
def to_index(self, data):
"""
Convert word strings and label strings into indices.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return data_index: the shape of data, but each string is replaced by its corresponding index
"""
data_index = []
for example in data:
word_list = []
label_list = []
for word, label in zip(example[0], example[1]):
word_list.append(self.word2index[word])
label_list.append(self.label2index[label])
data_index.append([word_list, label_list])
return data_index
@property
def vocab_size(self):
return len(self.word2index)
@property
def num_classes(self):
return len(self.label2index)
class ClassPreprocess(BasePreprocess):
"""
Pre-process the classification datasets.
Params:
pickle_path - directory to save result of pre-processing
Saves:
word2id.pkl
id2word.pkl
class2id.pkl
id2class.pkl
embedding.pkl
data_train.pkl
data_dev.pkl
data_test.pkl
"""
def __init__(self, pickle_path):
# super(ClassPreprocess, self).__init__(data, pickle_path)
self.word_dict = None
self.label_dict = None
self.pickle_path = pickle_path # save directory
def process(self, data, save_name):
"""
Process data.
Params:
data - nested list, data = [sample1, sample2, ...],
sample = [sentence, label], sentence = [word1, word2, ...]
save_name - name of processed data, such as data_train.pkl
Returns:
vocab_size - vocabulary size
n_classes - number of classes
"""
self.build_dict(data)
self.word2id()
vocab_size = self.id2word()
self.class2id()
num_classes = self.id2class()
self.embedding()
self.data_generate(data, save_name)
return vocab_size, num_classes
def build_dict(self, data):
"""Build vocabulary."""
# just read if word2id.pkl and class2id.pkl exists
if self.pickle_exist("word2id.pkl") and \
self.pickle_exist("class2id.pkl"):
file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, 'rb') as f:
self.word_dict = _pickle.load(f)
file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, 'rb') as f:
self.label_dict = _pickle.load(f)
return
# build vocabulary from scratch if nothing exists
self.word_dict = {
DEFAULT_PADDING_LABEL: 0,
DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2,
DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
self.label_dict = {}
# collect every word and label
for sent, label in data:
if len(sent) <= 1:
continue
if label not in self.label_dict:
index = len(self.label_dict)
self.label_dict[label] = index
for word in sent:
if word not in self.word_dict:
index = len(self.word_dict)
self.word_dict[word[0]] = index
def pickle_exist(self, pickle_name):
"""
Check whether a pickle file exists.
Params
pickle_name: the filename of target pickle file
Return
True if file exists else False
"""
if not os.path.exists(self.pickle_path):
os.makedirs(self.pickle_path)
file_name = os.path.join(self.pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False
def word2id(self):
"""Save vocabulary of {word:id} mapping format."""
# nothing will be done if word2id.pkl exists
if self.pickle_exist("word2id.pkl"):
return
file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.word_dict, f)
def id2word(self):
"""Save vocabulary of {id:word} mapping format."""
# nothing will be done if id2word.pkl exists
if self.pickle_exist("id2word.pkl"):
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, 'rb') as f:
id2word_dict = _pickle.load(f)
return len(id2word_dict)
id2word_dict = {self.word_dict[w]: w for w in self.word_dict}
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2word_dict, f)
return len(id2word_dict)
def class2id(self):
"""Save mapping of {class:id}."""
# nothing will be done if class2id.pkl exists
if self.pickle_exist("class2id.pkl"):
return
file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.label_dict, f)
def id2class(self):
"""Save mapping of {id:class}."""
# nothing will be done if id2class.pkl exists
if self.pickle_exist("id2class.pkl"):
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "rb") as f:
id2class_dict = _pickle.load(f)
return len(id2class_dict)
id2class_dict = {self.label_dict[c]: c for c in self.label_dict}
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2class_dict, f)
return len(id2class_dict)
def embedding(self):
"""Save embedding lookup table corresponding to vocabulary."""
# nothing will be done if embedding.pkl exists
if self.pickle_exist("embedding.pkl"):
return
# retrieve vocabulary from pre-trained embedding (not implemented)
def data_generate(self, data_src, save_name):
"""Convert dataset from text to digit."""
# nothing will be done if file exists
save_path = os.path.join(self.pickle_path, save_name)
if os.path.exists(save_path):
return
data = []
# for every sample
for sent, label in data_src:
if len(sent) <= 1:
continue
label_id = self.label_dict[label] # label id
sent_id = [] # sentence ids
for word in sent:
if word in self.word_dict:
sent_id.append(self.word_dict[word])
else:
sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL])
data.append([sent_id, label_id])
# save data
with open(save_path, "wb") as f:
_pickle.dump(data, f)
class LMPreprocess(BasePreprocess):
def __init__(self, data, pickle_path):
super(LMPreprocess, self).__init__(data, pickle_path)
import _pickle
import os

DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4

DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}


# the first vocab in dict with the index = 5

def save_pickle(obj, pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "wb") as f:
_pickle.dump(obj, f)
print("{} saved. ".format(file_name))


def load_pickle(pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f)
return obj


def pickle_exist(pickle_path, pickle_name):
"""
:param pickle_path: the directory of target pickle file
:param pickle_name: the filename of target pickle file
:return: True if file exists else False
"""
if not os.path.exists(pickle_path):
os.makedirs(pickle_path)
file_name = os.path.join(pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False


class BasePreprocess(object):

def __init__(self, data, pickle_path):
super(BasePreprocess, self).__init__()
# self.data = data
self.pickle_path = pickle_path
if not self.pickle_path.endswith('/'):
self.pickle_path = self.pickle_path + '/'


class POSPreprocess(BasePreprocess):
"""
This class are used to preprocess the POS Tag datasets.

"""

def __init__(self, data, pickle_path="./", train_dev_split=0):
"""
Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:param pickle_path: str, the directory to the pickle files. Default: "./"
:param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0.

"""
super(POSPreprocess, self).__init__(data, pickle_path)

self.pickle_path = pickle_path

if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
self.word2index = load_pickle(self.pickle_path, "word2id.pkl")
self.label2index = load_pickle(self.pickle_path, "class2id.pkl")
else:
self.word2index, self.label2index = self.build_dict(data)
save_pickle(self.word2index, self.pickle_path, "word2id.pkl")
save_pickle(self.label2index, self.pickle_path, "class2id.pkl")

if not pickle_exist(pickle_path, "id2word.pkl"):
index2word = self.build_reverse_dict(self.word2index)
save_pickle(index2word, self.pickle_path, "id2word.pkl")

if not pickle_exist(pickle_path, "id2class.pkl"):
index2label = self.build_reverse_dict(self.label2index)
save_pickle(index2label, self.pickle_path, "id2class.pkl")

if not pickle_exist(pickle_path, "data_train.pkl"):
data_train = self.to_index(data)
if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
data_dev = data_train[: int(len(data_train) * train_dev_split)]
save_pickle(data_dev, self.pickle_path, "data_dev.pkl")
save_pickle(data_train, self.pickle_path, "data_train.pkl")

def build_dict(self, data):
"""
Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return word2index: dict of {str, int}
label2index: dict of {str, int}
"""
label2index = {}
word2index = DEFAULT_WORD_TO_INDEX
for example in data:
for word, label in zip(example[0], example[1]):
if word not in word2index:
word2index[word] = len(word2index)
if label not in label2index:
label2index[label] = len(label2index)
return word2index, label2index

def build_reverse_dict(self, word_dict):
id2word = {word_dict[w]: w for w in word_dict}
return id2word

def to_index(self, data):
"""
Convert word strings and label strings into indices.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return data_index: the shape of data, but each string is replaced by its corresponding index
"""
data_index = []
for example in data:
word_list = []
label_list = []
for word, label in zip(example[0], example[1]):
word_list.append(self.word2index[word])
label_list.append(self.label2index[label])
data_index.append([word_list, label_list])
return data_index

@property
def vocab_size(self):
return len(self.word2index)

@property
def num_classes(self):
return len(self.label2index)


class ClassPreprocess(BasePreprocess):
"""
Pre-process the classification datasets.

Params:
pickle_path - directory to save result of pre-processing
Saves:
word2id.pkl
id2word.pkl
class2id.pkl
id2class.pkl
embedding.pkl
data_train.pkl
data_dev.pkl
data_test.pkl
"""

def __init__(self, pickle_path):
# super(ClassPreprocess, self).__init__(data, pickle_path)
self.word_dict = None
self.label_dict = None
self.pickle_path = pickle_path # save directory

def process(self, data, save_name):
"""
Process data.

Params:
data - nested list, data = [sample1, sample2, ...],
sample = [sentence, label], sentence = [word1, word2, ...]
save_name - name of processed data, such as data_train.pkl
Returns:
vocab_size - vocabulary size
n_classes - number of classes
"""
self.build_dict(data)
self.word2id()
vocab_size = self.id2word()
self.class2id()
num_classes = self.id2class()
self.embedding()
self.data_generate(data, save_name)

return vocab_size, num_classes

def build_dict(self, data):
"""Build vocabulary."""

# just read if word2id.pkl and class2id.pkl exists
if self.pickle_exist("word2id.pkl") and \
self.pickle_exist("class2id.pkl"):
file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, 'rb') as f:
self.word_dict = _pickle.load(f)
file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, 'rb') as f:
self.label_dict = _pickle.load(f)
return

# build vocabulary from scratch if nothing exists
self.word_dict = {
DEFAULT_PADDING_LABEL: 0,
DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2,
DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
self.label_dict = {}

# collect every word and label
for sent, label in data:
if len(sent) <= 1:
continue

if label not in self.label_dict:
index = len(self.label_dict)
self.label_dict[label] = index

for word in sent:
if word not in self.word_dict:
index = len(self.word_dict)
self.word_dict[word[0]] = index

def pickle_exist(self, pickle_name):
"""
Check whether a pickle file exists.

Params
pickle_name: the filename of target pickle file
Return
True if file exists else False
"""
if not os.path.exists(self.pickle_path):
os.makedirs(self.pickle_path)
file_name = os.path.join(self.pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False

def word2id(self):
"""Save vocabulary of {word:id} mapping format."""
# nothing will be done if word2id.pkl exists
if self.pickle_exist("word2id.pkl"):
return

file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.word_dict, f)

def id2word(self):
"""Save vocabulary of {id:word} mapping format."""
# nothing will be done if id2word.pkl exists
if self.pickle_exist("id2word.pkl"):
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, 'rb') as f:
id2word_dict = _pickle.load(f)
return len(id2word_dict)

id2word_dict = {self.word_dict[w]: w for w in self.word_dict}
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2word_dict, f)
return len(id2word_dict)

def class2id(self):
"""Save mapping of {class:id}."""
# nothing will be done if class2id.pkl exists
if self.pickle_exist("class2id.pkl"):
return

file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.label_dict, f)

def id2class(self):
"""Save mapping of {id:class}."""
# nothing will be done if id2class.pkl exists
if self.pickle_exist("id2class.pkl"):
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "rb") as f:
id2class_dict = _pickle.load(f)
return len(id2class_dict)

id2class_dict = {self.label_dict[c]: c for c in self.label_dict}
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2class_dict, f)
return len(id2class_dict)

def embedding(self):
"""Save embedding lookup table corresponding to vocabulary."""
# nothing will be done if embedding.pkl exists
if self.pickle_exist("embedding.pkl"):
return

# retrieve vocabulary from pre-trained embedding (not implemented)

def data_generate(self, data_src, save_name):
"""Convert dataset from text to digit."""

# nothing will be done if file exists
save_path = os.path.join(self.pickle_path, save_name)
if os.path.exists(save_path):
return

data = []
# for every sample
for sent, label in data_src:
if len(sent) <= 1:
continue

label_id = self.label_dict[label] # label id
sent_id = [] # sentence ids
for word in sent:
if word in self.word_dict:
sent_id.append(self.word_dict[word])
else:
sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL])
data.append([sent_id, label_id])

# save data
with open(save_path, "wb") as f:
_pickle.dump(data, f)


class LMPreprocess(BasePreprocess):
def __init__(self, data, pickle_path):
super(LMPreprocess, self).__init__(data, pickle_path)


def infer_preprocess(pickle_path, data):
"""
Preprocess over inference data.
Transform three-level list of strings into that of index.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
"""
word2index = load_pickle(pickle_path, "word2id.pkl")
data_index = []
for example in data:
data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
return data_index

+ 7
- 12
fastNLP/models/sequence_modeling.py View File

@@ -9,17 +9,12 @@ class SeqLabeling(BaseModel):
PyTorch Network for sequence labeling
"""

def __init__(self, hidden_dim,
rnn_num_layer,
num_classes,
vocab_size,
word_emb_dim=100,
init_emb=None,
rnn_mode="gru",
bi_direction=False,
dropout=0.5,
use_crf=True):
def __init__(self, args):
super(SeqLabeling, self).__init__()
vocab_size = args["vocab_size"]
word_emb_dim = args["word_emb_dim"]
hidden_dim = args["rnn_hidden_units"]
num_classes = args["num_classes"]

self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim)
self.Rnn = encoder.lstm.Lstm(word_emb_dim, hidden_dim)
@@ -29,7 +24,7 @@ class SeqLabeling(BaseModel):
def forward(self, x):
"""
:param x: LongTensor, [batch_size, mex_len]
:return y: [batch_size, tag_size, tag_size]
:return y: [batch_size, mex_len, tag_size]
"""
x = self.Embedding(x)
# [batch_size, max_len, word_emb_dim]
@@ -64,7 +59,7 @@ class SeqLabeling(BaseModel):

def prediction(self, x, seq_length):
"""
:param x: FloatTensor, [batch_size, tag_size, tag_size]
:param x: FloatTensor, [batch_size, max_len, tag_size]
:param seq_length: int
:return prediction: list of tuple of (decode path(list), best score)
"""


+ 1
- 1
fastNLP/modules/encoder/lstm.py View File

@@ -13,7 +13,7 @@ class Lstm(nn.Module):
bidirectional : If True, becomes a bidirectional RNN. Default: False.
"""

def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.5, bidirectional=False):
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0, bidirectional=False):
super(Lstm, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,
dropout=dropout, bidirectional=bidirectional)


+ 6
- 0
test/data_for_tests/config View File

@@ -74,3 +74,9 @@ save_dev_input = false
save_loss = true
batch_size = 1
pickle_path = "./data_for_tests/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true

+ 2
- 0
test/data_for_tests/people_infer.txt View File

@@ -0,0 +1,2 @@
迈向充满希望的新世纪——一九九八年新年讲话
(附图片1张)

+ 39
- 24
test/test_POS_pipeline.py View File

@@ -4,8 +4,8 @@ sys.path.append("..")

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.action.trainer import POSTrainer
from fastNLP.loader.dataset_loader import POSDatasetLoader
from fastNLP.loader.preprocess import POSPreprocess
from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.action.tester import POSTester
@@ -15,32 +15,49 @@ from fastNLP.action.inference import Inference
data_name = "people.txt"
data_path = "data_for_tests/people.txt"
pickle_path = "data_for_tests"
data_infer_path = "data_for_tests/people_infer.txt"


def test_infer():
def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictinary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = SeqLabeling(hidden_dim=train_args["rnn_hidden_units"], rnn_num_layer=train_args["rnn_layers"],
num_classes=train_args["num_classes"], vocab_size=train_args["vocab_size"],
word_emb_dim=train_args["word_emb_dim"], bi_direction=train_args["rnn_bi_direction"],
rnn_mode="gru", dropout=train_args["dropout"], use_crf=train_args["use_crf"])
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader("arbitrary_name", "./saved_model.pkl").load_pytorch(model)
ModelLoader.load_pytorch(model, "./saved_model.pkl")
print("model loaded!")

# Data Loader
pos_loader = POSDatasetLoader(data_name, data_path)
infer_data = pos_loader.load_lines()

# Preprocessor
POSPreprocess(infer_data, pickle_path)
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()
"""
Transform strings into list of list of strings.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them.
"""

# Inference interface
infer = Inference()
infer = Inference(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")

if __name__ == "__main__":

def train_test():
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
@@ -58,10 +75,7 @@ if __name__ == "__main__":
trainer = POSTrainer(train_args)

# Model
model = SeqLabeling(hidden_dim=train_args["rnn_hidden_units"], rnn_num_layer=train_args["rnn_layers"],
num_classes=train_args["num_classes"], vocab_size=train_args["vocab_size"],
word_emb_dim=train_args["word_emb_dim"], bi_direction=train_args["rnn_bi_direction"],
rnn_mode="gru", dropout=train_args["dropout"], use_crf=train_args["use_crf"])
model = SeqLabeling(train_args)

# Start training
trainer.train(model)
@@ -75,13 +89,10 @@ if __name__ == "__main__":
del model, trainer, pos_loader

# Define the same model
model = SeqLabeling(hidden_dim=train_args["rnn_hidden_units"], rnn_num_layer=train_args["rnn_layers"],
num_classes=train_args["num_classes"], vocab_size=train_args["vocab_size"],
word_emb_dim=train_args["word_emb_dim"], bi_direction=train_args["rnn_bi_direction"],
rnn_mode="gru", dropout=train_args["dropout"], use_crf=train_args["use_crf"])
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader("arbitrary_name", "./saved_model.pkl").load_pytorch(model)
ModelLoader.load_pytorch(model, "./saved_model.pkl")
print("model loaded!")

# Load test configuration
@@ -97,3 +108,7 @@ if __name__ == "__main__":
# print test results
print(tester.show_matrices())
print("model tested!")


if __name__ == "__main__":
infer()

Loading…
Cancel
Save