@@ -38,7 +38,7 @@ class Inference(object): | |||
num_iter = len(data) // self.batch_size | |||
for step in range(num_iter): | |||
batch_x = self.batchify(data) | |||
batch_x = self.make_batch(data) | |||
prediction = self.data_forward(network, batch_x) | |||
@@ -68,10 +68,11 @@ class Inference(object): | |||
results = torch.Tensor(prediction).view(-1, ) | |||
return list(results.data) | |||
def batchify(self, data): | |||
def make_batch(self, data): | |||
indices = next(self.iterator) | |||
batch_x = [data[idx] for idx in indices] | |||
batch_x = self.pad(batch_x) | |||
if self.batch_size > 1: | |||
batch_x = self.pad(batch_x) | |||
return batch_x | |||
@staticmethod | |||
@@ -98,6 +99,7 @@ class Inference(object): | |||
... | |||
] | |||
""" | |||
assert isinstance(data, list) | |||
data_index = [] | |||
default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] | |||
for example in data: | |||
@@ -107,7 +109,7 @@ class Inference(object): | |||
def prepare_output(self, batch_outputs): | |||
""" | |||
Transform list of batch outputs into strings. | |||
:param batch_outputs: list of list [num_batch, tag_seq_length] | |||
:param batch_outputs: list of list, of shape [num_batch, tag_seq_length]. Element type is Tensor. | |||
:return: | |||
""" | |||
results = [] | |||
@@ -3,14 +3,14 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.loader.model_loader import ModelLoader | |||
""" | |||
mapping from model name to [URL, file_name.class_name] | |||
mapping from model name to [URL, file_name.class_name, model_pickle_name] | |||
Notice that the class of the model should be in "models" directory. | |||
Example: | |||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] | |||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] | |||
""" | |||
FastNLP_MODEL_COLLECTION = { | |||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] | |||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] | |||
} | |||
@@ -26,6 +26,13 @@ class FastNLP(object): | |||
""" | |||
def __init__(self, model_dir="./"): | |||
""" | |||
:param model_dir: this directory should contain the following files: | |||
1. a pre-trained model | |||
2. a config file | |||
3. "id2class.pkl" | |||
4. "word2id.pkl" | |||
""" | |||
self.model_dir = model_dir | |||
self.model = None | |||
@@ -45,27 +52,32 @@ class FastNLP(object): | |||
model_args = ConfigSection() | |||
# To do: customized config file for model init parameters | |||
ConfigLoader.load_config(self.model_dir + "default.cfg", model_args) | |||
ConfigLoader.load_config(self.model_dir + "config", {"POS_infer": model_args}) | |||
# Construct the model | |||
model = model_class(model_args) | |||
# To do: framework independent | |||
ModelLoader.load_pytorch(model, self.model_dir + model_name) | |||
ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name][2]) | |||
self.model = model | |||
print("Model loaded. ") | |||
def run(self, infer_input): | |||
def run(self, raw_input): | |||
""" | |||
Perform inference over given input using the loaded model. | |||
:param infer_input: str, raw text | |||
:param raw_input: str, raw text | |||
:return results: | |||
""" | |||
infer = Inference() | |||
data = infer.prepare_input(infer_input) | |||
results = infer.predict(self.model, data) | |||
return results | |||
infer = Inference(self.model_dir) | |||
infer_input = self.string_to_list(raw_input) | |||
results = infer.predict(self.model, infer_input) | |||
outputs = self.make_output(results) | |||
return outputs | |||
@staticmethod | |||
def _get_model_class(file_class_name): | |||
@@ -101,4 +113,61 @@ class FastNLP(object): | |||
Check whether the desired model is already in the directory. | |||
:param model_dir: | |||
""" | |||
pass | |||
return True | |||
def string_to_list(self, text, delimiter="\n"): | |||
""" | |||
For word seg only, currently. | |||
This function is used to transform raw input to lists, which is done by DatasetLoader in training. | |||
Split text string into three-level lists. | |||
[ | |||
[word_11, word_12, ...], | |||
[word_21, word_22, ...], | |||
... | |||
] | |||
:param text: string | |||
:param delimiter: str, character used to split text into sentences. | |||
:return data: three-level lists | |||
""" | |||
data = [] | |||
sents = text.strip().split(delimiter) | |||
for sent in sents: | |||
characters = [] | |||
for ch in sent: | |||
characters.append(ch) | |||
data.append(characters) | |||
# To refactor: this is used in make_output | |||
self.data = data | |||
return data | |||
def make_output(self, results): | |||
""" | |||
Transform model output into user-friendly contents. | |||
Example: In CWS, convert <BMES> labeling into segmented text. | |||
:param results: | |||
:return: | |||
""" | |||
outputs = [] | |||
for sent_char, sent_label in zip(self.data, results): | |||
words = [] | |||
word = "" | |||
for char, label in zip(sent_char, sent_label): | |||
if label[0] == "B": | |||
if word != "": | |||
words.append(word) | |||
word = char | |||
elif label[0] == "M": | |||
word += char | |||
elif label[0] == "E": | |||
word += char | |||
words.append(word) | |||
word = "" | |||
elif label[0] == "S": | |||
if word != "": | |||
words.append(word) | |||
word = "" | |||
words.append(char) | |||
else: | |||
raise ValueError("invalid label") | |||
outputs.append(" ".join(words)) | |||
return outputs |
@@ -20,9 +20,13 @@ class ConfigLoader(BaseLoader): | |||
def load_config(file_path, sections): | |||
""" | |||
:param file_path: the path of config file | |||
:param sections: the dict of sections | |||
:param sections: the dict of {section_name(string): Section instance} | |||
Example: | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
:return: | |||
""" | |||
assert isinstance(sections, dict) | |||
cfg = configparser.ConfigParser() | |||
if not os.path.exists(file_path): | |||
raise FileNotFoundError("config file {} not found. ".format(file_path)) | |||
@@ -22,6 +22,7 @@ class POSDatasetLoader(DatasetLoader): | |||
and label2 | |||
Jerry label1 | |||
. label3 | |||
(separated by an empty line) | |||
Hello label4 | |||
world label5 | |||
! label3 | |||
@@ -77,6 +78,51 @@ class POSDatasetLoader(DatasetLoader): | |||
return data | |||
class TokenizeDatasetLoader(DatasetLoader): | |||
""" | |||
Data set loader for tokenization data sets | |||
""" | |||
def __init__(self, data_name, data_path): | |||
super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | |||
def load_pku(self): | |||
""" | |||
load pku dataset for Chinese word segmentation | |||
CWS (Chinese Word Segmentation) pku training dataset format: | |||
1. Each line is a sentence. | |||
2. Each word in a sentence is separated by space. | |||
This function convert the pku dataset into three-level lists with labels <BMES>. | |||
B: beginning of a word | |||
M: middle of a word | |||
E: ending of a word | |||
S: single character | |||
:return: three-level lists | |||
""" | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
sentences = f.readlines() | |||
data = [] | |||
for sent in sentences: | |||
words = [] | |||
labels = [] | |||
tokens = sent.strip().split() | |||
for token in tokens: | |||
if len(token) == 1: | |||
words.append(token) | |||
labels.append("S") | |||
else: | |||
words.append(token[0]) | |||
labels.append("B") | |||
for idx in range(1, len(token) - 1): | |||
words.append(token[idx]) | |||
labels.append("M") | |||
words.append(token[-1]) | |||
labels.append("E") | |||
data.append([words, labels]) | |||
return data | |||
class ClassDatasetLoader(DatasetLoader): | |||
"""Loader for classification data sets""" | |||
@@ -163,7 +209,12 @@ class LMDatasetLoader(DatasetLoader): | |||
if __name__ == "__main__": | |||
""" | |||
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | |||
for example in data: | |||
for w, l in zip(example[0], example[1]): | |||
print(w, l) | |||
""" | |||
ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||
print(ans) |
@@ -54,8 +54,8 @@ test = 5 | |||
new_attr = 40 | |||
[POS] | |||
epochs = 20 | |||
batch_size = 1 | |||
epochs = 1 | |||
batch_size = 32 | |||
pickle_path = "./data_for_tests/" | |||
validate = true | |||
save_best_dev = true | |||
@@ -80,3 +80,12 @@ rnn_bi_direction = true | |||
word_emb_dim = 100 | |||
dropout = 0.5 | |||
use_crf = true | |||
[POS_infer] | |||
pickle_path = "./data_for_tests/" | |||
rnn_hidden_units = 100 | |||
rnn_layers = 1 | |||
rnn_bi_direction = true | |||
word_emb_dim = 100 | |||
vocab_size = 52 | |||
num_classes = 22 |
@@ -0,0 +1,115 @@ | |||
import sys | |||
sys.path.append("..") | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.action.trainer import POSTrainer | |||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.action.tester import POSTester | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
from fastNLP.action.inference import Inference | |||
data_name = "pku_training.utf8" | |||
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||
pickle_path = "data_for_tests" | |||
data_infer_path = "data_for_tests/people_infer.txt" | |||
def infer(): | |||
# Load infer configuration, the same as test | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
model = SeqLabeling(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
""" | |||
Transform strings into list of list of strings. | |||
[ | |||
[word_11, word_12, ...], | |||
[word_21, word_22, ...], | |||
... | |||
] | |||
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. | |||
""" | |||
# Inference interface | |||
infer = Inference(pickle_path) | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
print("Inference finished!") | |||
def train_test(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
# Data Loader | |||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
train_data = loader.load_pku() | |||
# Preprocessor | |||
p = POSPreprocess(train_data, pickle_path) | |||
train_args["vocab_size"] = p.vocab_size | |||
train_args["num_classes"] = p.num_classes | |||
# Trainer | |||
trainer = POSTrainer(train_args) | |||
# Model | |||
model = SeqLabeling(train_args) | |||
# Start training | |||
trainer.train(model) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
del model, trainer, loader | |||
# Define the same model | |||
model = SeqLabeling(train_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Load test configuration | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# Tester | |||
tester = POSTester(test_args) | |||
# Start testing | |||
tester.test(model) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print("model tested!") | |||
if __name__ == "__main__": | |||
train_test() | |||
# infer() |
@@ -0,0 +1,14 @@ | |||
from fastNLP.fastnlp import FastNLP | |||
def foo(): | |||
nlp = FastNLP("./data_for_tests/") | |||
nlp.load("zh_pos_tag_model") | |||
text = "这是最好的基于深度学习的中文分词系统。" | |||
result = nlp.run(text) | |||
print(result) | |||
print("FastNLP finished!") | |||
if __name__ == "__main__": | |||
foo() |
@@ -1,28 +0,0 @@ | |||
import aggregation | |||
import decoder | |||
import encoder | |||
class Input(object): | |||
def __init__(self): | |||
pass | |||
class Trainer(object): | |||
def __init__(self, input, target, truth): | |||
pass | |||
def train(self): | |||
pass | |||
def test_keras_like(): | |||
data_train, label_train = dataLoader("./data_path") | |||
x = Input() | |||
x = encoder.LSTM(input=x) | |||
x = aggregation.max_pool(input=x) | |||
y = decoder.CRF(input=x) | |||
trainer = Trainer(input=data_train, target=y, truth=label_train) | |||
trainer.train() |
@@ -23,7 +23,7 @@ def infer(): | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# fetch dictinary size and number of labels from pickle files | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
@@ -33,7 +33,7 @@ def infer(): | |||
model = SeqLabeling(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./saved_model.pkl") | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Data Loader | |||
@@ -82,7 +82,7 @@ def train_test(): | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./saved_model.pkl") | |||
saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
@@ -92,7 +92,7 @@ def train_test(): | |||
model = SeqLabeling(train_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./saved_model.pkl") | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Load test configuration | |||
@@ -111,4 +111,5 @@ def train_test(): | |||
if __name__ == "__main__": | |||
infer() | |||
train_test() | |||
# infer() |