@@ -38,7 +38,7 @@ class Inference(object): | |||||
num_iter = len(data) // self.batch_size | num_iter = len(data) // self.batch_size | ||||
for step in range(num_iter): | for step in range(num_iter): | ||||
batch_x = self.batchify(data) | |||||
batch_x = self.make_batch(data) | |||||
prediction = self.data_forward(network, batch_x) | prediction = self.data_forward(network, batch_x) | ||||
@@ -68,10 +68,11 @@ class Inference(object): | |||||
results = torch.Tensor(prediction).view(-1, ) | results = torch.Tensor(prediction).view(-1, ) | ||||
return list(results.data) | return list(results.data) | ||||
def batchify(self, data): | |||||
def make_batch(self, data): | |||||
indices = next(self.iterator) | indices = next(self.iterator) | ||||
batch_x = [data[idx] for idx in indices] | batch_x = [data[idx] for idx in indices] | ||||
batch_x = self.pad(batch_x) | |||||
if self.batch_size > 1: | |||||
batch_x = self.pad(batch_x) | |||||
return batch_x | return batch_x | ||||
@staticmethod | @staticmethod | ||||
@@ -98,6 +99,7 @@ class Inference(object): | |||||
... | ... | ||||
] | ] | ||||
""" | """ | ||||
assert isinstance(data, list) | |||||
data_index = [] | data_index = [] | ||||
default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] | default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] | ||||
for example in data: | for example in data: | ||||
@@ -107,7 +109,7 @@ class Inference(object): | |||||
def prepare_output(self, batch_outputs): | def prepare_output(self, batch_outputs): | ||||
""" | """ | ||||
Transform list of batch outputs into strings. | Transform list of batch outputs into strings. | ||||
:param batch_outputs: list of list [num_batch, tag_seq_length] | |||||
:param batch_outputs: list of list, of shape [num_batch, tag_seq_length]. Element type is Tensor. | |||||
:return: | :return: | ||||
""" | """ | ||||
results = [] | results = [] | ||||
@@ -3,14 +3,14 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
from fastNLP.loader.model_loader import ModelLoader | from fastNLP.loader.model_loader import ModelLoader | ||||
""" | """ | ||||
mapping from model name to [URL, file_name.class_name] | |||||
mapping from model name to [URL, file_name.class_name, model_pickle_name] | |||||
Notice that the class of the model should be in "models" directory. | Notice that the class of the model should be in "models" directory. | ||||
Example: | Example: | ||||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] | |||||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] | |||||
""" | """ | ||||
FastNLP_MODEL_COLLECTION = { | FastNLP_MODEL_COLLECTION = { | ||||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] | |||||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] | |||||
} | } | ||||
@@ -26,6 +26,13 @@ class FastNLP(object): | |||||
""" | """ | ||||
def __init__(self, model_dir="./"): | def __init__(self, model_dir="./"): | ||||
""" | |||||
:param model_dir: this directory should contain the following files: | |||||
1. a pre-trained model | |||||
2. a config file | |||||
3. "id2class.pkl" | |||||
4. "word2id.pkl" | |||||
""" | |||||
self.model_dir = model_dir | self.model_dir = model_dir | ||||
self.model = None | self.model = None | ||||
@@ -45,27 +52,32 @@ class FastNLP(object): | |||||
model_args = ConfigSection() | model_args = ConfigSection() | ||||
# To do: customized config file for model init parameters | # To do: customized config file for model init parameters | ||||
ConfigLoader.load_config(self.model_dir + "default.cfg", model_args) | |||||
ConfigLoader.load_config(self.model_dir + "config", {"POS_infer": model_args}) | |||||
# Construct the model | |||||
model = model_class(model_args) | model = model_class(model_args) | ||||
# To do: framework independent | # To do: framework independent | ||||
ModelLoader.load_pytorch(model, self.model_dir + model_name) | |||||
ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name][2]) | |||||
self.model = model | self.model = model | ||||
print("Model loaded. ") | print("Model loaded. ") | ||||
def run(self, infer_input): | |||||
def run(self, raw_input): | |||||
""" | """ | ||||
Perform inference over given input using the loaded model. | Perform inference over given input using the loaded model. | ||||
:param infer_input: str, raw text | |||||
:param raw_input: str, raw text | |||||
:return results: | :return results: | ||||
""" | """ | ||||
infer = Inference() | |||||
data = infer.prepare_input(infer_input) | |||||
results = infer.predict(self.model, data) | |||||
return results | |||||
infer = Inference(self.model_dir) | |||||
infer_input = self.string_to_list(raw_input) | |||||
results = infer.predict(self.model, infer_input) | |||||
outputs = self.make_output(results) | |||||
return outputs | |||||
@staticmethod | @staticmethod | ||||
def _get_model_class(file_class_name): | def _get_model_class(file_class_name): | ||||
@@ -101,4 +113,61 @@ class FastNLP(object): | |||||
Check whether the desired model is already in the directory. | Check whether the desired model is already in the directory. | ||||
:param model_dir: | :param model_dir: | ||||
""" | """ | ||||
pass | |||||
return True | |||||
def string_to_list(self, text, delimiter="\n"): | |||||
""" | |||||
For word seg only, currently. | |||||
This function is used to transform raw input to lists, which is done by DatasetLoader in training. | |||||
Split text string into three-level lists. | |||||
[ | |||||
[word_11, word_12, ...], | |||||
[word_21, word_22, ...], | |||||
... | |||||
] | |||||
:param text: string | |||||
:param delimiter: str, character used to split text into sentences. | |||||
:return data: three-level lists | |||||
""" | |||||
data = [] | |||||
sents = text.strip().split(delimiter) | |||||
for sent in sents: | |||||
characters = [] | |||||
for ch in sent: | |||||
characters.append(ch) | |||||
data.append(characters) | |||||
# To refactor: this is used in make_output | |||||
self.data = data | |||||
return data | |||||
def make_output(self, results): | |||||
""" | |||||
Transform model output into user-friendly contents. | |||||
Example: In CWS, convert <BMES> labeling into segmented text. | |||||
:param results: | |||||
:return: | |||||
""" | |||||
outputs = [] | |||||
for sent_char, sent_label in zip(self.data, results): | |||||
words = [] | |||||
word = "" | |||||
for char, label in zip(sent_char, sent_label): | |||||
if label[0] == "B": | |||||
if word != "": | |||||
words.append(word) | |||||
word = char | |||||
elif label[0] == "M": | |||||
word += char | |||||
elif label[0] == "E": | |||||
word += char | |||||
words.append(word) | |||||
word = "" | |||||
elif label[0] == "S": | |||||
if word != "": | |||||
words.append(word) | |||||
word = "" | |||||
words.append(char) | |||||
else: | |||||
raise ValueError("invalid label") | |||||
outputs.append(" ".join(words)) | |||||
return outputs |
@@ -20,9 +20,13 @@ class ConfigLoader(BaseLoader): | |||||
def load_config(file_path, sections): | def load_config(file_path, sections): | ||||
""" | """ | ||||
:param file_path: the path of config file | :param file_path: the path of config file | ||||
:param sections: the dict of sections | |||||
:param sections: the dict of {section_name(string): Section instance} | |||||
Example: | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
:return: | :return: | ||||
""" | """ | ||||
assert isinstance(sections, dict) | |||||
cfg = configparser.ConfigParser() | cfg = configparser.ConfigParser() | ||||
if not os.path.exists(file_path): | if not os.path.exists(file_path): | ||||
raise FileNotFoundError("config file {} not found. ".format(file_path)) | raise FileNotFoundError("config file {} not found. ".format(file_path)) | ||||
@@ -22,6 +22,7 @@ class POSDatasetLoader(DatasetLoader): | |||||
and label2 | and label2 | ||||
Jerry label1 | Jerry label1 | ||||
. label3 | . label3 | ||||
(separated by an empty line) | |||||
Hello label4 | Hello label4 | ||||
world label5 | world label5 | ||||
! label3 | ! label3 | ||||
@@ -77,6 +78,51 @@ class POSDatasetLoader(DatasetLoader): | |||||
return data | return data | ||||
class TokenizeDatasetLoader(DatasetLoader): | |||||
""" | |||||
Data set loader for tokenization data sets | |||||
""" | |||||
def __init__(self, data_name, data_path): | |||||
super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | |||||
def load_pku(self): | |||||
""" | |||||
load pku dataset for Chinese word segmentation | |||||
CWS (Chinese Word Segmentation) pku training dataset format: | |||||
1. Each line is a sentence. | |||||
2. Each word in a sentence is separated by space. | |||||
This function convert the pku dataset into three-level lists with labels <BMES>. | |||||
B: beginning of a word | |||||
M: middle of a word | |||||
E: ending of a word | |||||
S: single character | |||||
:return: three-level lists | |||||
""" | |||||
with open(self.data_path, "r", encoding="utf-8") as f: | |||||
sentences = f.readlines() | |||||
data = [] | |||||
for sent in sentences: | |||||
words = [] | |||||
labels = [] | |||||
tokens = sent.strip().split() | |||||
for token in tokens: | |||||
if len(token) == 1: | |||||
words.append(token) | |||||
labels.append("S") | |||||
else: | |||||
words.append(token[0]) | |||||
labels.append("B") | |||||
for idx in range(1, len(token) - 1): | |||||
words.append(token[idx]) | |||||
labels.append("M") | |||||
words.append(token[-1]) | |||||
labels.append("E") | |||||
data.append([words, labels]) | |||||
return data | |||||
class ClassDatasetLoader(DatasetLoader): | class ClassDatasetLoader(DatasetLoader): | ||||
"""Loader for classification data sets""" | """Loader for classification data sets""" | ||||
@@ -163,7 +209,12 @@ class LMDatasetLoader(DatasetLoader): | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
""" | |||||
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | ||||
for example in data: | for example in data: | ||||
for w, l in zip(example[0], example[1]): | for w, l in zip(example[0], example[1]): | ||||
print(w, l) | print(w, l) | ||||
""" | |||||
ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||||
print(ans) |
@@ -54,8 +54,8 @@ test = 5 | |||||
new_attr = 40 | new_attr = 40 | ||||
[POS] | [POS] | ||||
epochs = 20 | |||||
batch_size = 1 | |||||
epochs = 1 | |||||
batch_size = 32 | |||||
pickle_path = "./data_for_tests/" | pickle_path = "./data_for_tests/" | ||||
validate = true | validate = true | ||||
save_best_dev = true | save_best_dev = true | ||||
@@ -80,3 +80,12 @@ rnn_bi_direction = true | |||||
word_emb_dim = 100 | word_emb_dim = 100 | ||||
dropout = 0.5 | dropout = 0.5 | ||||
use_crf = true | use_crf = true | ||||
[POS_infer] | |||||
pickle_path = "./data_for_tests/" | |||||
rnn_hidden_units = 100 | |||||
rnn_layers = 1 | |||||
rnn_bi_direction = true | |||||
word_emb_dim = 100 | |||||
vocab_size = 52 | |||||
num_classes = 22 |
@@ -0,0 +1,115 @@ | |||||
import sys | |||||
sys.path.append("..") | |||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
from fastNLP.action.trainer import POSTrainer | |||||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||||
from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||||
from fastNLP.saver.model_saver import ModelSaver | |||||
from fastNLP.loader.model_loader import ModelLoader | |||||
from fastNLP.action.tester import POSTester | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | |||||
from fastNLP.action.inference import Inference | |||||
data_name = "pku_training.utf8" | |||||
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||||
pickle_path = "data_for_tests" | |||||
data_infer_path = "data_for_tests/people_infer.txt" | |||||
def infer(): | |||||
# Load infer configuration, the same as test | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
# fetch dictionary size and number of labels from pickle files | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
test_args["vocab_size"] = len(word2index) | |||||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
test_args["num_classes"] = len(index2label) | |||||
# Define the same model | |||||
model = SeqLabeling(test_args) | |||||
# Dump trained parameters into the model | |||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
print("model loaded!") | |||||
# Data Loader | |||||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
infer_data = raw_data_loader.load_lines() | |||||
""" | |||||
Transform strings into list of list of strings. | |||||
[ | |||||
[word_11, word_12, ...], | |||||
[word_21, word_22, ...], | |||||
... | |||||
] | |||||
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. | |||||
""" | |||||
# Inference interface | |||||
infer = Inference(pickle_path) | |||||
results = infer.predict(model, infer_data) | |||||
print(results) | |||||
print("Inference finished!") | |||||
def train_test(): | |||||
# Config Loader | |||||
train_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
# Data Loader | |||||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||||
train_data = loader.load_pku() | |||||
# Preprocessor | |||||
p = POSPreprocess(train_data, pickle_path) | |||||
train_args["vocab_size"] = p.vocab_size | |||||
train_args["num_classes"] = p.num_classes | |||||
# Trainer | |||||
trainer = POSTrainer(train_args) | |||||
# Model | |||||
model = SeqLabeling(train_args) | |||||
# Start training | |||||
trainer.train(model) | |||||
print("Training finished!") | |||||
# Saver | |||||
saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||||
saver.save_pytorch(model) | |||||
print("Model saved!") | |||||
del model, trainer, loader | |||||
# Define the same model | |||||
model = SeqLabeling(train_args) | |||||
# Dump trained parameters into the model | |||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
print("model loaded!") | |||||
# Load test configuration | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
# Tester | |||||
tester = POSTester(test_args) | |||||
# Start testing | |||||
tester.test(model) | |||||
# print test results | |||||
print(tester.show_matrices()) | |||||
print("model tested!") | |||||
if __name__ == "__main__": | |||||
train_test() | |||||
# infer() |
@@ -0,0 +1,14 @@ | |||||
from fastNLP.fastnlp import FastNLP | |||||
def foo(): | |||||
nlp = FastNLP("./data_for_tests/") | |||||
nlp.load("zh_pos_tag_model") | |||||
text = "这是最好的基于深度学习的中文分词系统。" | |||||
result = nlp.run(text) | |||||
print(result) | |||||
print("FastNLP finished!") | |||||
if __name__ == "__main__": | |||||
foo() |
@@ -1,28 +0,0 @@ | |||||
import aggregation | |||||
import decoder | |||||
import encoder | |||||
class Input(object): | |||||
def __init__(self): | |||||
pass | |||||
class Trainer(object): | |||||
def __init__(self, input, target, truth): | |||||
pass | |||||
def train(self): | |||||
pass | |||||
def test_keras_like(): | |||||
data_train, label_train = dataLoader("./data_path") | |||||
x = Input() | |||||
x = encoder.LSTM(input=x) | |||||
x = aggregation.max_pool(input=x) | |||||
y = decoder.CRF(input=x) | |||||
trainer = Trainer(input=data_train, target=y, truth=label_train) | |||||
trainer.train() |
@@ -23,7 +23,7 @@ def infer(): | |||||
test_args = ConfigSection() | test_args = ConfigSection() | ||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | ||||
# fetch dictinary size and number of labels from pickle files | |||||
# fetch dictionary size and number of labels from pickle files | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
test_args["vocab_size"] = len(word2index) | test_args["vocab_size"] = len(word2index) | ||||
index2label = load_pickle(pickle_path, "id2class.pkl") | index2label = load_pickle(pickle_path, "id2class.pkl") | ||||
@@ -33,7 +33,7 @@ def infer(): | |||||
model = SeqLabeling(test_args) | model = SeqLabeling(test_args) | ||||
# Dump trained parameters into the model | # Dump trained parameters into the model | ||||
ModelLoader.load_pytorch(model, "./saved_model.pkl") | |||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
print("model loaded!") | print("model loaded!") | ||||
# Data Loader | # Data Loader | ||||
@@ -82,7 +82,7 @@ def train_test(): | |||||
print("Training finished!") | print("Training finished!") | ||||
# Saver | # Saver | ||||
saver = ModelSaver("./saved_model.pkl") | |||||
saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||||
saver.save_pytorch(model) | saver.save_pytorch(model) | ||||
print("Model saved!") | print("Model saved!") | ||||
@@ -92,7 +92,7 @@ def train_test(): | |||||
model = SeqLabeling(train_args) | model = SeqLabeling(train_args) | ||||
# Dump trained parameters into the model | # Dump trained parameters into the model | ||||
ModelLoader.load_pytorch(model, "./saved_model.pkl") | |||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
print("model loaded!") | print("model loaded!") | ||||
# Load test configuration | # Load test configuration | ||||
@@ -111,4 +111,5 @@ def train_test(): | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
infer() | |||||
train_test() | |||||
# infer() |