Browse Source

fastnlp.py works, see test/test_fastNLP.py for high-level API

tags/v0.1.0
FengZiYjun 6 years ago
parent
commit
c83008add9
9 changed files with 289 additions and 52 deletions
  1. +6
    -4
      fastNLP/action/inference.py
  2. +81
    -12
      fastNLP/fastnlp.py
  3. +5
    -1
      fastNLP/loader/config_loader.py
  4. +51
    -0
      fastNLP/loader/dataset_loader.py
  5. +11
    -2
      test/data_for_tests/config
  6. +115
    -0
      test/test_cws.py
  7. +14
    -0
      test/test_fastNLP.py
  8. +0
    -28
      test/test_keras_like.py
  9. +6
    -5
      test/test_seq_labeling.py

+ 6
- 4
fastNLP/action/inference.py View File

@@ -38,7 +38,7 @@ class Inference(object):
num_iter = len(data) // self.batch_size

for step in range(num_iter):
batch_x = self.batchify(data)
batch_x = self.make_batch(data)

prediction = self.data_forward(network, batch_x)

@@ -68,10 +68,11 @@ class Inference(object):
results = torch.Tensor(prediction).view(-1, )
return list(results.data)

def batchify(self, data):
def make_batch(self, data):
indices = next(self.iterator)
batch_x = [data[idx] for idx in indices]
batch_x = self.pad(batch_x)
if self.batch_size > 1:
batch_x = self.pad(batch_x)
return batch_x

@staticmethod
@@ -98,6 +99,7 @@ class Inference(object):
...
]
"""
assert isinstance(data, list)
data_index = []
default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL]
for example in data:
@@ -107,7 +109,7 @@ class Inference(object):
def prepare_output(self, batch_outputs):
"""
Transform list of batch outputs into strings.
:param batch_outputs: list of list [num_batch, tag_seq_length]
:param batch_outputs: list of list, of shape [num_batch, tag_seq_length]. Element type is Tensor.
:return:
"""
results = []


fastNLP/fastNLP.py → fastNLP/fastnlp.py View File

@@ -3,14 +3,14 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader

"""
mapping from model name to [URL, file_name.class_name]
mapping from model name to [URL, file_name.class_name, model_pickle_name]
Notice that the class of the model should be in "models" directory.

Example:
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"]
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"]
"""
FastNLP_MODEL_COLLECTION = {
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"]
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"]
}


@@ -26,6 +26,13 @@ class FastNLP(object):
"""

def __init__(self, model_dir="./"):
"""
:param model_dir: this directory should contain the following files:
1. a pre-trained model
2. a config file
3. "id2class.pkl"
4. "word2id.pkl"
"""
self.model_dir = model_dir
self.model = None

@@ -45,27 +52,32 @@ class FastNLP(object):

model_args = ConfigSection()
# To do: customized config file for model init parameters
ConfigLoader.load_config(self.model_dir + "default.cfg", model_args)
ConfigLoader.load_config(self.model_dir + "config", {"POS_infer": model_args})

# Construct the model
model = model_class(model_args)

# To do: framework independent
ModelLoader.load_pytorch(model, self.model_dir + model_name)
ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name][2])

self.model = model

print("Model loaded. ")

def run(self, infer_input):
def run(self, raw_input):
"""
Perform inference over given input using the loaded model.
:param infer_input: str, raw text
:param raw_input: str, raw text
:return results:
"""
infer = Inference()
data = infer.prepare_input(infer_input)
results = infer.predict(self.model, data)
return results

infer = Inference(self.model_dir)
infer_input = self.string_to_list(raw_input)

results = infer.predict(self.model, infer_input)

outputs = self.make_output(results)
return outputs

@staticmethod
def _get_model_class(file_class_name):
@@ -101,4 +113,61 @@ class FastNLP(object):
Check whether the desired model is already in the directory.
:param model_dir:
"""
pass
return True

def string_to_list(self, text, delimiter="\n"):
"""
For word seg only, currently.
This function is used to transform raw input to lists, which is done by DatasetLoader in training.
Split text string into three-level lists.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
:param text: string
:param delimiter: str, character used to split text into sentences.
:return data: three-level lists
"""
data = []
sents = text.strip().split(delimiter)
for sent in sents:
characters = []
for ch in sent:
characters.append(ch)
data.append(characters)
# To refactor: this is used in make_output
self.data = data
return data

def make_output(self, results):
"""
Transform model output into user-friendly contents.
Example: In CWS, convert <BMES> labeling into segmented text.
:param results:
:return:
"""
outputs = []
for sent_char, sent_label in zip(self.data, results):
words = []
word = ""
for char, label in zip(sent_char, sent_label):
if label[0] == "B":
if word != "":
words.append(word)
word = char
elif label[0] == "M":
word += char
elif label[0] == "E":
word += char
words.append(word)
word = ""
elif label[0] == "S":
if word != "":
words.append(word)
word = ""
words.append(char)
else:
raise ValueError("invalid label")
outputs.append(" ".join(words))
return outputs

+ 5
- 1
fastNLP/loader/config_loader.py View File

@@ -20,9 +20,13 @@ class ConfigLoader(BaseLoader):
def load_config(file_path, sections):
"""
:param file_path: the path of config file
:param sections: the dict of sections
:param sections: the dict of {section_name(string): Section instance}
Example:
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
:return:
"""
assert isinstance(sections, dict)
cfg = configparser.ConfigParser()
if not os.path.exists(file_path):
raise FileNotFoundError("config file {} not found. ".format(file_path))


+ 51
- 0
fastNLP/loader/dataset_loader.py View File

@@ -22,6 +22,7 @@ class POSDatasetLoader(DatasetLoader):
and label2
Jerry label1
. label3
(separated by an empty line)
Hello label4
world label5
! label3
@@ -77,6 +78,51 @@ class POSDatasetLoader(DatasetLoader):
return data


class TokenizeDatasetLoader(DatasetLoader):
"""
Data set loader for tokenization data sets
"""

def __init__(self, data_name, data_path):
super(TokenizeDatasetLoader, self).__init__(data_name, data_path)

def load_pku(self):
"""
load pku dataset for Chinese word segmentation
CWS (Chinese Word Segmentation) pku training dataset format:
1. Each line is a sentence.
2. Each word in a sentence is separated by space.
This function convert the pku dataset into three-level lists with labels <BMES>.
B: beginning of a word
M: middle of a word
E: ending of a word
S: single character

:return: three-level lists
"""
with open(self.data_path, "r", encoding="utf-8") as f:
sentences = f.readlines()
data = []
for sent in sentences:
words = []
labels = []
tokens = sent.strip().split()
for token in tokens:
if len(token) == 1:
words.append(token)
labels.append("S")
else:
words.append(token[0])
labels.append("B")
for idx in range(1, len(token) - 1):
words.append(token[idx])
labels.append("M")
words.append(token[-1])
labels.append("E")
data.append([words, labels])
return data


class ClassDatasetLoader(DatasetLoader):
"""Loader for classification data sets"""

@@ -163,7 +209,12 @@ class LMDatasetLoader(DatasetLoader):


if __name__ == "__main__":
"""
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines()
for example in data:
for w, l in zip(example[0], example[1]):
print(w, l)
"""

ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku()
print(ans)

+ 11
- 2
test/data_for_tests/config View File

@@ -54,8 +54,8 @@ test = 5
new_attr = 40

[POS]
epochs = 20
batch_size = 1
epochs = 1
batch_size = 32
pickle_path = "./data_for_tests/"
validate = true
save_best_dev = true
@@ -80,3 +80,12 @@ rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true

[POS_infer]
pickle_path = "./data_for_tests/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
word_emb_dim = 100
vocab_size = 52
num_classes = 22

+ 115
- 0
test/test_cws.py View File

@@ -0,0 +1,115 @@
import sys

sys.path.append("..")

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.action.trainer import POSTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.action.tester import POSTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.action.inference import Inference

data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8"
pickle_path = "data_for_tests"
data_infer_path = "data_for_tests/people_infer.txt"


def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()
"""
Transform strings into list of list of strings.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them.
"""

# Inference interface
infer = Inference(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train_test():
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
train_data = loader.load_pku()

# Preprocessor
p = POSPreprocess(train_data, pickle_path)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes

# Trainer
trainer = POSTrainer(train_args)

# Model
model = SeqLabeling(train_args)

# Start training
trainer.train(model)
print("Training finished!")

# Saver
saver = ModelSaver("./data_for_tests/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

del model, trainer, loader

# Define the same model
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# Tester
tester = POSTester(test_args)

# Start testing
tester.test(model)

# print test results
print(tester.show_matrices())
print("model tested!")


if __name__ == "__main__":
train_test()
# infer()

+ 14
- 0
test/test_fastNLP.py View File

@@ -0,0 +1,14 @@
from fastNLP.fastnlp import FastNLP


def foo():
nlp = FastNLP("./data_for_tests/")
nlp.load("zh_pos_tag_model")
text = "这是最好的基于深度学习的中文分词系统。"
result = nlp.run(text)
print(result)
print("FastNLP finished!")


if __name__ == "__main__":
foo()

+ 0
- 28
test/test_keras_like.py View File

@@ -1,28 +0,0 @@
import aggregation
import decoder
import encoder


class Input(object):
def __init__(self):
pass


class Trainer(object):
def __init__(self, input, target, truth):
pass

def train(self):
pass


def test_keras_like():
data_train, label_train = dataLoader("./data_path")

x = Input()
x = encoder.LSTM(input=x)
x = aggregation.max_pool(input=x)
y = decoder.CRF(input=x)

trainer = Trainer(input=data_train, target=y, truth=label_train)
trainer.train()

test/test_POS_pipeline.py → test/test_seq_labeling.py View File

@@ -23,7 +23,7 @@ def infer():
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictinary size and number of labels from pickle files
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
@@ -33,7 +33,7 @@ def infer():
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./saved_model.pkl")
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Data Loader
@@ -82,7 +82,7 @@ def train_test():
print("Training finished!")

# Saver
saver = ModelSaver("./saved_model.pkl")
saver = ModelSaver("./data_for_tests/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

@@ -92,7 +92,7 @@ def train_test():
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./saved_model.pkl")
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Load test configuration
@@ -111,4 +111,5 @@ def train_test():


if __name__ == "__main__":
infer()
train_test()
# infer()

Loading…
Cancel
Save