[add] PeopleDailyCorpusLoader, to parse PeopleDaily Corpus [update] add CWS + POS_tag interface at FastNLP, see example in test_fastNLP.py [update] modify README.md and readme_example.py to the latest version.tags/v0.1.0
@@ -27,15 +27,16 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa | |||||
A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. | A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. | ||||
```python | ```python | ||||
from fastNLP.core.preprocess import ClassPreprocess | |||||
from fastNLP.core.predictor import ClassificationInfer | |||||
from fastNLP.core.trainer import ClassificationTrainer | |||||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||||
from fastNLP.models.base_model import BaseModel | from fastNLP.models.base_model import BaseModel | ||||
from fastNLP.modules import encoder | |||||
from fastNLP.modules import aggregation | from fastNLP.modules import aggregation | ||||
from fastNLP.modules import encoder | |||||
from fastNLP.modules import decoder | from fastNLP.modules import decoder | ||||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||||
from fastNLP.loader.preprocess import ClassPreprocess | |||||
from fastNLP.core.trainer import ClassificationTrainer | |||||
from fastNLP.core.inference import ClassificationInfer | |||||
from fastNLP.core.loss import Loss | |||||
from fastNLP.core.optimizer import Optimizer | |||||
class ClassificationModel(BaseModel): | class ClassificationModel(BaseModel): | ||||
@@ -50,7 +51,7 @@ class ClassificationModel(BaseModel): | |||||
self.enc = encoder.Conv( | self.enc = encoder.Conv( | ||||
in_channels=300, out_channels=100, kernel_size=3) | in_channels=300, out_channels=100, kernel_size=3) | ||||
self.agg = aggregation.MaxPool() | self.agg = aggregation.MaxPool() | ||||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||||
self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||||
def forward(self, x): | def forward(self, x): | ||||
x = self.emb(x) # [N,L] -> [N,L,C] | x = self.emb(x) # [N,L] -> [N,L,C] | ||||
@@ -60,16 +61,17 @@ class ClassificationModel(BaseModel): | |||||
return x | return x | ||||
data_dir = 'data' # directory to save data and model | |||||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||||
data_dir = 'save/' # directory to save data and model | |||||
train_path = './data_for_tests/text_classify.txt' # training set file | |||||
# load dataset | # load dataset | ||||
ds_loader = ClassDatasetLoader("train", train_path) | ds_loader = ClassDatasetLoader("train", train_path) | ||||
data = ds_loader.load() | data = ds_loader.load() | ||||
# pre-process dataset | # pre-process dataset | ||||
pre = ClassPreprocess(data_dir) | |||||
vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||||
pre = ClassPreprocess() | |||||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||||
n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||||
# construct model | # construct model | ||||
model_args = { | model_args = { | ||||
@@ -78,28 +80,36 @@ model_args = { | |||||
} | } | ||||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | ||||
# train model | |||||
# construct trainer | |||||
train_args = { | train_args = { | ||||
"epochs": 20, | |||||
"batch_size": 50, | |||||
"epochs": 3, | |||||
"batch_size": 16, | |||||
"pickle_path": data_dir, | "pickle_path": data_dir, | ||||
"validate": False, | "validate": False, | ||||
"save_best_dev": False, | "save_best_dev": False, | ||||
"model_saved_path": None, | "model_saved_path": None, | ||||
"use_cuda": True, | "use_cuda": True, | ||||
"learn_rate": 1e-3, | |||||
"momentum": 0.9} | |||||
trainer = ClassificationTrainer(train_args) | |||||
trainer.train(model) | |||||
"loss": Loss("cross_entropy"), | |||||
"optimizer": Optimizer("Adam", lr=0.001) | |||||
} | |||||
trainer = ClassificationTrainer(**train_args) | |||||
# start training | |||||
trainer.train(model, train_data=train_set, dev_data=dev_set) | |||||
# predict using model | # predict using model | ||||
seqs = [x[0] for x in data] | |||||
data_infer = [x[0] for x in data] | |||||
infer = ClassificationInfer(data_dir) | infer = ClassificationInfer(data_dir) | ||||
labels_pred = infer.predict(model, seqs) | |||||
labels_pred = infer.predict(model.cpu(), data_infer) | |||||
print(labels_pred) | |||||
``` | ``` | ||||
## Installation | ## Installation | ||||
Run the following commands to install fastNLP package. | |||||
```shell | |||||
pip install fastNLP | |||||
``` | |||||
### Cloning From GitHub | ### Cloning From GitHub | ||||
@@ -86,7 +86,7 @@ class BaseTester(object): | |||||
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | ||||
step = 0 | step = 0 | ||||
for batch_x, batch_y in self.make_batch(iterator, dev_data): | |||||
for batch_x, batch_y in self.make_batch(iterator): | |||||
with torch.no_grad(): | with torch.no_grad(): | ||||
prediction = self.data_forward(network, batch_x) | prediction = self.data_forward(network, batch_x) | ||||
eval_results = self.evaluate(prediction, batch_y) | eval_results = self.evaluate(prediction, batch_y) | ||||
@@ -123,14 +123,14 @@ class BaseTester(object): | |||||
"""Return a list of metrics. """ | """Return a list of metrics. """ | ||||
raise NotImplementedError | raise NotImplementedError | ||||
def show_matrices(self): | |||||
def show_metrics(self): | |||||
"""This is called by Trainer to print evaluation results on dev set during training. | """This is called by Trainer to print evaluation results on dev set during training. | ||||
:return print_str: str | :return print_str: str | ||||
""" | """ | ||||
raise NotImplementedError | raise NotImplementedError | ||||
def make_batch(self, iterator, data): | |||||
def make_batch(self, iterator): | |||||
raise NotImplementedError | raise NotImplementedError | ||||
@@ -194,7 +194,7 @@ class SeqLabelTester(BaseTester): | |||||
batch_accuracy = np.mean([x[1] for x in self.eval_history]) | batch_accuracy = np.mean([x[1] for x in self.eval_history]) | ||||
return batch_loss, batch_accuracy | return batch_loss, batch_accuracy | ||||
def show_matrices(self): | |||||
def show_metrics(self): | |||||
""" | """ | ||||
This is called by Trainer to print evaluation on dev set. | This is called by Trainer to print evaluation on dev set. | ||||
:return print_str: str | :return print_str: str | ||||
@@ -202,7 +202,7 @@ class SeqLabelTester(BaseTester): | |||||
loss, accuracy = self.metrics() | loss, accuracy = self.metrics() | ||||
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | ||||
def make_batch(self, iterator, data): | |||||
def make_batch(self, iterator): | |||||
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | ||||
@@ -216,7 +216,7 @@ class ClassificationTester(BaseTester): | |||||
""" | """ | ||||
super(ClassificationTester, self).__init__(**test_args) | super(ClassificationTester, self).__init__(**test_args) | ||||
def make_batch(self, iterator, data, max_len=None): | |||||
def make_batch(self, iterator, max_len=None): | |||||
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | ||||
def data_forward(self, network, x): | def data_forward(self, network, x): | ||||
@@ -144,7 +144,7 @@ class BaseTrainer(object): | |||||
print("Saved better model selected by validation.") | print("Saved better model selected by validation.") | ||||
logger.info("Saved better model selected by validation.") | logger.info("Saved better model selected by validation.") | ||||
valid_results = validator.show_matrices() | |||||
valid_results = validator.show_metrics() | |||||
print("[epoch {}] {}".format(epoch, valid_results)) | print("[epoch {}] {}".format(epoch, valid_results)) | ||||
logger.info("[epoch {}] {}".format(epoch, valid_results)) | logger.info("[epoch {}] {}".format(epoch, valid_results)) | ||||
@@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = { | |||||
"type": "seq_label", | "type": "seq_label", | ||||
"config_file_name": "config", | "config_file_name": "config", | ||||
"config_section_name": "text_class_model" | "config_section_name": "text_class_model" | ||||
}, | |||||
"pos_tag_model": { | |||||
"url": "", | |||||
"class": "sequence_modeling.AdvSeqLabel", | |||||
"pickle": "pos_tag_model_v_0.pkl", | |||||
"type": "seq_label", | |||||
"config_file_name": "pos_tag.config", | |||||
"config_section_name": "pos_tag_model" | |||||
} | } | ||||
} | } | ||||
@@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq): | |||||
else: | else: | ||||
raise ValueError("invalid label {}".format(label[0])) | raise ValueError("invalid label {}".format(label[0])) | ||||
return words | return words | ||||
def interpret_cws_pos_results(char_seq, label_seq): | |||||
"""Transform model output into user-friendly contents. | |||||
:param char_seq: list of string | |||||
:param label_seq: list of string, the same length as char_seq. | |||||
:return outputs: list of tuple (words, pos_tag): | |||||
""" | |||||
def pos_tag_check(seq): | |||||
"""check whether all entries are the same """ | |||||
return len(set(seq)) <= 1 | |||||
word = [] | |||||
word_pos = [] | |||||
outputs = [] | |||||
for char, label in zip(char_seq, label_seq): | |||||
tmp = label.split("-") | |||||
cws_label, pos_tag = tmp[0], tmp[1] | |||||
if cws_label == "B" or cws_label == "M": | |||||
word.append(char) | |||||
word_pos.append(pos_tag) | |||||
elif cws_label == "E": | |||||
word.append(char) | |||||
word_pos.append(pos_tag) | |||||
if not pos_tag_check(word_pos): | |||||
raise RuntimeError("character-wise pos tags inconsistent. ") | |||||
outputs.append(("".join(word), word_pos[0])) | |||||
word.clear() | |||||
word_pos.clear() | |||||
elif cws_label == "S": | |||||
outputs.append((char, pos_tag)) | |||||
return outputs |
@@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): | |||||
return text.strip().split() | return text.strip().split() | ||||
if __name__ == "__main__": | |||||
class PeopleDailyCorpusLoader(DatasetLoader): | |||||
""" | """ | ||||
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | |||||
for example in data: | |||||
for w, l in zip(example[0], example[1]): | |||||
print(w, l) | |||||
People Daily Corpus: Chinese word segmentation, POS tag, NER | |||||
""" | """ | ||||
ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||||
print(ans) | |||||
def __init__(self, data_path): | |||||
super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path) | |||||
def load(self): | |||||
with open(self.data_path, "r", encoding="utf-8") as f: | |||||
sents = f.readlines() | |||||
pos_tag_examples = [] | |||||
ner_examples = [] | |||||
for sent in sents: | |||||
inside_ne = False | |||||
sent_pos_tag = [] | |||||
sent_words = [] | |||||
sent_ner = [] | |||||
words = sent.strip().split()[1:] | |||||
for word in words: | |||||
if "[" in word and "]" in word: | |||||
ner_tag = "U" | |||||
print(word) | |||||
elif "[" in word: | |||||
inside_ne = True | |||||
ner_tag = "B" | |||||
word = word[1:] | |||||
elif "]" in word: | |||||
ner_tag = "L" | |||||
word = word[:word.index("]")] | |||||
if inside_ne is True: | |||||
inside_ne = False | |||||
else: | |||||
raise RuntimeError("only ] appears!") | |||||
else: | |||||
if inside_ne is True: | |||||
ner_tag = "I" | |||||
else: | |||||
ner_tag = "O" | |||||
tmp = word.split("/") | |||||
token, pos = tmp[0], tmp[1] | |||||
sent_ner.append(ner_tag) | |||||
sent_pos_tag.append(pos) | |||||
sent_words.append(token) | |||||
pos_tag_examples.append([sent_words, sent_pos_tag]) | |||||
ner_examples.append([sent_words, sent_ner]) | |||||
return pos_tag_examples, ner_examples | |||||
if __name__ == "__main__": | |||||
loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt") | |||||
pos, ner = loader.load() | |||||
print(pos[:10]) | |||||
print(ner[:10]) |
@@ -1,3 +1,4 @@ | |||||
from .CRF import ConditionalRandomField | from .CRF import ConditionalRandomField | ||||
from .MLP import MLP | |||||
__all__ = ["ConditionalRandomField"] | |||||
__all__ = ["ConditionalRandomField", "MLP"] |
@@ -1,114 +0,0 @@ | |||||
import sys | |||||
sys.path.append("..") | |||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
from fastNLP.core.trainer import SeqLabelTrainer | |||||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||||
from fastNLP.saver.model_saver import ModelSaver | |||||
from fastNLP.loader.model_loader import ModelLoader | |||||
from fastNLP.core.tester import SeqLabelTester | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | |||||
from fastNLP.core.predictor import Predictor | |||||
data_name = "pku_training.utf8" | |||||
cws_data_path = "/home/zyfeng/data/pku_training.utf8" | |||||
pickle_path = "./save/" | |||||
data_infer_path = "/home/zyfeng/data/pku_test.utf8" | |||||
def infer(): | |||||
# Load infer configuration, the same as test | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
# fetch dictionary size and number of labels from pickle files | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
test_args["vocab_size"] = len(word2index) | |||||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
test_args["num_classes"] = len(index2label) | |||||
# Define the same model | |||||
model = SeqLabeling(test_args) | |||||
# Dump trained parameters into the model | |||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
print("model loaded!") | |||||
# Data Loader | |||||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
infer_data = raw_data_loader.load_lines() | |||||
# Inference interface | |||||
infer = Predictor(pickle_path) | |||||
results = infer.predict(model, infer_data) | |||||
print(results) | |||||
print("Inference finished!") | |||||
def train_test(): | |||||
# Config Loader | |||||
train_args = ConfigSection() | |||||
test_args = ConfigSection() | |||||
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) | |||||
# Data Loader | |||||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||||
train_data = loader.load_pku() | |||||
# Preprocessor | |||||
preprocess = SeqLabelPreprocess() | |||||
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||||
train_args["vocab_size"] = preprocess.vocab_size | |||||
train_args["num_classes"] = preprocess.num_classes | |||||
# Trainer | |||||
trainer = SeqLabelTrainer(train_args) | |||||
# Model | |||||
model = SeqLabeling(train_args) | |||||
# Start training | |||||
trainer.train(model, data_train, data_dev) | |||||
print("Training finished!") | |||||
# Saver | |||||
saver = ModelSaver("./save/saved_model.pkl") | |||||
saver.save_pytorch(model) | |||||
print("Model saved!") | |||||
# testing with validation set | |||||
test(data_dev) | |||||
def test(test_data): | |||||
# Config Loader | |||||
train_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
# Define the same model | |||||
model = SeqLabeling(train_args) | |||||
# Dump trained parameters into the model | |||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
print("model loaded!") | |||||
# Load test configuration | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
# Tester | |||||
tester = SeqLabelTester(test_args) | |||||
# Start testing | |||||
tester.test(model, test_data) | |||||
# print test results | |||||
print(tester.show_matrices()) | |||||
print("model tested!") | |||||
if __name__ == "__main__": | |||||
train_test() |
@@ -31,4 +31,16 @@ pickle_path = "./save/" | |||||
use_crf = true | use_crf = true | ||||
use_cuda = true | use_cuda = true | ||||
rnn_hidden_units = 100 | rnn_hidden_units = 100 | ||||
word_emb_dim = 100 | |||||
[model] | |||||
save_output = true | |||||
validate_in_training = true | |||||
save_dev_input = false | |||||
save_loss = true | |||||
batch_size = 640 | |||||
pickle_path = "./save/" | |||||
use_crf = true | |||||
use_cuda = true | |||||
rnn_hidden_units = 100 | |||||
word_emb_dim = 100 | word_emb_dim = 100 |
@@ -125,7 +125,7 @@ def test(): | |||||
tester.test(model, dev_data) | tester.test(model, dev_data) | ||||
# print test results | # print test results | ||||
print(tester.show_matrices()) | |||||
print(tester.show_metrics()) | |||||
print("model tested!") | print("model tested!") | ||||
@@ -1,29 +1,35 @@ | |||||
[train] | [train] | ||||
epochs = 10 | |||||
batch_size = 32 | |||||
epochs = 30 | |||||
batch_size = 64 | |||||
pickle_path = "./save/" | pickle_path = "./save/" | ||||
validate = true | validate = true | ||||
save_best_dev = true | save_best_dev = true | ||||
model_saved_path = "./save/" | model_saved_path = "./save/" | ||||
rnn_hidden_units = 100 | rnn_hidden_units = 100 | ||||
rnn_layers = 2 | |||||
rnn_bi_direction = true | |||||
word_emb_dim = 100 | word_emb_dim = 100 | ||||
dropout = 0.5 | |||||
use_crf = true | use_crf = true | ||||
use_cuda = true | use_cuda = true | ||||
print_every_step = 10 | |||||
[test] | [test] | ||||
save_output = true | save_output = true | ||||
validate_in_training = true | validate_in_training = true | ||||
save_dev_input = false | save_dev_input = false | ||||
save_loss = true | save_loss = true | ||||
batch_size = 64 | |||||
batch_size = 640 | |||||
pickle_path = "./save/" | |||||
use_crf = true | |||||
use_cuda = true | |||||
[POS_test] | |||||
save_output = true | |||||
validate_in_training = true | |||||
save_dev_input = false | |||||
save_loss = true | |||||
batch_size = 640 | |||||
pickle_path = "./save/" | pickle_path = "./save/" | ||||
rnn_hidden_units = 100 | |||||
rnn_layers = 1 | |||||
rnn_bi_direction = true | |||||
word_emb_dim = 100 | |||||
dropout = 0.5 | |||||
use_crf = true | use_crf = true | ||||
use_cuda = true | use_cuda = true | ||||
rnn_hidden_units = 100 | |||||
word_emb_dim = 100 |
@@ -0,0 +1,146 @@ | |||||
import os | |||||
import sys | |||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) | |||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
from fastNLP.core.trainer import SeqLabelTrainer | |||||
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader | |||||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||||
from fastNLP.saver.model_saver import ModelSaver | |||||
from fastNLP.loader.model_loader import ModelLoader | |||||
from fastNLP.core.tester import SeqLabelTester | |||||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||||
from fastNLP.core.predictor import SeqLabelInfer | |||||
# not in the file's dir | |||||
if len(os.path.dirname(__file__)) != 0: | |||||
os.chdir(os.path.dirname(__file__)) | |||||
datadir = "/home/zyfeng/data/" | |||||
cfgfile = './pos_tag.cfg' | |||||
data_name = "CWS_POS_TAG_NER_people_daily.txt" | |||||
pos_tag_data_path = os.path.join(datadir, data_name) | |||||
pickle_path = "save" | |||||
data_infer_path = os.path.join(datadir, "infer.utf8") | |||||
def infer(): | |||||
# Config Loader | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||||
# fetch dictionary size and number of labels from pickle files | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
test_args["vocab_size"] = len(word2index) | |||||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
test_args["num_classes"] = len(index2label) | |||||
# Define the same model | |||||
model = AdvSeqLabel(test_args) | |||||
try: | |||||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||||
print('model loaded!') | |||||
except Exception as e: | |||||
print('cannot load model!') | |||||
raise | |||||
# Data Loader | |||||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
infer_data = raw_data_loader.load_lines() | |||||
print('data loaded') | |||||
# Inference interface | |||||
infer = SeqLabelInfer(pickle_path) | |||||
results = infer.predict(model, infer_data) | |||||
print(results) | |||||
print("Inference finished!") | |||||
def train(): | |||||
# Config Loader | |||||
train_args = ConfigSection() | |||||
test_args = ConfigSection() | |||||
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||||
# Data Loader | |||||
loader = PeopleDailyCorpusLoader(pos_tag_data_path) | |||||
train_data, _ = loader.load() | |||||
# Preprocessor | |||||
preprocessor = SeqLabelPreprocess() | |||||
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||||
train_args["vocab_size"] = preprocessor.vocab_size | |||||
train_args["num_classes"] = preprocessor.num_classes | |||||
# Trainer | |||||
trainer = SeqLabelTrainer(**train_args.data) | |||||
# Model | |||||
model = AdvSeqLabel(train_args) | |||||
try: | |||||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||||
print('model parameter loaded!') | |||||
except Exception as e: | |||||
print("No saved model. Continue.") | |||||
pass | |||||
# Start training | |||||
trainer.train(model, data_train, data_dev) | |||||
print("Training finished!") | |||||
# Saver | |||||
saver = ModelSaver("./save/saved_model.pkl") | |||||
saver.save_pytorch(model) | |||||
print("Model saved!") | |||||
def test(): | |||||
# Config Loader | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||||
# fetch dictionary size and number of labels from pickle files | |||||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
test_args["vocab_size"] = len(word2index) | |||||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
test_args["num_classes"] = len(index2label) | |||||
# load dev data | |||||
dev_data = load_pickle(pickle_path, "data_dev.pkl") | |||||
# Define the same model | |||||
model = AdvSeqLabel(test_args) | |||||
# Dump trained parameters into the model | |||||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||||
print("model loaded!") | |||||
# Tester | |||||
tester = SeqLabelTester(**test_args.data) | |||||
# Start testing | |||||
tester.test(model, dev_data) | |||||
# print test results | |||||
print(tester.show_metrics()) | |||||
print("model tested!") | |||||
if __name__ == "__main__": | |||||
import argparse | |||||
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') | |||||
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) | |||||
args = parser.parse_args() | |||||
if args.mode == 'train': | |||||
train() | |||||
elif args.mode == 'test': | |||||
test() | |||||
elif args.mode == 'infer': | |||||
infer() | |||||
else: | |||||
print('no mode specified for model!') | |||||
parser.print_help() |
@@ -68,7 +68,7 @@ class MyNERTester(SeqLabelTester): | |||||
def metrics(self): | def metrics(self): | ||||
return np.mean(self.eval_history) | return np.mean(self.eval_history) | ||||
def show_matrices(self): | |||||
def show_metrics(self): | |||||
return "dev accuracy={:.2f}".format(float(self.metrics())) | return "dev accuracy={:.2f}".format(float(self.metrics())) | ||||
@@ -1,19 +1,13 @@ | |||||
# python: 3.5 | |||||
# pytorch: 0.4 | |||||
################ | |||||
# Test cross validation. | |||||
################ | |||||
from fastNLP.loader.preprocess import ClassPreprocess | |||||
from fastNLP.core.loss import Loss | |||||
from fastNLP.core.optimizer import Optimizer | |||||
from fastNLP.core.predictor import ClassificationInfer | from fastNLP.core.predictor import ClassificationInfer | ||||
from fastNLP.core.preprocess import ClassPreprocess | |||||
from fastNLP.core.trainer import ClassificationTrainer | from fastNLP.core.trainer import ClassificationTrainer | ||||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | from fastNLP.loader.dataset_loader import ClassDatasetLoader | ||||
from fastNLP.models.base_model import BaseModel | from fastNLP.models.base_model import BaseModel | ||||
from fastNLP.modules import aggregation | from fastNLP.modules import aggregation | ||||
from fastNLP.modules import encoder | |||||
from fastNLP.modules import decoder | from fastNLP.modules import decoder | ||||
from fastNLP.modules import encoder | |||||
class ClassificationModel(BaseModel): | class ClassificationModel(BaseModel): | ||||
@@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): | |||||
self.enc = encoder.Conv( | self.enc = encoder.Conv( | ||||
in_channels=300, out_channels=100, kernel_size=3) | in_channels=300, out_channels=100, kernel_size=3) | ||||
self.agg = aggregation.MaxPool() | self.agg = aggregation.MaxPool() | ||||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||||
self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||||
def forward(self, x): | def forward(self, x): | ||||
x = self.emb(x) # [N,L] -> [N,L,C] | x = self.emb(x) # [N,L] -> [N,L,C] | ||||
@@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): | |||||
return x | return x | ||||
data_dir = 'data' # directory to save data and model | |||||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||||
data_dir = 'save/' # directory to save data and model | |||||
train_path = './data_for_tests/text_classify.txt' # training set file | |||||
# load dataset | # load dataset | ||||
ds_loader = ClassDatasetLoader("train", train_path) | ds_loader = ClassDatasetLoader("train", train_path) | ||||
data = ds_loader.load() | data = ds_loader.load() | ||||
# pre-process dataset | # pre-process dataset | ||||
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) | |||||
# pre = ClassPreprocess(data, data_dir) | |||||
n_classes = pre.num_classes | |||||
vocab_size = pre.vocab_size | |||||
pre = ClassPreprocess() | |||||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||||
n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||||
# construct model | # construct model | ||||
model_args = { | model_args = { | ||||
@@ -58,22 +51,25 @@ model_args = { | |||||
} | } | ||||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | ||||
# train model | |||||
# construct trainer | |||||
train_args = { | train_args = { | ||||
"epochs": 10, | |||||
"batch_size": 50, | |||||
"epochs": 3, | |||||
"batch_size": 16, | |||||
"pickle_path": data_dir, | "pickle_path": data_dir, | ||||
"validate": False, | "validate": False, | ||||
"save_best_dev": False, | "save_best_dev": False, | ||||
"model_saved_path": None, | "model_saved_path": None, | ||||
"use_cuda": True, | "use_cuda": True, | ||||
"learn_rate": 1e-3, | |||||
"momentum": 0.9} | |||||
trainer = ClassificationTrainer(train_args) | |||||
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) | |||||
trainer.cross_validate(model) | |||||
"loss": Loss("cross_entropy"), | |||||
"optimizer": Optimizer("Adam", lr=0.001) | |||||
} | |||||
trainer = ClassificationTrainer(**train_args) | |||||
# start training | |||||
trainer.train(model, train_data=train_set, dev_data=dev_set) | |||||
# predict using model | # predict using model | ||||
data_infer = [x[0] for x in data] | data_infer = [x[0] for x in data] | ||||
infer = ClassificationInfer(data_dir) | infer = ClassificationInfer(data_dir) | ||||
labels_pred = infer.predict(model, data_infer) | |||||
labels_pred = infer.predict(model.cpu(), data_infer) | |||||
print(labels_pred) |
@@ -134,7 +134,7 @@ def train_and_test(): | |||||
tester.test(model, data_dev) | tester.test(model, data_dev) | ||||
# print test results | # print test results | ||||
print(tester.show_matrices()) | |||||
print(tester.show_metrics()) | |||||
print("model tested!") | print("model tested!") | ||||
@@ -108,7 +108,7 @@ def train_test(): | |||||
tester.test(model, data_train) | tester.test(model, data_train) | ||||
# print test results | # print test results | ||||
print(tester.show_matrices()) | |||||
print(tester.show_metrics()) | |||||
print("model tested!") | print("model tested!") | ||||
@@ -1,9 +1,12 @@ | |||||
import sys | import sys | ||||
sys.path.append("..") | sys.path.append("..") | ||||
from fastNLP.fastnlp import FastNLP | from fastNLP.fastnlp import FastNLP | ||||
from fastNLP.fastnlp import interpret_word_seg_results | |||||
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results | |||||
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | ||||
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | |||||
def word_seg(): | def word_seg(): | ||||
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | ||||
@@ -39,5 +42,33 @@ def test_word_seg_interpret(): | |||||
print(interpret_word_seg_results(chars, labels)) | print(interpret_word_seg_results(chars, labels)) | ||||
def test_interpret_cws_pos_results(): | |||||
foo = [ | |||||
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||||
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||||
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||||
] | |||||
chars = [x[0] for x in foo[0]] | |||||
labels = [x[1] for x in foo[0]] | |||||
print(interpret_cws_pos_results(chars, labels)) | |||||
def test_pos_tag(): | |||||
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) | |||||
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") | |||||
text = ["这是最好的基于深度学习的中文分词系统。", | |||||
"大王叫我来巡山。", | |||||
"我党多年来致力于改善人民生活水平。"] | |||||
results = nlp.run(text) | |||||
for example in results: | |||||
words, labels = [], [] | |||||
for res in example: | |||||
words.append(res[0]) | |||||
labels.append(res[1]) | |||||
print(interpret_cws_pos_results(words, labels)) | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
word_seg() | word_seg() |
@@ -5,7 +5,6 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | from fastNLP.models.sequence_modeling import SeqLabeling | ||||
data_name = "pku_training.utf8" | data_name = "pku_training.utf8" | ||||
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||||
pickle_path = "data_for_tests" | pickle_path = "data_for_tests" | ||||
@@ -17,7 +16,8 @@ def foo(): | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | ||||
# Preprocessor | # Preprocessor | ||||
p = SeqLabelPreprocess(train_data, pickle_path) | |||||
p = SeqLabelPreprocess() | |||||
p.run(train_data) | |||||
train_args["vocab_size"] = p.vocab_size | train_args["vocab_size"] = p.vocab_size | ||||
train_args["num_classes"] = p.num_classes | train_args["num_classes"] = p.num_classes | ||||
@@ -30,7 +30,7 @@ def foo(): | |||||
print("start validation.") | print("start validation.") | ||||
validator.test(model) | validator.test(model) | ||||
print(validator.show_matrices()) | |||||
print(validator.show_metrics()) | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||