[add] PeopleDailyCorpusLoader, to parse PeopleDaily Corpus [update] add CWS + POS_tag interface at FastNLP, see example in test_fastNLP.py [update] modify README.md and readme_example.py to the latest version.tags/v0.1.0
@@ -27,15 +27,16 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa | |||
A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. | |||
```python | |||
from fastNLP.core.preprocess import ClassPreprocess | |||
from fastNLP.core.predictor import ClassificationInfer | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import decoder | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.core.inference import ClassificationInfer | |||
from fastNLP.core.loss import Loss | |||
from fastNLP.core.optimizer import Optimizer | |||
class ClassificationModel(BaseModel): | |||
@@ -50,7 +51,7 @@ class ClassificationModel(BaseModel): | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
@@ -60,16 +61,17 @@ class ClassificationModel(BaseModel): | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
data_dir = 'save/' # directory to save data and model | |||
train_path = './data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data_dir) | |||
vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
pre = ClassPreprocess() | |||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||
n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||
# construct model | |||
model_args = { | |||
@@ -78,28 +80,36 @@ model_args = { | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
# construct trainer | |||
train_args = { | |||
"epochs": 20, | |||
"batch_size": 50, | |||
"epochs": 3, | |||
"batch_size": 16, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
trainer.train(model) | |||
"loss": Loss("cross_entropy"), | |||
"optimizer": Optimizer("Adam", lr=0.001) | |||
} | |||
trainer = ClassificationTrainer(**train_args) | |||
# start training | |||
trainer.train(model, train_data=train_set, dev_data=dev_set) | |||
# predict using model | |||
seqs = [x[0] for x in data] | |||
data_infer = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, seqs) | |||
labels_pred = infer.predict(model.cpu(), data_infer) | |||
print(labels_pred) | |||
``` | |||
## Installation | |||
Run the following commands to install fastNLP package. | |||
```shell | |||
pip install fastNLP | |||
``` | |||
### Cloning From GitHub | |||
@@ -86,7 +86,7 @@ class BaseTester(object): | |||
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | |||
step = 0 | |||
for batch_x, batch_y in self.make_batch(iterator, dev_data): | |||
for batch_x, batch_y in self.make_batch(iterator): | |||
with torch.no_grad(): | |||
prediction = self.data_forward(network, batch_x) | |||
eval_results = self.evaluate(prediction, batch_y) | |||
@@ -123,14 +123,14 @@ class BaseTester(object): | |||
"""Return a list of metrics. """ | |||
raise NotImplementedError | |||
def show_matrices(self): | |||
def show_metrics(self): | |||
"""This is called by Trainer to print evaluation results on dev set during training. | |||
:return print_str: str | |||
""" | |||
raise NotImplementedError | |||
def make_batch(self, iterator, data): | |||
def make_batch(self, iterator): | |||
raise NotImplementedError | |||
@@ -194,7 +194,7 @@ class SeqLabelTester(BaseTester): | |||
batch_accuracy = np.mean([x[1] for x in self.eval_history]) | |||
return batch_loss, batch_accuracy | |||
def show_matrices(self): | |||
def show_metrics(self): | |||
""" | |||
This is called by Trainer to print evaluation on dev set. | |||
:return print_str: str | |||
@@ -202,7 +202,7 @@ class SeqLabelTester(BaseTester): | |||
loss, accuracy = self.metrics() | |||
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | |||
def make_batch(self, iterator, data): | |||
def make_batch(self, iterator): | |||
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | |||
@@ -216,7 +216,7 @@ class ClassificationTester(BaseTester): | |||
""" | |||
super(ClassificationTester, self).__init__(**test_args) | |||
def make_batch(self, iterator, data, max_len=None): | |||
def make_batch(self, iterator, max_len=None): | |||
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | |||
def data_forward(self, network, x): | |||
@@ -144,7 +144,7 @@ class BaseTrainer(object): | |||
print("Saved better model selected by validation.") | |||
logger.info("Saved better model selected by validation.") | |||
valid_results = validator.show_matrices() | |||
valid_results = validator.show_metrics() | |||
print("[epoch {}] {}".format(epoch, valid_results)) | |||
logger.info("[epoch {}] {}".format(epoch, valid_results)) | |||
@@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = { | |||
"type": "seq_label", | |||
"config_file_name": "config", | |||
"config_section_name": "text_class_model" | |||
}, | |||
"pos_tag_model": { | |||
"url": "", | |||
"class": "sequence_modeling.AdvSeqLabel", | |||
"pickle": "pos_tag_model_v_0.pkl", | |||
"type": "seq_label", | |||
"config_file_name": "pos_tag.config", | |||
"config_section_name": "pos_tag_model" | |||
} | |||
} | |||
@@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq): | |||
else: | |||
raise ValueError("invalid label {}".format(label[0])) | |||
return words | |||
def interpret_cws_pos_results(char_seq, label_seq): | |||
"""Transform model output into user-friendly contents. | |||
:param char_seq: list of string | |||
:param label_seq: list of string, the same length as char_seq. | |||
:return outputs: list of tuple (words, pos_tag): | |||
""" | |||
def pos_tag_check(seq): | |||
"""check whether all entries are the same """ | |||
return len(set(seq)) <= 1 | |||
word = [] | |||
word_pos = [] | |||
outputs = [] | |||
for char, label in zip(char_seq, label_seq): | |||
tmp = label.split("-") | |||
cws_label, pos_tag = tmp[0], tmp[1] | |||
if cws_label == "B" or cws_label == "M": | |||
word.append(char) | |||
word_pos.append(pos_tag) | |||
elif cws_label == "E": | |||
word.append(char) | |||
word_pos.append(pos_tag) | |||
if not pos_tag_check(word_pos): | |||
raise RuntimeError("character-wise pos tags inconsistent. ") | |||
outputs.append(("".join(word), word_pos[0])) | |||
word.clear() | |||
word_pos.clear() | |||
elif cws_label == "S": | |||
outputs.append((char, pos_tag)) | |||
return outputs |
@@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): | |||
return text.strip().split() | |||
if __name__ == "__main__": | |||
class PeopleDailyCorpusLoader(DatasetLoader): | |||
""" | |||
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | |||
for example in data: | |||
for w, l in zip(example[0], example[1]): | |||
print(w, l) | |||
People Daily Corpus: Chinese word segmentation, POS tag, NER | |||
""" | |||
ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||
print(ans) | |||
def __init__(self, data_path): | |||
super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path) | |||
def load(self): | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
sents = f.readlines() | |||
pos_tag_examples = [] | |||
ner_examples = [] | |||
for sent in sents: | |||
inside_ne = False | |||
sent_pos_tag = [] | |||
sent_words = [] | |||
sent_ner = [] | |||
words = sent.strip().split()[1:] | |||
for word in words: | |||
if "[" in word and "]" in word: | |||
ner_tag = "U" | |||
print(word) | |||
elif "[" in word: | |||
inside_ne = True | |||
ner_tag = "B" | |||
word = word[1:] | |||
elif "]" in word: | |||
ner_tag = "L" | |||
word = word[:word.index("]")] | |||
if inside_ne is True: | |||
inside_ne = False | |||
else: | |||
raise RuntimeError("only ] appears!") | |||
else: | |||
if inside_ne is True: | |||
ner_tag = "I" | |||
else: | |||
ner_tag = "O" | |||
tmp = word.split("/") | |||
token, pos = tmp[0], tmp[1] | |||
sent_ner.append(ner_tag) | |||
sent_pos_tag.append(pos) | |||
sent_words.append(token) | |||
pos_tag_examples.append([sent_words, sent_pos_tag]) | |||
ner_examples.append([sent_words, sent_ner]) | |||
return pos_tag_examples, ner_examples | |||
if __name__ == "__main__": | |||
loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt") | |||
pos, ner = loader.load() | |||
print(pos[:10]) | |||
print(ner[:10]) |
@@ -1,3 +1,4 @@ | |||
from .CRF import ConditionalRandomField | |||
from .MLP import MLP | |||
__all__ = ["ConditionalRandomField"] | |||
__all__ = ["ConditionalRandomField", "MLP"] |
@@ -1,114 +0,0 @@ | |||
import sys | |||
sys.path.append("..") | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
from fastNLP.core.predictor import Predictor | |||
data_name = "pku_training.utf8" | |||
cws_data_path = "/home/zyfeng/data/pku_training.utf8" | |||
pickle_path = "./save/" | |||
data_infer_path = "/home/zyfeng/data/pku_test.utf8" | |||
def infer(): | |||
# Load infer configuration, the same as test | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
model = SeqLabeling(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
# Inference interface | |||
infer = Predictor(pickle_path) | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
print("Inference finished!") | |||
def train_test(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
test_args = ConfigSection() | |||
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) | |||
# Data Loader | |||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
train_data = loader.load_pku() | |||
# Preprocessor | |||
preprocess = SeqLabelPreprocess() | |||
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = preprocess.vocab_size | |||
train_args["num_classes"] = preprocess.num_classes | |||
# Trainer | |||
trainer = SeqLabelTrainer(train_args) | |||
# Model | |||
model = SeqLabeling(train_args) | |||
# Start training | |||
trainer.train(model, data_train, data_dev) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./save/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
# testing with validation set | |||
test(data_dev) | |||
def test(test_data): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
# Define the same model | |||
model = SeqLabeling(train_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Load test configuration | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# Tester | |||
tester = SeqLabelTester(test_args) | |||
# Start testing | |||
tester.test(model, test_data) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print("model tested!") | |||
if __name__ == "__main__": | |||
train_test() |
@@ -31,4 +31,16 @@ pickle_path = "./save/" | |||
use_crf = true | |||
use_cuda = true | |||
rnn_hidden_units = 100 | |||
word_emb_dim = 100 | |||
[model] | |||
save_output = true | |||
validate_in_training = true | |||
save_dev_input = false | |||
save_loss = true | |||
batch_size = 640 | |||
pickle_path = "./save/" | |||
use_crf = true | |||
use_cuda = true | |||
rnn_hidden_units = 100 | |||
word_emb_dim = 100 |
@@ -125,7 +125,7 @@ def test(): | |||
tester.test(model, dev_data) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
@@ -1,29 +1,35 @@ | |||
[train] | |||
epochs = 10 | |||
batch_size = 32 | |||
epochs = 30 | |||
batch_size = 64 | |||
pickle_path = "./save/" | |||
validate = true | |||
save_best_dev = true | |||
model_saved_path = "./save/" | |||
rnn_hidden_units = 100 | |||
rnn_layers = 2 | |||
rnn_bi_direction = true | |||
word_emb_dim = 100 | |||
dropout = 0.5 | |||
use_crf = true | |||
use_cuda = true | |||
print_every_step = 10 | |||
[test] | |||
save_output = true | |||
validate_in_training = true | |||
save_dev_input = false | |||
save_loss = true | |||
batch_size = 64 | |||
batch_size = 640 | |||
pickle_path = "./save/" | |||
use_crf = true | |||
use_cuda = true | |||
[POS_test] | |||
save_output = true | |||
validate_in_training = true | |||
save_dev_input = false | |||
save_loss = true | |||
batch_size = 640 | |||
pickle_path = "./save/" | |||
rnn_hidden_units = 100 | |||
rnn_layers = 1 | |||
rnn_bi_direction = true | |||
word_emb_dim = 100 | |||
dropout = 0.5 | |||
use_crf = true | |||
use_cuda = true | |||
rnn_hidden_units = 100 | |||
word_emb_dim = 100 |
@@ -0,0 +1,146 @@ | |||
import os | |||
import sys | |||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader | |||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
from fastNLP.core.predictor import SeqLabelInfer | |||
# not in the file's dir | |||
if len(os.path.dirname(__file__)) != 0: | |||
os.chdir(os.path.dirname(__file__)) | |||
datadir = "/home/zyfeng/data/" | |||
cfgfile = './pos_tag.cfg' | |||
data_name = "CWS_POS_TAG_NER_people_daily.txt" | |||
pos_tag_data_path = os.path.join(datadir, data_name) | |||
pickle_path = "save" | |||
data_infer_path = os.path.join(datadir, "infer.utf8") | |||
def infer(): | |||
# Config Loader | |||
test_args = ConfigSection() | |||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
model = AdvSeqLabel(test_args) | |||
try: | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print('model loaded!') | |||
except Exception as e: | |||
print('cannot load model!') | |||
raise | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
print('data loaded') | |||
# Inference interface | |||
infer = SeqLabelInfer(pickle_path) | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
print("Inference finished!") | |||
def train(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
test_args = ConfigSection() | |||
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
# Data Loader | |||
loader = PeopleDailyCorpusLoader(pos_tag_data_path) | |||
train_data, _ = loader.load() | |||
# Preprocessor | |||
preprocessor = SeqLabelPreprocess() | |||
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = preprocessor.vocab_size | |||
train_args["num_classes"] = preprocessor.num_classes | |||
# Trainer | |||
trainer = SeqLabelTrainer(**train_args.data) | |||
# Model | |||
model = AdvSeqLabel(train_args) | |||
try: | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print('model parameter loaded!') | |||
except Exception as e: | |||
print("No saved model. Continue.") | |||
pass | |||
# Start training | |||
trainer.train(model, data_train, data_dev) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./save/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
def test(): | |||
# Config Loader | |||
test_args = ConfigSection() | |||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# load dev data | |||
dev_data = load_pickle(pickle_path, "data_dev.pkl") | |||
# Define the same model | |||
model = AdvSeqLabel(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print("model loaded!") | |||
# Tester | |||
tester = SeqLabelTester(**test_args.data) | |||
# Start testing | |||
tester.test(model, dev_data) | |||
# print test results | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
if __name__ == "__main__": | |||
import argparse | |||
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') | |||
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) | |||
args = parser.parse_args() | |||
if args.mode == 'train': | |||
train() | |||
elif args.mode == 'test': | |||
test() | |||
elif args.mode == 'infer': | |||
infer() | |||
else: | |||
print('no mode specified for model!') | |||
parser.print_help() |
@@ -68,7 +68,7 @@ class MyNERTester(SeqLabelTester): | |||
def metrics(self): | |||
return np.mean(self.eval_history) | |||
def show_matrices(self): | |||
def show_metrics(self): | |||
return "dev accuracy={:.2f}".format(float(self.metrics())) | |||
@@ -1,19 +1,13 @@ | |||
# python: 3.5 | |||
# pytorch: 0.4 | |||
################ | |||
# Test cross validation. | |||
################ | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.loss import Loss | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.predictor import ClassificationInfer | |||
from fastNLP.core.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import decoder | |||
from fastNLP.modules import encoder | |||
class ClassificationModel(BaseModel): | |||
@@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
@@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
data_dir = 'save/' # directory to save data and model | |||
train_path = './data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) | |||
# pre = ClassPreprocess(data, data_dir) | |||
n_classes = pre.num_classes | |||
vocab_size = pre.vocab_size | |||
pre = ClassPreprocess() | |||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||
n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||
# construct model | |||
model_args = { | |||
@@ -58,22 +51,25 @@ model_args = { | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
# construct trainer | |||
train_args = { | |||
"epochs": 10, | |||
"batch_size": 50, | |||
"epochs": 3, | |||
"batch_size": 16, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) | |||
trainer.cross_validate(model) | |||
"loss": Loss("cross_entropy"), | |||
"optimizer": Optimizer("Adam", lr=0.001) | |||
} | |||
trainer = ClassificationTrainer(**train_args) | |||
# start training | |||
trainer.train(model, train_data=train_set, dev_data=dev_set) | |||
# predict using model | |||
data_infer = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, data_infer) | |||
labels_pred = infer.predict(model.cpu(), data_infer) | |||
print(labels_pred) |
@@ -134,7 +134,7 @@ def train_and_test(): | |||
tester.test(model, data_dev) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
@@ -108,7 +108,7 @@ def train_test(): | |||
tester.test(model, data_train) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
@@ -1,9 +1,12 @@ | |||
import sys | |||
sys.path.append("..") | |||
from fastNLP.fastnlp import FastNLP | |||
from fastNLP.fastnlp import interpret_word_seg_results | |||
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results | |||
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | |||
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | |||
def word_seg(): | |||
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | |||
@@ -39,5 +42,33 @@ def test_word_seg_interpret(): | |||
print(interpret_word_seg_results(chars, labels)) | |||
def test_interpret_cws_pos_results(): | |||
foo = [ | |||
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||
] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_cws_pos_results(chars, labels)) | |||
def test_pos_tag(): | |||
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) | |||
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
results = nlp.run(text) | |||
for example in results: | |||
words, labels = [], [] | |||
for res in example: | |||
words.append(res[0]) | |||
labels.append(res[1]) | |||
print(interpret_cws_pos_results(words, labels)) | |||
if __name__ == "__main__": | |||
word_seg() |
@@ -5,7 +5,6 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
data_name = "pku_training.utf8" | |||
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||
pickle_path = "data_for_tests" | |||
@@ -17,7 +16,8 @@ def foo(): | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
# Preprocessor | |||
p = SeqLabelPreprocess(train_data, pickle_path) | |||
p = SeqLabelPreprocess() | |||
p.run(train_data) | |||
train_args["vocab_size"] = p.vocab_size | |||
train_args["num_classes"] = p.num_classes | |||
@@ -30,7 +30,7 @@ def foo(): | |||
print("start validation.") | |||
validator.test(model) | |||
print(validator.show_matrices()) | |||
print(validator.show_metrics()) | |||
if __name__ == "__main__": | |||