Browse Source

[fix] drop "data" in Tester.make_batch; correct spelling of "show_metrics"

[add] PeopleDailyCorpusLoader, to parse PeopleDaily Corpus
[update] add CWS + POS_tag interface at FastNLP, see example in test_fastNLP.py
[update] modify README.md and readme_example.py to the latest version.
tags/v0.1.0
FengZiYjun 6 years ago
parent
commit
32a036e8e6
17 changed files with 370 additions and 194 deletions
  1. +30
    -20
      README.md
  2. +6
    -6
      fastNLP/core/tester.py
  3. +1
    -1
      fastNLP/core/trainer.py
  4. +44
    -0
      fastNLP/fastnlp.py
  5. +51
    -7
      fastNLP/loader/dataset_loader.py
  6. +2
    -1
      fastNLP/modules/decoder/__init__.py
  7. +0
    -114
      reproduction/chinese_word_seg/cws_train.py
  8. +12
    -0
      reproduction/chinese_word_segment/cws.cfg
  9. +1
    -1
      reproduction/chinese_word_segment/run.py
  10. +17
    -11
      reproduction/pos_tag_model/pos_tag.cfg
  11. +146
    -0
      reproduction/pos_tag_model/train_pos_tag.py
  12. +1
    -1
      test/ner.py
  13. +22
    -26
      test/readme_example.py
  14. +1
    -1
      test/seq_labeling.py
  15. +1
    -1
      test/test_cws.py
  16. +32
    -1
      test/test_fastNLP.py
  17. +3
    -3
      test/test_tester.py

+ 30
- 20
README.md View File

@@ -27,15 +27,16 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa


A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model.
```python ```python
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel from fastNLP.models.base_model import BaseModel
from fastNLP.modules import encoder
from fastNLP.modules import aggregation from fastNLP.modules import aggregation
from fastNLP.modules import encoder
from fastNLP.modules import decoder from fastNLP.modules import decoder

from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.inference import ClassificationInfer
from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer




class ClassificationModel(BaseModel): class ClassificationModel(BaseModel):
@@ -50,7 +51,7 @@ class ClassificationModel(BaseModel):
self.enc = encoder.Conv( self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3) in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool() self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)
self.dec = decoder.MLP(size_layer=[100, num_classes])


def forward(self, x): def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C] x = self.emb(x) # [N,L] -> [N,L,C]
@@ -60,16 +61,17 @@ class ClassificationModel(BaseModel):
return x return x




data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file
data_dir = 'save/' # directory to save data and model
train_path = './data_for_tests/text_classify.txt' # training set file


# load dataset # load dataset
ds_loader = ClassDatasetLoader("train", train_path) ds_loader = ClassDatasetLoader("train", train_path)
data = ds_loader.load() data = ds_loader.load()


# pre-process dataset # pre-process dataset
pre = ClassPreprocess(data_dir)
vocab_size, n_classes = pre.process(data, "data_train.pkl")
pre = ClassPreprocess()
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
n_classes, vocab_size = pre.num_classes, pre.vocab_size


# construct model # construct model
model_args = { model_args = {
@@ -78,28 +80,36 @@ model_args = {
} }
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)


# train model
# construct trainer
train_args = { train_args = {
"epochs": 20,
"batch_size": 50,
"epochs": 3,
"batch_size": 16,
"pickle_path": data_dir, "pickle_path": data_dir,
"validate": False, "validate": False,
"save_best_dev": False, "save_best_dev": False,
"model_saved_path": None, "model_saved_path": None,
"use_cuda": True, "use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
trainer.train(model)
"loss": Loss("cross_entropy"),
"optimizer": Optimizer("Adam", lr=0.001)
}
trainer = ClassificationTrainer(**train_args)

# start training
trainer.train(model, train_data=train_set, dev_data=dev_set)


# predict using model # predict using model
seqs = [x[0] for x in data]
data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir) infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, seqs)
labels_pred = infer.predict(model.cpu(), data_infer)
print(labels_pred)
``` ```




## Installation ## Installation
Run the following commands to install fastNLP package.
```shell
pip install fastNLP
```


### Cloning From GitHub ### Cloning From GitHub




+ 6
- 6
fastNLP/core/tester.py View File

@@ -86,7 +86,7 @@ class BaseTester(object):
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
step = 0 step = 0


for batch_x, batch_y in self.make_batch(iterator, dev_data):
for batch_x, batch_y in self.make_batch(iterator):
with torch.no_grad(): with torch.no_grad():
prediction = self.data_forward(network, batch_x) prediction = self.data_forward(network, batch_x)
eval_results = self.evaluate(prediction, batch_y) eval_results = self.evaluate(prediction, batch_y)
@@ -123,14 +123,14 @@ class BaseTester(object):
"""Return a list of metrics. """ """Return a list of metrics. """
raise NotImplementedError raise NotImplementedError


def show_matrices(self):
def show_metrics(self):
"""This is called by Trainer to print evaluation results on dev set during training. """This is called by Trainer to print evaluation results on dev set during training.


:return print_str: str :return print_str: str
""" """
raise NotImplementedError raise NotImplementedError


def make_batch(self, iterator, data):
def make_batch(self, iterator):
raise NotImplementedError raise NotImplementedError




@@ -194,7 +194,7 @@ class SeqLabelTester(BaseTester):
batch_accuracy = np.mean([x[1] for x in self.eval_history]) batch_accuracy = np.mean([x[1] for x in self.eval_history])
return batch_loss, batch_accuracy return batch_loss, batch_accuracy


def show_matrices(self):
def show_metrics(self):
""" """
This is called by Trainer to print evaluation on dev set. This is called by Trainer to print evaluation on dev set.
:return print_str: str :return print_str: str
@@ -202,7 +202,7 @@ class SeqLabelTester(BaseTester):
loss, accuracy = self.metrics() loss, accuracy = self.metrics()
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy)


def make_batch(self, iterator, data):
def make_batch(self, iterator):
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True)




@@ -216,7 +216,7 @@ class ClassificationTester(BaseTester):
""" """
super(ClassificationTester, self).__init__(**test_args) super(ClassificationTester, self).__init__(**test_args)


def make_batch(self, iterator, data, max_len=None):
def make_batch(self, iterator, max_len=None):
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len)


def data_forward(self, network, x): def data_forward(self, network, x):


+ 1
- 1
fastNLP/core/trainer.py View File

@@ -144,7 +144,7 @@ class BaseTrainer(object):
print("Saved better model selected by validation.") print("Saved better model selected by validation.")
logger.info("Saved better model selected by validation.") logger.info("Saved better model selected by validation.")


valid_results = validator.show_matrices()
valid_results = validator.show_metrics()
print("[epoch {}] {}".format(epoch, valid_results)) print("[epoch {}] {}".format(epoch, valid_results))
logger.info("[epoch {}] {}".format(epoch, valid_results)) logger.info("[epoch {}] {}".format(epoch, valid_results))




+ 44
- 0
fastNLP/fastnlp.py View File

@@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = {
"type": "seq_label", "type": "seq_label",
"config_file_name": "config", "config_file_name": "config",
"config_section_name": "text_class_model" "config_section_name": "text_class_model"
},
"pos_tag_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "pos_tag_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "pos_tag.config",
"config_section_name": "pos_tag_model"
} }

} }




@@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq):
else: else:
raise ValueError("invalid label {}".format(label[0])) raise ValueError("invalid label {}".format(label[0]))
return words return words


def interpret_cws_pos_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.

:param char_seq: list of string
:param label_seq: list of string, the same length as char_seq.
:return outputs: list of tuple (words, pos_tag):
"""

def pos_tag_check(seq):
"""check whether all entries are the same """
return len(set(seq)) <= 1

word = []
word_pos = []
outputs = []
for char, label in zip(char_seq, label_seq):
tmp = label.split("-")
cws_label, pos_tag = tmp[0], tmp[1]

if cws_label == "B" or cws_label == "M":
word.append(char)
word_pos.append(pos_tag)
elif cws_label == "E":
word.append(char)
word_pos.append(pos_tag)
if not pos_tag_check(word_pos):
raise RuntimeError("character-wise pos tags inconsistent. ")
outputs.append(("".join(word), word_pos[0]))
word.clear()
word_pos.clear()
elif cws_label == "S":
outputs.append((char, pos_tag))
return outputs

+ 51
- 7
fastNLP/loader/dataset_loader.py View File

@@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader):
return text.strip().split() return text.strip().split()




if __name__ == "__main__":
class PeopleDailyCorpusLoader(DatasetLoader):
""" """
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines()
for example in data:
for w, l in zip(example[0], example[1]):
print(w, l)
People Daily Corpus: Chinese word segmentation, POS tag, NER
""" """


ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku()
print(ans)
def __init__(self, data_path):
super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path)

def load(self):
with open(self.data_path, "r", encoding="utf-8") as f:
sents = f.readlines()

pos_tag_examples = []
ner_examples = []
for sent in sents:
inside_ne = False
sent_pos_tag = []
sent_words = []
sent_ner = []
words = sent.strip().split()[1:]
for word in words:
if "[" in word and "]" in word:
ner_tag = "U"
print(word)
elif "[" in word:
inside_ne = True
ner_tag = "B"
word = word[1:]
elif "]" in word:
ner_tag = "L"
word = word[:word.index("]")]
if inside_ne is True:
inside_ne = False
else:
raise RuntimeError("only ] appears!")
else:
if inside_ne is True:
ner_tag = "I"
else:
ner_tag = "O"
tmp = word.split("/")
token, pos = tmp[0], tmp[1]
sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
return pos_tag_examples, ner_examples

if __name__ == "__main__":
loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt")
pos, ner = loader.load()
print(pos[:10])
print(ner[:10])

+ 2
- 1
fastNLP/modules/decoder/__init__.py View File

@@ -1,3 +1,4 @@
from .CRF import ConditionalRandomField from .CRF import ConditionalRandomField
from .MLP import MLP


__all__ = ["ConditionalRandomField"]
__all__ = ["ConditionalRandomField", "MLP"]

+ 0
- 114
reproduction/chinese_word_seg/cws_train.py View File

@@ -1,114 +0,0 @@
import sys

sys.path.append("..")

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.predictor import Predictor

data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/data/pku_training.utf8"
pickle_path = "./save/"
data_infer_path = "/home/zyfeng/data/pku_test.utf8"


def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()

# Inference interface
infer = Predictor(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train_test():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
train_data = loader.load_pku()

# Preprocessor
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocess.vocab_size
train_args["num_classes"] = preprocess.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)

# Model
model = SeqLabeling(train_args)

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

# testing with validation set
test(data_dev)


def test(test_data):
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

# Define the same model
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# Tester
tester = SeqLabelTester(test_args)

# Start testing
tester.test(model, test_data)

# print test results
print(tester.show_matrices())
print("model tested!")


if __name__ == "__main__":
train_test()

+ 12
- 0
reproduction/chinese_word_segment/cws.cfg View File

@@ -31,4 +31,16 @@ pickle_path = "./save/"
use_crf = true use_crf = true
use_cuda = true use_cuda = true
rnn_hidden_units = 100 rnn_hidden_units = 100
word_emb_dim = 100

[model]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 640
pickle_path = "./save/"
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100 word_emb_dim = 100

+ 1
- 1
reproduction/chinese_word_segment/run.py View File

@@ -125,7 +125,7 @@ def test():
tester.test(model, dev_data) tester.test(model, dev_data)


# print test results # print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!") print("model tested!")






reproduction/chinese_word_seg/cws.cfg → reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,29 +1,35 @@
[train] [train]
epochs = 10
batch_size = 32
epochs = 30
batch_size = 64
pickle_path = "./save/" pickle_path = "./save/"
validate = true validate = true
save_best_dev = true save_best_dev = true
model_saved_path = "./save/" model_saved_path = "./save/"
rnn_hidden_units = 100 rnn_hidden_units = 100
rnn_layers = 2
rnn_bi_direction = true
word_emb_dim = 100 word_emb_dim = 100
dropout = 0.5
use_crf = true use_crf = true
use_cuda = true use_cuda = true
print_every_step = 10


[test] [test]
save_output = true save_output = true
validate_in_training = true validate_in_training = true
save_dev_input = false save_dev_input = false
save_loss = true save_loss = true
batch_size = 64
batch_size = 640
pickle_path = "./save/"
use_crf = true
use_cuda = true


[POS_test]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 640
pickle_path = "./save/" pickle_path = "./save/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true use_crf = true
use_cuda = true use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

+ 146
- 0
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -0,0 +1,146 @@
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './pos_tag.cfg'
data_name = "CWS_POS_TAG_NER_people_daily.txt"

pos_tag_data_path = os.path.join(datadir, data_name)
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")


def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = AdvSeqLabel(test_args)

try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()
print('data loaded')

# Inference interface
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = PeopleDailyCorpusLoader(pos_tag_data_path)
train_data, _ = loader.load()

# Preprocessor
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes

# Trainer
trainer = SeqLabelTrainer(**train_args.data)

# Model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def test():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")

# Define the same model
model = AdvSeqLabel(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print("model loaded!")

# Tester
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, dev_data)

# print test results
print(tester.show_metrics())
print("model tested!")


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
test()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

+ 1
- 1
test/ner.py View File

@@ -68,7 +68,7 @@ class MyNERTester(SeqLabelTester):
def metrics(self): def metrics(self):
return np.mean(self.eval_history) return np.mean(self.eval_history)


def show_matrices(self):
def show_metrics(self):
return "dev accuracy={:.2f}".format(float(self.metrics())) return "dev accuracy={:.2f}".format(float(self.metrics()))






+ 22
- 26
test/readme_example.py View File

@@ -1,19 +1,13 @@
# python: 3.5
# pytorch: 0.4

################
# Test cross validation.
################

from fastNLP.loader.preprocess import ClassPreprocess

from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.predictor import ClassificationInfer from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregation from fastNLP.modules import aggregation
from fastNLP.modules import encoder
from fastNLP.modules import decoder from fastNLP.modules import decoder
from fastNLP.modules import encoder




class ClassificationModel(BaseModel): class ClassificationModel(BaseModel):
@@ -28,7 +22,7 @@ class ClassificationModel(BaseModel):
self.enc = encoder.Conv( self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3) in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool() self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)
self.dec = decoder.MLP(size_layer=[100, num_classes])


def forward(self, x): def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C] x = self.emb(x) # [N,L] -> [N,L,C]
@@ -38,18 +32,17 @@ class ClassificationModel(BaseModel):
return x return x




data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file
data_dir = 'save/' # directory to save data and model
train_path = './data_for_tests/text_classify.txt' # training set file


# load dataset # load dataset
ds_loader = ClassDatasetLoader("train", train_path) ds_loader = ClassDatasetLoader("train", train_path)
data = ds_loader.load() data = ds_loader.load()


# pre-process dataset # pre-process dataset
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
# pre = ClassPreprocess(data, data_dir)
n_classes = pre.num_classes
vocab_size = pre.vocab_size
pre = ClassPreprocess()
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
n_classes, vocab_size = pre.num_classes, pre.vocab_size


# construct model # construct model
model_args = { model_args = {
@@ -58,22 +51,25 @@ model_args = {
} }
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)


# train model
# construct trainer
train_args = { train_args = {
"epochs": 10,
"batch_size": 50,
"epochs": 3,
"batch_size": 16,
"pickle_path": data_dir, "pickle_path": data_dir,
"validate": False, "validate": False,
"save_best_dev": False, "save_best_dev": False,
"model_saved_path": None, "model_saved_path": None,
"use_cuda": True, "use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
trainer.cross_validate(model)
"loss": Loss("cross_entropy"),
"optimizer": Optimizer("Adam", lr=0.001)
}
trainer = ClassificationTrainer(**train_args)

# start training
trainer.train(model, train_data=train_set, dev_data=dev_set)


# predict using model # predict using model
data_infer = [x[0] for x in data] data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir) infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, data_infer)
labels_pred = infer.predict(model.cpu(), data_infer)
print(labels_pred)

+ 1
- 1
test/seq_labeling.py View File

@@ -134,7 +134,7 @@ def train_and_test():
tester.test(model, data_dev) tester.test(model, data_dev)


# print test results # print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!") print("model tested!")






+ 1
- 1
test/test_cws.py View File

@@ -108,7 +108,7 @@ def train_test():
tester.test(model, data_train) tester.test(model, data_train)


# print test results # print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!") print("model tested!")






+ 32
- 1
test/test_fastNLP.py View File

@@ -1,9 +1,12 @@
import sys import sys

sys.path.append("..") sys.path.append("..")
from fastNLP.fastnlp import FastNLP from fastNLP.fastnlp import FastNLP
from fastNLP.fastnlp import interpret_word_seg_results
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results


PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/"



def word_seg(): def word_seg():
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES)
@@ -39,5 +42,33 @@ def test_word_seg_interpret():
print(interpret_word_seg_results(chars, labels)) print(interpret_word_seg_results(chars, labels))




def test_interpret_cws_pos_results():
foo = [
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'),
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'),
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')]
]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_cws_pos_results(chars, labels))


def test_pos_tag():
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES)
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model")
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_cws_pos_results(words, labels))




if __name__ == "__main__": if __name__ == "__main__":
word_seg() word_seg()

+ 3
- 3
test/test_tester.py View File

@@ -5,7 +5,6 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
from fastNLP.models.sequence_modeling import SeqLabeling from fastNLP.models.sequence_modeling import SeqLabeling


data_name = "pku_training.utf8" data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8"
pickle_path = "data_for_tests" pickle_path = "data_for_tests"




@@ -17,7 +16,8 @@ def foo():
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})


# Preprocessor # Preprocessor
p = SeqLabelPreprocess(train_data, pickle_path)
p = SeqLabelPreprocess()
p.run(train_data)
train_args["vocab_size"] = p.vocab_size train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes train_args["num_classes"] = p.num_classes


@@ -30,7 +30,7 @@ def foo():


print("start validation.") print("start validation.")
validator.test(model) validator.test(model)
print(validator.show_matrices())
print(validator.show_metrics())




if __name__ == "__main__": if __name__ == "__main__":


Loading…
Cancel
Save