Browse Source

Updates:

* 改名: chinese_word_segment ---> Chinese_word_segmentation
* 改名: pos_tag_model ---> POS_tagging
* 添加4个对Batch的测试
* 删除无用的chinese_word_segment/run.py
tags/v0.3.1^2
FengZiYjun 6 years ago
parent
commit
1fdaf236d2
16 changed files with 50 additions and 156 deletions
  1. +0
    -0
      reproduction/Chinese_word_segmentation/cws.cfg
  2. +0
    -0
      reproduction/Chinese_word_segmentation/cws_io/__init__.py
  3. +0
    -0
      reproduction/Chinese_word_segmentation/cws_io/cws_reader.py
  4. +0
    -0
      reproduction/Chinese_word_segmentation/models/__init__.py
  5. +4
    -4
      reproduction/Chinese_word_segmentation/models/cws_model.py
  6. +0
    -0
      reproduction/Chinese_word_segmentation/process/__init__.py
  7. +1
    -1
      reproduction/Chinese_word_segmentation/process/cws_processor.py
  8. +0
    -0
      reproduction/Chinese_word_segmentation/process/span_converter.py
  9. +0
    -0
      reproduction/Chinese_word_segmentation/utils.py
  10. +0
    -0
      reproduction/POS_tagging/pos_processor.py
  11. +0
    -0
      reproduction/POS_tagging/pos_reader.py
  12. +0
    -0
      reproduction/POS_tagging/pos_tag.cfg
  13. +0
    -0
      reproduction/POS_tagging/train_pos_tag.py
  14. +0
    -0
      reproduction/POS_tagging/utils.py
  15. +0
    -151
      reproduction/chinese_word_segment/run.py
  16. +45
    -0
      test/core/test_batch.py

reproduction/chinese_word_segment/cws.cfg → reproduction/Chinese_word_segmentation/cws.cfg View File


reproduction/chinese_word_segment/cws_io/__init__.py → reproduction/Chinese_word_segmentation/cws_io/__init__.py View File


reproduction/chinese_word_segment/cws_io/cws_reader.py → reproduction/Chinese_word_segmentation/cws_io/cws_reader.py View File


reproduction/chinese_word_segment/models/__init__.py → reproduction/Chinese_word_segmentation/models/__init__.py View File


reproduction/chinese_word_segment/models/cws_model.py → reproduction/Chinese_word_segmentation/models/cws_model.py View File

@@ -1,11 +1,11 @@


from torch import nn
import torch import torch
import torch.nn.functional as F
from torch import nn


from fastNLP.modules.decoder.MLP import MLP
from fastNLP.models.base_model import BaseModel from fastNLP.models.base_model import BaseModel
from reproduction.chinese_word_segment.utils import seq_lens_to_mask
from fastNLP.modules.decoder.MLP import MLP
from reproduction.Chinese_word_segmentation.utils import seq_lens_to_mask



class CWSBiLSTMEncoder(BaseModel): class CWSBiLSTMEncoder(BaseModel):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,

reproduction/chinese_word_segment/process/__init__.py → reproduction/Chinese_word_segmentation/process/__init__.py View File


reproduction/chinese_word_segment/process/cws_processor.py → reproduction/Chinese_word_segmentation/process/cws_processor.py View File

@@ -4,7 +4,7 @@ import re
from fastNLP.api.processor import Processor from fastNLP.api.processor import Processor
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.vocabulary import Vocabulary
from reproduction.chinese_word_segment.process.span_converter import SpanConverter
from reproduction.Chinese_word_segmentation.process.span_converter import SpanConverter


_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' _SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'



reproduction/chinese_word_segment/process/span_converter.py → reproduction/Chinese_word_segmentation/process/span_converter.py View File


reproduction/chinese_word_segment/utils.py → reproduction/Chinese_word_segmentation/utils.py View File


reproduction/pos_tag_model/pos_processor.py → reproduction/POS_tagging/pos_processor.py View File


reproduction/pos_tag_model/pos_reader.py → reproduction/POS_tagging/pos_reader.py View File


reproduction/pos_tag_model/pos_tag.cfg → reproduction/POS_tagging/pos_tag.cfg View File


reproduction/pos_tag_model/train_pos_tag.py → reproduction/POS_tagging/train_pos_tag.py View File


reproduction/pos_tag_model/utils.py → reproduction/POS_tagging/utils.py View File


+ 0
- 151
reproduction/chinese_word_segment/run.py View File

@@ -1,151 +0,0 @@
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.utils import load_pickle
from fastNLP.io.model_io import ModelLoader, ModelSaver
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.utils import save_pickle
from fastNLP.core.metrics import SeqLabelEvaluator

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'

cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")


def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = AdvSeqLabel(test_args)

try:
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise

# Data Loader
infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines)
infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
print('data loaded')

# Inference interface
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args})

print("loading data set...")
data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
data.load(cws_data_path)
data_train, data_dev = data.split(ratio=0.3)
train_args["vocab_size"] = len(data.word_vocab)
train_args["num_classes"] = len(data.label_vocab)
print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab)))

change_field_is_target(data_dev, "truth", True)
save_pickle(data_dev, "./save/", "data_dev.pkl")
save_pickle(data.word_vocab, "./save/", "word2id.pkl")
save_pickle(data.label_vocab, "./save/", "label2id.pkl")

# Trainer
trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"],
validate=train_args["validate"],
use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"],
save_best_dev=True, print_every_step=10, model_name="trained_model.pkl",
evaluator=SeqLabelEvaluator())

# Model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/trained_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def predict():
# Config Loader
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")

# Define the same model
model = AdvSeqLabel(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print("model loaded!")

# Tester
test_args["evaluator"] = SeqLabelEvaluator()
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, dev_data)


if __name__ == "__main__":

import argparse

parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
predict()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

+ 45
- 0
test/core/test_batch.py View File

@@ -1,6 +1,7 @@
import unittest import unittest


import numpy as np import numpy as np
import torch


from fastNLP.core.batch import Batch from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
@@ -31,3 +32,47 @@ class TestCase1(unittest.TestCase):
self.assertEqual(len(y["y"]), 4) self.assertEqual(len(y["y"]), 4)
self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
self.assertListEqual(list(y["y"][-1]), [5, 6]) self.assertListEqual(list(y["y"][-1]), [5, 6])

def test_list_padding(self):
ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))

def test_numpy_padding(self):
ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))

def test_list_to_tensor(self):
ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))

def test_numpy_to_tensor(self):
ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))

Loading…
Cancel
Save