Browse Source

Merge pull request #54 from fastnlp/pos_tag_service

CWS+POS tag interface
tags/v0.1.0
Coet GitHub 6 years ago
parent
commit
63c687d906
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 509 additions and 587 deletions
  1. +15
    -55
      README.md
  2. +1
    -1
      fastNLP/core/loss.py
  3. +1
    -1
      fastNLP/core/predictor.py
  4. +36
    -16
      fastNLP/core/tester.py
  5. +4
    -25
      fastNLP/core/trainer.py
  6. +44
    -0
      fastNLP/fastnlp.py
  7. +3
    -4
      fastNLP/loader/base_loader.py
  8. +2
    -2
      fastNLP/loader/config_loader.py
  9. +62
    -18
      fastNLP/loader/dataset_loader.py
  10. +44
    -2
      fastNLP/loader/embed_loader.py
  11. +2
    -2
      fastNLP/loader/model_loader.py
  12. +2
    -1
      fastNLP/modules/decoder/__init__.py
  13. +0
    -114
      reproduction/chinese_word_seg/cws_train.py
  14. +12
    -0
      reproduction/chinese_word_segment/cws.cfg
  15. +6
    -6
      reproduction/chinese_word_segment/run.py
  16. +17
    -11
      reproduction/pos_tag_model/pos_tag.cfg
  17. +146
    -0
      reproduction/pos_tag_model/train_pos_tag.py
  18. +24
    -0
      setup.py
  19. +7
    -8
      test/loader/test_loader.py
  20. +0
    -138
      test/ner.py
  21. +0
    -129
      test/ner_decode.py
  22. +23
    -27
      test/readme_example.py
  23. +8
    -8
      test/seq_labeling.py
  24. +8
    -8
      test/test_cws.py
  25. +33
    -2
      test/test_fastNLP.py
  26. +7
    -7
      test/test_tester.py
  27. +2
    -2
      test/text_classify.py

+ 15
- 55
README.md View File

@@ -23,9 +23,11 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa
- [Documentation](https://fastnlp.readthedocs.io/en/latest/)
- [Source Code](https://github.com/fastnlp/fastNLP)



## Installation
Run the following commands to install fastNLP package.
```shell
pip install fastNLP
```

### Cloning From GitHub

@@ -51,14 +53,15 @@ pip3 install torch torchvision
```
FastNLP
├── docs
│   └── quick_tutorial.md
├── fastNLP
│   ├── action
│   ├── core
│   │   ├── action.py
│   │   ├── inference.py
│   │   ├── __init__.py
│   │   ├── loss.py
│   │   ├── metrics.py
│   │   ├── optimizer.py
│   │   ├── predictor.py
│   │   ├── preprocess.py
│   │   ├── README.md
│   │   ├── tester.py
│   │   └── trainer.py
@@ -70,71 +73,28 @@ FastNLP
│   │   ├── dataset_loader.py
│   │   ├── embed_loader.py
│   │   ├── __init__.py
│   │   ├── model_loader.py
│   │   └── preprocess.py
│   │   └── model_loader.py
│   ├── models
│   │   ├── base_model.py
│   │   ├── char_language_model.py
│   │   ├── cnn_text_classification.py
│   │   ├── __init__.py
│   │   └── sequence_modeling.py
│   ├── modules
│   │   ├── aggregation
│   │   │   ├── attention.py
│   │   │   ├── avg_pool.py
│   │   │   ├── __init__.py
│   │   │   ├── kmax_pool.py
│   │   │   ├── max_pool.py
│   │   │   └── self_attention.py
│   │   ├── decoder
│   │   │   ├── CRF.py
│   │   │   └── __init__.py
│   │   ├── encoder
│   │   │   ├── char_embedding.py
│   │   │   ├── conv_maxpool.py
│   │   │   ├── conv.py
│   │   │   ├── embedding.py
│   │   │   ├── __init__.py
│   │   │   ├── linear.py
│   │   │   ├── lstm.py
│   │   │   ├── masked_rnn.py
│   │   │   └── variational_rnn.py
│   │   ├── __init__.py
│   │   ├── interaction
│   │   │   └── __init__.py
│   │   ├── other_modules.py
│   │   └── utils.py
│   └── saver
│   ├── base_saver.py
│   ├── __init__.py
│   ├── logger.py
│   └── model_saver.py
├── LICENSE
├── README.md
├── reproduction
│   ├── Char-aware_NLM
│   │  
│   ├── CNN-sentence_classification
│   │  
│   ├── HAN-document_classification
│   │  
│   └── LSTM+self_attention_sentiment_analysis
|
├── requirements.txt
├── setup.py
└── test
├── core
├── data_for_tests
│   ├── charlm.txt
│   ├── config
│   ├── cws_test
│   ├── cws_train
│   ├── people_infer.txt
│   └── people.txt
├── test_charlm.py
├── test_cws.py
├── test_fastNLP.py
├── test_loader.py
├── test_seq_labeling.py
├── test_tester.py
└── test_trainer.py
├── __init__.py
├── loader
├── modules
└── readme_example.py

```

+ 1
- 1
fastNLP/core/loss.py View File

@@ -9,7 +9,7 @@ class Loss(object):

def __init__(self, args):
if args is None:
# this is useful when
# this is useful when Trainer.__init__ performs type check
self._loss = None
elif isinstance(args, str):
self._loss = self._borrow_from_pytorch(args)


+ 1
- 1
fastNLP/core/predictor.py View File

@@ -70,7 +70,7 @@ class Predictor(object):
def predict(self, network, data):
"""Perform inference using the trained model.

:param network: a PyTorch model
:param network: a PyTorch model (cpu)
:param data: list of list of strings
:return: list of list of strings, [num_examples, tag_seq_length]
"""


+ 36
- 16
fastNLP/core/tester.py View File

@@ -38,7 +38,7 @@ class BaseTester(object):
Obviously, "required_args" is the subset of "default_args".
The value in "default_args" to the keys in "required_args" is simply for type check.
"""
# TODO: required arguments
# add required arguments here
required_args = {}

for req_key in required_args:
@@ -56,7 +56,7 @@ class BaseTester(object):
logger.error(msg)
raise ValueError(msg)
else:
# BeseTester doesn't care about extra arguments
# BaseTester doesn't care about extra arguments
pass
print(default_args)

@@ -69,8 +69,8 @@ class BaseTester(object):
self.print_every_step = default_args["print_every_step"]

self._model = None
self.eval_history = []
self.batch_output = []
self.eval_history = [] # evaluation results of all batches
self.batch_output = [] # outputs of all batches

def test(self, network, dev_data):
if torch.cuda.is_available() and self.use_cuda:
@@ -83,10 +83,10 @@ class BaseTester(object):
self.eval_history.clear()
self.batch_output.clear()

iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False))
step = 0

for batch_x, batch_y in self.make_batch(iterator, dev_data):
for batch_x, batch_y in self.make_batch(iterator):
with torch.no_grad():
prediction = self.data_forward(network, batch_x)
eval_results = self.evaluate(prediction, batch_y)
@@ -99,7 +99,7 @@ class BaseTester(object):
print_output = "[test step {}] {}".format(step, eval_results)
logger.info(print_output)
if self.print_every_step > 0 and step % self.print_every_step == 0:
print(print_output)
print(self.make_eval_output(prediction, eval_results))
step += 1

def mode(self, model, test):
@@ -115,24 +115,44 @@ class BaseTester(object):
raise NotImplementedError

def evaluate(self, predict, truth):
"""Compute evaluation metrics for the model. """
"""Compute evaluation metrics.

:param predict: Tensor
:param truth: Tensor
:return eval_results: can be anything. It will be stored in self.eval_history
"""
raise NotImplementedError

@property
def metrics(self):
"""Return a list of metrics. """
"""Compute and return metrics.
Use self.eval_history to compute metrics over the whole dev set.
Please refer to metrics.py for common metric functions.

:return : variable number of outputs
"""
raise NotImplementedError

def show_matrices(self):
"""This is called by Trainer to print evaluation results on dev set during training.
def show_metrics(self):
"""Customize evaluation outputs in Trainer.
Called by Trainer to print evaluation results on dev set during training.
Use self.metrics to fetch available metrics.

:return print_str: str
"""
raise NotImplementedError

def make_batch(self, iterator, data):
def make_batch(self, iterator):
raise NotImplementedError

def make_eval_output(self, predictions, eval_results):
"""Customize Tester outputs.

:param predictions: Tensor
:param eval_results: Tensor
:return: str, to be printed.
"""
raise NotImplementedError

class SeqLabelTester(BaseTester):
"""
@@ -194,7 +214,7 @@ class SeqLabelTester(BaseTester):
batch_accuracy = np.mean([x[1] for x in self.eval_history])
return batch_loss, batch_accuracy

def show_matrices(self):
def show_metrics(self):
"""
This is called by Trainer to print evaluation on dev set.
:return print_str: str
@@ -202,7 +222,7 @@ class SeqLabelTester(BaseTester):
loss, accuracy = self.metrics()
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy)

def make_batch(self, iterator, data):
def make_batch(self, iterator):
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True)


@@ -211,12 +231,12 @@ class ClassificationTester(BaseTester):

def __init__(self, **test_args):
"""
:param test_args: a dict-like object that has __getitem__ method, \
:param test_args: a dict-like object that has __getitem__ method.
can be accessed by "test_args["key_str"]"
"""
super(ClassificationTester, self).__init__(**test_args)

def make_batch(self, iterator, data, max_len=None):
def make_batch(self, iterator, max_len=None):
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len)

def data_forward(self, network, x):


+ 4
- 25
fastNLP/core/trainer.py View File

@@ -1,6 +1,4 @@
import _pickle
import copy
import os
import time
from datetime import timedelta

@@ -15,16 +13,12 @@ from fastNLP.modules import utils
from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver

DEFAULT_QUEUE_SIZE = 300
logger = create_logger(__name__, "./train_test.log")


class BaseTrainer(object):
"""Operations to train a model, including data loading, SGD, and validation.
"""Operations of training a model, including data loading, gradient descent, and validation.

Subclasses must implement the following abstract methods:
- grad_backward
- get_loss
"""

def __init__(self, **kwargs):
@@ -47,7 +41,7 @@ class BaseTrainer(object):
"""
default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/",
"save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1,
"loss": Loss(None),
"loss": Loss(None), # used to pass type check
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0)
}
"""
@@ -56,7 +50,7 @@ class BaseTrainer(object):
Obviously, "required_args" is the subset of "default_args".
The value in "default_args" to the keys in "required_args" is simply for type check.
"""
# TODO: required arguments
# add required arguments here
required_args = {}

for req_key in required_args:
@@ -144,7 +138,7 @@ class BaseTrainer(object):
print("Saved better model selected by validation.")
logger.info("Saved better model selected by validation.")

valid_results = validator.show_matrices()
valid_results = validator.show_metrics()
print("[epoch {}] {}".format(epoch, valid_results))
logger.info("[epoch {}] {}".format(epoch, valid_results))

@@ -198,21 +192,6 @@ class BaseTrainer(object):
network_copy = copy.deepcopy(network)
self.train(network_copy, train_data_cv[i], dev_data_cv[i])

def load_train_data(self, pickle_path):
"""
For task-specific processing.
:param pickle_path:
:return data_train
"""
file_path = os.path.join(pickle_path, "data_train.pkl")
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
data = _pickle.load(f)
else:
logger.error("cannot find training data {}. invalid input path for training data.".format(file_path))
raise RuntimeError("cannot find training data {}".format(file_path))
return data

def make_batch(self, iterator):
raise NotImplementedError



+ 44
- 0
fastNLP/fastnlp.py View File

@@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = {
"type": "seq_label",
"config_file_name": "config",
"config_section_name": "text_class_model"
},
"pos_tag_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "pos_tag_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "pos_tag.config",
"config_section_name": "pos_tag_model"
}

}


@@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq):
else:
raise ValueError("invalid label {}".format(label[0]))
return words


def interpret_cws_pos_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.

:param char_seq: list of string
:param label_seq: list of string, the same length as char_seq.
:return outputs: list of tuple (words, pos_tag):
"""

def pos_tag_check(seq):
"""check whether all entries are the same """
return len(set(seq)) <= 1

word = []
word_pos = []
outputs = []
for char, label in zip(char_seq, label_seq):
tmp = label.split("-")
cws_label, pos_tag = tmp[0], tmp[1]

if cws_label == "B" or cws_label == "M":
word.append(char)
word_pos.append(pos_tag)
elif cws_label == "E":
word.append(char)
word_pos.append(pos_tag)
if not pos_tag_check(word_pos):
raise RuntimeError("character-wise pos tags inconsistent. ")
outputs.append(("".join(word), word_pos[0]))
word.clear()
word_pos.clear()
elif cws_label == "S":
outputs.append((char, pos_tag))
return outputs

+ 3
- 4
fastNLP/loader/base_loader.py View File

@@ -1,9 +1,8 @@
class BaseLoader(object):
"""docstring for BaseLoader"""

def __init__(self, data_name, data_path):
def __init__(self, data_path):
super(BaseLoader, self).__init__()
self.data_name = data_name
self.data_path = data_path

def load(self):
@@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader):
For charLM
"""

def __init__(self, name, path):
super(ToyLoader0, self).__init__(name, path)
def __init__(self, data_path):
super(ToyLoader0, self).__init__(data_path)

def load(self):
with open(self.data_path, 'r') as f:


+ 2
- 2
fastNLP/loader/config_loader.py View File

@@ -9,7 +9,7 @@ class ConfigLoader(BaseLoader):
"""loader for configuration files"""

def __int__(self, data_name, data_path):
super(ConfigLoader, self).__init__(data_name, data_path)
super(ConfigLoader, self).__init__(data_path)
self.config = self.parse(super(ConfigLoader, self).load())

@staticmethod
@@ -100,7 +100,7 @@ class ConfigSection(object):


if __name__ == "__main__":
config = ConfigLoader('configLoader', 'there is no data')
config = ConfigLoader('there is no data')

section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()}
"""


+ 62
- 18
fastNLP/loader/dataset_loader.py View File

@@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader
class DatasetLoader(BaseLoader):
""""loader for data sets"""

def __init__(self, data_name, data_path):
super(DatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(DatasetLoader, self).__init__(data_path)


class POSDatasetLoader(DatasetLoader):
@@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader):
to label5.
"""

def __init__(self, data_name, data_path):
super(POSDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(POSDatasetLoader, self).__init__(data_path)

def load(self):
assert os.path.exists(self.data_path)
@@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader):
Data set loader for tokenization data sets
"""

def __init__(self, data_name, data_path):
super(TokenizeDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(TokenizeDatasetLoader, self).__init__(data_path)

def load_pku(self, max_seq_len=32):
"""
@@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader):
class ClassDatasetLoader(DatasetLoader):
"""Loader for classification data sets"""

def __init__(self, data_name, data_path):
super(ClassDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(ClassDatasetLoader, self).__init__(data_path)

def load(self):
assert os.path.exists(self.data_path)
@@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader):
:param str data_name: the name of the conll data set
:param str data_path: the path to the conll data set
"""
super(ConllLoader, self).__init__(data_name, data_path)
super(ConllLoader, self).__init__(data_path)
self.data_set = self.parse(self.load())

def load(self):
@@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader):


class LMDatasetLoader(DatasetLoader):
def __init__(self, data_name, data_path):
super(LMDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(LMDatasetLoader, self).__init__(data_path)

def load(self):
if not os.path.exists(self.data_path):
@@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader):
return text.strip().split()


if __name__ == "__main__":
class PeopleDailyCorpusLoader(DatasetLoader):
"""
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines()
for example in data:
for w, l in zip(example[0], example[1]):
print(w, l)
People Daily Corpus: Chinese word segmentation, POS tag, NER
"""

ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku()
print(ans)
def __init__(self, data_path):
super(PeopleDailyCorpusLoader, self).__init__(data_path)

def load(self):
with open(self.data_path, "r", encoding="utf-8") as f:
sents = f.readlines()

pos_tag_examples = []
ner_examples = []
for sent in sents:
inside_ne = False
sent_pos_tag = []
sent_words = []
sent_ner = []
words = sent.strip().split()[1:]
for word in words:
if "[" in word and "]" in word:
ner_tag = "U"
print(word)
elif "[" in word:
inside_ne = True
ner_tag = "B"
word = word[1:]
elif "]" in word:
ner_tag = "L"
word = word[:word.index("]")]
if inside_ne is True:
inside_ne = False
else:
raise RuntimeError("only ] appears!")
else:
if inside_ne is True:
ner_tag = "I"
else:
ner_tag = "O"
tmp = word.split("/")
token, pos = tmp[0], tmp[1]
sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
return pos_tag_examples, ner_examples

if __name__ == "__main__":
loader = PeopleDailyCorpusLoader("./")
pos, ner = loader.load()
print(pos[:10])
print(ner[:10])

+ 44
- 2
fastNLP/loader/embed_loader.py View File

@@ -1,8 +1,50 @@
import _pickle
import os

import numpy as np

from fastNLP.loader.base_loader import BaseLoader


class EmbedLoader(BaseLoader):
"""docstring for EmbedLoader"""

def __init__(self, data_name, data_path):
super(EmbedLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(EmbedLoader, self).__init__(data_path)

@staticmethod
def load_embedding(emb_dim, emb_file, word_dict, emb_pkl):
"""Load the pre-trained embedding and combine with the given dictionary.

:param emb_file: str, the pre-trained embedding.
The embedding file should have the following format:
Each line is a word embedding, where a word string is followed by multiple floats.
Floats are separated by space. The word and the first float are separated by space.
:param word_dict: dict, a mapping from word to index.
:param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
:param emb_pkl: str, the embedding pickle file.
:return embedding_np: numpy array of shape (len(word_dict), emb_dim)

TODO: fragile code
"""
# If the embedding pickle exists, load it and return.
if os.path.exists(emb_pkl):
with open(emb_pkl, "rb") as f:
embedding_np = _pickle.load(f)
return embedding_np
# Otherwise, load the pre-trained embedding.
with open(emb_file, "r", encoding="utf-8") as f:
# begin with a random embedding
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
for line in f:
line = line.strip().split()
if len(line) != emb_dim + 1:
# skip this line if two embedding dimension not match
continue
if line[0] in word_dict:
# find the word and replace its embedding with a pre-trained one
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
# save and return the result
with open(emb_pkl, "wb") as f:
_pickle.dump(embedding_np, f)
return embedding_np

+ 2
- 2
fastNLP/loader/model_loader.py View File

@@ -8,8 +8,8 @@ class ModelLoader(BaseLoader):
Loader for models.
"""

def __init__(self, data_name, data_path):
super(ModelLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(ModelLoader, self).__init__(data_path)

@staticmethod
def load_pytorch(empty_model, model_path):


+ 2
- 1
fastNLP/modules/decoder/__init__.py View File

@@ -1,3 +1,4 @@
from .CRF import ConditionalRandomField
from .MLP import MLP

__all__ = ["ConditionalRandomField"]
__all__ = ["ConditionalRandomField", "MLP"]

+ 0
- 114
reproduction/chinese_word_seg/cws_train.py View File

@@ -1,114 +0,0 @@
import sys

sys.path.append("..")

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.predictor import Predictor

data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/data/pku_training.utf8"
pickle_path = "./save/"
data_infer_path = "/home/zyfeng/data/pku_test.utf8"


def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()

# Inference interface
infer = Predictor(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train_test():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
train_data = loader.load_pku()

# Preprocessor
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocess.vocab_size
train_args["num_classes"] = preprocess.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)

# Model
model = SeqLabeling(train_args)

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

# testing with validation set
test(data_dev)


def test(test_data):
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

# Define the same model
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# Tester
tester = SeqLabelTester(test_args)

# Start testing
tester.test(model, test_data)

# print test results
print(tester.show_matrices())
print("model tested!")


if __name__ == "__main__":
train_test()

+ 12
- 0
reproduction/chinese_word_segment/cws.cfg View File

@@ -31,4 +31,16 @@ pickle_path = "./save/"
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

[model]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 640
pickle_path = "./save/"
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

+ 6
- 6
reproduction/chinese_word_segment/run.py View File

@@ -27,7 +27,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8")
def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -47,7 +47,7 @@ def infer():
raise

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
print('data loaded')

@@ -63,10 +63,10 @@ def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
loader = TokenizeDatasetLoader(cws_data_path)
train_data = loader.load_pku()

# Preprocessor
@@ -100,7 +100,7 @@ def train():
def test():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -125,7 +125,7 @@ def test():
tester.test(model, dev_data)

# print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!")




reproduction/chinese_word_seg/cws.cfg → reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,29 +1,35 @@
[train]
epochs = 10
batch_size = 32
epochs = 30
batch_size = 64
pickle_path = "./save/"
validate = true
save_best_dev = true
model_saved_path = "./save/"
rnn_hidden_units = 100
rnn_layers = 2
rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true
use_cuda = true
print_every_step = 10

[test]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 64
batch_size = 640
pickle_path = "./save/"
use_crf = true
use_cuda = true


[POS_test]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 640
pickle_path = "./save/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

+ 146
- 0
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -0,0 +1,146 @@
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './pos_tag.cfg'
data_name = "CWS_POS_TAG_NER_people_daily.txt"

pos_tag_data_path = os.path.join(datadir, data_name)
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")


def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = AdvSeqLabel(test_args)

try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise

# Data Loader
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
print('data loaded')

# Inference interface
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = PeopleDailyCorpusLoader(pos_tag_data_path)
train_data, _ = loader.load()

# Preprocessor
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes

# Trainer
trainer = SeqLabelTrainer(**train_args.data)

# Model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def test():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")

# Define the same model
model = AdvSeqLabel(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print("model loaded!")

# Tester
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, dev_data)

# print test results
print(tester.show_metrics())
print("model tested!")


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
test()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

+ 24
- 0
setup.py View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding=utf-8
from setuptools import setup, find_packages

with open('README.md') as f:
readme = f.read()

with open('LICENSE') as f:
license = f.read()

with open('requirements.txt') as f:
reqs = f.read()

setup(
name='fastNLP',
version='1.0',
description=('fudan fastNLP '),
long_description=readme,
license=license,
author='fudanNLP',
python_requires='>=3.5',
packages=find_packages(),
install_requires=reqs.strip().split('\n'),
)

+ 7
- 8
test/loader/test_loader.py View File

@@ -1,13 +1,12 @@
import os
import configparser

import json
import os
import unittest


from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader


class TestConfigLoader(unittest.TestCase):
def test_case_ConfigLoader(self):

@@ -33,8 +32,8 @@ class TestConfigLoader(unittest.TestCase):
return dict

test_arg = ConfigSection()
ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg})
#ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config",
ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg})
# ConfigLoader("config").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config",
# {"test": test_arg})

#dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test")
@@ -58,18 +57,18 @@ class TestConfigLoader(unittest.TestCase):

class TestDatasetLoader(unittest.TestCase):
def test_case_TokenizeDatasetLoader(self):
loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8")
loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
data = loader.load_pku(max_seq_len=32)
print("pass TokenizeDatasetLoader test!")

def test_case_POSDatasetLoader(self):
loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt")
loader = POSDatasetLoader("./test/data_for_tests/people.txt")
data = loader.load()
datas = loader.load_lines()
print("pass POSDatasetLoader test!")

def test_case_LMDatasetLoader(self):
loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8")
loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
data = loader.load()
datas = loader.load_lines()
print("pass TokenizeDatasetLoader test!")

+ 0
- 138
test/ner.py View File

@@ -1,138 +0,0 @@
import _pickle
import os

import numpy as np
import torch

from fastNLP.core.preprocess import SeqLabelPreprocess
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.models.sequence_modeling import AdvSeqLabel


class MyNERTrainer(SeqLabelTrainer):
def __init__(self, train_args):
super(MyNERTrainer, self).__init__(train_args)
self.scheduler = None

def define_optimizer(self):
"""
override
:return:
"""
self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001)
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5)

def update(self):
"""
override
:return:
"""
self.optimizer.step()
self.scheduler.step()

def _create_validator(self, valid_args):
return MyNERTester(valid_args)

def best_eval_result(self, validator):
accuracy = validator.metrics()
if accuracy > self.best_accuracy:
self.best_accuracy = accuracy
return True
else:
return False


class MyNERTester(SeqLabelTester):
def __init__(self, test_args):
super(MyNERTester, self).__init__(test_args)

def _evaluate(self, prediction, batch_y, seq_len):
"""
:param prediction: [batch_size, seq_len, num_classes]
:param batch_y: [batch_size, seq_len]
:param seq_len: [batch_size]
:return:
"""
summ = 0
correct = 0
_, indices = torch.max(prediction, 2)
for p, y, l in zip(indices, batch_y, seq_len):
summ += l
correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy())
return float(correct / summ)

def evaluate(self, predict, truth):
return self._evaluate(predict, truth, self.seq_len)

def metrics(self):
return np.mean(self.eval_history)

def show_matrices(self):
return "dev accuracy={:.2f}".format(float(self.metrics()))


def embedding_process(emb_file, word_dict, emb_dim, emb_pkl):
if os.path.exists(emb_pkl):
with open(emb_pkl, "rb") as f:
embedding_np = _pickle.load(f)
return embedding_np
with open(emb_file, "r", encoding="utf-8") as f:
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
for line in f:
line = line.strip().split()
if len(line) != emb_dim + 1:
continue
if line[0] in word_dict:
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
with open(emb_pkl, "wb") as f:
_pickle.dump(embedding_np, f)
return embedding_np


def data_load(data_file):
with open(data_file, "r", encoding="utf-8") as f:
all_data = []
sent = []
label = []
for line in f:
line = line.strip().split()

if not len(line) <= 1:
sent.append(line[0])
label.append(line[1])
else:
all_data.append([sent, label])
sent = []
label = []
return all_data


data_path = "data_for_tests/people.txt"
pick_path = "data_for_tests/"
emb_path = "data_for_tests/emb50.txt"
save_path = "data_for_tests/"
if __name__ == "__main__":
data = data_load(data_path)
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3)
# emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl"))
emb = None
args = {"epochs": 20,
"batch_size": 1,
"pickle_path": pick_path,
"validate": True,
"save_best_dev": True,
"model_saved_path": save_path,
"use_cuda": True,

"vocab_size": preprocess.vocab_size,
"num_classes": preprocess.num_classes,
"word_emb_dim": 50,
"rnn_hidden_units": 100
}
# emb = torch.Tensor(emb).float().cuda()
networks = AdvSeqLabel(args, emb)
trainer = MyNERTrainer(args)
trainer.train(networks, data_train, data_dev)
print("Training finished!")

+ 0
- 129
test/ner_decode.py View File

@@ -1,129 +0,0 @@
import _pickle
import os

import torch

from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel


class Decode(SeqLabelTrainer):
def __init__(self, args):
super(Decode, self).__init__(args)

def decoder(self, network, sents, model_path):
self.model = network
self.model.load_state_dict(torch.load(model_path))
out_put = []
self.mode(network, test=True)
for batch_x in sents:
prediction = self.data_forward(self.model, batch_x)

seq_tag = self.model.prediction(prediction, batch_x[1])

out_put.append(list(seq_tag)[0])
return out_put


def process_sent(sents, word2id):
sents_num = []
for s in sents:
sent_num = []
for c in s:
if c in word2id:
sent_num.append(word2id[c])
else:
sent_num.append(word2id["<unk>"])
sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1

return sents_num


def process_tag(sents, tags, id2class):
Tags = []
for ttt in tags:
Tags.append([id2class[t] for t in ttt])

Segs = []
PosNers = []
for sent, tag in zip(sents, tags):
word__ = []
lll__ = []
for c, t in zip(sent, tag):

t = id2class[t]
l = t.split("-")
split_ = l[0]
pn = l[1]

if split_ == "S":
word__.append(c)
lll__.append(pn)
word_1 = ""
elif split_ == "E":
word_1 += c
word__.append(word_1)
lll__.append(pn)
word_1 = ""
elif split_ == "B":
word_1 = ""
word_1 += c
else:
word_1 += c
Segs.append(word__)
PosNers.append(lll__)
return Segs, PosNers


pickle_path = "data_for_tests/"
model_path = "data_for_tests/model_best_dev.pkl"
if __name__ == "__main__":

with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f:
id2word = _pickle.load(f)
with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f:
word2id = _pickle.load(f)
with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f:
id2class = _pickle.load(f)

sent = ["中共中央总书记、国家主席江泽民",
"逆向处理输入序列并返回逆序后的序列"] # here is input

args = {"epochs": 1,
"batch_size": 1,
"pickle_path": "data_for_tests/",
"validate": True,
"save_best_dev": True,
"model_saved_path": "data_for_tests/",
"use_cuda": False,

"vocab_size": len(word2id),
"num_classes": len(id2class),
"word_emb_dim": 50,
"rnn_hidden_units": 100,
}
"""
network = AdvSeqLabel(args, None)
decoder_ = Decode(args)
tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path)
output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output
print(output_seg)
print(output_pn)
"""
# Define the same model
model = AdvSeqLabel(args, None)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl")
print("model loaded!")

# Inference interface
infer = SeqLabelInfer(pickle_path)
sent = [[ch for ch in s] for s in sent]
results = infer.predict(model, sent)

for res in results:
print(res)
print("Inference finished!")

+ 23
- 27
test/readme_example.py View File

@@ -1,19 +1,13 @@
# python: 3.5
# pytorch: 0.4

################
# Test cross validation.
################

from fastNLP.loader.preprocess import ClassPreprocess

from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregation
from fastNLP.modules import encoder
from fastNLP.modules import decoder
from fastNLP.modules import encoder


class ClassificationModel(BaseModel):
@@ -28,7 +22,7 @@ class ClassificationModel(BaseModel):
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)
self.dec = decoder.MLP(size_layer=[100, num_classes])

def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C]
@@ -38,18 +32,17 @@ class ClassificationModel(BaseModel):
return x


data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file
data_dir = 'save/' # directory to save data and model
train_path = './data_for_tests/text_classify.txt' # training set file

# load dataset
ds_loader = ClassDatasetLoader("train", train_path)
ds_loader = ClassDatasetLoader(train_path)
data = ds_loader.load()

# pre-process dataset
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
# pre = ClassPreprocess(data, data_dir)
n_classes = pre.num_classes
vocab_size = pre.vocab_size
pre = ClassPreprocess()
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
n_classes, vocab_size = pre.num_classes, pre.vocab_size

# construct model
model_args = {
@@ -58,22 +51,25 @@ model_args = {
}
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)

# train model
# construct trainer
train_args = {
"epochs": 10,
"batch_size": 50,
"epochs": 3,
"batch_size": 16,
"pickle_path": data_dir,
"validate": False,
"save_best_dev": False,
"model_saved_path": None,
"use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
trainer.cross_validate(model)
"loss": Loss("cross_entropy"),
"optimizer": Optimizer("Adam", lr=0.001)
}
trainer = ClassificationTrainer(**train_args)

# start training
trainer.train(model, train_data=train_set, dev_data=dev_set)

# predict using model
data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, data_infer)
labels_pred = infer.predict(model.cpu(), data_infer)
print(labels_pred)

+ 8
- 8
test/seq_labeling.py View File

@@ -33,7 +33,7 @@ data_infer_path = args.infer
def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args})
ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -49,7 +49,7 @@ def infer():
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader("xxx", data_infer_path)
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()

# Inference interface
@@ -65,11 +65,11 @@ def train_and_test():
# Config Loader
trainer_args = ConfigSection()
model_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {
ConfigLoader("config.cfg").load_config(config_dir, {
"test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})

# Data Loader
pos_loader = POSDatasetLoader("xxx", data_path)
pos_loader = POSDatasetLoader(data_path)
train_data = pos_loader.load_lines()

# Preprocessor
@@ -117,7 +117,7 @@ def train_and_test():

# Load test configuration
tester_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args})
ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args})

# Tester
tester = SeqLabelTester(save_output=False,
@@ -134,10 +134,10 @@ def train_and_test():
tester.test(model, data_dev)

# print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!")


if __name__ == "__main__":
train_and_test()
# infer()
# train_and_test()
infer()

+ 8
- 8
test/test_cws.py View File

@@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt"
def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -38,7 +38,7 @@ def infer():
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
"""
Transform strings into list of list of strings.
@@ -61,10 +61,10 @@ def infer():
def train_test():
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
loader = TokenizeDatasetLoader(cws_data_path)
train_data = loader.load_pku()

# Preprocessor
@@ -74,7 +74,7 @@ def train_test():
train_args["num_classes"] = p.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)
trainer = SeqLabelTrainer(**train_args.data)

# Model
model = SeqLabeling(train_args)
@@ -99,16 +99,16 @@ def train_test():

# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})

# Tester
tester = SeqLabelTester(test_args)
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, data_train)

# print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!")




+ 33
- 2
test/test_fastNLP.py View File

@@ -1,9 +1,12 @@
import sys

sys.path.append("..")
from fastNLP.fastnlp import FastNLP
from fastNLP.fastnlp import interpret_word_seg_results
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results

PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/"


def word_seg():
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES)
@@ -39,5 +42,33 @@ def test_word_seg_interpret():
print(interpret_word_seg_results(chars, labels))


def test_interpret_cws_pos_results():
foo = [
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'),
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'),
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')]
]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_cws_pos_results(chars, labels))


def pos_tag():
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES)
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model")
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_cws_pos_results(words, labels))




if __name__ == "__main__":
word_seg()
pos_tag()

+ 7
- 7
test/test_tester.py View File

@@ -5,19 +5,19 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
from fastNLP.models.sequence_modeling import SeqLabeling

data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8"
pickle_path = "data_for_tests"


def foo():
loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8")
loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8")
train_data = loader.load_pku()

train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args})

# Preprocessor
p = SeqLabelPreprocess(train_data, pickle_path)
p = SeqLabelPreprocess()
train_data = p.run(train_data)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes

@@ -26,11 +26,11 @@ def foo():
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
"save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/",
"use_cuda": True}
validator = SeqLabelTester(valid_args)
validator = SeqLabelTester(**valid_args)

print("start validation.")
validator.test(model)
print(validator.show_matrices())
validator.test(model, train_data)
print(validator.show_metrics())


if __name__ == "__main__":


+ 2
- 2
test/text_classify.py View File

@@ -34,7 +34,7 @@ config_dir = args.config
def infer():
# load dataset
print("Loading data...")
ds_loader = ClassDatasetLoader("train", train_data_dir)
ds_loader = ClassDatasetLoader(train_data_dir)
data = ds_loader.load()
unlabeled_data = [x[0] for x in data]

@@ -69,7 +69,7 @@ def train():

# load dataset
print("Loading data...")
ds_loader = ClassDatasetLoader("train", train_data_dir)
ds_loader = ClassDatasetLoader(train_data_dir)
data = ds_loader.load()
print(data[0])



Loading…
Cancel
Save