Browse Source

Merge pull request #38 from FengZiYjun/new_updates

New updates
tags/v0.1.0
Coet GitHub 6 years ago
parent
commit
ceac3f2e1f
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 475 additions and 433 deletions
  1. +1
    -1
      fastNLP/core/predictor.py
  2. +310
    -0
      fastNLP/core/preprocess.py
  3. +3
    -3
      fastNLP/core/tester.py
  4. +29
    -7
      fastNLP/core/trainer.py
  5. +0
    -366
      fastNLP/loader/preprocess.py
  6. +3
    -4
      reproduction/CNN-sentence_classification/train.py
  7. +13
    -9
      reproduction/chinese_word_seg/cws_train.py
  8. +2
    -2
      test/data_for_tests/config
  9. +6
    -5
      test/ner.py
  10. +78
    -0
      test/readme_example.py
  11. +10
    -18
      test/seq_labeling.py
  12. +5
    -4
      test/test_cws.py
  13. +2
    -2
      test/test_tester.py
  14. +13
    -12
      test/text_classify.py

+ 1
- 1
fastNLP/core/predictor.py View File

@@ -3,7 +3,7 @@ import torch

from fastNLP.core.action import Batchifier, SequentialSampler
from fastNLP.core.action import convert_to_torch_tensor
from fastNLP.loader.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL
from fastNLP.core.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL
from fastNLP.modules import utils




+ 310
- 0
fastNLP/core/preprocess.py View File

@@ -0,0 +1,310 @@
import _pickle
import os

import numpy as np

DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4

DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}


# the first vocab in dict with the index = 5

def save_pickle(obj, pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "wb") as f:
_pickle.dump(obj, f)
print("{} saved. ".format(file_name))


def load_pickle(pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f)
print("{} loaded. ".format(file_name))
return obj


def pickle_exist(pickle_path, pickle_name):
"""
:param pickle_path: the directory of target pickle file
:param pickle_name: the filename of target pickle file
:return: True if file exists else False
"""
if not os.path.exists(pickle_path):
os.makedirs(pickle_path)
file_name = os.path.join(pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False


class BasePreprocess(object):
def __init__(self):
self.word2index = None
self.label2index = None

@property
def vocab_size(self):
return len(self.word2index)

@property
def num_classes(self):
return len(self.label2index)

def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
"""Main preprocessing pipeline.

:param train_dev_data: three-level list, with either single label or multiple labels in a sample.
:param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
:param pickle_path: str, the path to save the pickle files.
:param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set.
:param cross_val: bool, whether to do cross validation.
:param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
:return results: a tuple of datasets after preprocessing.
"""
if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
self.word2index = load_pickle(pickle_path, "word2id.pkl")
self.label2index = load_pickle(pickle_path, "class2id.pkl")
else:
self.word2index, self.label2index = self.build_dict(train_dev_data)
save_pickle(self.word2index, pickle_path, "word2id.pkl")
save_pickle(self.label2index, pickle_path, "class2id.pkl")

if not pickle_exist(pickle_path, "id2word.pkl"):
index2word = self.build_reverse_dict(self.word2index)
save_pickle(index2word, pickle_path, "id2word.pkl")

if not pickle_exist(pickle_path, "id2class.pkl"):
index2label = self.build_reverse_dict(self.label2index)
save_pickle(index2label, pickle_path, "id2class.pkl")

data_train = []
data_dev = []
if not cross_val:
if not pickle_exist(pickle_path, "data_train.pkl"):
data_train.extend(self.to_index(train_dev_data))
if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
split = int(len(data_train) * train_dev_split)
data_dev = data_train[: split]
data_train = data_train[split:]
save_pickle(data_dev, pickle_path, "data_dev.pkl")
print("{} of the training data is split for validation. ".format(train_dev_split))
save_pickle(data_train, pickle_path, "data_train.pkl")
else:
data_train = load_pickle(pickle_path, "data_train.pkl")
else:
# cross_val is True
if not pickle_exist(pickle_path, "data_train_0.pkl"):
# cross validation
data_idx = self.to_index(train_dev_data)
data_cv = self.cv_split(data_idx, n_fold)
for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
save_pickle(
data_train_cv, pickle_path,
"data_train_{}.pkl".format(i))
save_pickle(
data_dev_cv, pickle_path,
"data_dev_{}.pkl".format(i))
data_train.append(data_train_cv)
data_dev.append(data_dev_cv)
print("{}-fold cross validation.".format(n_fold))
else:
for i in range(n_fold):
data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i))
data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i))
data_train.append(data_train_cv)
data_dev.append(data_dev_cv)

# prepare test data if provided
data_test = []
if test_data is not None:
if not pickle_exist(pickle_path, "data_test.pkl"):
data_test = self.to_index(test_data)
save_pickle(data_test, pickle_path, "data_test.pkl")

# return preprocessed results
results = [data_train]
if cross_val or train_dev_split > 0:
results.append(data_dev)
if test_data:
results.append(data_test)
if len(results) == 1:
return results[0]
else:
return tuple(results)

def build_dict(self, data):
raise NotImplementedError

def to_index(self, data):
raise NotImplementedError

def build_reverse_dict(self, word_dict):
id2word = {word_dict[w]: w for w in word_dict}
return id2word

def data_split(self, data, train_dev_split):
"""Split data into train and dev set."""
split = int(len(data) * train_dev_split)
data_dev = data[: split]
data_train = data[split:]
return data_train, data_dev

def cv_split(self, data, n_fold):
"""Split data for cross validation."""
data_copy = data.copy()
np.random.shuffle(data_copy)
fold_size = round(len(data_copy) / n_fold)

data_cv = []
for i in range(n_fold - 1):
start = i * fold_size
end = (i + 1) * fold_size
data_dev = data_copy[start:end]
data_train = data_copy[:start] + data_copy[end:]
data_cv.append((data_train, data_dev))
start = (n_fold - 1) * fold_size
data_dev = data_copy[start:]
data_train = data_copy[:start]
data_cv.append((data_train, data_dev))

return data_cv


class SeqLabelPreprocess(BasePreprocess):
"""Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
data of three-level list which have multiple labels in each sample.
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
"""

def __init__(self):
super(SeqLabelPreprocess, self).__init__()

def build_dict(self, data):
"""
Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return word2index: dict of {str, int}
label2index: dict of {str, int}
"""
# In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch.
label2index = DEFAULT_WORD_TO_INDEX.copy()
word2index = DEFAULT_WORD_TO_INDEX.copy()
for example in data:
for word, label in zip(example[0], example[1]):
if word not in word2index:
word2index[word] = len(word2index)
if label not in label2index:
label2index[label] = len(label2index)
return word2index, label2index

def to_index(self, data):
"""
Convert word strings and label strings into indices.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return data_index: the same shape as data, but each string is replaced by its corresponding index
"""
data_index = []
for example in data:
word_list = []
label_list = []
for word, label in zip(example[0], example[1]):
word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
label_list.append(self.label2index.get(label, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
data_index.append([word_list, label_list])
return data_index


class ClassPreprocess(BasePreprocess):
""" Preprocess pipeline for classification datasets.
Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
design for data of three-level list which has a single label in each sample.
[
[ [word_11, word_12, ...], label_1 ],
[ [word_21, word_22, ...], label_2 ],
...
]
"""

def __init__(self):
super(ClassPreprocess, self).__init__()

def build_dict(self, data):
"""Build vocabulary."""

# build vocabulary from scratch if nothing exists
word2index = DEFAULT_WORD_TO_INDEX.copy()
label2index = DEFAULT_WORD_TO_INDEX.copy()

# collect every word and label
for sent, label in data:
if len(sent) <= 1:
continue

if label not in label2index:
label2index[label] = len(label2index)

for word in sent:
if word not in word2index:
word2index[word[0]] = len(word2index)
return word2index, label2index

def to_index(self, data):
"""
Convert word strings and label strings into indices.
:param data: three-level list
[
[ [word_11, word_12, ...], label_1 ],
[ [word_21, word_22, ...], label_2 ],
...
]
:return data_index: the same shape as data, but each string is replaced by its corresponding index
"""
data_index = []
for example in data:
word_list = []
# example[0] is the word list, example[1] is the single label
for word in example[0]:
word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
label_index = self.label2index.get(example[1], DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])
data_index.append([word_list, label_index])
return data_index


def infer_preprocess(pickle_path, data):
"""
Preprocess over inference data.
Transform three-level list of strings into that of index.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
"""
word2index = load_pickle(pickle_path, "word2id.pkl")
data_index = []
for example in data:
data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
return data_index

+ 3
- 3
fastNLP/core/tester.py View File

@@ -34,7 +34,7 @@ class BaseTester(object):
self.eval_history = []
self.batch_output = []

def test(self, network):
def test(self, network, dev_data):
if torch.cuda.is_available() and self.use_cuda:
self.model = network.cuda()
else:
@@ -45,8 +45,8 @@ class BaseTester(object):
self.eval_history.clear()
self.batch_output.clear()

dev_data = self.prepare_input(self.pickle_path)
logger.info("validation data loaded")
# dev_data = self.prepare_input(self.pickle_path)
# logger.info("validation data loaded")

iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
n_batches = len(dev_data) // self.batch_size


+ 29
- 7
fastNLP/core/trainer.py View File

@@ -1,4 +1,5 @@
import _pickle
import copy
import os
import time
from datetime import timedelta
@@ -52,9 +53,11 @@ class BaseTrainer(object):
self.loss_func = None
self.optimizer = None

def train(self, network):
def train(self, network, train_data, dev_data=None):
"""General Training Steps
:param network: a model
:param train_data: three-level list, the training set.
:param dev_data: three-level list, the validation data (optional)

The method is framework independent.
Work by calling the following methods:
@@ -73,8 +76,8 @@ class BaseTrainer(object):
else:
self.model = network

data_train = self.load_train_data(self.pickle_path)
logger.info("training data loaded")
# train_data = self.load_train_data(self.pickle_path)
# logger.info("training data loaded")

# define tester over dev data
if self.validate:
@@ -88,8 +91,7 @@ class BaseTrainer(object):
logger.info("optimizer defined as {}".format(str(self.optimizer)))

# main training epochs

n_samples = len(data_train)
n_samples = len(train_data)
n_batches = n_samples // self.batch_size
n_print = 1
start = time.time()
@@ -101,14 +103,14 @@ class BaseTrainer(object):
# turn on network training mode
self.mode(network, test=False)
# prepare mini-batch iterator
data_iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=False))
data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False))
logger.info("prepared data iterator")

self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch)

if self.validate:
logger.info("validation started")
validator.test(network)
validator.test(network, dev_data)

if self.save_best_dev and self.best_eval_result(validator):
self.save_model(network)
@@ -139,6 +141,26 @@ class BaseTrainer(object):
logger.info(print_output)
step += 1

def cross_validate(self, network, train_data_cv, dev_data_cv):
"""Training with cross validation.

:param network: the model
:param train_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]
:param dev_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]

"""
if len(train_data_cv) != len(dev_data_cv):
logger.error("the number of folds in train and dev data unequals {}!={}".format(len(train_data_cv),
len(dev_data_cv)))
raise RuntimeError("the number of folds in train and dev data unequals")
n_fold = len(train_data_cv)
logger.info("perform {} folds cross validation.".format(n_fold))
for i in range(n_fold):
print("CV:", i)
logger.info("running the {} of {} folds cross validation".format(i + 1, n_fold))
network_copy = copy.deepcopy(network)
self.train(network_copy, train_data_cv[i], dev_data_cv[i])

def load_train_data(self, pickle_path):
"""
For task-specific processing.


+ 0
- 366
fastNLP/loader/preprocess.py View File

@@ -1,366 +0,0 @@
import _pickle
import os

DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4

DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}


# the first vocab in dict with the index = 5

def save_pickle(obj, pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "wb") as f:
_pickle.dump(obj, f)
print("{} saved. ".format(file_name))


def load_pickle(pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f)
print("{} loaded. ".format(file_name))
return obj


def pickle_exist(pickle_path, pickle_name):
"""
:param pickle_path: the directory of target pickle file
:param pickle_name: the filename of target pickle file
:return: True if file exists else False
"""
if not os.path.exists(pickle_path):
os.makedirs(pickle_path)
file_name = os.path.join(pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False


class BasePreprocess(object):

def __init__(self, data, pickle_path):
super(BasePreprocess, self).__init__()
# self.data = data
self.pickle_path = pickle_path
if not self.pickle_path.endswith('/'):
self.pickle_path = self.pickle_path + '/'


class POSPreprocess(BasePreprocess):
"""
This class are used to preprocess the POS Tag datasets.

"""

def __init__(self, data, pickle_path="./", train_dev_split=0):
"""
Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:param pickle_path: str, the directory to the pickle files. Default: "./"
:param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0.

"""
super(POSPreprocess, self).__init__(data, pickle_path)

self.pickle_path = pickle_path

if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
self.word2index = load_pickle(self.pickle_path, "word2id.pkl")
self.label2index = load_pickle(self.pickle_path, "class2id.pkl")
else:
self.word2index, self.label2index = self.build_dict(data)
save_pickle(self.word2index, self.pickle_path, "word2id.pkl")
save_pickle(self.label2index, self.pickle_path, "class2id.pkl")

if not pickle_exist(pickle_path, "id2word.pkl"):
index2word = self.build_reverse_dict(self.word2index)
save_pickle(index2word, self.pickle_path, "id2word.pkl")

if not pickle_exist(pickle_path, "id2class.pkl"):
index2label = self.build_reverse_dict(self.label2index)
save_pickle(index2label, self.pickle_path, "id2class.pkl")

if not pickle_exist(pickle_path, "data_train.pkl"):
data_train = self.to_index(data)
if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
split = int(len(data_train) * train_dev_split)
data_dev = data_train[: split]
data_train = data_train[split:]
save_pickle(data_dev, self.pickle_path, "data_dev.pkl")
print("{} of the training data is split for validation. ".format(train_dev_split))
save_pickle(data_train, self.pickle_path, "data_train.pkl")

def build_dict(self, data):
"""
Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return word2index: dict of {str, int}
label2index: dict of {str, int}
"""
# In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch.
label2index = DEFAULT_WORD_TO_INDEX.copy()
word2index = DEFAULT_WORD_TO_INDEX.copy()
for example in data:
for word, label in zip(example[0], example[1]):
if word not in word2index:
word2index[word] = len(word2index)
if label not in label2index:
label2index[label] = len(label2index)
return word2index, label2index

def build_reverse_dict(self, word_dict):
id2word = {word_dict[w]: w for w in word_dict}
return id2word

def to_index(self, data):
"""
Convert word strings and label strings into indices.
:param data: three-level list
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]
:return data_index: the shape of data, but each string is replaced by its corresponding index
"""
data_index = []
for example in data:
word_list = []
label_list = []
for word, label in zip(example[0], example[1]):
word_list.append(self.word2index[word])
label_list.append(self.label2index[label])
data_index.append([word_list, label_list])
return data_index

@property
def vocab_size(self):
return len(self.word2index)

@property
def num_classes(self):
return len(self.label2index)


class ClassPreprocess(BasePreprocess):
"""
Pre-process the classification datasets.

Params:
pickle_path - directory to save result of pre-processing
Saves:
word2id.pkl
id2word.pkl
class2id.pkl
id2class.pkl
embedding.pkl
data_train.pkl
data_dev.pkl
data_test.pkl
"""

def __init__(self, pickle_path):
# super(ClassPreprocess, self).__init__(data, pickle_path)
self.word_dict = None
self.label_dict = None
self.pickle_path = pickle_path # save directory

def process(self, data, save_name):
"""
Process data.

Params:
data - nested list, data = [sample1, sample2, ...],
sample = [sentence, label], sentence = [word1, word2, ...]
save_name - name of processed data, such as data_train.pkl
Returns:
vocab_size - vocabulary size
n_classes - number of classes
"""
self.build_dict(data)
self.word2id()
vocab_size = self.id2word()
self.class2id()
num_classes = self.id2class()
self.embedding()
self.data_generate(data, save_name)

return vocab_size, num_classes

def build_dict(self, data):
"""Build vocabulary."""

# just read if word2id.pkl and class2id.pkl exists
if self.pickle_exist("word2id.pkl") and \
self.pickle_exist("class2id.pkl"):
file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, 'rb') as f:
self.word_dict = _pickle.load(f)
file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, 'rb') as f:
self.label_dict = _pickle.load(f)
return

# build vocabulary from scratch if nothing exists
self.word_dict = {
DEFAULT_PADDING_LABEL: 0,
DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2,
DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
self.label_dict = {}

# collect every word and label
for sent, label in data:
if len(sent) <= 1:
continue

if label not in self.label_dict:
index = len(self.label_dict)
self.label_dict[label] = index

for word in sent:
if word not in self.word_dict:
index = len(self.word_dict)
self.word_dict[word[0]] = index

def pickle_exist(self, pickle_name):
"""
Check whether a pickle file exists.

Params
pickle_name: the filename of target pickle file
Return
True if file exists else False
"""
if not os.path.exists(self.pickle_path):
os.makedirs(self.pickle_path)
file_name = os.path.join(self.pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
return False

def word2id(self):
"""Save vocabulary of {word:id} mapping format."""
# nothing will be done if word2id.pkl exists
if self.pickle_exist("word2id.pkl"):
return

file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.word_dict, f)

def id2word(self):
"""Save vocabulary of {id:word} mapping format."""
# nothing will be done if id2word.pkl exists
if self.pickle_exist("id2word.pkl"):
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, 'rb') as f:
id2word_dict = _pickle.load(f)
return len(id2word_dict)

id2word_dict = {self.word_dict[w]: w for w in self.word_dict}
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2word_dict, f)
return len(id2word_dict)

def class2id(self):
"""Save mapping of {class:id}."""
# nothing will be done if class2id.pkl exists
if self.pickle_exist("class2id.pkl"):
return

file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.label_dict, f)

def id2class(self):
"""Save mapping of {id:class}."""
# nothing will be done if id2class.pkl exists
if self.pickle_exist("id2class.pkl"):
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "rb") as f:
id2class_dict = _pickle.load(f)
return len(id2class_dict)

id2class_dict = {self.label_dict[c]: c for c in self.label_dict}
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2class_dict, f)
return len(id2class_dict)

def embedding(self):
"""Save embedding lookup table corresponding to vocabulary."""
# nothing will be done if embedding.pkl exists
if self.pickle_exist("embedding.pkl"):
return

# retrieve vocabulary from pre-trained embedding (not implemented)

def data_generate(self, data_src, save_name):
"""Convert dataset from text to digit."""

# nothing will be done if file exists
save_path = os.path.join(self.pickle_path, save_name)
if os.path.exists(save_path):
return

data = []
# for every sample
for sent, label in data_src:
if len(sent) <= 1:
continue

label_id = self.label_dict[label] # label id
sent_id = [] # sentence ids
for word in sent:
if word in self.word_dict:
sent_id.append(self.word_dict[word])
else:
sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL])
data.append([sent_id, label_id])

# save data
with open(save_path, "wb") as f:
_pickle.dump(data, f)


class LMPreprocess(BasePreprocess):
def __init__(self, data, pickle_path):
super(LMPreprocess, self).__init__(data, pickle_path)


def infer_preprocess(pickle_path, data):
"""
Preprocess over inference data.
Transform three-level list of strings into that of index.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
"""
word2index = load_pickle(pickle_path, "word2id.pkl")
data_index = []
for example in data:
data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
return data_index

+ 3
- 4
reproduction/CNN-sentence_classification/train.py View File

@@ -1,13 +1,12 @@
import os

import
import
import torch
import torch.nn as nn
.dataset as dst
from .model import CNN_text
from torch.autograd import Variable

from . import dataset as dst
from .model import CNN_text

# Hyper Parameters
batch_size = 50
learning_rate = 0.0001


+ 13
- 9
reproduction/chinese_word_seg/cws_train.py View File

@@ -5,7 +5,7 @@ sys.path.append("..")
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
@@ -48,7 +48,7 @@ def infer():
print("Inference finished!")


def train():
def train_test():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
@@ -59,9 +59,10 @@ def train():
train_data = loader.load_pku()

# Preprocessor
p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocess.vocab_size
train_args["num_classes"] = preprocess.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)
@@ -70,7 +71,7 @@ def train():
model = SeqLabeling(train_args)

# Start training
trainer.train(model)
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
@@ -78,8 +79,11 @@ def train():
saver.save_pytorch(model)
print("Model saved!")

# testing with validation set
test(data_dev)

def test():

def test(test_data):
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
@@ -99,7 +103,7 @@ def test():
tester = SeqLabelTester(test_args)

# Start testing
tester.test(model)
tester.test(model, test_data)

# print test results
print(tester.show_matrices())
@@ -107,4 +111,4 @@ def test():


if __name__ == "__main__":
train()
train_test()

+ 2
- 2
test/data_for_tests/config View File

@@ -95,10 +95,10 @@ num_classes = 27
[text_class]
epochs = 1
batch_size = 10
pickle_path = "./data_for_tests/"
pickle_path = "./save_path/"
validate = false
save_best_dev = false
model_saved_path = "./data_for_tests/"
model_saved_path = "./save_path/"
use_cuda = true
learn_rate = 1e-3
momentum = 0.9


+ 6
- 5
test/ner.py View File

@@ -4,9 +4,9 @@ import os
import numpy as np
import torch

from fastNLP.core.preprocess import SeqLabelPreprocess
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.preprocess import POSPreprocess
from fastNLP.models.sequence_modeling import AdvSeqLabel


@@ -114,7 +114,8 @@ emb_path = "data_for_tests/emb50.txt"
save_path = "data_for_tests/"
if __name__ == "__main__":
data = data_load(data_path)
p = POSPreprocess(data, pickle_path=pick_path, train_dev_split=0.3)
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3)
# emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl"))
emb = None
args = {"epochs": 20,
@@ -125,13 +126,13 @@ if __name__ == "__main__":
"model_saved_path": save_path,
"use_cuda": True,

"vocab_size": p.vocab_size,
"num_classes": p.num_classes,
"vocab_size": preprocess.vocab_size,
"num_classes": preprocess.num_classes,
"word_emb_dim": 50,
"rnn_hidden_units": 100
}
# emb = torch.Tensor(emb).float().cuda()
networks = AdvSeqLabel(args, emb)
trainer = MyNERTrainer(args)
trainer.train(network=networks)
trainer.train(networks, data_train, data_dev)
print("Training finished!")

+ 78
- 0
test/readme_example.py View File

@@ -0,0 +1,78 @@
# python: 3.5
# pytorch: 0.4

################
# Test cross validation.
################

from fastNLP.loader.preprocess import ClassPreprocess

from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregation
from fastNLP.modules import encoder


class ClassificationModel(BaseModel):
"""
Simple text classification model based on CNN.
"""

def __init__(self, class_num, vocab_size):
super(ClassificationModel, self).__init__()

self.embed = encoder.Embedding(nums=vocab_size, dims=300)
self.conv = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.pool = aggregation.MaxPool()
self.output = encoder.Linear(input_size=100, output_size=class_num)

def forward(self, x):
x = self.embed(x) # [N,L] -> [N,L,C]
x = self.conv(x) # [N,L,C_in] -> [N,L,C_out]
x = self.pool(x) # [N,L,C] -> [N,C]
x = self.output(x) # [N,C] -> [N, N_class]
return x


data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file

# load dataset
ds_loader = ClassDatasetLoader("train", train_path)
data = ds_loader.load()

# pre-process dataset
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
# pre = ClassPreprocess(data, data_dir)
n_classes = pre.num_classes
vocab_size = pre.vocab_size

# construct model
model_args = {
'num_classes': n_classes,
'vocab_size': vocab_size
}
model = ClassificationModel(class_num=n_classes, vocab_size=vocab_size)

# train model
train_args = {
"epochs": 10,
"batch_size": 50,
"pickle_path": data_dir,
"validate": False,
"save_best_dev": False,
"model_saved_path": None,
"use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
trainer.cross_validate(model)

# predict using model
data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, data_infer)

+ 10
- 18
test/seq_labeling.py View File

@@ -5,7 +5,7 @@ sys.path.append("..")
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
@@ -14,7 +14,7 @@ from fastNLP.core.predictor import SeqLabelInfer

data_name = "people.txt"
data_path = "data_for_tests/people.txt"
pickle_path = "data_for_tests"
pickle_path = "seq_label/"
data_infer_path = "data_for_tests/people_infer.txt"


@@ -33,21 +33,12 @@ def infer():
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl")
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()
"""
Transform strings into list of list of strings.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them.
"""

# Inference interface
infer = SeqLabelInfer(pickle_path)
@@ -68,7 +59,8 @@ def train_and_test():
train_data = pos_loader.load_lines()

# Preprocessor
p = POSPreprocess(train_data, pickle_path, train_dev_split=0.5)
p = SeqLabelPreprocess()
data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes

@@ -79,11 +71,11 @@ def train_and_test():
model = SeqLabeling(train_args)

# Start training
trainer.train(model)
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./data_for_tests/saved_model.pkl")
saver = ModelSaver(pickle_path + "saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

@@ -93,7 +85,7 @@ def train_and_test():
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl")
print("model loaded!")

# Load test configuration
@@ -103,8 +95,8 @@ def train_and_test():
# Tester
tester = SeqLabelTester(test_args)

# Start testing
tester.test(model)
# Start testing with validation data
tester.test(model, data_dev)

# print test results
print(tester.show_matrices())


+ 5
- 4
test/test_cws.py View File

@@ -5,7 +5,7 @@ sys.path.append("..")
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
@@ -68,7 +68,8 @@ def train_test():
train_data = loader.load_pku()

# Preprocessor
p = POSPreprocess(train_data, pickle_path)
p = SeqLabelPreprocess()
data_train = p.run(train_data, pickle_path=pickle_path)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes

@@ -79,7 +80,7 @@ def train_test():
model = SeqLabeling(train_args)

# Start training
trainer.train(model)
trainer.train(model, data_train)
print("Training finished!")

# Saver
@@ -104,7 +105,7 @@ def train_test():
tester = SeqLabelTester(test_args)

# Start testing
tester.test(model)
tester.test(model, data_train)

# print test results
print(tester.show_matrices())


+ 2
- 2
test/test_tester.py View File

@@ -1,7 +1,7 @@
from fastNLP.core.preprocess import SeqLabelPreprocess
from fastNLP.core.tester import SeqLabelTester
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
from fastNLP.loader.preprocess import POSPreprocess
from fastNLP.models.sequence_modeling import SeqLabeling

data_name = "pku_training.utf8"
@@ -17,7 +17,7 @@ def foo():
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

# Preprocessor
p = POSPreprocess(train_data, pickle_path)
p = SeqLabelPreprocess(train_data, pickle_path)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes



+ 13
- 12
test/text_classify.py View File

@@ -10,10 +10,11 @@ from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.loader.preprocess import ClassPreprocess
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.saver.model_saver import ModelSaver

save_path = "./test_classification/"
data_dir = "./data_for_tests/"
train_file = 'text_classify.txt'
model_name = "model_class.pkl"
@@ -27,8 +28,8 @@ def infer():
unlabeled_data = [x[0] for x in data]

# pre-process data
pre = ClassPreprocess(data_dir)
vocab_size, n_classes = pre.process(data, "data_train.pkl")
pre = ClassPreprocess()
vocab_size, n_classes = pre.run(data, pickle_path=save_path)
print("vocabulary size:", vocab_size)
print("number of classes:", n_classes)

@@ -59,28 +60,28 @@ def train():
print(data[0])

# pre-process data
pre = ClassPreprocess(data_dir)
vocab_size, n_classes = pre.process(data, "data_train.pkl")
print("vocabulary size:", vocab_size)
print("number of classes:", n_classes)
pre = ClassPreprocess()
data_train = pre.run(data, pickle_path=save_path)
print("vocabulary size:", pre.vocab_size)
print("number of classes:", pre.num_classes)

# construct model
print("Building model...")
cnn = CNNText(model_args)
model = CNNText(model_args)

# train
print("Training...")

trainer = ClassificationTrainer(train_args)
trainer.train(cnn)
trainer.train(model, data_train)

print("Training finished!")

saver = ModelSaver("./data_for_tests/saved_model.pkl")
saver.save_pytorch(cnn)
saver.save_pytorch(model)
print("Model saved!")


if __name__ == "__main__":
# train()
infer()
train()
# infer()

Loading…
Cancel
Save