@@ -5,7 +5,7 @@ from fastNLP.core.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import aggregator | |||
from fastNLP.modules import decoder | |||
from fastNLP.modules import encoder | |||
@@ -21,7 +21,7 @@ class ClassificationModel(BaseModel): | |||
self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.agg = aggregator.MaxPool() | |||
self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||
def forward(self, x): | |||
@@ -2,10 +2,6 @@ from collections import defaultdict | |||
import torch | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.core.field import TextField, LabelField | |||
from fastNLP.core.instance import Instance | |||
class Batch(object): | |||
"""Batch is an iterable object which iterates over mini-batches. | |||
@@ -16,6 +12,14 @@ class Batch(object): | |||
""" | |||
def __init__(self, dataset, batch_size, sampler, use_cuda): | |||
""" | |||
:param dataset: a DataSet object | |||
:param batch_size: int, the size of the batch | |||
:param sampler: a Sampler object | |||
:param use_cuda: bool, whetjher to use GPU | |||
""" | |||
self.dataset = dataset | |||
self.batch_size = batch_size | |||
self.sampler = sampler | |||
@@ -81,46 +85,3 @@ class Batch(object): | |||
self.curidx += endidx | |||
return batch_x, batch_y | |||
if __name__ == "__main__": | |||
"""simple running example | |||
""" | |||
texts = ["i am a cat", | |||
"this is a test of new batch", | |||
"haha" | |||
] | |||
labels = [0, 1, 0] | |||
# prepare vocabulary | |||
vocab = {} | |||
for text in texts: | |||
for tokens in text.split(): | |||
if tokens not in vocab: | |||
vocab[tokens] = len(vocab) | |||
print("vocabulary: ", vocab) | |||
# prepare input dataset | |||
data = DataSet() | |||
for text, label in zip(texts, labels): | |||
x = TextField(text.split(), False) | |||
y = LabelField(label, is_target=True) | |||
ins = Instance(text=x, label=y) | |||
data.append(ins) | |||
# use vocabulary to index data | |||
data.index_field("text", vocab) | |||
# define naive sampler for batch class | |||
class SeqSampler: | |||
def __call__(self, dataset): | |||
return list(range(len(dataset))) | |||
# use batch to iterate dataset | |||
data_iterator = Batch(data, 2, SeqSampler(), False) | |||
for epoch in range(1): | |||
for batch_x, batch_y in data_iterator: | |||
print(batch_x) | |||
print(batch_y) | |||
# do stuff |
@@ -1,10 +1,10 @@ | |||
import numpy as np | |||
import torch | |||
from fastNLP.core.action import SequentialSampler | |||
from fastNLP.core.batch import Batch | |||
from fastNLP.core.dataset import create_dataset_from_lists | |||
from fastNLP.core.preprocess import load_pickle | |||
from fastNLP.core.sampler import SequentialSampler | |||
class Predictor(object): | |||
@@ -27,8 +27,8 @@ class Predictor(object): | |||
self.batch_output = [] | |||
self.pickle_path = pickle_path | |||
self._task = task # one of ("seq_label", "text_classify") | |||
self.index2label = load_pickle(self.pickle_path, "id2class.pkl") | |||
self.word2index = load_pickle(self.pickle_path, "word2id.pkl") | |||
self.label_vocab = load_pickle(self.pickle_path, "class2id.pkl") | |||
self.word_vocab = load_pickle(self.pickle_path, "word2id.pkl") | |||
def predict(self, network, data): | |||
"""Perform inference using the trained model. | |||
@@ -62,9 +62,13 @@ class Predictor(object): | |||
def data_forward(self, network, x): | |||
"""Forward through network.""" | |||
y = network(**x) | |||
if self._task == "seq_label": | |||
y = network(x["word_seq"], x["word_seq_origin_len"]) | |||
y = network.prediction(y) | |||
elif self._task == "text_classify": | |||
y = network(x["word_seq"]) | |||
else: | |||
raise NotImplementedError("Unknown task type {}.".format(self._task)) | |||
return y | |||
def prepare_input(self, data): | |||
@@ -82,7 +86,7 @@ class Predictor(object): | |||
:return data_set: a DataSet instance. | |||
""" | |||
assert isinstance(data, list) | |||
return create_dataset_from_lists(data, self.word2index, has_target=False) | |||
return create_dataset_from_lists(data, self.word_vocab, has_target=False) | |||
def prepare_output(self, data): | |||
"""Transform list of batch outputs into strings.""" | |||
@@ -97,14 +101,14 @@ class Predictor(object): | |||
results = [] | |||
for batch in batch_outputs: | |||
for example in np.array(batch): | |||
results.append([self.index2label[int(x)] for x in example]) | |||
results.append([self.label_vocab.to_word(int(x)) for x in example]) | |||
return results | |||
def _text_classify_prepare_output(self, batch_outputs): | |||
results = [] | |||
for batch_out in batch_outputs: | |||
idx = np.argmax(batch_out.detach().numpy(), axis=-1) | |||
results.extend([self.index2label[i] for i in idx]) | |||
results.extend([self.label_vocab.to_word(i) for i in idx]) | |||
return results | |||
@@ -6,16 +6,7 @@ import numpy as np | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.core.field import TextField, LabelField | |||
from fastNLP.core.instance import Instance | |||
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||
DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||
'<reserved-3>', | |||
'<reserved-4>'] # dict index = 2~4 | |||
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||
DEFAULT_RESERVED_LABEL[2]: 4} | |||
from fastNLP.core.vocabulary import Vocabulary | |||
# the first vocab in dict with the index = 5 | |||
@@ -61,31 +52,36 @@ def pickle_exist(pickle_path, pickle_name): | |||
return False | |||
class BasePreprocess(object): | |||
"""Base class of all preprocessors. | |||
Preprocessors are responsible for converting data of strings into data of indices. | |||
class Preprocessor(object): | |||
"""Preprocessors are responsible for converting data of strings into data of indices. | |||
During the pre-processing, the following pickle files will be built: | |||
- "word2id.pkl", a mapping from words(tokens) to indices | |||
- "id2word.pkl", a reversed dictionary | |||
- "label2id.pkl", a dictionary on labels | |||
- "id2label.pkl", a reversed dictionary on labels | |||
- "word2id.pkl", a Vocabulary object, mapping words to indices. | |||
- "class2id.pkl", a Vocabulary object, mapping labels to indices. | |||
- "data_train.pkl", a DataSet object for training | |||
- "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0. | |||
- "data_test.pkl", a DataSet object for testing, if test_data is not None. | |||
These four pickle files are expected to be saved in the given pickle directory once they are constructed. | |||
Preprocessors will check if those files are already in the directory and will reuse them in future calls. | |||
""" | |||
def __init__(self): | |||
self.word2index = None | |||
self.label2index = None | |||
def __init__(self, label_is_seq=False): | |||
""" | |||
:param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve | |||
several special tokens for sequence processing. | |||
""" | |||
self.data_vocab = Vocabulary() | |||
self.label_vocab = Vocabulary(need_default=label_is_seq) | |||
@property | |||
def vocab_size(self): | |||
return len(self.word2index) | |||
return len(self.data_vocab) | |||
@property | |||
def num_classes(self): | |||
return len(self.label2index) | |||
return len(self.label_vocab) | |||
def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): | |||
"""Main pre-processing pipeline. | |||
@@ -102,20 +98,14 @@ class BasePreprocess(object): | |||
""" | |||
if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | |||
self.word2index = load_pickle(pickle_path, "word2id.pkl") | |||
self.label2index = load_pickle(pickle_path, "class2id.pkl") | |||
self.data_vocab = load_pickle(pickle_path, "word2id.pkl") | |||
self.label_vocab = load_pickle(pickle_path, "class2id.pkl") | |||
else: | |||
self.word2index, self.label2index = self.build_dict(train_dev_data) | |||
save_pickle(self.word2index, pickle_path, "word2id.pkl") | |||
save_pickle(self.label2index, pickle_path, "class2id.pkl") | |||
if not pickle_exist(pickle_path, "id2word.pkl"): | |||
index2word = self.build_reverse_dict(self.word2index) | |||
save_pickle(index2word, pickle_path, "id2word.pkl") | |||
self.data_vocab, self.label_vocab = self.build_dict(train_dev_data) | |||
save_pickle(self.data_vocab, pickle_path, "word2id.pkl") | |||
save_pickle(self.label_vocab, pickle_path, "class2id.pkl") | |||
if not pickle_exist(pickle_path, "id2class.pkl"): | |||
index2label = self.build_reverse_dict(self.label2index) | |||
save_pickle(index2label, pickle_path, "id2class.pkl") | |||
self.build_reverse_dict() | |||
train_set = [] | |||
dev_set = [] | |||
@@ -125,13 +115,13 @@ class BasePreprocess(object): | |||
split = int(len(train_dev_data) * train_dev_split) | |||
data_dev = train_dev_data[: split] | |||
data_train = train_dev_data[split:] | |||
train_set = self.convert_to_dataset(data_train, self.word2index, self.label2index) | |||
dev_set = self.convert_to_dataset(data_dev, self.word2index, self.label2index) | |||
train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab) | |||
dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab) | |||
save_pickle(dev_set, pickle_path, "data_dev.pkl") | |||
print("{} of the training data is split for validation. ".format(train_dev_split)) | |||
else: | |||
train_set = self.convert_to_dataset(train_dev_data, self.word2index, self.label2index) | |||
train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab) | |||
save_pickle(train_set, pickle_path, "data_train.pkl") | |||
else: | |||
train_set = load_pickle(pickle_path, "data_train.pkl") | |||
@@ -143,8 +133,8 @@ class BasePreprocess(object): | |||
# cross validation | |||
data_cv = self.cv_split(train_dev_data, n_fold) | |||
for i, (data_train_cv, data_dev_cv) in enumerate(data_cv): | |||
data_train_cv = self.convert_to_dataset(data_train_cv, self.word2index, self.label2index) | |||
data_dev_cv = self.convert_to_dataset(data_dev_cv, self.word2index, self.label2index) | |||
data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab) | |||
data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab) | |||
save_pickle( | |||
data_train_cv, pickle_path, | |||
"data_train_{}.pkl".format(i)) | |||
@@ -165,7 +155,7 @@ class BasePreprocess(object): | |||
test_set = [] | |||
if test_data is not None: | |||
if not pickle_exist(pickle_path, "data_test.pkl"): | |||
test_set = self.convert_to_dataset(test_data, self.word2index, self.label2index) | |||
test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab) | |||
save_pickle(test_set, pickle_path, "data_test.pkl") | |||
# return preprocessed results | |||
@@ -180,28 +170,15 @@ class BasePreprocess(object): | |||
return tuple(results) | |||
def build_dict(self, data): | |||
label2index = DEFAULT_WORD_TO_INDEX.copy() | |||
word2index = DEFAULT_WORD_TO_INDEX.copy() | |||
for example in data: | |||
for word in example[0]: | |||
if word not in word2index: | |||
word2index[word] = len(word2index) | |||
label = example[1] | |||
if isinstance(label, str): | |||
# label is a string | |||
if label not in label2index: | |||
label2index[label] = len(label2index) | |||
elif isinstance(label, list): | |||
# label is a list of strings | |||
for single_label in label: | |||
if single_label not in label2index: | |||
label2index[single_label] = len(label2index) | |||
return word2index, label2index | |||
def build_reverse_dict(self, word_dict): | |||
id2word = {word_dict[w]: w for w in word_dict} | |||
return id2word | |||
word, label = example | |||
self.data_vocab.update(word) | |||
self.label_vocab.update(label) | |||
return self.data_vocab, self.label_vocab | |||
def build_reverse_dict(self): | |||
self.data_vocab.build_reverse_vocab() | |||
self.label_vocab.build_reverse_vocab() | |||
def data_split(self, data, train_dev_split): | |||
"""Split data into train and dev set.""" | |||
@@ -289,20 +266,20 @@ class BasePreprocess(object): | |||
return data_set | |||
class SeqLabelPreprocess(BasePreprocess): | |||
class SeqLabelPreprocess(Preprocessor): | |||
def __init__(self): | |||
print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.") | |||
super(SeqLabelPreprocess, self).__init__() | |||
class ClassPreprocess(BasePreprocess): | |||
class ClassPreprocess(Preprocessor): | |||
def __init__(self): | |||
print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.") | |||
super(ClassPreprocess, self).__init__() | |||
if __name__ == "__main__": | |||
p = BasePreprocess() | |||
p = Preprocessor() | |||
train_dev_data = [[["I", "am", "a", "good", "student", "."], "0"], | |||
[["You", "are", "pretty", "."], "1"] | |||
] | |||
@@ -1,5 +1,3 @@ | |||
from collections import Counter | |||
import numpy as np | |||
import torch | |||
@@ -17,6 +15,56 @@ def convert_to_torch_tensor(data_list, use_cuda): | |||
return data_list | |||
class BaseSampler(object): | |||
"""The base class of all samplers. | |||
Sub-classes must implement the __call__ method. | |||
__call__ takes a DataSet object and returns a list of int - the sampling indices. | |||
""" | |||
def __call__(self, *args, **kwargs): | |||
raise NotImplementedError | |||
class SequentialSampler(BaseSampler): | |||
"""Sample data in the original order. | |||
""" | |||
def __call__(self, data_set): | |||
return list(range(len(data_set))) | |||
class RandomSampler(BaseSampler): | |||
"""Sample data in random permutation order. | |||
""" | |||
def __call__(self, data_set): | |||
return list(np.random.permutation(len(data_set))) | |||
def simple_sort_bucketing(lengths): | |||
""" | |||
:param lengths: list of int, the lengths of all examples. | |||
:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length | |||
threshold for each bucket (This is usually None.). | |||
:return data: 2-level list | |||
:: | |||
[ | |||
[index_11, index_12, ...], # bucket 1 | |||
[index_21, index_22, ...], # bucket 2 | |||
... | |||
] | |||
""" | |||
lengths_mapping = [(idx, length) for idx, length in enumerate(lengths)] | |||
sorted_lengths = sorted(lengths_mapping, key=lambda x: x[1]) | |||
# TODO: need to return buckets | |||
return [idx for idx, _ in sorted_lengths] | |||
def k_means_1d(x, k, max_iter=100): | |||
"""Perform k-means on 1-D data. | |||
@@ -46,18 +94,10 @@ def k_means_1d(x, k, max_iter=100): | |||
return np.array(centroids), assign | |||
def k_means_bucketing(all_inst, buckets): | |||
def k_means_bucketing(lengths, buckets): | |||
"""Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. | |||
:param all_inst: 3-level list | |||
E.g. :: | |||
[ | |||
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1 | |||
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | |||
... | |||
] | |||
:param lengths: list of int, the length of all samples. | |||
:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length | |||
threshold for each bucket (This is usually None.). | |||
:return data: 2-level list | |||
@@ -72,7 +112,6 @@ def k_means_bucketing(all_inst, buckets): | |||
""" | |||
bucket_data = [[] for _ in buckets] | |||
num_buckets = len(buckets) | |||
lengths = np.array([len(inst[0]) for inst in all_inst]) | |||
_, assignments = k_means_1d(lengths, num_buckets) | |||
for idx, bucket_id in enumerate(assignments): | |||
@@ -81,102 +120,33 @@ def k_means_bucketing(all_inst, buckets): | |||
return bucket_data | |||
class BaseSampler(object): | |||
"""The base class of all samplers. | |||
""" | |||
def __call__(self, *args, **kwargs): | |||
raise NotImplementedError | |||
class SequentialSampler(BaseSampler): | |||
"""Sample data in the original order. | |||
""" | |||
def __call__(self, data_set): | |||
return list(range(len(data_set))) | |||
class RandomSampler(BaseSampler): | |||
"""Sample data in random permutation order. | |||
""" | |||
def __call__(self, data_set): | |||
return list(np.random.permutation(len(data_set))) | |||
class Batchifier(object): | |||
"""Wrap random or sequential sampler to generate a mini-batch. | |||
""" | |||
def __init__(self, sampler, batch_size, drop_last=True): | |||
""" | |||
:param sampler: a Sampler object | |||
:param batch_size: int, the size of the mini-batch | |||
:param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch. | |||
""" | |||
super(Batchifier, self).__init__() | |||
self.sampler = sampler | |||
self.batch_size = batch_size | |||
self.drop_last = drop_last | |||
def __iter__(self): | |||
batch = [] | |||
for example in self.sampler: | |||
batch.append(example) | |||
if len(batch) == self.batch_size: | |||
yield batch | |||
batch = [] | |||
if 0 < len(batch) < self.batch_size and self.drop_last is False: | |||
yield batch | |||
class BucketBatchifier(Batchifier): | |||
class BucketSampler(BaseSampler): | |||
"""Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||
In sampling, first random choose a bucket. Then sample data from it. | |||
The number of buckets is decided dynamically by the variance of sentence lengths. | |||
TODO: merge it into Batch | |||
""" | |||
def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None): | |||
def __call__(self, data_set, batch_size, num_buckets): | |||
return self._process(data_set, batch_size, num_buckets) | |||
def _process(self, data_set, batch_size, num_buckets, use_kmeans=False): | |||
""" | |||
:param data_set: three-level list, shape [num_samples, 2] | |||
:param data_set: a DataSet object | |||
:param batch_size: int | |||
:param num_buckets: int, number of buckets for grouping these sequences. | |||
:param drop_last: bool, useless currently. | |||
:param sampler: Sampler, useless currently. | |||
:param use_kmeans: bool, whether to use k-means to create buckets. | |||
""" | |||
super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) | |||
buckets = ([None] * num_buckets) | |||
self.data = data_set | |||
self.batch_size = batch_size | |||
self.length_freq = dict(Counter([len(example) for example in data_set])) | |||
self.buckets = k_means_bucketing(data_set, buckets) | |||
def __iter__(self): | |||
"""Make a min-batch of data.""" | |||
for _ in range(len(self.data) // self.batch_size): | |||
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))] | |||
np.random.shuffle(bucket_samples) | |||
yield [self.data[idx] for idx in bucket_samples[:batch_size]] | |||
if __name__ == "__main__": | |||
import random | |||
data = [[[y] * random.randint(0, 50), [y]] for y in range(500)] | |||
batch_size = 8 | |||
iterator = iter(BucketBatchifier(data, batch_size, num_buckets=5)) | |||
for d in iterator: | |||
print("\nbatch:") | |||
for dd in d: | |||
print(len(dd[0]), end=" ") | |||
if use_kmeans is True: | |||
buckets = k_means_bucketing(data_set, buckets) | |||
else: | |||
buckets = simple_sort_bucketing(data_set) | |||
index_list = [] | |||
for _ in range(len(data_set) // batch_size): | |||
chosen_bucket = buckets[np.random.randint(0, len(buckets))] | |||
np.random.shuffle(chosen_bucket) | |||
index_list += [idx for idx in chosen_bucket[:batch_size]] | |||
return index_list |
@@ -1,32 +1,32 @@ | |||
import numpy as np | |||
import torch | |||
from fastNLP.core.action import RandomSampler | |||
from fastNLP.core.batch import Batch | |||
from fastNLP.core.sampler import RandomSampler | |||
from fastNLP.saver.logger import create_logger | |||
logger = create_logger(__name__, "./train_test.log") | |||
class BaseTester(object): | |||
class Tester(object): | |||
"""An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ | |||
def __init__(self, **kwargs): | |||
""" | |||
:param kwargs: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]" | |||
""" | |||
super(BaseTester, self).__init__() | |||
super(Tester, self).__init__() | |||
""" | |||
"default_args" provides default value for important settings. | |||
The initialization arguments "kwargs" with the same key (name) will override the default value. | |||
"kwargs" must have the same type as "default_args" on corresponding keys. | |||
Otherwise, error will raise. | |||
""" | |||
default_args = {"save_output": False, # collect outputs of validation set | |||
"save_loss": False, # collect losses in validation | |||
default_args = {"save_output": True, # collect outputs of validation set | |||
"save_loss": True, # collect losses in validation | |||
"save_best_dev": False, # save best model during validation | |||
"batch_size": 8, | |||
"use_cuda": True, | |||
"use_cuda": False, | |||
"pickle_path": "./save/", | |||
"model_name": "dev_best_model.pkl", | |||
"print_every_step": 1, | |||
@@ -55,7 +55,7 @@ class BaseTester(object): | |||
logger.error(msg) | |||
raise ValueError(msg) | |||
else: | |||
# BaseTester doesn't care about extra arguments | |||
# Tester doesn't care about extra arguments | |||
pass | |||
print(default_args) | |||
@@ -208,7 +208,7 @@ class BaseTester(object): | |||
return self.show_metrics() | |||
class SeqLabelTester(BaseTester): | |||
class SeqLabelTester(Tester): | |||
def __init__(self, **test_args): | |||
test_args.update({"task": "seq_label"}) | |||
print( | |||
@@ -216,9 +216,9 @@ class SeqLabelTester(BaseTester): | |||
super(SeqLabelTester, self).__init__(**test_args) | |||
class ClassificationTester(BaseTester): | |||
class ClassificationTester(Tester): | |||
def __init__(self, **test_args): | |||
test_args.update({"task": "seq_label"}) | |||
test_args.update({"task": "text_classify"}) | |||
print( | |||
"[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester with argument 'task'='text_classify'.") | |||
super(ClassificationTester, self).__init__(**test_args) |
@@ -6,10 +6,10 @@ from datetime import timedelta | |||
import torch | |||
from tensorboardX import SummaryWriter | |||
from fastNLP.core.action import RandomSampler | |||
from fastNLP.core.batch import Batch | |||
from fastNLP.core.loss import Loss | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.sampler import RandomSampler | |||
from fastNLP.core.tester import SeqLabelTester, ClassificationTester | |||
from fastNLP.saver.logger import create_logger | |||
from fastNLP.saver.model_saver import ModelSaver | |||
@@ -17,7 +17,7 @@ from fastNLP.saver.model_saver import ModelSaver | |||
logger = create_logger(__name__, "./train_test.log") | |||
class BaseTrainer(object): | |||
class Trainer(object): | |||
"""Operations of training a model, including data loading, gradient descent, and validation. | |||
""" | |||
@@ -32,7 +32,7 @@ class BaseTrainer(object): | |||
- batch_size: int | |||
- pickle_path: str, the path to pickle files for pre-processing | |||
""" | |||
super(BaseTrainer, self).__init__() | |||
super(Trainer, self).__init__() | |||
""" | |||
"default_args" provides default value for important settings. | |||
@@ -40,8 +40,8 @@ class BaseTrainer(object): | |||
"kwargs" must have the same type as "default_args" on corresponding keys. | |||
Otherwise, error will raise. | |||
""" | |||
default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", | |||
"save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, | |||
default_args = {"epochs": 1, "batch_size": 2, "validate": False, "use_cuda": False, "pickle_path": "./save/", | |||
"save_best_dev": False, "model_name": "default_model_name.pkl", "print_every_step": 1, | |||
"loss": Loss(None), # used to pass type check | |||
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) | |||
} | |||
@@ -69,7 +69,7 @@ class BaseTrainer(object): | |||
logger.error(msg) | |||
raise ValueError(msg) | |||
else: | |||
# BaseTrainer doesn't care about extra arguments | |||
# Trainer doesn't care about extra arguments | |||
pass | |||
print(default_args) | |||
@@ -136,6 +136,9 @@ class BaseTrainer(object): | |||
# validation | |||
if self.validate: | |||
if dev_data is None: | |||
raise RuntimeError( | |||
"self.validate is True in trainer, but dev_data is None. Please provide the validation data.") | |||
logger.info("validation started") | |||
validator.test(network, dev_data) | |||
@@ -314,7 +317,7 @@ class BaseTrainer(object): | |||
raise NotImplementedError | |||
class SeqLabelTrainer(BaseTrainer): | |||
class SeqLabelTrainer(Trainer): | |||
"""Trainer for Sequence Labeling | |||
""" | |||
@@ -328,7 +331,7 @@ class SeqLabelTrainer(BaseTrainer): | |||
return SeqLabelTester(**valid_args) | |||
class ClassificationTrainer(BaseTrainer): | |||
class ClassificationTrainer(Trainer): | |||
"""Trainer for text classification.""" | |||
def __init__(self, **train_args): | |||
@@ -0,0 +1,124 @@ | |||
from copy import deepcopy | |||
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||
DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||
'<reserved-3>', | |||
'<reserved-4>'] # dict index = 2~4 | |||
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||
DEFAULT_RESERVED_LABEL[2]: 4} | |||
def isiterable(p_object): | |||
try: | |||
it = iter(p_object) | |||
except TypeError: | |||
return False | |||
return True | |||
class Vocabulary(object): | |||
"""Use for word and index one to one mapping | |||
Example:: | |||
vocab = Vocabulary() | |||
word_list = "this is a word list".split() | |||
vocab.update(word_list) | |||
vocab["word"] | |||
vocab.to_word(5) | |||
""" | |||
def __init__(self, need_default=True): | |||
""" | |||
:param bool need_default: set if the Vocabulary has default labels reserved. | |||
""" | |||
if need_default: | |||
self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) | |||
self.padding_label = DEFAULT_PADDING_LABEL | |||
self.unknown_label = DEFAULT_UNKNOWN_LABEL | |||
else: | |||
self.word2idx = {} | |||
self.padding_label = None | |||
self.unknown_label = None | |||
self.has_default = need_default | |||
self.idx2word = None | |||
def __len__(self): | |||
return len(self.word2idx) | |||
def update(self, word): | |||
"""add word or list of words into Vocabulary | |||
:param word: a list of str or str | |||
""" | |||
if not isinstance(word, str) and isiterable(word): | |||
# it's a nested list | |||
for w in word: | |||
self.update(w) | |||
else: | |||
# it's a word to be added | |||
if word not in self.word2idx: | |||
self.word2idx[word] = len(self) | |||
if self.idx2word is not None: | |||
self.idx2word = None | |||
def __getitem__(self, w): | |||
"""To support usage like:: | |||
vocab[w] | |||
""" | |||
if w in self.word2idx: | |||
return self.word2idx[w] | |||
else: | |||
return self.word2idx[DEFAULT_UNKNOWN_LABEL] | |||
def to_index(self, w): | |||
""" like to_index(w) function, turn a word to the index | |||
if w is not in Vocabulary, return the unknown label | |||
:param str w: | |||
""" | |||
return self[w] | |||
def unknown_idx(self): | |||
if self.unknown_label is None: | |||
return None | |||
return self.word2idx[self.unknown_label] | |||
def padding_idx(self): | |||
if self.padding_label is None: | |||
return None | |||
return self.word2idx[self.padding_label] | |||
def build_reverse_vocab(self): | |||
"""build 'index to word' dict based on 'word to index' dict | |||
""" | |||
self.idx2word = {self.word2idx[w] : w for w in self.word2idx} | |||
def to_word(self, idx): | |||
"""given a word's index, return the word itself | |||
:param int idx: | |||
""" | |||
if self.idx2word is None: | |||
self.build_reverse_vocab() | |||
return self.idx2word[idx] | |||
def __getstate__(self): | |||
"""use to prepare data for pickle | |||
""" | |||
state = self.__dict__.copy() | |||
# no need to pickle idx2word as it can be constructed from word2idx | |||
del state['idx2word'] | |||
return state | |||
def __setstate__(self, state): | |||
"""use to restore state from pickle | |||
""" | |||
self.__dict__.update(state) | |||
self.idx2word = None | |||
@@ -31,7 +31,7 @@ FastNLP_MODEL_COLLECTION = { | |||
"class": "sequence_modeling.AdvSeqLabel", | |||
"pickle": "cws_basic_model_v_0.pkl", | |||
"type": "seq_label", | |||
"config_file_name": "config", | |||
"config_file_name": "cws.cfg", | |||
"config_section_name": "text_class_model" | |||
}, | |||
"pos_tag_model": { | |||
@@ -39,7 +39,7 @@ FastNLP_MODEL_COLLECTION = { | |||
"class": "sequence_modeling.AdvSeqLabel", | |||
"pickle": "pos_tag_model_v_0.pkl", | |||
"type": "seq_label", | |||
"config_file_name": "pos_tag.config", | |||
"config_file_name": "pos_tag.cfg", | |||
"config_section_name": "pos_tag_model" | |||
}, | |||
"text_classify_model": { | |||
@@ -56,21 +56,22 @@ FastNLP_MODEL_COLLECTION = { | |||
class FastNLP(object): | |||
""" | |||
High-level interface for direct model inference. | |||
Example Usage: | |||
Example Usage | |||
:: | |||
fastnlp = FastNLP() | |||
fastnlp.load("zh_pos_tag_model") | |||
text = "这是最好的基于深度学习的中文分词系统。" | |||
result = fastnlp.run(text) | |||
print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"] | |||
""" | |||
def __init__(self, model_dir="./"): | |||
""" | |||
:param model_dir: this directory should contain the following files: | |||
1. a pre-trained model | |||
2. a config file | |||
3. "id2class.pkl" | |||
4. "word2id.pkl" | |||
1. a trained model | |||
2. a config file, which is a fastNLP's configuration. | |||
3. a Vocab file, which is a pickle object of a Vocab instance. | |||
""" | |||
self.model_dir = model_dir | |||
self.model = None | |||
@@ -99,10 +100,10 @@ class FastNLP(object): | |||
print("Restore model hyper-parameters {}".format(str(model_args.data))) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(self.model_dir, "word2id.pkl") | |||
model_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(self.model_dir, "id2class.pkl") | |||
model_args["num_classes"] = len(index2label) | |||
word_vocab = load_pickle(self.model_dir, "word2id.pkl") | |||
model_args["vocab_size"] = len(word_vocab) | |||
label_vocab = load_pickle(self.model_dir, "class2id.pkl") | |||
model_args["num_classes"] = len(label_vocab) | |||
# Construct the model | |||
model = model_class(model_args) | |||
@@ -172,9 +172,8 @@ class ClassDatasetLoader(DatasetLoader): | |||
class ConllLoader(DatasetLoader): | |||
"""loader for conll format files""" | |||
def __int__(self, data_name, data_path): | |||
def __int__(self, data_path): | |||
""" | |||
:param str data_name: the name of the conll data set | |||
:param str data_path: the path to the conll data set | |||
""" | |||
super(ConllLoader, self).__init__(data_path) | |||
@@ -269,8 +268,3 @@ class PeopleDailyCorpusLoader(DatasetLoader): | |||
ner_examples.append([sent_words, sent_ner]) | |||
return pos_tag_examples, ner_examples | |||
if __name__ == "__main__": | |||
loader = PeopleDailyCorpusLoader("./") | |||
pos, ner = loader.load() | |||
print(pos[:10]) | |||
print(ner[:10]) |
@@ -1,11 +1,11 @@ | |||
from . import aggregation | |||
from . import aggregator | |||
from . import decoder | |||
from . import encoder | |||
from . import interaction | |||
from . import interactor | |||
__version__ = '0.0.0' | |||
__all__ = ['encoder', | |||
'decoder', | |||
'aggregation', | |||
'interaction'] | |||
'aggregator', | |||
'interactor'] |
@@ -1,8 +1,7 @@ | |||
import torch | |||
import torch.nn as nn | |||
from torch.autograd import Variable | |||
import torch.nn.functional as F | |||
from torch.autograd import Variable | |||
from fastNLP.modules.utils import initial_parameter | |||
@@ -1,19 +1,10 @@ | |||
""" | |||
This is borrowed from FudanParser. Not stable. Do not use !!! | |||
""" | |||
import numpy | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torch.utils.data | |||
from torch import optim | |||
from torch.autograd import Function, Variable | |||
from torch.nn import Parameter | |||
from .utils import orthogonal | |||
class GroupNorm(nn.Module): | |||
def __init__(self, num_features, num_groups=20, eps=1e-5): | |||
@@ -59,15 +50,6 @@ class LayerNormalization(nn.Module): | |||
return ln_out | |||
class OrthEmbedding(nn.Embedding): | |||
def __init__(self, *args, **kwargs): | |||
super(OrthEmbedding, self).__init__(*args, **kwargs) | |||
def reset_parameters(self): | |||
self.weight = orthogonal(self.weight) | |||
nn.init.constant_(self.bias, 0.) | |||
class BiLinear(nn.Module): | |||
def __init__(self, n_left, n_right, n_out, bias=True): | |||
""" | |||
@@ -241,250 +223,3 @@ class WordDropout(nn.Module): | |||
drop_mask = drop_mask.long() | |||
output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx | |||
return output | |||
class WlossLayer(torch.nn.Module): | |||
def __init__(self, lam=100, sinkhorn_iter=50): | |||
super(WlossLayer, self).__init__() | |||
# cost = matrix M = distance matrix | |||
# lam = lambda of type float > 0 | |||
# sinkhorn_iter > 0 | |||
# diagonal cost should be 0 | |||
self.lam = lam | |||
self.sinkhorn_iter = sinkhorn_iter | |||
# self.register_buffer("K", torch.exp(-self.cost / self.lam).double()) | |||
# self.register_buffer("KM", (self.cost * self.K).double()) | |||
def forward(self, pred, target, cost): | |||
return WassersteinLossStab.apply(pred, target, | |||
cost, self.lam, self.sinkhorn_iter) | |||
class WassersteinLossStab(Function): | |||
@staticmethod | |||
def forward(ctx, pred, target, cost, lam=1e-3, sinkhorn_iter=4): | |||
"""pred: Batch * K: K = # mass points | |||
target: Batch * L: L = # mass points""" | |||
# import pdb | |||
# pdb.set_trace() | |||
eps = 1e-8 | |||
# pred = pred.gather(dim=1, index=) | |||
na = pred.size(1) | |||
nb = target.size(1) | |||
cost = cost.double() | |||
pred = pred.double() | |||
target = target.double() | |||
cost = cost[:na, :nb].double() | |||
K = torch.exp(-cost / lam).double() | |||
KM = (cost * K).double() | |||
batch_size = pred.size(0) | |||
# pdb.set_trace() | |||
log_a, log_b = torch.log(pred + eps), torch.log(target + eps) | |||
log_u = cost.new(batch_size, na).fill_(-numpy.log(na)) | |||
log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb)) | |||
# import pdb | |||
# pdb.set_trace() | |||
for i in range(int(sinkhorn_iter)): | |||
log_u_max = torch.max(log_u, dim=1)[0] | |||
u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps) | |||
log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1) | |||
log_v_max = torch.max(log_v, dim=1)[0] | |||
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) | |||
tmp = log_u | |||
log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1) | |||
# print(log_u.sum()) | |||
if torch.norm(tmp - log_u) / torch.norm(log_u) < eps: | |||
break | |||
log_v_max = torch.max(log_v, dim=1)[0] | |||
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) | |||
logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1) | |||
wnorm = torch.exp(log_u + logcostpart1).mean(0).sum() # sum(1) for per item pair loss... | |||
grad_input = log_u * lam | |||
# print("log_u", log_u) | |||
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) | |||
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) | |||
grad_input = grad_input / batch_size | |||
ctx.save_for_backward(grad_input) | |||
# print("grad type", type(grad_input)) | |||
return pred.new((wnorm,)), grad_input | |||
@staticmethod | |||
def backward(ctx, grad_output, _): | |||
grad_input = ctx.saved_variables | |||
# print(grad) | |||
res = grad_output.clone() | |||
res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data) | |||
res = res.mul_(grad_output[0]).float() | |||
# print("in backward func:\n\n", res) | |||
return res, None, None, None, None, None, None | |||
class Sinkhorn(Function): | |||
def __init__(self): | |||
super(Sinkhorn, self).__init__() | |||
def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop): | |||
a = a.double() | |||
b = b.double() | |||
M = M.double() | |||
nbb = b.size(1) | |||
# init data | |||
na = len(a) | |||
nb = len(b) | |||
cpt = 0 | |||
# we assume that no distances are null except those of the diagonal of | |||
# distances | |||
if warmstart is None: | |||
alpha, beta = np.zeros(na), np.zeros(nb) | |||
else: | |||
alpha, beta = warmstart | |||
if nbb: | |||
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb | |||
else: | |||
u, v = np.ones(na) / na, np.ones(nb) / nb | |||
def get_K(alpha, beta): | |||
"""log space computation""" | |||
return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg) | |||
def get_Gamma(alpha, beta, u, v): | |||
"""log space gamma computation""" | |||
return np.exp( | |||
-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log( | |||
v.reshape((1, nb)))) | |||
# print(np.min(K)) | |||
K = get_K(alpha, beta) | |||
transp = K | |||
cpt = 0 | |||
err = 1 | |||
while 1: | |||
uprev = u | |||
vprev = v | |||
# sinkhorn update | |||
v = b / (np.dot(K.T, u) + 1e-16) | |||
u = a / (np.dot(K, v) + 1e-16) | |||
# remove numerical problems and store them in K | |||
if np.abs(u).max() > tau or np.abs(v).max() > tau: | |||
if nbb: | |||
alpha, beta = alpha + reg * \ | |||
np.max(np.log(u), 1), beta + reg * np.max(np.log(v)) | |||
else: | |||
alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v) | |||
if nbb: | |||
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb | |||
else: | |||
u, v = np.ones(na) / na, np.ones(nb) / nb | |||
K = get_K(alpha, beta) | |||
if cpt % print_period == 0: | |||
# we can speed up the process by checking for the error only all | |||
# the 10th iterations | |||
if nbb: | |||
err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \ | |||
np.sum((v - vprev) ** 2) / np.sum((v) ** 2) | |||
else: | |||
transp = get_Gamma(alpha, beta, u, v) | |||
err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2 | |||
if log: | |||
log['err'].append(err) | |||
if verbose: | |||
if cpt % (print_period * 20) == 0: | |||
print( | |||
'{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) | |||
print('{:5d}|{:8e}|'.format(cpt, err)) | |||
if err <= stopThr: | |||
loop = False | |||
if cpt >= numItermax: | |||
loop = False | |||
if np.any(np.isnan(u)) or np.any(np.isnan(v)): | |||
# we have reached the machine precision | |||
# come back to previous solution and quit loop | |||
print('Warning: numerical errors at iteration', cpt) | |||
u = uprev | |||
v = vprev | |||
break | |||
cpt = cpt + 1 | |||
# print('err=',err,' cpt=',cpt) | |||
if log: | |||
log['logu'] = alpha / reg + np.log(u) | |||
log['logv'] = beta / reg + np.log(v) | |||
log['alpha'] = alpha + reg * np.log(u) | |||
log['beta'] = beta + reg * np.log(v) | |||
log['warmstart'] = (log['alpha'], log['beta']) | |||
if nbb: | |||
res = np.zeros((nbb)) | |||
for i in range(nbb): | |||
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) | |||
return res, log | |||
else: | |||
return get_Gamma(alpha, beta, u, v), log | |||
else: | |||
if nbb: | |||
res = np.zeros((nbb)) | |||
for i in range(nbb): | |||
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) | |||
return res | |||
else: | |||
return get_Gamma(alpha, beta, u, v) | |||
if __name__ == "__main__": | |||
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))) # .cuda() | |||
mylayer = WlossLayer(cost) # .cuda() | |||
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True) # .cuda() | |||
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])) # .cuda() | |||
res, _ = mylayer(inp, ground_true) | |||
# print(inp.requires_grad, res.requires_grad) | |||
# print(res, inp) | |||
mylayer.zero_grad() | |||
res.backward() | |||
print("inp's gradient is good:") | |||
print(inp.grad) | |||
print("convert to gpu:\n", inp.cuda().grad) | |||
print("==============================================" | |||
"\n However, this does not work on pytorch when GPU is enabled") | |||
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda() | |||
mylayer = WlossLayer(cost).cuda() | |||
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda() | |||
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda() | |||
opt = optim.SGD([ | |||
{'params': mylayer.parameters()}, | |||
], lr=1e-2, momentum=0.9) | |||
res, _ = mylayer(inp, ground_true) | |||
# print(inp.requires_grad, res.requires_grad) | |||
# print(res, inp) | |||
mylayer.zero_grad() | |||
res.backward() | |||
print("input's gradient is None!!!!!!!!!!!!!!!!") | |||
print(inp.grad) |
@@ -1,9 +1,8 @@ | |||
from collections import defaultdict | |||
import numpy as np | |||
import torch | |||
import torch.nn.init as init | |||
import torch.nn as nn | |||
import torch.nn.init as init | |||
def mask_softmax(matrix, mask): | |||
if mask is None: | |||
result = torch.nn.functional.softmax(matrix, dim=-1) | |||
@@ -11,13 +10,28 @@ def mask_softmax(matrix, mask): | |||
raise NotImplementedError | |||
return result | |||
def initial_parameter(net ,initial_method =None): | |||
def initial_parameter(net, initial_method=None): | |||
"""A method used to initialize the weights of PyTorch models. | |||
:param net: a PyTorch model | |||
:param initial_method: str, one of the following initializations | |||
- xavier_uniform | |||
- xavier_normal (default) | |||
- kaiming_normal, or msra | |||
- kaiming_uniform | |||
- orthogonal | |||
- sparse | |||
- normal | |||
- uniform | |||
""" | |||
if initial_method == 'xavier_uniform': | |||
init_method = init.xavier_uniform_ | |||
elif initial_method=='xavier_normal': | |||
elif initial_method == 'xavier_normal': | |||
init_method = init.xavier_normal_ | |||
elif initial_method == 'kaiming_normal' or initial_method =='msra': | |||
elif initial_method == 'kaiming_normal' or initial_method == 'msra': | |||
init_method = init.kaiming_normal | |||
elif initial_method == 'kaiming_uniform': | |||
init_method = init.kaiming_normal | |||
@@ -25,263 +39,49 @@ def initial_parameter(net ,initial_method =None): | |||
init_method = init.orthogonal_ | |||
elif initial_method == 'sparse': | |||
init_method = init.sparse_ | |||
elif initial_method =='normal': | |||
elif initial_method == 'normal': | |||
init_method = init.normal_ | |||
elif initial_method =='uniform': | |||
elif initial_method == 'uniform': | |||
initial_method = init.uniform_ | |||
else: | |||
init_method = init.xavier_normal_ | |||
def weights_init(m): | |||
# classname = m.__class__.__name__ | |||
if isinstance(m, nn.Conv2d) or isinstance(m,nn.Conv1d) or isinstance(m,nn.Conv3d): # for all the cnn | |||
if initial_method != None: | |||
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv3d): # for all the cnn | |||
if initial_method is not None: | |||
init_method(m.weight.data) | |||
else: | |||
init.xavier_normal_(m.weight.data) | |||
init.normal_(m.bias.data) | |||
elif isinstance(m, nn.LSTM): | |||
for w in m.parameters(): | |||
if len(w.data.size())>1: | |||
if len(w.data.size()) > 1: | |||
init_method(w.data) # weight | |||
else: | |||
init.normal_(w.data) # bias | |||
elif hasattr(m, 'weight') and m.weight.requires_grad: | |||
init_method(m.weight.data) | |||
else: | |||
for w in m.parameters() : | |||
if w.requires_grad: | |||
if len(w.data.size())>1: | |||
for w in m.parameters(): | |||
if w.requires_grad: | |||
if len(w.data.size()) > 1: | |||
init_method(w.data) # weight | |||
else: | |||
init.normal_(w.data) # bias | |||
# print("init else") | |||
net.apply(weights_init) | |||
def seq_mask(seq_len, max_len): | |||
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] | |||
mask = torch.stack(mask, 1) | |||
return mask | |||
""" | |||
Codes from FudanParser. Not tested. Do not use !!! | |||
""" | |||
def expand_gt(gt): | |||
"""expand_gt: Expand ground truth to matrix | |||
Arguments: | |||
gt: tensor of (n, l) | |||
Return: | |||
f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$. | |||
""" | |||
n, l = gt.shape | |||
ret = torch.zeros(n, l, l).long() | |||
for i in range(n): | |||
ret[i][torch.arange(l).long(), gt[i]] = 1 | |||
return ret | |||
def greedy_decoding(arc_f): | |||
"""greedy_decoding | |||
Arguments: | |||
arc_f: a tensor in shape of (n, l+1, l+1) | |||
length of the sentence is l and index 0 is <root> | |||
Output: | |||
arc_pred: a tensor in shape of (n, l), indicating the head words | |||
""" | |||
f_arc = arc_f[:, 1:, :] # ignore the root | |||
_, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False) | |||
return arc_pred | |||
def mst_decoding(arc_f): | |||
batch_size = arc_f.shape[0] | |||
length = arc_f.shape[1] | |||
arc_score = arc_f.data.cpu() | |||
pred_collection = [] | |||
for i in range(batch_size): | |||
head = mst(arc_score[i].numpy()) | |||
pred_collection.append(head[1:].reshape((1, length - 1))) | |||
arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long() | |||
return arc_pred | |||
def outer_product(features): | |||
"""InterProduct: Get inter sequence product of features | |||
Arguments: | |||
features: feature vectors of sequence in the shape of (n, l, h) | |||
Return: | |||
f: product result in (n, l, l, h) shape | |||
""" | |||
n, l, c = features.shape | |||
features = features.contiguous() | |||
x = features.view(n, l, 1, c) | |||
x = x.expand(n, l, l, c) | |||
y = features.view(n, 1, l, c).contiguous() | |||
y = y.expand(n, l, l, c) | |||
return x * y | |||
def outer_concat(features): | |||
"""InterProduct: Get inter sequence concatenation of features | |||
Arguments: | |||
features: feature vectors of sequence in the shape of (n, l, h) | |||
Return: | |||
f: product result in (n, l, l, h) shape | |||
""" | |||
n, l, c = features.shape | |||
x = features.contiguous().view(n, l, 1, c) | |||
x = x.expand(n, l, l, c) | |||
y = features.view(n, 1, l, c) | |||
y = y.expand(n, l, l, c) | |||
return torch.cat((x, y), dim=3) | |||
def mst(scores): | |||
""" | |||
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692 # NOQA | |||
""" | |||
length = scores.shape[0] | |||
min_score = scores.min() - 1 | |||
eye = np.eye(length) | |||
scores = scores * (1 - eye) + min_score * eye | |||
heads = np.argmax(scores, axis=1) | |||
heads[0] = 0 | |||
tokens = np.arange(1, length) | |||
roots = np.where(heads[tokens] == 0)[0] + 1 | |||
if len(roots) < 1: | |||
root_scores = scores[tokens, 0] | |||
head_scores = scores[tokens, heads[tokens]] | |||
new_root = tokens[np.argmax(root_scores / head_scores)] | |||
heads[new_root] = 0 | |||
elif len(roots) > 1: | |||
root_scores = scores[roots, 0] | |||
scores[roots, 0] = 0 | |||
new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1 | |||
new_root = roots[np.argmin( | |||
scores[roots, new_heads] / root_scores)] | |||
heads[roots] = new_heads | |||
heads[new_root] = 0 | |||
edges = defaultdict(set) | |||
vertices = set((0,)) | |||
for dep, head in enumerate(heads[tokens]): | |||
vertices.add(dep + 1) | |||
edges[head].add(dep + 1) | |||
for cycle in _find_cycle(vertices, edges): | |||
dependents = set() | |||
to_visit = set(cycle) | |||
while len(to_visit) > 0: | |||
node = to_visit.pop() | |||
if node not in dependents: | |||
dependents.add(node) | |||
to_visit.update(edges[node]) | |||
cycle = np.array(list(cycle)) | |||
old_heads = heads[cycle] | |||
old_scores = scores[cycle, old_heads] | |||
non_heads = np.array(list(dependents)) | |||
scores[np.repeat(cycle, len(non_heads)), | |||
np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score | |||
new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1 | |||
new_scores = scores[cycle, new_heads] / old_scores | |||
change = np.argmax(new_scores) | |||
changed_cycle = cycle[change] | |||
old_head = old_heads[change] | |||
new_head = new_heads[change] | |||
heads[changed_cycle] = new_head | |||
edges[new_head].add(changed_cycle) | |||
edges[old_head].remove(changed_cycle) | |||
return heads | |||
def _find_cycle(vertices, edges): | |||
""" | |||
https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA | |||
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA | |||
""" | |||
_index = 0 | |||
_stack = [] | |||
_indices = {} | |||
_lowlinks = {} | |||
_onstack = defaultdict(lambda: False) | |||
_SCCs = [] | |||
def _strongconnect(v): | |||
nonlocal _index | |||
_indices[v] = _index | |||
_lowlinks[v] = _index | |||
_index += 1 | |||
_stack.append(v) | |||
_onstack[v] = True | |||
for w in edges[v]: | |||
if w not in _indices: | |||
_strongconnect(w) | |||
_lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) | |||
elif _onstack[w]: | |||
_lowlinks[v] = min(_lowlinks[v], _indices[w]) | |||
if _lowlinks[v] == _indices[v]: | |||
SCC = set() | |||
while True: | |||
w = _stack.pop() | |||
_onstack[w] = False | |||
SCC.add(w) | |||
if not (w != v): | |||
break | |||
_SCCs.append(SCC) | |||
net.apply(weights_init) | |||
for v in vertices: | |||
if v not in _indices: | |||
_strongconnect(v) | |||
return [SCC for SCC in _SCCs if len(SCC) > 1] | |||
def seq_mask(seq_len, max_len): | |||
"""Create sequence mask. | |||
:param seq_len: list of int, the lengths of sequences in a batch. | |||
:param max_len: int, the maximum sequence length in a batch. | |||
:return mask: torch.LongTensor, [batch_size, max_len] | |||
# https://github.com/alykhantejani/nninit/blob/master/nninit.py | |||
def orthogonal(tensor, gain=1): | |||
"""Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions, | |||
and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with | |||
rows equal to the first dimension and columns equal to the product of as a sparse matrix, where the non-zero elements | |||
will be drawn from a normal distribution with mean=0 and std=`std`. | |||
Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. | |||
Args: | |||
tensor: a n-dimension torch.Tensor, where n >= 2 | |||
gain: optional gain to be applied | |||
Examples: | |||
>>> w = torch.Tensor(3, 5) | |||
>>> nninit.orthogonal(w) | |||
""" | |||
if tensor.ndimension() < 2: | |||
raise ValueError("Only tensors with 2 or more dimensions are supported.") | |||
flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:]))) | |||
flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1) | |||
u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False) | |||
if u.shape == flattened.detach().numpy().shape: | |||
tensor.view_as(flattened).copy_(torch.from_numpy(u)) | |||
else: | |||
tensor.view_as(flattened).copy_(torch.from_numpy(v)) | |||
tensor.mul_(gain) | |||
with torch.no_grad(): | |||
return tensor | |||
def generate_step_dropout(masks, hidden_dim, step_dropout, training=False): | |||
# assume batch first | |||
# import pdb | |||
# pdb.set_trace() | |||
batch, length = masks.size() | |||
if not training: | |||
return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch, | |||
length, 1) | |||
masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout) | |||
masked = torch.bernoulli(masked).repeat(1, length, 1) | |||
masked = masked.cuda(masks.device) * masks.view(batch, length, 1) | |||
return masked | |||
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] | |||
mask = torch.stack(mask, 1) | |||
return mask |
@@ -2,16 +2,23 @@ import torch | |||
class ModelSaver(object): | |||
"""Save a models""" | |||
"""Save a model | |||
Example:: | |||
saver = ModelSaver("./save/model_ckpt_100.pkl") | |||
saver.save_pytorch(model) | |||
""" | |||
def __init__(self, save_path): | |||
""" | |||
:param save_path: str, the path to the saving directory. | |||
""" | |||
self.save_path = save_path | |||
# TODO: check whether the path exist, if not exist, create it. | |||
def save_pytorch(self, model): | |||
""" | |||
Save a pytorch model into .pkl file. | |||
"""Save a pytorch model into .pkl file. | |||
:param model: a PyTorch model | |||
:return: | |||
""" | |||
torch.save(model.state_dict(), self.save_path) |
@@ -1,23 +1,15 @@ | |||
import os | |||
import torch.nn.functional as F | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader as Dataset_loader | |||
from fastNLP.loader.embed_loader import EmbedLoader as EmbedLoader | |||
from fastNLP.loader.config_loader import ConfigSection | |||
from fastNLP.core.preprocess import ClassPreprocess as Preprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.loader.config_loader import ConfigLoader | |||
from fastNLP.loader.config_loader import ConfigSection | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader as Dataset_loader | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.core.preprocess import ClassPreprocess as Preprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.modules.aggregator.self_attention import SelfAttention | |||
from fastNLP.modules.decoder.MLP import MLP | |||
from fastNLP.modules.encoder.embedding import Embedding as Embedding | |||
from fastNLP.modules.encoder.lstm import Lstm | |||
from fastNLP.modules.aggregation.self_attention import SelfAttention | |||
from fastNLP.modules.decoder.MLP import MLP | |||
train_data_path = 'small_train_data.txt' | |||
dev_data_path = 'small_dev_data.txt' | |||
@@ -32,7 +32,7 @@ def infer(): | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
index2label = load_pickle(pickle_path, "class2id.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
@@ -105,7 +105,7 @@ def test(): | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
index2label = load_pickle(pickle_path, "class2id.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# load dev data | |||
@@ -33,7 +33,7 @@ def infer(): | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
index2label = load_pickle(pickle_path, "class2id.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
@@ -105,7 +105,7 @@ def test(): | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
index2label = load_pickle(pickle_path, "class2id.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# load dev data | |||
@@ -4,6 +4,7 @@ import unittest | |||
from fastNLP.core.predictor import Predictor | |||
from fastNLP.core.preprocess import save_pickle | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
from fastNLP.core.vocabulary import Vocabulary | |||
class TestPredictor(unittest.TestCase): | |||
@@ -23,10 +24,14 @@ class TestPredictor(unittest.TestCase): | |||
['a', 'b', 'c', 'd', '$'], | |||
['!', 'b', 'c', 'd', 'e'] | |||
] | |||
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} | |||
vocab = Vocabulary() | |||
vocab.word2idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} | |||
class_vocab = Vocabulary() | |||
class_vocab.word2idx = {"0":0, "1":1, "2":2, "3":3, "4":4} | |||
os.system("mkdir save") | |||
save_pickle({0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, "./save/", "id2class.pkl") | |||
save_pickle(class_vocab, "./save/", "class2id.pkl") | |||
save_pickle(vocab, "./save/", "word2id.pkl") | |||
model = SeqLabeling(model_args) | |||
@@ -0,0 +1,30 @@ | |||
import torch | |||
from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler | |||
def test_convert_to_torch_tensor(): | |||
data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 3, 4, 5, 2]] | |||
ans = convert_to_torch_tensor(data, False) | |||
assert isinstance(ans, torch.Tensor) | |||
assert tuple(ans.shape) == (3, 5) | |||
def test_sequential_sampler(): | |||
sampler = SequentialSampler() | |||
data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] | |||
for idx, i in enumerate(sampler(data)): | |||
assert idx == i | |||
def test_random_sampler(): | |||
sampler = RandomSampler() | |||
data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] | |||
ans = [data[i] for i in sampler(data)] | |||
assert len(ans) == len(data) | |||
for d in ans: | |||
assert d in data | |||
if __name__ == "__main__": | |||
test_sequential_sampler() |
@@ -0,0 +1,31 @@ | |||
import unittest | |||
from fastNLP.core.vocabulary import Vocabulary, DEFAULT_WORD_TO_INDEX | |||
class TestVocabulary(unittest.TestCase): | |||
def test_vocab(self): | |||
import _pickle as pickle | |||
import os | |||
vocab = Vocabulary() | |||
filename = 'vocab' | |||
vocab.update(filename) | |||
vocab.update([filename, ['a'], [['b']], ['c']]) | |||
idx = vocab[filename] | |||
before_pic = (vocab.to_word(idx), vocab[filename]) | |||
with open(filename, 'wb') as f: | |||
pickle.dump(vocab, f) | |||
with open(filename, 'rb') as f: | |||
vocab = pickle.load(f) | |||
os.remove(filename) | |||
vocab.build_reverse_vocab() | |||
after_pic = (vocab.to_word(idx), vocab[filename]) | |||
TRUE_DICT = {'vocab': 5, 'a': 6, 'b': 7, 'c': 8} | |||
TRUE_DICT.update(DEFAULT_WORD_TO_INDEX) | |||
TRUE_IDXDICT = {0: '<pad>', 1: '<unk>', 2: '<reserved-2>', 3: '<reserved-3>', 4: '<reserved-4>', 5: 'vocab', 6: 'a', 7: 'b', 8: 'c'} | |||
self.assertEqual(before_pic, after_pic) | |||
self.assertDictEqual(TRUE_DICT, vocab.word2idx) | |||
self.assertDictEqual(TRUE_IDXDICT, vocab.idx2word) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,15 @@ | |||
1 I _ PRP PRP _ 2 SUB | |||
2 solved _ VBD VBD _ 0 ROOT | |||
3 the _ DT DT _ 4 NMOD | |||
4 problem _ NN NN _ 2 OBJ | |||
5 with _ IN IN _ 2 VMOD | |||
6 statistics _ NNS NNS _ 5 PMOD | |||
7 . _ . . _ 2 P | |||
1 I _ PRP PRP _ 2 SUB | |||
2 solved _ VBD VBD _ 0 ROOT | |||
3 the _ DT DT _ 4 NMOD | |||
4 problem _ NN NN _ 2 OBJ | |||
5 with _ IN IN _ 2 VMOD | |||
6 statistics _ NNS NNS _ 5 PMOD | |||
7 . _ . . _ 2 P |
@@ -0,0 +1,27 @@ | |||
19980101-01-001-001/m 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n ——/w 一九九八年/t 新年/t 讲话/n (/w 附/v 图片/n 1/m 张/q )/w | |||
19980101-01-001-002/m 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr | |||
19980101-01-001-003/m (/w 一九九七年/t 十二月/t 三十一日/t )/w | |||
19980101-01-001-004/m 12月/t 31日/t ,/w 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr 发表/v 1998年/t 新年/t 讲话/n 《/w 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n 》/w 。/w (/w 新华社/nt 记者/n 兰/nr 红光/nr 摄/Vg )/w | |||
19980101-01-001-005/m 同胞/n 们/k 、/w 朋友/n 们/k 、/w 女士/n 们/k 、/w 先生/n 们/k :/w | |||
19980101-01-001-006/m 在/p 1998年/t 来临/v 之际/f ,/w 我/r 十分/m 高兴/a 地/u 通过/p [中央/n 人民/n 广播/vn 电台/n]nt 、/w [中国/ns 国际/n 广播/vn 电台/n]nt 和/c [中央/n 电视台/n]nt ,/w 向/p 全国/n 各族/r 人民/n ,/w 向/p [香港/ns 特别/a 行政区/n]ns 同胞/n 、/w 澳门/ns 和/c 台湾/ns 同胞/n 、/w 海外/s 侨胞/n ,/w 向/p 世界/n 各国/r 的/u 朋友/n 们/k ,/w 致以/v 诚挚/a 的/u 问候/vn 和/c 良好/a 的/u 祝愿/vn !/w | |||
19980101-01-001-007/m 1997年/t ,/w 是/v 中国/ns 发展/vn 历史/n 上/f 非常/d 重要/a 的/u 很/d 不/d 平凡/a 的/u 一/m 年/q 。/w 中国/ns 人民/n 决心/d 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 继续/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 事业/n 推向/v 前进/v 。/w [中国/ns 政府/n]nt 顺利/ad 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 并/c 按照/p “/w 一国两制/j ”/w 、/w “/w 港人治港/l ”/w 、/w 高度/d 自治/v 的/u 方针/n 保持/v 香港/ns 的/u 繁荣/an 稳定/an 。/w [中国/ns 共产党/n]nt 成功/a 地/u 召开/v 了/u 第十五/m 次/q 全国/n 代表大会/n ,/w 高举/v 邓小平理论/n 伟大/a 旗帜/n ,/w 总结/v 百年/m 历史/n ,/w 展望/v 新/a 的/u 世纪/n ,/w 制定/v 了/u 中国/ns 跨/v 世纪/n 发展/v 的/u 行动/vn 纲领/n 。/w | |||
19980101-01-001-008/m 在/p 这/r 一/m 年/q 中/f ,/w 中国/ns 的/u 改革/vn 开放/vn 和/c 现代化/vn 建设/vn 继续/v 向前/v 迈进/v 。/w 国民经济/n 保持/v 了/u “/w 高/a 增长/vn 、/w 低/a 通胀/j ”/w 的/u 良好/a 发展/vn 态势/n 。/w 农业/n 生产/vn 再次/d 获得/v 好/a 的/u 收成/n ,/w 企业/n 改革/vn 继续/v 深化/v ,/w 人民/n 生活/vn 进一步/d 改善/v 。/w 对外/vn 经济/n 技术/n 合作/vn 与/c 交流/vn 不断/d 扩大/v 。/w 民主/a 法制/n 建设/vn 、/w 精神文明/n 建设/vn 和/c 其他/r 各项/r 事业/n 都/d 有/v 新/a 的/u 进展/vn 。/w 我们/r 十分/m 关注/v 最近/t 一个/m 时期/n 一些/m 国家/n 和/c 地区/n 发生/v 的/u 金融/n 风波/n ,/w 我们/r 相信/v 通过/p 这些/r 国家/n 和/c 地区/n 的/u 努力/an 以及/c 有关/v 的/u 国际/n 合作/vn ,/w 情况/n 会/v 逐步/d 得到/v 缓解/vn 。/w 总的来说/c ,/w 中国/ns 改革/v 和/c 发展/v 的/u 全局/n 继续/v 保持/v 了/u 稳定/an 。/w | |||
19980101-01-001-009/m 在/p 这/r 一/m 年/q 中/f ,/w 中国/ns 的/u 外交/n 工作/vn 取得/v 了/u 重要/a 成果/n 。/w 通过/p 高层/n 互访/v ,/w 中国/ns 与/p 美国/ns 、/w 俄罗斯/ns 、/w 法国/ns 、/w 日本/ns 等/u 大国/n 确定/v 了/u 双方/n 关系/n 未来/t 发展/v 的/u 目标/n 和/c 指导/vn 方针/n 。/w 中国/ns 与/p 周边/n 国家/n 和/c 广大/b 发展中国家/l 的/u 友好/a 合作/vn 进一步/d 加强/v 。/w 中国/ns 积极/ad 参与/v [亚/j 太/j 经合/j 组织/n]nt 的/u 活动/vn ,/w 参加/v 了/u 东盟/ns —/w 中/j 日/j 韩/j 和/c 中国/ns —/w 东盟/ns 首脑/n 非正式/b 会晤/vn 。/w 这些/r 外交/n 活动/vn ,/w 符合/v 和平/n 与/c 发展/v 的/u 时代/n 主题/n ,/w 顺应/v 世界/n 走向/v 多极化/v 的/u 趋势/n ,/w 对于/p 促进/v 国际/n 社会/n 的/u 友好/a 合作/vn 和/c 共同/b 发展/vn 作出/v 了/u 积极/a 的/u 贡献/n 。/w | |||
19980101-01-001-010/m 1998年/t ,/w 中国/ns 人民/n 将/d 满怀信心/l 地/u 开创/v 新/a 的/u 业绩/n 。/w 尽管/c 我们/r 在/p 经济/n 社会/n 发展/v 中/f 还/d 面临/v 不少/m 困难/an ,/w 但/c 我们/r 有/v 邓小平理论/n 的/u 指引/vn ,/w 有/v 改革/v 开放/v 近/a 20/m 年/q 来/f 取得/v 的/u 伟大/a 成就/n 和/c 积累/v 的/u 丰富/a 经验/n ,/w 还/d 有/v 其他/r 的/u 各种/r 有利/a 条件/n ,/w 我们/r 一定/d 能够/v 克服/v 这些/r 困难/an ,/w 继续/v 稳步前进/l 。/w 只要/c 我们/r 进一步/d 解放思想/i ,/w 实事求是/i ,/w 抓住/v 机遇/n ,/w 开拓进取/l ,/w 建设/v 有/v 中国/ns 特色/n 社会主义/n 的/u 道路/n 就/c 会/v 越/d 走/v 越/d 宽广/a 。/w | |||
19980101-01-001-011/m 实现/v 祖国/n 的/u 完全/a 统一/vn ,/w 是/v 海内外/s 全体/n 中国/ns 人/n 的/u 共同/b 心愿/n 。/w 通过/p 中/j 葡/j 双方/n 的/u 合作/vn 和/c 努力/an ,/w 按照/p “/w 一国两制/j ”/w 方针/n 和/c 澳门/ns 《/w 基本法/n 》/w ,/w 1999年/t 12月/t 澳门/ns 的/u 回归/vn 一定/d 能够/v 顺利/ad 实现/v 。/w | |||
19980101-01-001-012/m 台湾/ns 是/v 中国/ns 领土/n 不可分割/l 的/u 一/m 部分/n 。/w 完成/v 祖国/n 统一/vn ,/w 是/v 大势所趋/i ,/w 民心所向/l 。/w 任何/r 企图/v 制造/v “/w 两/m 个/q 中国/ns ”/w 、/w “/w 一中一台/j ”/w 、/w “/w 台湾/ns 独立/v ”/w 的/u 图谋/n ,/w 都/d 注定/v 要/v 失败/v 。/w 希望/v 台湾/ns 当局/n 以/p 民族/n 大义/n 为重/v ,/w 拿/v 出/v 诚意/n ,/w 采取/v 实际/a 的/u 行动/vn ,/w 推动/v 两岸/n 经济/n 文化/n 交流/vn 和/c 人员/n 往来/vn ,/w 促进/v 两岸/n 直接/ad 通邮/v 、/w 通航/v 、/w 通商/v 的/u 早日/d 实现/v ,/w 并/c 尽早/d 回应/v 我们/r 发出/v 的/u 在/p 一个/m 中国/ns 的/u 原则/n 下/f 两岸/n 进行/v 谈判/vn 的/u 郑重/a 呼吁/vn 。/w | |||
19980101-01-001-013/m 环顾/v 全球/n ,/w 日益/d 密切/a 的/u 世界/n 经济/n 联系/vn ,/w 日新月异/i 的/u 科技/n 进步/vn ,/w 正在/d 为/p 各国/r 经济/n 的/u 发展/vn 提供/v 历史/n 机遇/n 。/w 但是/c ,/w 世界/n 还/d 不/d 安宁/a 。/w 南北/f 之间/f 的/u 贫富/n 差距/n 继续/v 扩大/v ;/w 局部/n 冲突/vn 时有发生/l ;/w 不/d 公正/a 不/d 合理/a 的/u 旧/a 的/u 国际/n 政治/n 经济/n 秩序/n 还/d 没有/v 根本/a 改变/vn ;/w 发展中国家/l 在/p 激烈/a 的/u 国际/n 经济/n 竞争/vn 中/f 仍/d 处于/v 弱势/n 地位/n ;/w 人类/n 的/u 生存/vn 与/c 发展/vn 还/d 面临/v 种种/q 威胁/vn 和/c 挑战/vn 。/w 和平/n 与/c 发展/vn 的/u 前景/n 是/v 光明/a 的/u ,/w 21/m 世纪/n 将/d 是/v 充满/v 希望/n 的/u 世纪/n 。/w 但/c 前进/v 的/u 道路/n 不/d 会/v 也/d 不/d 可能/v 一帆风顺/i ,/w 关键/n 是/v 世界/n 各国/r 人民/n 要/v 进一步/d 团结/a 起来/v ,/w 共同/d 推动/v 早日/d 建立/v 公正/a 合理/a 的/u 国际/n 政治/n 经济/n 新/a 秩序/n 。/w | |||
19980101-01-001-014/m [中国/ns 政府/n]nt 将/d 继续/v 坚持/v 奉行/v 独立自主/i 的/u 和平/n 外交/n 政策/n ,/w 在/p 和平共处/l 五/m 项/q 原则/n 的/u 基础/n 上/f 努力/ad 发展/v 同/p 世界/n 各国/r 的/u 友好/a 关系/n 。/w 中国/ns 愿意/v 加强/v 同/p 联合国/nt 和/c 其他/r 国际/n 组织/n 的/u 协调/vn ,/w 促进/v 在/p 扩大/v 经贸/j 科技/n 交流/vn 、/w 保护/v 环境/n 、/w 消除/v 贫困/an 、/w 打击/v 国际/n 犯罪/vn 等/u 方面/n 的/u 国际/n 合作/vn 。/w 中国/ns 永远/d 是/v 维护/v 世界/n 和平/n 与/c 稳定/an 的/u 重要/a 力量/n 。/w 中国/ns 人民/n 愿/v 与/p 世界/n 各国/r 人民/n 一道/d ,/w 为/p 开创/v 持久/a 和平/n 、/w 共同/d 发展/v 的/u 新/a 世纪/n 而/c 不懈努力/l !/w | |||
19980101-01-001-015/m 在/p 这/r 辞旧迎新/l 的/u 美好/a 时刻/n ,/w 我/r 祝/v 大家/r 新年/t 快乐/a ,/w 家庭/n 幸福/a !/w | |||
19980101-01-001-016/m 谢谢/v !/w (/w 新华社/nt 北京/ns 12月/t 31日/t 电/n )/w | |||
19980101-01-002-001/m 在/p 十五大/j 精神/n 指引/vn 下/f 胜利/vd 前进/v ——/w 元旦/t 献辞/n | |||
19980101-01-002-002/m 我们/r 即将/d 以/p 丰收/vn 的/u 喜悦/an 送/v 走/v 牛年/t ,/w 以/p 昂扬/a 的/u 斗志/n 迎来/v 虎年/t 。/w 我们/r 伟大/a 祖国/n 在/p 新/a 的/u 一/m 年/q ,/w 将/d 是/v 充满/v 生机/n 、/w 充满/v 希望/n 的/u 一/m 年/q 。/w | |||
19980101-01-002-003/m 刚刚/d 过去/v 的/u 一/m 年/q ,/w 大气磅礴/i ,/w 波澜壮阔/i 。/w 在/p 这/r 一/m 年/q ,/w 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt ,/w 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 高举/v 邓小平理论/n 的/u 伟大/a 旗帜/n ,/w 领导/v 全党/n 和/c 全国/n 各族/r 人民/n 坚定不移/i 地/u 沿着/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 道路/n 阔步/d 前进/v ,/w 写/v 下/v 了/u 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 辉煌/a 篇章/n 。/w 顺利/a 地/u 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 胜利/v 地/u 召开/v 党/n 的/u 第十五/m 次/q 全国/n 代表大会/n ———/w 两/m 件/q 大事/n 办/v 得/u 圆满/a 成功/a 。/w 国民经济/n 稳中求进/l ,/w 国家/n 经济/n 实力/n 进一步/d 增强/v ,/w 人民/n 生活/vn 继续/v 改善/v ,/w 对外/vn 经济/n 技术/n 交流/vn 日益/d 扩大/v 。/w 在/p 国际/n 金融/n 危机/n 的/u 风浪/n 波及/v 许多/m 国家/n 的/u 情况/n 下/f ,/w 我国/n 保持/v 了/u 金融/n 形势/n 和/c 整个/b 经济/n 形势/n 的/u 稳定/a 发展/vn 。/w 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 取得/v 新/a 的/u 成绩/n ,/w 各项/r 社会/n 事业/n 全面/ad 进步/v 。/w 外交/n 工作/vn 取得/v 可喜/a 的/u 突破/vn ,/w 我国/n 的/u 国际/n 地位/n 和/c 国际/n 威望/n 进一步/d 提高/v 。/w 实践/v 使/v 亿万/m 人民/n 对/p 邓小平理论/n 更加/d 信仰/v ,/w 对/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 更加/d 信赖/v ,/w 对/p 伟大/a 祖国/n 的/u 光辉/n 前景/n 更加/d 充满/v 信心/n 。/w | |||
19980101-01-002-004/m 1998年/t ,/w 是/v 全面/ad 贯彻/v 落实/v 党/n 的/u 十五大/j 提出/v 的/u 任务/n 的/u 第一/m 年/q ,/w 各/r 条/q 战线/n 改革/v 和/c 发展/v 的/u 任务/n 都/d 十分/m 繁重/a ,/w 有/v 许多/m 深/a 层次/n 的/u 矛盾/an 和/c 问题/n 有待/v 克服/v 和/c 解决/v ,/w 特别/d 是/v 国有/vn 企业/n 改革/vn 已经/d 进入/v 攻坚/vn 阶段/n 。/w 我们/r 必须/d 进一步/d 深入/ad 学习/v 和/c 掌握/v 党/n 的/u 十五大/j 精神/n ,/w 统揽全局/l ,/w 精心/ad 部署/v ,/w 狠抓/v 落实/v ,/w 团结/a 一致/a ,/w 艰苦奋斗/i ,/w 开拓/v 前进/v ,/w 为/p 夺取/v 今年/t 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 新/a 胜利/vn 而/c 奋斗/v 。/w | |||
19980101-01-002-005/m 今年/t 是/v 党/n 的/u 十一/m 届/q 三中全会/j 召开/v 20/m 周年/q ,/w 是/v 我们/r 党/n 和/c 国家/n 实现/v 伟大/a 的/u 历史/n 转折/vn 、/w 进入/v 改革/vn 开放/vn 历史/n 新/a 时期/n 的/u 20/m 周年/q 。/w 在/p 新/a 的/u 一/m 年/q 里/f ,/w 大力/d 发扬/v 十一/m 届/q 三中全会/j 以来/f 我们/r 党/n 所/u 恢复/v 的/u 优良/z 传统/n 和/c 在/p 新/a 的/u 历史/n 条件/n 下/f 形成/v 的/u 优良/z 作风/n ,/w 对于/p 完成/v 好/a 今年/t 的/u 各项/r 任务/n 具有/v 十分/m 重要/a 的/u 意义/n 。/w | |||
19980101-01-002-006/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 解放思想/i 、/w 实事求是/i 的/u 思想/n 路线/n 。/w 解放思想/i 、/w 实事求是/i ,/w 是/v 邓小平理论/n 的/u 精髓/n 。/w 实践/v 证明/v ,/w 只有/c 解放思想/i 、/w 实事求是/i ,/w 才/c 能/v 冲破/v 各种/r 不/d 切合/v 实际/n 的/u 或者/c 过时/a 的/u 观念/n 的/u 束缚/vn ,/w 真正/d 做到/v 尊重/v 、/w 认识/v 和/c 掌握/v 客观/a 规律/n ,/w 勇于/v 突破/v ,/w 勇于/v 创新/v ,/w 不断/d 开创/v 社会主义/n 现代化/vn 建设/vn 的/u 新/a 局面/n 。/w 党/n 的/u 十五大/j 是/v 我们/r 党/n 解放思想/i 、/w 实事求是/i 的/u 新/a 的/u 里程碑/n 。/w 进一步/d 认真/ad 学习/v 和/c 掌握/v 十五大/j 精神/n ,/w 解放思想/i 、/w 实事求是/i ,/w 我们/r 的/u 各项/r 事业/n 就/d 能/v 结/v 出/v 更加/d 丰硕/a 的/u 成果/n 。/w | |||
19980101-01-002-007/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 。/w 各项/r 工作/vn 必须/d 以/p 经济/n 建设/vn 为/v 中心/n ,/w 是/v 邓小平理论/n 的/u 基本/a 观点/n ,/w 是/v 党/n 的/u 基本/a 路线/n 的/u 核心/n 内容/n ,/w 近/a 20/m 年/q 来/f 的/u 实践/vn 证明/v ,/w 坚持/v 这个/r 中心/n ,/w 是/v 完全/ad 正确/a 的/u 。/w 今后/t ,/w 我们/r 能否/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 伟大/a 事业/n 全面/ad 推向/v 21/m 世纪/n ,/w 关键/n 仍然/d 要/v 看/v 能否/v 把/p 经济/n 工作/vn 搞/v 上去/v 。/w 各级/r 领导/n 干部/n 要/v 切实/ad 把/p 精力/n 集中/v 到/v 贯彻/v 落实/v 好/a 中央/n 关于/p 今年/t 经济/n 工作/vn 的/u 总体/n 要求/n 和/c 各项/r 重要/a 任务/n 上/f 来/v ,/w 不断/d 提高/v 领导/v 经济/n 建设/vn 的/u 能力/n 和/c 水平/n 。/w | |||
19980101-01-002-008/m 我们/r 要/v 更/d 好/a 地/u 坚持/v “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 的/u 方针/n 。/w 在/p 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 的/u 同时/n ,/w 积极/ad 推进/v 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn ,/w 是/v 建设/v 富强/a 、/w 民主/a 、/w 文明/a 的/u 社会主义/n 现代化/vn 国家/n 的/u 重要/a 内容/n 。/w 实践/v 证明/v ,/w 经济/n 建设/vn 的/u 顺利/a 进行/vn ,/w 离/v 不/d 开/v 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 的/u 保证/vn 。/w 党/n 的/u 十五大/j 依据/p 邓小平理论/n 和/c 党/n 的/u 基本/a 路线/n 提出/v 的/u 党/n 在/p 社会主义/n 初级/b 阶段/n 经济/n 、/w 政治/n 、/w 文化/n 的/u 基本/a 纲领/n ,/w 为/p “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 提供/v 了/u 新/a 的/u 理论/n 根据/n ,/w 提出/v 了/u 更/d 高/a 要求/n ,/w 现在/t 的/u 关键/n 是/v 认真/ad 抓好/v 落实/v 。/w | |||
19980101-01-002-009/m 我们/r 要/v 更/d 好/a 地/u 发扬/v 求真务实/l 、/w 密切/ad 联系/v 群众/n 的/u 作风/n 。/w 这/r 是/v 把/p 党/n 的/u 方针/n 、/w 政策/n 落到实处/l ,/w 使/v 改革/v 和/c 建设/v 取得/v 胜利/vn 的/u 重要/a 保证/vn 。/w 在/p 当前/t 改革/v 进一步/d 深化/v ,/w 经济/n 不断/d 发展/v ,/w 同时/c 又/d 出现/v 一些/m 新/a 情况/n 、/w 新/a 问题/n 和/c 新/a 困难/an 的/u 形势/n 下/f ,/w 更/d 要/v 发扬/v 这样/r 的/u 好/a 作风/n 。/w 要/v 尊重/v 群众/n 的/u 意愿/n ,/w 重视/v 群众/n 的/u 首创/vn 精神/n ,/w 关心/v 群众/n 的/u 生活/vn 疾苦/n 。/w 江/nr 泽民/nr 同志/n 最近/t 强调/vd 指出/v ,/w 要/v 大力/d 倡导/v 说实话/l 、/w 办/v 实事/n 、/w 鼓/v 实劲/n 、/w 讲/v 实效/n 的/u 作风/n ,/w 坚决/ad 制止/v 追求/v 表面文章/i ,/w 搞/v 花架子/n 等/u 形式主义/n ,/w 坚决/ad 杜绝/v 脱离/v 群众/n 、/w 脱离/v 实际/n 、/w 浮躁/a 虚夸/v 等/u 官僚主义/n 。/w 这/r 是/v 非常/d 重要/a 的/u 。/w 因此/c ,/w 各级/r 领导/n 干部/n 务必/d 牢记/v 全心全意/i 为/p 人民/n 服务/v 的/u 宗旨/n ,/w 在/p 勤政廉政/l 、/w 艰苦奋斗/i 方面/n 以身作则/i ,/w 当/v 好/a 表率/n 。/w | |||
19980101-01-002-010/m 1998/m ,/w 瞩目/v 中华/nz 。/w 新/a 的/u 机遇/n 和/c 挑战/vn ,/w 催/v 人/n 进取/v ;/w 新/a 的/u 目标/n 和/c 征途/n ,/w 催/v 人/n 奋发/v 。/w 英雄/n 的/u 中国/ns 人民/n 在/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 坚强/a 领导/vn 和/c 党/n 的/u 十五大/j 精神/n 指引/v 下/f ,/w 更/d 高/a 地/u 举起/v 邓小平理论/n 的/u 伟大/a 旗帜/n ,/w 团结/a 一致/a ,/w 扎实/ad 工作/v ,/w 奋勇/d 前进/v ,/w 一定/d 能够/v 创造/v 出/v 更加/d 辉煌/a 的/u 业绩/n !/w |
@@ -4,7 +4,6 @@ import os | |||
import unittest | |||
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader | |||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader | |||
class TestConfigLoader(unittest.TestCase): | |||
@@ -52,21 +51,3 @@ class TestConfigLoader(unittest.TestCase): | |||
print("pass config test!") | |||
class TestDatasetLoader(unittest.TestCase): | |||
def test_case_TokenizeDatasetLoader(self): | |||
loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||
data = loader.load_pku(max_seq_len=32) | |||
print("pass TokenizeDatasetLoader test!") | |||
def test_case_POSDatasetLoader(self): | |||
loader = POSDatasetLoader("./test/data_for_tests/people.txt") | |||
data = loader.load() | |||
datas = loader.load_lines() | |||
print("pass POSDatasetLoader test!") | |||
def test_case_LMDatasetLoader(self): | |||
loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||
data = loader.load() | |||
datas = loader.load_lines() | |||
print("pass TokenizeDatasetLoader test!") |
@@ -0,0 +1,42 @@ | |||
import unittest | |||
from fastNLP.loader.dataset_loader import POSDatasetLoader, LMDatasetLoader, TokenizeDatasetLoader, \ | |||
PeopleDailyCorpusLoader, ConllLoader | |||
class TestDatasetLoader(unittest.TestCase): | |||
def test_case_1(self): | |||
data = """Tom\tT\nand\tF\nJerry\tT\n.\tF\n\nHello\tT\nworld\tF\n!\tF""" | |||
lines = data.split("\n") | |||
answer = POSDatasetLoader.parse(lines) | |||
truth = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], [["Hello", "world", "!"], ["T", "F", "F"]]] | |||
self.assertListEqual(answer, truth, "POS Dataset Loader") | |||
def test_case_TokenizeDatasetLoader(self): | |||
loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||
data = loader.load_pku(max_seq_len=32) | |||
print("pass TokenizeDatasetLoader test!") | |||
def test_case_POSDatasetLoader(self): | |||
loader = POSDatasetLoader("./test/data_for_tests/people.txt") | |||
data = loader.load() | |||
datas = loader.load_lines() | |||
print("pass POSDatasetLoader test!") | |||
def test_case_LMDatasetLoader(self): | |||
loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||
data = loader.load() | |||
datas = loader.load_lines() | |||
print("pass TokenizeDatasetLoader test!") | |||
def test_PeopleDailyCorpusLoader(self): | |||
loader = PeopleDailyCorpusLoader("./test/data_for_tests/people_daily_raw.txt") | |||
_, _ = loader.load() | |||
def test_ConllLoader(self): | |||
loader = ConllLoader("./test/data_for_tests/conll_example.txt") | |||
_ = loader.load() | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -1,24 +0,0 @@ | |||
import unittest | |||
from fastNLP.loader.dataset_loader import POSDatasetLoader | |||
class TestPreprocess(unittest.TestCase): | |||
def test_case_1(self): | |||
data = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], | |||
["Hello", "world", "!"], ["T", "F", "F"]] | |||
pickle_path = "./data_for_tests/" | |||
# POSPreprocess(data, pickle_path) | |||
class TestDatasetLoader(unittest.TestCase): | |||
def test_case_1(self): | |||
data = """Tom\tT\nand\tF\nJerry\tT\n.\tF\n\nHello\tT\nworld\tF\n!\tF""" | |||
lines = data.split("\n") | |||
answer = POSDatasetLoader.parse(lines) | |||
truth = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], [["Hello", "world", "!"], ["T", "F", "F"]]] | |||
self.assertListEqual(answer, truth, "POS Dataset Loader") | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -38,7 +38,7 @@ def infer(): | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
index2label = load_pickle(pickle_path, "class2id.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
@@ -1,74 +1,61 @@ | |||
import sys | |||
import os | |||
sys.path.append("..") | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.core.predictor import Predictor | |||
from fastNLP.core.preprocess import Preprocessor, load_pickle | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
from fastNLP.core.predictor import Predictor | |||
from fastNLP.saver.model_saver import ModelSaver | |||
data_name = "pku_training.utf8" | |||
# cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||
cws_data_path = "data_for_tests/cws_pku_utf_8" | |||
pickle_path = "data_for_tests" | |||
data_infer_path = "data_for_tests/people_infer.txt" | |||
cws_data_path = "test/data_for_tests/cws_pku_utf_8" | |||
pickle_path = "./save/" | |||
data_infer_path = "test/data_for_tests/people_infer.txt" | |||
config_path = "test/data_for_tests/config" | |||
def infer(): | |||
# Load infer configuration, the same as test | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
index2label = load_pickle(pickle_path, "class2id.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
model = SeqLabeling(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print("model loaded!") | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
""" | |||
Transform strings into list of list of strings. | |||
[ | |||
[word_11, word_12, ...], | |||
[word_21, word_22, ...], | |||
... | |||
] | |||
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. | |||
""" | |||
# Inference interface | |||
infer = Predictor(pickle_path) | |||
infer = Predictor(pickle_path, "seq_label") | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
print("Inference finished!") | |||
def train_test(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||
ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": train_args}) | |||
# Data Loader | |||
loader = TokenizeDatasetLoader(cws_data_path) | |||
train_data = loader.load_pku() | |||
# Preprocessor | |||
p = SeqLabelPreprocess() | |||
p = Preprocessor(label_is_seq=True) | |||
data_train = p.run(train_data, pickle_path=pickle_path) | |||
train_args["vocab_size"] = p.vocab_size | |||
train_args["num_classes"] = p.num_classes | |||
@@ -81,12 +68,10 @@ def train_test(): | |||
# Start training | |||
trainer.train(model, data_train) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||
saver = ModelSaver("./save/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
del model, trainer, loader | |||
@@ -94,12 +79,11 @@ def train_test(): | |||
model = SeqLabeling(train_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
# Load test configuration | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": test_args}) | |||
# Tester | |||
tester = SeqLabelTester(**test_args.data) | |||
@@ -109,7 +93,13 @@ def train_test(): | |||
# print test results | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
def test(): | |||
os.makedirs("save", exist_ok=True) | |||
train_test() | |||
infer() | |||
os.system("rm -rf save") | |||
if __name__ == "__main__": | |||
@@ -1,7 +1,6 @@ | |||
import unittest | |||
import torch | |||
import unittest | |||
from fastNLP.modules.other_modules import GroupNorm, LayerNormalization, BiLinear | |||
@@ -1,18 +1,9 @@ | |||
import torch | |||
import numpy as np | |||
import unittest | |||
import fastNLP.modules.utils as utils | |||
class TestUtils(unittest.TestCase): | |||
def test_case_1(self): | |||
a = torch.tensor([ | |||
[1, 2, 3, 4, 5], [2, 3, 4, 5, 6] | |||
]) | |||
utils.orthogonal(a) | |||
pass | |||
def test_case_2(self): | |||
a = np.random.rand(100, 100) | |||
utils.mst(a) | |||
pass |
@@ -1,16 +1,32 @@ | |||
import sys | |||
# encoding: utf-8 | |||
import os | |||
sys.path.append("..") | |||
from fastNLP.core.preprocess import save_pickle | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.fastnlp import FastNLP | |||
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results | |||
from fastNLP.models.cnn_text_classification import CNNText | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
from fastNLP.saver.model_saver import ModelSaver | |||
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | |||
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | |||
PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" | |||
def word_seg(): | |||
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | |||
nlp.load("cws_basic_model", config_file="cws.cfg", section_name="POS_test") | |||
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||
DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||
'<reserved-3>', | |||
'<reserved-4>'] # dict index = 2~4 | |||
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||
DEFAULT_RESERVED_LABEL[2]: 4} | |||
def word_seg(model_dir, config, section): | |||
nlp = FastNLP(model_dir=model_dir) | |||
nlp.load("cws_basic_model", config_file=config, section_name=section) | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
@@ -24,38 +40,52 @@ def word_seg(): | |||
print(interpret_word_seg_results(words, labels)) | |||
def text_class(): | |||
nlp = FastNLP("./data_for_tests/") | |||
nlp.load("text_class_model") | |||
text = "这是最好的基于深度学习的中文分词系统。" | |||
result = nlp.run(text) | |||
print(result) | |||
print("FastNLP finished!") | |||
def mock_cws(): | |||
os.makedirs("mock", exist_ok=True) | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
word2id = Vocabulary() | |||
word_list = [ch for ch in "".join(text)] | |||
word2id.update(word_list) | |||
save_pickle(word2id, "./mock/", "word2id.pkl") | |||
def test_word_seg_interpret(): | |||
foo = [[('这', 'S'), ('是', 'S'), ('最', 'S'), ('好', 'S'), ('的', 'S'), ('基', 'B'), ('于', 'E'), ('深', 'B'), ('度', 'E'), | |||
('学', 'B'), ('习', 'E'), ('的', 'S'), ('中', 'B'), ('文', 'E'), ('分', 'B'), ('词', 'E'), ('系', 'B'), ('统', 'E'), | |||
('。', 'S')]] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_word_seg_results(chars, labels)) | |||
class2id = Vocabulary(need_default=False) | |||
label_list = ['B', 'M', 'E', 'S'] | |||
class2id.update(label_list) | |||
save_pickle(class2id, "./mock/", "class2id.pkl") | |||
model_args = {"vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id)} | |||
config_file = """ | |||
[test_section] | |||
vocab_size = {} | |||
word_emb_dim = 50 | |||
rnn_hidden_units = 50 | |||
num_classes = {} | |||
""".format(len(word2id), len(class2id)) | |||
with open("mock/test.cfg", "w", encoding="utf-8") as f: | |||
f.write(config_file) | |||
def test_interpret_cws_pos_results(): | |||
foo = [ | |||
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||
] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_cws_pos_results(chars, labels)) | |||
model = AdvSeqLabel(model_args) | |||
ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model) | |||
def test_word_seg(): | |||
# fake the model and pickles | |||
print("start mocking") | |||
mock_cws() | |||
# run the inference codes | |||
print("start testing") | |||
word_seg("./mock/", "test.cfg", "test_section") | |||
# clean up environments | |||
print("clean up") | |||
os.system("rm -rf mock") | |||
def pos_tag(): | |||
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) | |||
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") | |||
def pos_tag(model_dir, config, section): | |||
nlp = FastNLP(model_dir=model_dir) | |||
nlp.load("pos_tag_model", config_file=config, section_name=section) | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
@@ -65,21 +95,119 @@ def pos_tag(): | |||
for res in example: | |||
words.append(res[0]) | |||
labels.append(res[1]) | |||
print(interpret_cws_pos_results(words, labels)) | |||
try: | |||
print(interpret_cws_pos_results(words, labels)) | |||
except RuntimeError: | |||
print("inconsistent pos tags. this is for test only.") | |||
def mock_pos_tag(): | |||
os.makedirs("mock", exist_ok=True) | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
vocab = Vocabulary() | |||
word_list = [ch for ch in "".join(text)] | |||
vocab.update(word_list) | |||
save_pickle(vocab, "./mock/", "word2id.pkl") | |||
idx2label = Vocabulary(need_default=False) | |||
label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv'] | |||
idx2label.update(label_list) | |||
save_pickle(idx2label, "./mock/", "class2id.pkl") | |||
model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)} | |||
config_file = """ | |||
[test_section] | |||
vocab_size = {} | |||
word_emb_dim = 50 | |||
rnn_hidden_units = 50 | |||
num_classes = {} | |||
""".format(len(vocab), len(idx2label)) | |||
with open("mock/test.cfg", "w", encoding="utf-8") as f: | |||
f.write(config_file) | |||
def text_classify(): | |||
nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES) | |||
nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model") | |||
model = AdvSeqLabel(model_args) | |||
ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model) | |||
def test_pos_tag(): | |||
mock_pos_tag() | |||
pos_tag("./mock/", "test.cfg", "test_section") | |||
os.system("rm -rf mock") | |||
def text_classify(model_dir, config, section): | |||
nlp = FastNLP(model_dir=model_dir) | |||
nlp.load("text_classify_model", config_file=config, section_name=section) | |||
text = [ | |||
"世界物联网大会明日在京召开龙头股启动在即", | |||
"乌鲁木齐市新增一处城市中心旅游目的地", | |||
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"] | |||
results = nlp.run(text) | |||
print(results) | |||
""" | |||
['finance', 'travel', 'history'] | |||
""" | |||
def mock_text_classify(): | |||
os.makedirs("mock", exist_ok=True) | |||
text = ["世界物联网大会明日在京召开龙头股启动在即", | |||
"乌鲁木齐市新增一处城市中心旅游目的地", | |||
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”" | |||
] | |||
vocab = Vocabulary() | |||
word_list = [ch for ch in "".join(text)] | |||
vocab.update(word_list) | |||
save_pickle(vocab, "./mock/", "word2id.pkl") | |||
idx2label = Vocabulary(need_default=False) | |||
label_list = ['class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F'] | |||
idx2label.update(label_list) | |||
save_pickle(idx2label, "./mock/", "class2id.pkl") | |||
model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)} | |||
config_file = """ | |||
[test_section] | |||
vocab_size = {} | |||
word_emb_dim = 50 | |||
rnn_hidden_units = 50 | |||
num_classes = {} | |||
""".format(len(vocab), len(idx2label)) | |||
with open("mock/test.cfg", "w", encoding="utf-8") as f: | |||
f.write(config_file) | |||
model = CNNText(model_args) | |||
ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model) | |||
def test_text_classify(): | |||
mock_text_classify() | |||
text_classify("./mock/", "test.cfg", "test_section") | |||
os.system("rm -rf mock") | |||
def test_word_seg_interpret(): | |||
foo = [[('这', 'S'), ('是', 'S'), ('最', 'S'), ('好', 'S'), ('的', 'S'), ('基', 'B'), ('于', 'E'), ('深', 'B'), ('度', 'E'), | |||
('学', 'B'), ('习', 'E'), ('的', 'S'), ('中', 'B'), ('文', 'E'), ('分', 'B'), ('词', 'E'), ('系', 'B'), ('统', 'E'), | |||
('。', 'S')]] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_word_seg_results(chars, labels)) | |||
def test_interpret_cws_pos_results(): | |||
foo = [ | |||
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||
] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_cws_pos_results(chars, labels)) | |||
if __name__ == "__main__": | |||
text_classify() | |||
test_word_seg() | |||
test_pos_tag() | |||
test_text_classify() | |||
test_word_seg_interpret() | |||
test_interpret_cws_pos_results() |