From 466f3c21ec7c07de98ae9979f2261e921260867a Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 15 Sep 2018 19:53:48 +0800 Subject: [PATCH 1/6] add vocabulary --- fastNLP/data/vocabulary.py | 99 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 fastNLP/data/vocabulary.py diff --git a/fastNLP/data/vocabulary.py b/fastNLP/data/vocabulary.py new file mode 100644 index 00000000..3cff161b --- /dev/null +++ b/fastNLP/data/vocabulary.py @@ -0,0 +1,99 @@ +from copy import deepcopy + +DEFAULT_PADDING_LABEL = '' # dict index = 0 +DEFAULT_UNKNOWN_LABEL = '' # dict index = 1 +DEFAULT_RESERVED_LABEL = ['', + '', + ''] # dict index = 2~4 + +DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, + DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, + DEFAULT_RESERVED_LABEL[2]: 4} + +def isiterable(p_object): + try: + it = iter(p_object) + except TypeError: + return False + return True + +class Vocabulary(object): + def __init__(self): + self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) + self.padding_label = DEFAULT_PADDING_LABEL + self.unknown_label = DEFAULT_UNKNOWN_LABEL + self.idx2word = None + + def __len__(self): + return len(self.word2idx) + + def update(self, word): + """add word or list of words into Vocabulary + """ + if not isinstance(word, str) and isiterable(word): + # it's a nested list + for w in word: + self.update(w) + else: + # it's a word to be added + self.word2idx[word] = len(self) + if self.idx2word is not None: + self.idx2word = None + + + def __getitem__(self, w): + """ like to_index(w) function, turn a word to the index + if w is not in Vocabulary, return the unknown label + """ + if w in self.word2idx: + return self.word2idx[w] + else: + return self.word2idx[DEFAULT_UNKNOWN_LABEL] + + def unknown_idx(self): + return self.word2idx[self.unknown_label] + + def padding_idx(self): + return self.word2idx[self.padding_label] + + def build_reverse_vocab(self): + self.idx2word = {self.word2idx[w] : w for w in self.word2idx} + + def to_word(self, idx): + """given a word's index, return the word itself + """ + if self.idx2word is None: + self.build_reverse_vocab() + return self.idx2word[idx] + + def __getstate__(self): + """use to prepare data for pickle + """ + state = self.__dict__.copy() + # no need to pickle idx2word as it can be constructed from word2idx + del state['idx2word'] + return state + + def __setstate__(self, state): + """use to restore state from pickle + """ + self.__dict__.update(state) + self.idx2word = None + +if __name__ == '__main__': + import _pickle as pickle + vocab = Vocabulary() + filename = 'vocab' + vocab.update(filename) + vocab.update([filename, ['a'], [['b']], ['c']]) + idx = vocab[filename] + print('{} {}'.format(vocab.to_word(idx), vocab[filename])) + + with open(filename, 'wb') as f: + pickle.dump(vocab, f) + with open(filename, 'rb') as f: + vocab = pickle.load(f) + + print('{} {}'.format(vocab.to_word(idx), vocab[filename])) + print(vocab.word2idx) + \ No newline at end of file From 3f4544759ddfd4569be034de811b366f1a6bb3cf Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 15 Sep 2018 20:39:51 +0800 Subject: [PATCH 2/6] add unittest of data, fix bug --- fastNLP/{data => core}/vocabulary.py | 22 ++------- test/core/test_field.py | 69 ++++++++++++++++++++++++++++ test/core/test_vocab.py | 35 ++++++++++++++ 3 files changed, 108 insertions(+), 18 deletions(-) rename fastNLP/{data => core}/vocabulary.py (79%) create mode 100644 test/core/test_field.py create mode 100644 test/core/test_vocab.py diff --git a/fastNLP/data/vocabulary.py b/fastNLP/core/vocabulary.py similarity index 79% rename from fastNLP/data/vocabulary.py rename to fastNLP/core/vocabulary.py index 3cff161b..baae3753 100644 --- a/fastNLP/data/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -36,9 +36,10 @@ class Vocabulary(object): self.update(w) else: # it's a word to be added - self.word2idx[word] = len(self) - if self.idx2word is not None: - self.idx2word = None + if word not in self.word2idx: + self.word2idx[word] = len(self) + if self.idx2word is not None: + self.idx2word = None def __getitem__(self, w): @@ -80,20 +81,5 @@ class Vocabulary(object): self.__dict__.update(state) self.idx2word = None -if __name__ == '__main__': - import _pickle as pickle - vocab = Vocabulary() - filename = 'vocab' - vocab.update(filename) - vocab.update([filename, ['a'], [['b']], ['c']]) - idx = vocab[filename] - print('{} {}'.format(vocab.to_word(idx), vocab[filename])) - with open(filename, 'wb') as f: - pickle.dump(vocab, f) - with open(filename, 'rb') as f: - vocab = pickle.load(f) - - print('{} {}'.format(vocab.to_word(idx), vocab[filename])) - print(vocab.word2idx) \ No newline at end of file diff --git a/test/core/test_field.py b/test/core/test_field.py new file mode 100644 index 00000000..7c1b6343 --- /dev/null +++ b/test/core/test_field.py @@ -0,0 +1,69 @@ +import os +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) + +import unittest +import torch +from fastNLP.data.field import TextField, LabelField +from fastNLP.data.instance import Instance +from fastNLP.data.dataset import DataSet +from fastNLP.data.batch import Batch + + + +class TestField(unittest.TestCase): + def check_batched_data_equal(self, data1, data2): + self.assertEqual(len(data1), len(data2)) + for i in range(len(data1)): + self.assertTrue(data1[i].keys(), data2[i].keys()) + for i in range(len(data1)): + for t1, t2 in zip(data1[i].values(), data2[i].values()): + self.assertTrue(torch.equal(t1, t2)) + + def test_batchiter(self): + texts = [ + "i am a cat", + "this is a test of new batch", + "haha" + ] + labels = [0, 1, 0] + + # prepare vocabulary + vocab = {} + for text in texts: + for tokens in text.split(): + if tokens not in vocab: + vocab[tokens] = len(vocab) + + # prepare input dataset + data = DataSet() + for text, label in zip(texts, labels): + x = TextField(text.split(), False) + y = LabelField(label, is_target=True) + ins = Instance(text=x, label=y) + data.append(ins) + + # use vocabulary to index data + data.index_field("text", vocab) + + # define naive sampler for batch class + class SeqSampler: + def __call__(self, dataset): + return list(range(len(dataset))) + + # use bacth to iterate dataset + batcher = Batch(data, SeqSampler(), 2) + TRUE_X = [{'text': torch.tensor([[0, 1, 2, 3, 0, 0, 0], [4, 5, 2, 6, 7, 8, 9]])}, {'text': torch.tensor([[10]])}] + TRUE_Y = [{'label': torch.tensor([[0], [1]])}, {'label': torch.tensor([[0]])}] + for epoch in range(3): + test_x, test_y = [], [] + for batch_x, batch_y in batcher: + test_x.append(batch_x) + test_y.append(batch_y) + self.check_batched_data_equal(TRUE_X, test_x) + self.check_batched_data_equal(TRUE_Y, test_y) + + +if __name__ == "__main__": + unittest.main() + \ No newline at end of file diff --git a/test/core/test_vocab.py b/test/core/test_vocab.py new file mode 100644 index 00000000..dd51c197 --- /dev/null +++ b/test/core/test_vocab.py @@ -0,0 +1,35 @@ +import os +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) + +import unittest +from fastNLP.data.vocabulary import Vocabulary, DEFAULT_WORD_TO_INDEX + +class TestVocabulary(unittest.TestCase): + def test_vocab(self): + import _pickle as pickle + import os + vocab = Vocabulary() + filename = 'vocab' + vocab.update(filename) + vocab.update([filename, ['a'], [['b']], ['c']]) + idx = vocab[filename] + before_pic = (vocab.to_word(idx), vocab[filename]) + + with open(filename, 'wb') as f: + pickle.dump(vocab, f) + with open(filename, 'rb') as f: + vocab = pickle.load(f) + os.remove(filename) + + vocab.build_reverse_vocab() + after_pic = (vocab.to_word(idx), vocab[filename]) + TRUE_DICT = {'vocab': 5, 'a': 6, 'b': 7, 'c': 8} + TRUE_DICT.update(DEFAULT_WORD_TO_INDEX) + TRUE_IDXDICT = {0: '', 1: '', 2: '', 3: '', 4: '', 5: 'vocab', 6: 'a', 7: 'b', 8: 'c'} + self.assertEqual(before_pic, after_pic) + self.assertDictEqual(TRUE_DICT, vocab.word2idx) + self.assertDictEqual(TRUE_IDXDICT, vocab.idx2word) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 9c7f3cf26125234a785d9383839df1ee6779905f Mon Sep 17 00:00:00 2001 From: yunfan Date: Tue, 18 Sep 2018 16:43:56 +0800 Subject: [PATCH 3/6] add vocabulary into preprocessor --- fastNLP/core/preprocess.py | 80 ++++++++++++-------------------------- fastNLP/core/vocabulary.py | 29 +++++++++++++- test/core/test_field.py | 69 -------------------------------- test/core/test_vocab.py | 6 +-- 4 files changed, 53 insertions(+), 131 deletions(-) delete mode 100644 test/core/test_field.py diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py index b5d348e6..2671b4f4 100644 --- a/fastNLP/core/preprocess.py +++ b/fastNLP/core/preprocess.py @@ -6,16 +6,7 @@ import numpy as np from fastNLP.core.dataset import DataSet from fastNLP.core.field import TextField, LabelField from fastNLP.core.instance import Instance - -DEFAULT_PADDING_LABEL = '' # dict index = 0 -DEFAULT_UNKNOWN_LABEL = '' # dict index = 1 -DEFAULT_RESERVED_LABEL = ['', - '', - ''] # dict index = 2~4 - -DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, - DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, - DEFAULT_RESERVED_LABEL[2]: 4} +from fastNLP.core.vocabulary import Vocabulary # the first vocab in dict with the index = 5 @@ -68,24 +59,22 @@ class BasePreprocess(object): - "word2id.pkl", a mapping from words(tokens) to indices - "id2word.pkl", a reversed dictionary - - "label2id.pkl", a dictionary on labels - - "id2label.pkl", a reversed dictionary on labels These four pickle files are expected to be saved in the given pickle directory once they are constructed. Preprocessors will check if those files are already in the directory and will reuse them in future calls. """ def __init__(self): - self.word2index = None - self.label2index = None + self.data_vocab = Vocabulary() + self.label_vocab = Vocabulary() @property def vocab_size(self): - return len(self.word2index) + return len(self.data_vocab) @property def num_classes(self): - return len(self.label2index) + return len(self.label_vocab) def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): """Main pre-processing pipeline. @@ -102,20 +91,14 @@ class BasePreprocess(object): """ if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): - self.word2index = load_pickle(pickle_path, "word2id.pkl") - self.label2index = load_pickle(pickle_path, "class2id.pkl") + self.data_vocab = load_pickle(pickle_path, "word2id.pkl") + self.label_vocab = load_pickle(pickle_path, "class2id.pkl") else: - self.word2index, self.label2index = self.build_dict(train_dev_data) - save_pickle(self.word2index, pickle_path, "word2id.pkl") - save_pickle(self.label2index, pickle_path, "class2id.pkl") - - if not pickle_exist(pickle_path, "id2word.pkl"): - index2word = self.build_reverse_dict(self.word2index) - save_pickle(index2word, pickle_path, "id2word.pkl") + self.data_vocab, self.label_vocab = self.build_dict(train_dev_data) + save_pickle(self.data_vocab, pickle_path, "word2id.pkl") + save_pickle(self.label_vocab, pickle_path, "class2id.pkl") - if not pickle_exist(pickle_path, "id2class.pkl"): - index2label = self.build_reverse_dict(self.label2index) - save_pickle(index2label, pickle_path, "id2class.pkl") + self.build_reverse_dict() train_set = [] dev_set = [] @@ -125,13 +108,13 @@ class BasePreprocess(object): split = int(len(train_dev_data) * train_dev_split) data_dev = train_dev_data[: split] data_train = train_dev_data[split:] - train_set = self.convert_to_dataset(data_train, self.word2index, self.label2index) - dev_set = self.convert_to_dataset(data_dev, self.word2index, self.label2index) + train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab) + dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab) save_pickle(dev_set, pickle_path, "data_dev.pkl") print("{} of the training data is split for validation. ".format(train_dev_split)) else: - train_set = self.convert_to_dataset(train_dev_data, self.word2index, self.label2index) + train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab) save_pickle(train_set, pickle_path, "data_train.pkl") else: train_set = load_pickle(pickle_path, "data_train.pkl") @@ -143,8 +126,8 @@ class BasePreprocess(object): # cross validation data_cv = self.cv_split(train_dev_data, n_fold) for i, (data_train_cv, data_dev_cv) in enumerate(data_cv): - data_train_cv = self.convert_to_dataset(data_train_cv, self.word2index, self.label2index) - data_dev_cv = self.convert_to_dataset(data_dev_cv, self.word2index, self.label2index) + data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab) + data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab) save_pickle( data_train_cv, pickle_path, "data_train_{}.pkl".format(i)) @@ -165,7 +148,7 @@ class BasePreprocess(object): test_set = [] if test_data is not None: if not pickle_exist(pickle_path, "data_test.pkl"): - test_set = self.convert_to_dataset(test_data, self.word2index, self.label2index) + test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab) save_pickle(test_set, pickle_path, "data_test.pkl") # return preprocessed results @@ -180,28 +163,15 @@ class BasePreprocess(object): return tuple(results) def build_dict(self, data): - label2index = DEFAULT_WORD_TO_INDEX.copy() - word2index = DEFAULT_WORD_TO_INDEX.copy() for example in data: - for word in example[0]: - if word not in word2index: - word2index[word] = len(word2index) - label = example[1] - if isinstance(label, str): - # label is a string - if label not in label2index: - label2index[label] = len(label2index) - elif isinstance(label, list): - # label is a list of strings - for single_label in label: - if single_label not in label2index: - label2index[single_label] = len(label2index) - return word2index, label2index - - - def build_reverse_dict(self, word_dict): - id2word = {word_dict[w]: w for w in word_dict} - return id2word + word, label = example + self.data_vocab.update(word) + self.label_vocab.update(label) + return self.data_vocab, self.label_vocab + + def build_reverse_dict(self): + self.data_vocab.build_reverse_vocab() + self.label_vocab.build_reverse_vocab() def data_split(self, data, train_dev_split): """Split data into train and dev set.""" diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index baae3753..79b70939 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -18,6 +18,16 @@ def isiterable(p_object): return True class Vocabulary(object): + """Use for word and index one to one mapping + + Example:: + + vocab = Vocabulary() + word_list = "this is a word list".split() + vocab.update(word_list) + vocab["word"] + vocab.to_word(5) + """ def __init__(self): self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) self.padding_label = DEFAULT_PADDING_LABEL @@ -29,6 +39,8 @@ class Vocabulary(object): def update(self, word): """add word or list of words into Vocabulary + + :param word: a list of str or str """ if not isinstance(word, str) and isiterable(word): # it's a nested list @@ -43,13 +55,22 @@ class Vocabulary(object): def __getitem__(self, w): - """ like to_index(w) function, turn a word to the index - if w is not in Vocabulary, return the unknown label + """To support usage like:: + + vocab[w] """ if w in self.word2idx: return self.word2idx[w] else: return self.word2idx[DEFAULT_UNKNOWN_LABEL] + + def to_index(self, w): + """ like to_index(w) function, turn a word to the index + if w is not in Vocabulary, return the unknown label + + :param str w: + """ + return self[w] def unknown_idx(self): return self.word2idx[self.unknown_label] @@ -58,10 +79,14 @@ class Vocabulary(object): return self.word2idx[self.padding_label] def build_reverse_vocab(self): + """build 'index to word' dict based on 'word to index' dict + """ self.idx2word = {self.word2idx[w] : w for w in self.word2idx} def to_word(self, idx): """given a word's index, return the word itself + + :param int idx: """ if self.idx2word is None: self.build_reverse_vocab() diff --git a/test/core/test_field.py b/test/core/test_field.py deleted file mode 100644 index 7c1b6343..00000000 --- a/test/core/test_field.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -import sys -sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) - -import unittest -import torch -from fastNLP.data.field import TextField, LabelField -from fastNLP.data.instance import Instance -from fastNLP.data.dataset import DataSet -from fastNLP.data.batch import Batch - - - -class TestField(unittest.TestCase): - def check_batched_data_equal(self, data1, data2): - self.assertEqual(len(data1), len(data2)) - for i in range(len(data1)): - self.assertTrue(data1[i].keys(), data2[i].keys()) - for i in range(len(data1)): - for t1, t2 in zip(data1[i].values(), data2[i].values()): - self.assertTrue(torch.equal(t1, t2)) - - def test_batchiter(self): - texts = [ - "i am a cat", - "this is a test of new batch", - "haha" - ] - labels = [0, 1, 0] - - # prepare vocabulary - vocab = {} - for text in texts: - for tokens in text.split(): - if tokens not in vocab: - vocab[tokens] = len(vocab) - - # prepare input dataset - data = DataSet() - for text, label in zip(texts, labels): - x = TextField(text.split(), False) - y = LabelField(label, is_target=True) - ins = Instance(text=x, label=y) - data.append(ins) - - # use vocabulary to index data - data.index_field("text", vocab) - - # define naive sampler for batch class - class SeqSampler: - def __call__(self, dataset): - return list(range(len(dataset))) - - # use bacth to iterate dataset - batcher = Batch(data, SeqSampler(), 2) - TRUE_X = [{'text': torch.tensor([[0, 1, 2, 3, 0, 0, 0], [4, 5, 2, 6, 7, 8, 9]])}, {'text': torch.tensor([[10]])}] - TRUE_Y = [{'label': torch.tensor([[0], [1]])}, {'label': torch.tensor([[0]])}] - for epoch in range(3): - test_x, test_y = [], [] - for batch_x, batch_y in batcher: - test_x.append(batch_x) - test_y.append(batch_y) - self.check_batched_data_equal(TRUE_X, test_x) - self.check_batched_data_equal(TRUE_Y, test_y) - - -if __name__ == "__main__": - unittest.main() - \ No newline at end of file diff --git a/test/core/test_vocab.py b/test/core/test_vocab.py index dd51c197..89b0691a 100644 --- a/test/core/test_vocab.py +++ b/test/core/test_vocab.py @@ -1,9 +1,5 @@ -import os -import sys -sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) - import unittest -from fastNLP.data.vocabulary import Vocabulary, DEFAULT_WORD_TO_INDEX +from fastNLP.core.vocabulary import Vocabulary, DEFAULT_WORD_TO_INDEX class TestVocabulary(unittest.TestCase): def test_vocab(self): From 819c8f05bed3d47d1c85ae4e44643ab24432f240 Mon Sep 17 00:00:00 2001 From: yunfan Date: Wed, 19 Sep 2018 14:49:10 +0800 Subject: [PATCH 4/6] fix vocab --- fastNLP/core/predictor.py | 10 +++++----- fastNLP/fastnlp.py | 10 +++++----- reproduction/chinese_word_segment/run.py | 4 ++-- reproduction/pos_tag_model/train_pos_tag.py | 4 ++-- test/core/test_predictor.py | 9 +++++++-- test/model/seq_labeling.py | 2 +- test/model/test_cws.py | 2 +- 7 files changed, 23 insertions(+), 18 deletions(-) diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 802661ef..c83b2069 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -27,8 +27,8 @@ class Predictor(object): self.batch_output = [] self.pickle_path = pickle_path self._task = task # one of ("seq_label", "text_classify") - self.index2label = load_pickle(self.pickle_path, "id2class.pkl") - self.word2index = load_pickle(self.pickle_path, "word2id.pkl") + self.label_vocab = load_pickle(self.pickle_path, "class2id.pkl") + self.word_vocab = load_pickle(self.pickle_path, "word2id.pkl") def predict(self, network, data): """Perform inference using the trained model. @@ -82,7 +82,7 @@ class Predictor(object): :return data_set: a DataSet instance. """ assert isinstance(data, list) - return create_dataset_from_lists(data, self.word2index, has_target=False) + return create_dataset_from_lists(data, self.word_vocab, has_target=False) def prepare_output(self, data): """Transform list of batch outputs into strings.""" @@ -97,14 +97,14 @@ class Predictor(object): results = [] for batch in batch_outputs: for example in np.array(batch): - results.append([self.index2label[int(x)] for x in example]) + results.append([self.label_vocab.to_word(int(x)) for x in example]) return results def _text_classify_prepare_output(self, batch_outputs): results = [] for batch_out in batch_outputs: idx = np.argmax(batch_out.detach().numpy(), axis=-1) - results.extend([self.index2label[i] for i in idx]) + results.extend([self.label_vocab.to_word(i) for i in idx]) return results diff --git a/fastNLP/fastnlp.py b/fastNLP/fastnlp.py index c76e6681..e683950d 100644 --- a/fastNLP/fastnlp.py +++ b/fastNLP/fastnlp.py @@ -69,7 +69,7 @@ class FastNLP(object): :param model_dir: this directory should contain the following files: 1. a pre-trained model 2. a config file - 3. "id2class.pkl" + 3. "class2id.pkl" 4. "word2id.pkl" """ self.model_dir = model_dir @@ -99,10 +99,10 @@ class FastNLP(object): print("Restore model hyper-parameters {}".format(str(model_args.data))) # fetch dictionary size and number of labels from pickle files - word2index = load_pickle(self.model_dir, "word2id.pkl") - model_args["vocab_size"] = len(word2index) - index2label = load_pickle(self.model_dir, "id2class.pkl") - model_args["num_classes"] = len(index2label) + word_vocab = load_pickle(self.model_dir, "word2id.pkl") + model_args["vocab_size"] = len(word_vocab) + label_vocab = load_pickle(self.model_dir, "class2id.pkl") + model_args["num_classes"] = len(label_vocab) # Construct the model model = model_class(model_args) diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index d0a22e84..0d5ae8c1 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -32,7 +32,7 @@ def infer(): # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") + index2label = load_pickle(pickle_path, "class2id.pkl") test_args["num_classes"] = len(index2label) @@ -105,7 +105,7 @@ def test(): # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") + index2label = load_pickle(pickle_path, "class2id.pkl") test_args["num_classes"] = len(index2label) # load dev data diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py index 87a9f7e8..15164130 100644 --- a/reproduction/pos_tag_model/train_pos_tag.py +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -33,7 +33,7 @@ def infer(): # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") + index2label = load_pickle(pickle_path, "class2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model @@ -105,7 +105,7 @@ def test(): # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") + index2label = load_pickle(pickle_path, "class2id.pkl") test_args["num_classes"] = len(index2label) # load dev data diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py index c7ad65d7..411f636e 100644 --- a/test/core/test_predictor.py +++ b/test/core/test_predictor.py @@ -4,6 +4,7 @@ import unittest from fastNLP.core.predictor import Predictor from fastNLP.core.preprocess import save_pickle from fastNLP.models.sequence_modeling import SeqLabeling +from fastNLP.core.vocabulary import Vocabulary class TestPredictor(unittest.TestCase): @@ -23,10 +24,14 @@ class TestPredictor(unittest.TestCase): ['a', 'b', 'c', 'd', '$'], ['!', 'b', 'c', 'd', 'e'] ] - vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} + + vocab = Vocabulary() + vocab.word2idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} + class_vocab = Vocabulary() + class_vocab.word2idx = {"0":0, "1":1, "2":2, "3":3, "4":4} os.system("mkdir save") - save_pickle({0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, "./save/", "id2class.pkl") + save_pickle(class_vocab, "./save/", "class2id.pkl") save_pickle(vocab, "./save/", "word2id.pkl") model = SeqLabeling(model_args) diff --git a/test/model/seq_labeling.py b/test/model/seq_labeling.py index d7750b17..cd011c0d 100644 --- a/test/model/seq_labeling.py +++ b/test/model/seq_labeling.py @@ -38,7 +38,7 @@ def infer(): # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") + index2label = load_pickle(pickle_path, "class2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model diff --git a/test/model/test_cws.py b/test/model/test_cws.py index 802d97ba..70716c3a 100644 --- a/test/model/test_cws.py +++ b/test/model/test_cws.py @@ -27,7 +27,7 @@ def infer(): # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") + index2label = load_pickle(pickle_path, "class2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model From e8cc702737f93afc285a90051f29788b40847523 Mon Sep 17 00:00:00 2001 From: yunfan Date: Thu, 20 Sep 2018 15:11:01 +0800 Subject: [PATCH 5/6] add default switch --- fastNLP/core/vocabulary.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 79b70939..ad618ff9 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -28,15 +28,25 @@ class Vocabulary(object): vocab["word"] vocab.to_word(5) """ - def __init__(self): - self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) - self.padding_label = DEFAULT_PADDING_LABEL - self.unknown_label = DEFAULT_UNKNOWN_LABEL + def __init__(self, need_default=True): + """ + :param bool need_default: set if the Vocabulary has default labels reserved. + """ + if need_default: + self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) + self.padding_label = DEFAULT_PADDING_LABEL + self.unknown_label = DEFAULT_UNKNOWN_LABEL + else: + self.word2idx = {} + self.padding_label = None + self.unknown_label = None + + self.has_default = need_default self.idx2word = None def __len__(self): return len(self.word2idx) - + def update(self, word): """add word or list of words into Vocabulary @@ -73,9 +83,13 @@ class Vocabulary(object): return self[w] def unknown_idx(self): + if self.unknown_label is None: + return None return self.word2idx[self.unknown_label] def padding_idx(self): + if self.padding_label is None: + return None return self.word2idx[self.padding_label] def build_reverse_vocab(self): From 28a06838530645ede29ef1cf1f62dc30ef292b7c Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 22 Sep 2018 15:33:52 +0800 Subject: [PATCH 6/6] 1. add tests in test_fastNLP.py & test_sampler.py; increase test coverage to 81% 2. changes of names: aggregation ----> aggregator interaction ----> interactor action.py ----> sampler.py BasePreprocess ---> Preprocessor BaseTester ----> Tester BaseTrainer ----> Trainer 3. add more code comments 4. fix bugs in predictor's data_forward 5. in sampler.py, remove Bachifier, fix some codes. but not test 6. remove unused codes in other_modules.py & utils.py 7. update fastnlp.py with new config file names and code comments 8. add data examples in data_for_tests/ --- examples/readme_example.py | 4 +- fastNLP/core/batch.py | 55 +--- fastNLP/core/predictor.py | 8 +- fastNLP/core/preprocess.py | 31 +- fastNLP/core/{action.py => sampler.py} | 170 +++++------ fastNLP/core/tester.py | 20 +- fastNLP/core/trainer.py | 19 +- fastNLP/fastnlp.py | 15 +- fastNLP/loader/dataset_loader.py | 8 +- fastNLP/modules/__init__.py | 8 +- .../{aggregation => aggregator}/__init__.py | 0 .../{aggregation => aggregator}/attention.py | 0 .../{aggregation => aggregator}/avg_pool.py | 0 .../{aggregation => aggregator}/kmax_pool.py | 0 .../{aggregation => aggregator}/max_pool.py | 0 .../self_attention.py | 3 +- .../{interaction => interactor}/__init__.py | 0 fastNLP/modules/other_modules.py | 265 ----------------- fastNLP/modules/utils.py | 278 +++--------------- fastNLP/saver/model_saver.py | 17 +- .../main.py | 20 +- test/core/test_sampler.py | 30 ++ test/data_for_tests/conll_example.txt | 15 + test/data_for_tests/people_daily_raw.txt | 27 ++ .../{test_loader.py => test_config_loader.py} | 19 -- test/loader/test_dataset_loader.py | 42 +++ test/loader/test_loader2.py | 24 -- test/model/test_cws.py | 60 ++-- test/modules/test_other_modules.py | 3 +- test/modules/test_utils.py | 13 +- test/test_fastNLP.py | 206 ++++++++++--- 31 files changed, 506 insertions(+), 854 deletions(-) rename fastNLP/core/{action.py => sampler.py} (58%) rename fastNLP/modules/{aggregation => aggregator}/__init__.py (100%) rename fastNLP/modules/{aggregation => aggregator}/attention.py (100%) rename fastNLP/modules/{aggregation => aggregator}/avg_pool.py (100%) rename fastNLP/modules/{aggregation => aggregator}/kmax_pool.py (100%) rename fastNLP/modules/{aggregation => aggregator}/max_pool.py (100%) rename fastNLP/modules/{aggregation => aggregator}/self_attention.py (99%) rename fastNLP/modules/{interaction => interactor}/__init__.py (100%) create mode 100644 test/core/test_sampler.py create mode 100644 test/data_for_tests/conll_example.txt create mode 100644 test/data_for_tests/people_daily_raw.txt rename test/loader/{test_loader.py => test_config_loader.py} (69%) create mode 100644 test/loader/test_dataset_loader.py delete mode 100644 test/loader/test_loader2.py diff --git a/examples/readme_example.py b/examples/readme_example.py index bc50c48b..74e20c57 100644 --- a/examples/readme_example.py +++ b/examples/readme_example.py @@ -5,7 +5,7 @@ from fastNLP.core.preprocess import ClassPreprocess from fastNLP.core.trainer import ClassificationTrainer from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.models.base_model import BaseModel -from fastNLP.modules import aggregation +from fastNLP.modules import aggregator from fastNLP.modules import decoder from fastNLP.modules import encoder @@ -21,7 +21,7 @@ class ClassificationModel(BaseModel): self.emb = encoder.Embedding(nums=vocab_size, dims=300) self.enc = encoder.Conv( in_channels=300, out_channels=100, kernel_size=3) - self.agg = aggregation.MaxPool() + self.agg = aggregator.MaxPool() self.dec = decoder.MLP(size_layer=[100, num_classes]) def forward(self, x): diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 0a5e9712..8a73b132 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -2,10 +2,6 @@ from collections import defaultdict import torch -from fastNLP.core.dataset import DataSet -from fastNLP.core.field import TextField, LabelField -from fastNLP.core.instance import Instance - class Batch(object): """Batch is an iterable object which iterates over mini-batches. @@ -16,6 +12,14 @@ class Batch(object): """ def __init__(self, dataset, batch_size, sampler, use_cuda): + """ + + :param dataset: a DataSet object + :param batch_size: int, the size of the batch + :param sampler: a Sampler object + :param use_cuda: bool, whetjher to use GPU + + """ self.dataset = dataset self.batch_size = batch_size self.sampler = sampler @@ -81,46 +85,3 @@ class Batch(object): self.curidx += endidx return batch_x, batch_y - -if __name__ == "__main__": - """simple running example - """ - texts = ["i am a cat", - "this is a test of new batch", - "haha" - ] - labels = [0, 1, 0] - - # prepare vocabulary - vocab = {} - for text in texts: - for tokens in text.split(): - if tokens not in vocab: - vocab[tokens] = len(vocab) - print("vocabulary: ", vocab) - - # prepare input dataset - data = DataSet() - for text, label in zip(texts, labels): - x = TextField(text.split(), False) - y = LabelField(label, is_target=True) - ins = Instance(text=x, label=y) - data.append(ins) - - # use vocabulary to index data - data.index_field("text", vocab) - - - # define naive sampler for batch class - class SeqSampler: - def __call__(self, dataset): - return list(range(len(dataset))) - - - # use batch to iterate dataset - data_iterator = Batch(data, 2, SeqSampler(), False) - for epoch in range(1): - for batch_x, batch_y in data_iterator: - print(batch_x) - print(batch_y) - # do stuff diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index c83b2069..6bbb1bee 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -1,10 +1,10 @@ import numpy as np import torch -from fastNLP.core.action import SequentialSampler from fastNLP.core.batch import Batch from fastNLP.core.dataset import create_dataset_from_lists from fastNLP.core.preprocess import load_pickle +from fastNLP.core.sampler import SequentialSampler class Predictor(object): @@ -62,9 +62,13 @@ class Predictor(object): def data_forward(self, network, x): """Forward through network.""" - y = network(**x) if self._task == "seq_label": + y = network(x["word_seq"], x["word_seq_origin_len"]) y = network.prediction(y) + elif self._task == "text_classify": + y = network(x["word_seq"]) + else: + raise NotImplementedError("Unknown task type {}.".format(self._task)) return y def prepare_input(self, data): diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py index 2671b4f4..619d3967 100644 --- a/fastNLP/core/preprocess.py +++ b/fastNLP/core/preprocess.py @@ -52,21 +52,28 @@ def pickle_exist(pickle_path, pickle_name): return False -class BasePreprocess(object): - """Base class of all preprocessors. - Preprocessors are responsible for converting data of strings into data of indices. +class Preprocessor(object): + """Preprocessors are responsible for converting data of strings into data of indices. During the pre-processing, the following pickle files will be built: - - "word2id.pkl", a mapping from words(tokens) to indices - - "id2word.pkl", a reversed dictionary + - "word2id.pkl", a Vocabulary object, mapping words to indices. + - "class2id.pkl", a Vocabulary object, mapping labels to indices. + - "data_train.pkl", a DataSet object for training + - "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0. + - "data_test.pkl", a DataSet object for testing, if test_data is not None. These four pickle files are expected to be saved in the given pickle directory once they are constructed. Preprocessors will check if those files are already in the directory and will reuse them in future calls. """ - def __init__(self): + def __init__(self, label_is_seq=False): + """ + + :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve + several special tokens for sequence processing. + """ self.data_vocab = Vocabulary() - self.label_vocab = Vocabulary() + self.label_vocab = Vocabulary(need_default=label_is_seq) @property def vocab_size(self): @@ -259,20 +266,20 @@ class BasePreprocess(object): return data_set -class SeqLabelPreprocess(BasePreprocess): +class SeqLabelPreprocess(Preprocessor): def __init__(self): - + print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.") super(SeqLabelPreprocess, self).__init__() - -class ClassPreprocess(BasePreprocess): +class ClassPreprocess(Preprocessor): def __init__(self): + print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.") super(ClassPreprocess, self).__init__() if __name__ == "__main__": - p = BasePreprocess() + p = Preprocessor() train_dev_data = [[["I", "am", "a", "good", "student", "."], "0"], [["You", "are", "pretty", "."], "1"] ] diff --git a/fastNLP/core/action.py b/fastNLP/core/sampler.py similarity index 58% rename from fastNLP/core/action.py rename to fastNLP/core/sampler.py index c6cf2d63..79dd56c0 100644 --- a/fastNLP/core/action.py +++ b/fastNLP/core/sampler.py @@ -1,5 +1,3 @@ -from collections import Counter - import numpy as np import torch @@ -17,6 +15,56 @@ def convert_to_torch_tensor(data_list, use_cuda): return data_list +class BaseSampler(object): + """The base class of all samplers. + + Sub-classes must implement the __call__ method. + __call__ takes a DataSet object and returns a list of int - the sampling indices. + """ + + def __call__(self, *args, **kwargs): + raise NotImplementedError + + +class SequentialSampler(BaseSampler): + """Sample data in the original order. + + """ + + def __call__(self, data_set): + return list(range(len(data_set))) + + +class RandomSampler(BaseSampler): + """Sample data in random permutation order. + + """ + + def __call__(self, data_set): + return list(np.random.permutation(len(data_set))) + + +def simple_sort_bucketing(lengths): + """ + + :param lengths: list of int, the lengths of all examples. + :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length + threshold for each bucket (This is usually None.). + :return data: 2-level list + :: + + [ + [index_11, index_12, ...], # bucket 1 + [index_21, index_22, ...], # bucket 2 + ... + ] + + """ + lengths_mapping = [(idx, length) for idx, length in enumerate(lengths)] + sorted_lengths = sorted(lengths_mapping, key=lambda x: x[1]) + # TODO: need to return buckets + return [idx for idx, _ in sorted_lengths] + def k_means_1d(x, k, max_iter=100): """Perform k-means on 1-D data. @@ -46,18 +94,10 @@ def k_means_1d(x, k, max_iter=100): return np.array(centroids), assign -def k_means_bucketing(all_inst, buckets): +def k_means_bucketing(lengths, buckets): """Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. - :param all_inst: 3-level list - E.g. :: - - [ - [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 - [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 - ... - ] - + :param lengths: list of int, the length of all samples. :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length threshold for each bucket (This is usually None.). :return data: 2-level list @@ -72,7 +112,6 @@ def k_means_bucketing(all_inst, buckets): """ bucket_data = [[] for _ in buckets] num_buckets = len(buckets) - lengths = np.array([len(inst[0]) for inst in all_inst]) _, assignments = k_means_1d(lengths, num_buckets) for idx, bucket_id in enumerate(assignments): @@ -81,102 +120,33 @@ def k_means_bucketing(all_inst, buckets): return bucket_data -class BaseSampler(object): - """The base class of all samplers. - - """ - - def __call__(self, *args, **kwargs): - raise NotImplementedError - - -class SequentialSampler(BaseSampler): - """Sample data in the original order. - - """ - - def __call__(self, data_set): - return list(range(len(data_set))) - - -class RandomSampler(BaseSampler): - """Sample data in random permutation order. - - """ - - def __call__(self, data_set): - return list(np.random.permutation(len(data_set))) - - - -class Batchifier(object): - """Wrap random or sequential sampler to generate a mini-batch. - - """ - - def __init__(self, sampler, batch_size, drop_last=True): - """ - - :param sampler: a Sampler object - :param batch_size: int, the size of the mini-batch - :param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch. - - """ - super(Batchifier, self).__init__() - self.sampler = sampler - self.batch_size = batch_size - self.drop_last = drop_last - - def __iter__(self): - batch = [] - for example in self.sampler: - batch.append(example) - if len(batch) == self.batch_size: - yield batch - batch = [] - if 0 < len(batch) < self.batch_size and self.drop_last is False: - yield batch - - -class BucketBatchifier(Batchifier): +class BucketSampler(BaseSampler): """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. In sampling, first random choose a bucket. Then sample data from it. The number of buckets is decided dynamically by the variance of sentence lengths. - TODO: merge it into Batch + """ - def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None): + def __call__(self, data_set, batch_size, num_buckets): + return self._process(data_set, batch_size, num_buckets) + + def _process(self, data_set, batch_size, num_buckets, use_kmeans=False): """ - :param data_set: three-level list, shape [num_samples, 2] + :param data_set: a DataSet object :param batch_size: int :param num_buckets: int, number of buckets for grouping these sequences. - :param drop_last: bool, useless currently. - :param sampler: Sampler, useless currently. + :param use_kmeans: bool, whether to use k-means to create buckets. """ - super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) buckets = ([None] * num_buckets) - self.data = data_set - self.batch_size = batch_size - self.length_freq = dict(Counter([len(example) for example in data_set])) - self.buckets = k_means_bucketing(data_set, buckets) - - def __iter__(self): - """Make a min-batch of data.""" - for _ in range(len(self.data) // self.batch_size): - bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))] - np.random.shuffle(bucket_samples) - yield [self.data[idx] for idx in bucket_samples[:batch_size]] - - -if __name__ == "__main__": - import random - - data = [[[y] * random.randint(0, 50), [y]] for y in range(500)] - batch_size = 8 - iterator = iter(BucketBatchifier(data, batch_size, num_buckets=5)) - for d in iterator: - print("\nbatch:") - for dd in d: - print(len(dd[0]), end=" ") + if use_kmeans is True: + buckets = k_means_bucketing(data_set, buckets) + else: + buckets = simple_sort_bucketing(data_set) + index_list = [] + for _ in range(len(data_set) // batch_size): + chosen_bucket = buckets[np.random.randint(0, len(buckets))] + np.random.shuffle(chosen_bucket) + index_list += [idx for idx in chosen_bucket[:batch_size]] + return index_list diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index aaa96283..0a75f46a 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -1,32 +1,32 @@ import numpy as np import torch -from fastNLP.core.action import RandomSampler from fastNLP.core.batch import Batch +from fastNLP.core.sampler import RandomSampler from fastNLP.saver.logger import create_logger logger = create_logger(__name__, "./train_test.log") -class BaseTester(object): +class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ def __init__(self, **kwargs): """ :param kwargs: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]" """ - super(BaseTester, self).__init__() + super(Tester, self).__init__() """ "default_args" provides default value for important settings. The initialization arguments "kwargs" with the same key (name) will override the default value. "kwargs" must have the same type as "default_args" on corresponding keys. Otherwise, error will raise. """ - default_args = {"save_output": False, # collect outputs of validation set - "save_loss": False, # collect losses in validation + default_args = {"save_output": True, # collect outputs of validation set + "save_loss": True, # collect losses in validation "save_best_dev": False, # save best model during validation "batch_size": 8, - "use_cuda": True, + "use_cuda": False, "pickle_path": "./save/", "model_name": "dev_best_model.pkl", "print_every_step": 1, @@ -55,7 +55,7 @@ class BaseTester(object): logger.error(msg) raise ValueError(msg) else: - # BaseTester doesn't care about extra arguments + # Tester doesn't care about extra arguments pass print(default_args) @@ -208,7 +208,7 @@ class BaseTester(object): return self.show_metrics() -class SeqLabelTester(BaseTester): +class SeqLabelTester(Tester): def __init__(self, **test_args): test_args.update({"task": "seq_label"}) print( @@ -216,9 +216,9 @@ class SeqLabelTester(BaseTester): super(SeqLabelTester, self).__init__(**test_args) -class ClassificationTester(BaseTester): +class ClassificationTester(Tester): def __init__(self, **test_args): - test_args.update({"task": "seq_label"}) + test_args.update({"task": "text_classify"}) print( "[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester with argument 'task'='text_classify'.") super(ClassificationTester, self).__init__(**test_args) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index e638fdde..a73229b2 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -6,10 +6,10 @@ from datetime import timedelta import torch from tensorboardX import SummaryWriter -from fastNLP.core.action import RandomSampler from fastNLP.core.batch import Batch from fastNLP.core.loss import Loss from fastNLP.core.optimizer import Optimizer +from fastNLP.core.sampler import RandomSampler from fastNLP.core.tester import SeqLabelTester, ClassificationTester from fastNLP.saver.logger import create_logger from fastNLP.saver.model_saver import ModelSaver @@ -17,7 +17,7 @@ from fastNLP.saver.model_saver import ModelSaver logger = create_logger(__name__, "./train_test.log") -class BaseTrainer(object): +class Trainer(object): """Operations of training a model, including data loading, gradient descent, and validation. """ @@ -32,7 +32,7 @@ class BaseTrainer(object): - batch_size: int - pickle_path: str, the path to pickle files for pre-processing """ - super(BaseTrainer, self).__init__() + super(Trainer, self).__init__() """ "default_args" provides default value for important settings. @@ -40,8 +40,8 @@ class BaseTrainer(object): "kwargs" must have the same type as "default_args" on corresponding keys. Otherwise, error will raise. """ - default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", - "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, + default_args = {"epochs": 1, "batch_size": 2, "validate": False, "use_cuda": False, "pickle_path": "./save/", + "save_best_dev": False, "model_name": "default_model_name.pkl", "print_every_step": 1, "loss": Loss(None), # used to pass type check "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) } @@ -69,7 +69,7 @@ class BaseTrainer(object): logger.error(msg) raise ValueError(msg) else: - # BaseTrainer doesn't care about extra arguments + # Trainer doesn't care about extra arguments pass print(default_args) @@ -136,6 +136,9 @@ class BaseTrainer(object): # validation if self.validate: + if dev_data is None: + raise RuntimeError( + "self.validate is True in trainer, but dev_data is None. Please provide the validation data.") logger.info("validation started") validator.test(network, dev_data) @@ -314,7 +317,7 @@ class BaseTrainer(object): raise NotImplementedError -class SeqLabelTrainer(BaseTrainer): +class SeqLabelTrainer(Trainer): """Trainer for Sequence Labeling """ @@ -328,7 +331,7 @@ class SeqLabelTrainer(BaseTrainer): return SeqLabelTester(**valid_args) -class ClassificationTrainer(BaseTrainer): +class ClassificationTrainer(Trainer): """Trainer for text classification.""" def __init__(self, **train_args): diff --git a/fastNLP/fastnlp.py b/fastNLP/fastnlp.py index e683950d..4643c247 100644 --- a/fastNLP/fastnlp.py +++ b/fastNLP/fastnlp.py @@ -31,7 +31,7 @@ FastNLP_MODEL_COLLECTION = { "class": "sequence_modeling.AdvSeqLabel", "pickle": "cws_basic_model_v_0.pkl", "type": "seq_label", - "config_file_name": "config", + "config_file_name": "cws.cfg", "config_section_name": "text_class_model" }, "pos_tag_model": { @@ -39,7 +39,7 @@ FastNLP_MODEL_COLLECTION = { "class": "sequence_modeling.AdvSeqLabel", "pickle": "pos_tag_model_v_0.pkl", "type": "seq_label", - "config_file_name": "pos_tag.config", + "config_file_name": "pos_tag.cfg", "config_section_name": "pos_tag_model" }, "text_classify_model": { @@ -56,21 +56,22 @@ FastNLP_MODEL_COLLECTION = { class FastNLP(object): """ High-level interface for direct model inference. - Example Usage: + Example Usage + :: fastnlp = FastNLP() fastnlp.load("zh_pos_tag_model") text = "这是最好的基于深度学习的中文分词系统。" result = fastnlp.run(text) print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"] + """ def __init__(self, model_dir="./"): """ :param model_dir: this directory should contain the following files: - 1. a pre-trained model - 2. a config file - 3. "class2id.pkl" - 4. "word2id.pkl" + 1. a trained model + 2. a config file, which is a fastNLP's configuration. + 3. a Vocab file, which is a pickle object of a Vocab instance. """ self.model_dir = model_dir self.model = None diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 2f03bd8a..72da209c 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -172,9 +172,8 @@ class ClassDatasetLoader(DatasetLoader): class ConllLoader(DatasetLoader): """loader for conll format files""" - def __int__(self, data_name, data_path): + def __int__(self, data_path): """ - :param str data_name: the name of the conll data set :param str data_path: the path to the conll data set """ super(ConllLoader, self).__init__(data_path) @@ -269,8 +268,3 @@ class PeopleDailyCorpusLoader(DatasetLoader): ner_examples.append([sent_words, sent_ner]) return pos_tag_examples, ner_examples -if __name__ == "__main__": - loader = PeopleDailyCorpusLoader("./") - pos, ner = loader.load() - print(pos[:10]) - print(ner[:10]) diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py index 01b9f8af..21cb2886 100644 --- a/fastNLP/modules/__init__.py +++ b/fastNLP/modules/__init__.py @@ -1,11 +1,11 @@ -from . import aggregation +from . import aggregator from . import decoder from . import encoder -from . import interaction +from . import interactor __version__ = '0.0.0' __all__ = ['encoder', 'decoder', - 'aggregation', - 'interaction'] + 'aggregator', + 'interactor'] diff --git a/fastNLP/modules/aggregation/__init__.py b/fastNLP/modules/aggregator/__init__.py similarity index 100% rename from fastNLP/modules/aggregation/__init__.py rename to fastNLP/modules/aggregator/__init__.py diff --git a/fastNLP/modules/aggregation/attention.py b/fastNLP/modules/aggregator/attention.py similarity index 100% rename from fastNLP/modules/aggregation/attention.py rename to fastNLP/modules/aggregator/attention.py diff --git a/fastNLP/modules/aggregation/avg_pool.py b/fastNLP/modules/aggregator/avg_pool.py similarity index 100% rename from fastNLP/modules/aggregation/avg_pool.py rename to fastNLP/modules/aggregator/avg_pool.py diff --git a/fastNLP/modules/aggregation/kmax_pool.py b/fastNLP/modules/aggregator/kmax_pool.py similarity index 100% rename from fastNLP/modules/aggregation/kmax_pool.py rename to fastNLP/modules/aggregator/kmax_pool.py diff --git a/fastNLP/modules/aggregation/max_pool.py b/fastNLP/modules/aggregator/max_pool.py similarity index 100% rename from fastNLP/modules/aggregation/max_pool.py rename to fastNLP/modules/aggregator/max_pool.py diff --git a/fastNLP/modules/aggregation/self_attention.py b/fastNLP/modules/aggregator/self_attention.py similarity index 99% rename from fastNLP/modules/aggregation/self_attention.py rename to fastNLP/modules/aggregator/self_attention.py index d750ea54..b56e869b 100644 --- a/fastNLP/modules/aggregation/self_attention.py +++ b/fastNLP/modules/aggregator/self_attention.py @@ -1,8 +1,7 @@ import torch import torch.nn as nn -from torch.autograd import Variable import torch.nn.functional as F - +from torch.autograd import Variable from fastNLP.modules.utils import initial_parameter diff --git a/fastNLP/modules/interaction/__init__.py b/fastNLP/modules/interactor/__init__.py similarity index 100% rename from fastNLP/modules/interaction/__init__.py rename to fastNLP/modules/interactor/__init__.py diff --git a/fastNLP/modules/other_modules.py b/fastNLP/modules/other_modules.py index cd4b225f..0cd32d3b 100644 --- a/fastNLP/modules/other_modules.py +++ b/fastNLP/modules/other_modules.py @@ -1,19 +1,10 @@ -""" -This is borrowed from FudanParser. Not stable. Do not use !!! - -""" -import numpy import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.data -from torch import optim -from torch.autograd import Function, Variable from torch.nn import Parameter -from .utils import orthogonal - class GroupNorm(nn.Module): def __init__(self, num_features, num_groups=20, eps=1e-5): @@ -59,15 +50,6 @@ class LayerNormalization(nn.Module): return ln_out -class OrthEmbedding(nn.Embedding): - def __init__(self, *args, **kwargs): - super(OrthEmbedding, self).__init__(*args, **kwargs) - - def reset_parameters(self): - self.weight = orthogonal(self.weight) - nn.init.constant_(self.bias, 0.) - - class BiLinear(nn.Module): def __init__(self, n_left, n_right, n_out, bias=True): """ @@ -241,250 +223,3 @@ class WordDropout(nn.Module): drop_mask = drop_mask.long() output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx return output - - -class WlossLayer(torch.nn.Module): - def __init__(self, lam=100, sinkhorn_iter=50): - super(WlossLayer, self).__init__() - - # cost = matrix M = distance matrix - # lam = lambda of type float > 0 - # sinkhorn_iter > 0 - # diagonal cost should be 0 - self.lam = lam - self.sinkhorn_iter = sinkhorn_iter - # self.register_buffer("K", torch.exp(-self.cost / self.lam).double()) - # self.register_buffer("KM", (self.cost * self.K).double()) - - def forward(self, pred, target, cost): - return WassersteinLossStab.apply(pred, target, - cost, self.lam, self.sinkhorn_iter) - - -class WassersteinLossStab(Function): - @staticmethod - def forward(ctx, pred, target, cost, lam=1e-3, sinkhorn_iter=4): - """pred: Batch * K: K = # mass points - target: Batch * L: L = # mass points""" - # import pdb - # pdb.set_trace() - eps = 1e-8 - - # pred = pred.gather(dim=1, index=) - na = pred.size(1) - nb = target.size(1) - - cost = cost.double() - pred = pred.double() - target = target.double() - - cost = cost[:na, :nb].double() - K = torch.exp(-cost / lam).double() - KM = (cost * K).double() - - batch_size = pred.size(0) - - # pdb.set_trace() - log_a, log_b = torch.log(pred + eps), torch.log(target + eps) - log_u = cost.new(batch_size, na).fill_(-numpy.log(na)) - log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb)) - # import pdb - # pdb.set_trace() - for i in range(int(sinkhorn_iter)): - log_u_max = torch.max(log_u, dim=1)[0] - u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps) - log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1) - log_v_max = torch.max(log_v, dim=1)[0] - v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) - tmp = log_u - log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1) - # print(log_u.sum()) - if torch.norm(tmp - log_u) / torch.norm(log_u) < eps: - break - - log_v_max = torch.max(log_v, dim=1)[0] - v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) - logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1) - wnorm = torch.exp(log_u + logcostpart1).mean(0).sum() # sum(1) for per item pair loss... - grad_input = log_u * lam - # print("log_u", log_u) - grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) - grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) - grad_input = grad_input / batch_size - - ctx.save_for_backward(grad_input) - # print("grad type", type(grad_input)) - - return pred.new((wnorm,)), grad_input - - @staticmethod - def backward(ctx, grad_output, _): - grad_input = ctx.saved_variables - # print(grad) - res = grad_output.clone() - res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data) - res = res.mul_(grad_output[0]).float() - # print("in backward func:\n\n", res) - return res, None, None, None, None, None, None - - -class Sinkhorn(Function): - def __init__(self): - super(Sinkhorn, self).__init__() - - def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop): - a = a.double() - b = b.double() - M = M.double() - - nbb = b.size(1) - - # init data - na = len(a) - nb = len(b) - - cpt = 0 - - # we assume that no distances are null except those of the diagonal of - # distances - if warmstart is None: - alpha, beta = np.zeros(na), np.zeros(nb) - else: - alpha, beta = warmstart - - if nbb: - u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb - else: - u, v = np.ones(na) / na, np.ones(nb) / nb - - def get_K(alpha, beta): - """log space computation""" - return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg) - - def get_Gamma(alpha, beta, u, v): - """log space gamma computation""" - return np.exp( - -(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log( - v.reshape((1, nb)))) - - # print(np.min(K)) - - K = get_K(alpha, beta) - transp = K - cpt = 0 - err = 1 - while 1: - - uprev = u - vprev = v - - # sinkhorn update - v = b / (np.dot(K.T, u) + 1e-16) - u = a / (np.dot(K, v) + 1e-16) - - # remove numerical problems and store them in K - if np.abs(u).max() > tau or np.abs(v).max() > tau: - if nbb: - alpha, beta = alpha + reg * \ - np.max(np.log(u), 1), beta + reg * np.max(np.log(v)) - else: - alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v) - if nbb: - u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb - else: - u, v = np.ones(na) / na, np.ones(nb) / nb - K = get_K(alpha, beta) - - if cpt % print_period == 0: - # we can speed up the process by checking for the error only all - # the 10th iterations - if nbb: - err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \ - np.sum((v - vprev) ** 2) / np.sum((v) ** 2) - else: - transp = get_Gamma(alpha, beta, u, v) - err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2 - if log: - log['err'].append(err) - - if verbose: - if cpt % (print_period * 20) == 0: - print( - '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) - print('{:5d}|{:8e}|'.format(cpt, err)) - - if err <= stopThr: - loop = False - - if cpt >= numItermax: - loop = False - - if np.any(np.isnan(u)) or np.any(np.isnan(v)): - # we have reached the machine precision - # come back to previous solution and quit loop - print('Warning: numerical errors at iteration', cpt) - u = uprev - v = vprev - break - - cpt = cpt + 1 - - # print('err=',err,' cpt=',cpt) - if log: - log['logu'] = alpha / reg + np.log(u) - log['logv'] = beta / reg + np.log(v) - log['alpha'] = alpha + reg * np.log(u) - log['beta'] = beta + reg * np.log(v) - log['warmstart'] = (log['alpha'], log['beta']) - if nbb: - res = np.zeros((nbb)) - for i in range(nbb): - res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) - return res, log - - else: - return get_Gamma(alpha, beta, u, v), log - else: - if nbb: - res = np.zeros((nbb)) - for i in range(nbb): - res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) - return res - else: - return get_Gamma(alpha, beta, u, v) - - -if __name__ == "__main__": - cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))) # .cuda() - mylayer = WlossLayer(cost) # .cuda() - inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True) # .cuda() - ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])) # .cuda() - - res, _ = mylayer(inp, ground_true) - # print(inp.requires_grad, res.requires_grad) - # print(res, inp) - mylayer.zero_grad() - res.backward() - print("inp's gradient is good:") - print(inp.grad) - - print("convert to gpu:\n", inp.cuda().grad) - print("==============================================" - "\n However, this does not work on pytorch when GPU is enabled") - - cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda() - mylayer = WlossLayer(cost).cuda() - inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda() - ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda() - - opt = optim.SGD([ - {'params': mylayer.parameters()}, - ], lr=1e-2, momentum=0.9) - - res, _ = mylayer(inp, ground_true) - # print(inp.requires_grad, res.requires_grad) - # print(res, inp) - mylayer.zero_grad() - res.backward() - print("input's gradient is None!!!!!!!!!!!!!!!!") - print(inp.grad) diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index 22139668..12efe1c8 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -1,9 +1,8 @@ -from collections import defaultdict - -import numpy as np import torch -import torch.nn.init as init import torch.nn as nn +import torch.nn.init as init + + def mask_softmax(matrix, mask): if mask is None: result = torch.nn.functional.softmax(matrix, dim=-1) @@ -11,13 +10,28 @@ def mask_softmax(matrix, mask): raise NotImplementedError return result -def initial_parameter(net ,initial_method =None): +def initial_parameter(net, initial_method=None): + """A method used to initialize the weights of PyTorch models. + + :param net: a PyTorch model + :param initial_method: str, one of the following initializations + + - xavier_uniform + - xavier_normal (default) + - kaiming_normal, or msra + - kaiming_uniform + - orthogonal + - sparse + - normal + - uniform + + """ if initial_method == 'xavier_uniform': init_method = init.xavier_uniform_ - elif initial_method=='xavier_normal': + elif initial_method == 'xavier_normal': init_method = init.xavier_normal_ - elif initial_method == 'kaiming_normal' or initial_method =='msra': + elif initial_method == 'kaiming_normal' or initial_method == 'msra': init_method = init.kaiming_normal elif initial_method == 'kaiming_uniform': init_method = init.kaiming_normal @@ -25,263 +39,49 @@ def initial_parameter(net ,initial_method =None): init_method = init.orthogonal_ elif initial_method == 'sparse': init_method = init.sparse_ - elif initial_method =='normal': + elif initial_method == 'normal': init_method = init.normal_ - elif initial_method =='uniform': + elif initial_method == 'uniform': initial_method = init.uniform_ else: init_method = init.xavier_normal_ + def weights_init(m): # classname = m.__class__.__name__ - if isinstance(m, nn.Conv2d) or isinstance(m,nn.Conv1d) or isinstance(m,nn.Conv3d): # for all the cnn - if initial_method != None: + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv3d): # for all the cnn + if initial_method is not None: init_method(m.weight.data) else: init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for w in m.parameters(): - if len(w.data.size())>1: + if len(w.data.size()) > 1: init_method(w.data) # weight else: init.normal_(w.data) # bias elif hasattr(m, 'weight') and m.weight.requires_grad: init_method(m.weight.data) else: - for w in m.parameters() : - if w.requires_grad: - if len(w.data.size())>1: + for w in m.parameters(): + if w.requires_grad: + if len(w.data.size()) > 1: init_method(w.data) # weight else: init.normal_(w.data) # bias # print("init else") - net.apply(weights_init) - -def seq_mask(seq_len, max_len): - mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] - mask = torch.stack(mask, 1) - return mask - - -""" - Codes from FudanParser. Not tested. Do not use !!! -""" - - -def expand_gt(gt): - """expand_gt: Expand ground truth to matrix - Arguments: - gt: tensor of (n, l) - Return: - f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$. - """ - n, l = gt.shape - ret = torch.zeros(n, l, l).long() - for i in range(n): - ret[i][torch.arange(l).long(), gt[i]] = 1 - return ret - - -def greedy_decoding(arc_f): - """greedy_decoding - Arguments: - arc_f: a tensor in shape of (n, l+1, l+1) - length of the sentence is l and index 0 is - Output: - arc_pred: a tensor in shape of (n, l), indicating the head words - """ - - f_arc = arc_f[:, 1:, :] # ignore the root - _, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False) - return arc_pred - - -def mst_decoding(arc_f): - batch_size = arc_f.shape[0] - length = arc_f.shape[1] - arc_score = arc_f.data.cpu() - pred_collection = [] - for i in range(batch_size): - head = mst(arc_score[i].numpy()) - pred_collection.append(head[1:].reshape((1, length - 1))) - arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long() - return arc_pred - -def outer_product(features): - """InterProduct: Get inter sequence product of features - Arguments: - features: feature vectors of sequence in the shape of (n, l, h) - Return: - f: product result in (n, l, l, h) shape - """ - n, l, c = features.shape - features = features.contiguous() - x = features.view(n, l, 1, c) - x = x.expand(n, l, l, c) - y = features.view(n, 1, l, c).contiguous() - y = y.expand(n, l, l, c) - return x * y - - -def outer_concat(features): - """InterProduct: Get inter sequence concatenation of features - Arguments: - features: feature vectors of sequence in the shape of (n, l, h) - Return: - f: product result in (n, l, l, h) shape - """ - n, l, c = features.shape - x = features.contiguous().view(n, l, 1, c) - x = x.expand(n, l, l, c) - y = features.view(n, 1, l, c) - y = y.expand(n, l, l, c) - return torch.cat((x, y), dim=3) - - -def mst(scores): - """ - https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692 # NOQA - """ - length = scores.shape[0] - min_score = scores.min() - 1 - eye = np.eye(length) - scores = scores * (1 - eye) + min_score * eye - heads = np.argmax(scores, axis=1) - heads[0] = 0 - tokens = np.arange(1, length) - roots = np.where(heads[tokens] == 0)[0] + 1 - if len(roots) < 1: - root_scores = scores[tokens, 0] - head_scores = scores[tokens, heads[tokens]] - new_root = tokens[np.argmax(root_scores / head_scores)] - heads[new_root] = 0 - elif len(roots) > 1: - root_scores = scores[roots, 0] - scores[roots, 0] = 0 - new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1 - new_root = roots[np.argmin( - scores[roots, new_heads] / root_scores)] - heads[roots] = new_heads - heads[new_root] = 0 - - edges = defaultdict(set) - vertices = set((0,)) - for dep, head in enumerate(heads[tokens]): - vertices.add(dep + 1) - edges[head].add(dep + 1) - for cycle in _find_cycle(vertices, edges): - dependents = set() - to_visit = set(cycle) - while len(to_visit) > 0: - node = to_visit.pop() - if node not in dependents: - dependents.add(node) - to_visit.update(edges[node]) - cycle = np.array(list(cycle)) - old_heads = heads[cycle] - old_scores = scores[cycle, old_heads] - non_heads = np.array(list(dependents)) - scores[np.repeat(cycle, len(non_heads)), - np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score - new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1 - new_scores = scores[cycle, new_heads] / old_scores - change = np.argmax(new_scores) - changed_cycle = cycle[change] - old_head = old_heads[change] - new_head = new_heads[change] - heads[changed_cycle] = new_head - edges[new_head].add(changed_cycle) - edges[old_head].remove(changed_cycle) - - return heads - - -def _find_cycle(vertices, edges): - """ - https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA - https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA - """ - _index = 0 - _stack = [] - _indices = {} - _lowlinks = {} - _onstack = defaultdict(lambda: False) - _SCCs = [] - - def _strongconnect(v): - nonlocal _index - _indices[v] = _index - _lowlinks[v] = _index - _index += 1 - _stack.append(v) - _onstack[v] = True - - for w in edges[v]: - if w not in _indices: - _strongconnect(w) - _lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) - elif _onstack[w]: - _lowlinks[v] = min(_lowlinks[v], _indices[w]) - - if _lowlinks[v] == _indices[v]: - SCC = set() - while True: - w = _stack.pop() - _onstack[w] = False - SCC.add(w) - if not (w != v): - break - _SCCs.append(SCC) + net.apply(weights_init) - for v in vertices: - if v not in _indices: - _strongconnect(v) - return [SCC for SCC in _SCCs if len(SCC) > 1] +def seq_mask(seq_len, max_len): + """Create sequence mask. + :param seq_len: list of int, the lengths of sequences in a batch. + :param max_len: int, the maximum sequence length in a batch. + :return mask: torch.LongTensor, [batch_size, max_len] -# https://github.com/alykhantejani/nninit/blob/master/nninit.py -def orthogonal(tensor, gain=1): - """Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions, - and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with - rows equal to the first dimension and columns equal to the product of as a sparse matrix, where the non-zero elements - will be drawn from a normal distribution with mean=0 and std=`std`. - Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. - Args: - tensor: a n-dimension torch.Tensor, where n >= 2 - gain: optional gain to be applied - Examples: - >>> w = torch.Tensor(3, 5) - >>> nninit.orthogonal(w) """ - if tensor.ndimension() < 2: - raise ValueError("Only tensors with 2 or more dimensions are supported.") - - flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:]))) - flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1) - - u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False) - if u.shape == flattened.detach().numpy().shape: - tensor.view_as(flattened).copy_(torch.from_numpy(u)) - else: - tensor.view_as(flattened).copy_(torch.from_numpy(v)) - - tensor.mul_(gain) - with torch.no_grad(): - return tensor - - -def generate_step_dropout(masks, hidden_dim, step_dropout, training=False): - # assume batch first - # import pdb - # pdb.set_trace() - - batch, length = masks.size() - if not training: - return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch, - length, 1) - masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout) - masked = torch.bernoulli(masked).repeat(1, length, 1) - masked = masked.cuda(masks.device) * masks.view(batch, length, 1) - return masked + mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] + mask = torch.stack(mask, 1) + return mask diff --git a/fastNLP/saver/model_saver.py b/fastNLP/saver/model_saver.py index 81690740..74518a44 100644 --- a/fastNLP/saver/model_saver.py +++ b/fastNLP/saver/model_saver.py @@ -2,16 +2,23 @@ import torch class ModelSaver(object): - """Save a models""" + """Save a model + Example:: + saver = ModelSaver("./save/model_ckpt_100.pkl") + saver.save_pytorch(model) + """ def __init__(self, save_path): + """ + + :param save_path: str, the path to the saving directory. + """ self.save_path = save_path - # TODO: check whether the path exist, if not exist, create it. def save_pytorch(self, model): - """ - Save a pytorch model into .pkl file. + """Save a pytorch model into .pkl file. + :param model: a PyTorch model - :return: + """ torch.save(model.state_dict(), self.save_path) diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/main.py b/reproduction/LSTM+self_attention_sentiment_analysis/main.py index 3b11f6be..b69a64ba 100644 --- a/reproduction/LSTM+self_attention_sentiment_analysis/main.py +++ b/reproduction/LSTM+self_attention_sentiment_analysis/main.py @@ -1,23 +1,15 @@ - -import os - import torch.nn.functional as F -from fastNLP.loader.dataset_loader import ClassDatasetLoader as Dataset_loader -from fastNLP.loader.embed_loader import EmbedLoader as EmbedLoader -from fastNLP.loader.config_loader import ConfigSection +from fastNLP.core.preprocess import ClassPreprocess as Preprocess +from fastNLP.core.trainer import ClassificationTrainer from fastNLP.loader.config_loader import ConfigLoader - +from fastNLP.loader.config_loader import ConfigSection +from fastNLP.loader.dataset_loader import ClassDatasetLoader as Dataset_loader from fastNLP.models.base_model import BaseModel - -from fastNLP.core.preprocess import ClassPreprocess as Preprocess -from fastNLP.core.trainer import ClassificationTrainer - +from fastNLP.modules.aggregator.self_attention import SelfAttention +from fastNLP.modules.decoder.MLP import MLP from fastNLP.modules.encoder.embedding import Embedding as Embedding from fastNLP.modules.encoder.lstm import Lstm -from fastNLP.modules.aggregation.self_attention import SelfAttention -from fastNLP.modules.decoder.MLP import MLP - train_data_path = 'small_train_data.txt' dev_data_path = 'small_dev_data.txt' diff --git a/test/core/test_sampler.py b/test/core/test_sampler.py new file mode 100644 index 00000000..179d20d7 --- /dev/null +++ b/test/core/test_sampler.py @@ -0,0 +1,30 @@ +import torch + +from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler + + +def test_convert_to_torch_tensor(): + data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 3, 4, 5, 2]] + ans = convert_to_torch_tensor(data, False) + assert isinstance(ans, torch.Tensor) + assert tuple(ans.shape) == (3, 5) + + +def test_sequential_sampler(): + sampler = SequentialSampler() + data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] + for idx, i in enumerate(sampler(data)): + assert idx == i + + +def test_random_sampler(): + sampler = RandomSampler() + data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] + ans = [data[i] for i in sampler(data)] + assert len(ans) == len(data) + for d in ans: + assert d in data + + +if __name__ == "__main__": + test_sequential_sampler() diff --git a/test/data_for_tests/conll_example.txt b/test/data_for_tests/conll_example.txt new file mode 100644 index 00000000..14fac0ad --- /dev/null +++ b/test/data_for_tests/conll_example.txt @@ -0,0 +1,15 @@ +1 I _ PRP PRP _ 2 SUB +2 solved _ VBD VBD _ 0 ROOT +3 the _ DT DT _ 4 NMOD +4 problem _ NN NN _ 2 OBJ +5 with _ IN IN _ 2 VMOD +6 statistics _ NNS NNS _ 5 PMOD +7 . _ . . _ 2 P + +1 I _ PRP PRP _ 2 SUB +2 solved _ VBD VBD _ 0 ROOT +3 the _ DT DT _ 4 NMOD +4 problem _ NN NN _ 2 OBJ +5 with _ IN IN _ 2 VMOD +6 statistics _ NNS NNS _ 5 PMOD +7 . _ . . _ 2 P diff --git a/test/data_for_tests/people_daily_raw.txt b/test/data_for_tests/people_daily_raw.txt new file mode 100644 index 00000000..8255edb6 --- /dev/null +++ b/test/data_for_tests/people_daily_raw.txt @@ -0,0 +1,27 @@ +19980101-01-001-001/m 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n ——/w 一九九八年/t 新年/t 讲话/n (/w 附/v 图片/n 1/m 张/q )/w +19980101-01-001-002/m 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr +19980101-01-001-003/m (/w 一九九七年/t 十二月/t 三十一日/t )/w +19980101-01-001-004/m 12月/t 31日/t ,/w 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr 发表/v 1998年/t 新年/t 讲话/n 《/w 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n 》/w 。/w (/w 新华社/nt 记者/n 兰/nr 红光/nr 摄/Vg )/w +19980101-01-001-005/m 同胞/n 们/k 、/w 朋友/n 们/k 、/w 女士/n 们/k 、/w 先生/n 们/k :/w +19980101-01-001-006/m 在/p 1998年/t 来临/v 之际/f ,/w 我/r 十分/m 高兴/a 地/u 通过/p [中央/n 人民/n 广播/vn 电台/n]nt 、/w [中国/ns 国际/n 广播/vn 电台/n]nt 和/c [中央/n 电视台/n]nt ,/w 向/p 全国/n 各族/r 人民/n ,/w 向/p [香港/ns 特别/a 行政区/n]ns 同胞/n 、/w 澳门/ns 和/c 台湾/ns 同胞/n 、/w 海外/s 侨胞/n ,/w 向/p 世界/n 各国/r 的/u 朋友/n 们/k ,/w 致以/v 诚挚/a 的/u 问候/vn 和/c 良好/a 的/u 祝愿/vn !/w +19980101-01-001-007/m 1997年/t ,/w 是/v 中国/ns 发展/vn 历史/n 上/f 非常/d 重要/a 的/u 很/d 不/d 平凡/a 的/u 一/m 年/q 。/w 中国/ns 人民/n 决心/d 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 继续/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 事业/n 推向/v 前进/v 。/w [中国/ns 政府/n]nt 顺利/ad 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 并/c 按照/p “/w 一国两制/j ”/w 、/w “/w 港人治港/l ”/w 、/w 高度/d 自治/v 的/u 方针/n 保持/v 香港/ns 的/u 繁荣/an 稳定/an 。/w [中国/ns 共产党/n]nt 成功/a 地/u 召开/v 了/u 第十五/m 次/q 全国/n 代表大会/n ,/w 高举/v 邓小平理论/n 伟大/a 旗帜/n ,/w 总结/v 百年/m 历史/n ,/w 展望/v 新/a 的/u 世纪/n ,/w 制定/v 了/u 中国/ns 跨/v 世纪/n 发展/v 的/u 行动/vn 纲领/n 。/w +19980101-01-001-008/m 在/p 这/r 一/m 年/q 中/f ,/w 中国/ns 的/u 改革/vn 开放/vn 和/c 现代化/vn 建设/vn 继续/v 向前/v 迈进/v 。/w 国民经济/n 保持/v 了/u “/w 高/a 增长/vn 、/w 低/a 通胀/j ”/w 的/u 良好/a 发展/vn 态势/n 。/w 农业/n 生产/vn 再次/d 获得/v 好/a 的/u 收成/n ,/w 企业/n 改革/vn 继续/v 深化/v ,/w 人民/n 生活/vn 进一步/d 改善/v 。/w 对外/vn 经济/n 技术/n 合作/vn 与/c 交流/vn 不断/d 扩大/v 。/w 民主/a 法制/n 建设/vn 、/w 精神文明/n 建设/vn 和/c 其他/r 各项/r 事业/n 都/d 有/v 新/a 的/u 进展/vn 。/w 我们/r 十分/m 关注/v 最近/t 一个/m 时期/n 一些/m 国家/n 和/c 地区/n 发生/v 的/u 金融/n 风波/n ,/w 我们/r 相信/v 通过/p 这些/r 国家/n 和/c 地区/n 的/u 努力/an 以及/c 有关/v 的/u 国际/n 合作/vn ,/w 情况/n 会/v 逐步/d 得到/v 缓解/vn 。/w 总的来说/c ,/w 中国/ns 改革/v 和/c 发展/v 的/u 全局/n 继续/v 保持/v 了/u 稳定/an 。/w +19980101-01-001-009/m 在/p 这/r 一/m 年/q 中/f ,/w 中国/ns 的/u 外交/n 工作/vn 取得/v 了/u 重要/a 成果/n 。/w 通过/p 高层/n 互访/v ,/w 中国/ns 与/p 美国/ns 、/w 俄罗斯/ns 、/w 法国/ns 、/w 日本/ns 等/u 大国/n 确定/v 了/u 双方/n 关系/n 未来/t 发展/v 的/u 目标/n 和/c 指导/vn 方针/n 。/w 中国/ns 与/p 周边/n 国家/n 和/c 广大/b 发展中国家/l 的/u 友好/a 合作/vn 进一步/d 加强/v 。/w 中国/ns 积极/ad 参与/v [亚/j 太/j 经合/j 组织/n]nt 的/u 活动/vn ,/w 参加/v 了/u 东盟/ns —/w 中/j 日/j 韩/j 和/c 中国/ns —/w 东盟/ns 首脑/n 非正式/b 会晤/vn 。/w 这些/r 外交/n 活动/vn ,/w 符合/v 和平/n 与/c 发展/v 的/u 时代/n 主题/n ,/w 顺应/v 世界/n 走向/v 多极化/v 的/u 趋势/n ,/w 对于/p 促进/v 国际/n 社会/n 的/u 友好/a 合作/vn 和/c 共同/b 发展/vn 作出/v 了/u 积极/a 的/u 贡献/n 。/w +19980101-01-001-010/m 1998年/t ,/w 中国/ns 人民/n 将/d 满怀信心/l 地/u 开创/v 新/a 的/u 业绩/n 。/w 尽管/c 我们/r 在/p 经济/n 社会/n 发展/v 中/f 还/d 面临/v 不少/m 困难/an ,/w 但/c 我们/r 有/v 邓小平理论/n 的/u 指引/vn ,/w 有/v 改革/v 开放/v 近/a 20/m 年/q 来/f 取得/v 的/u 伟大/a 成就/n 和/c 积累/v 的/u 丰富/a 经验/n ,/w 还/d 有/v 其他/r 的/u 各种/r 有利/a 条件/n ,/w 我们/r 一定/d 能够/v 克服/v 这些/r 困难/an ,/w 继续/v 稳步前进/l 。/w 只要/c 我们/r 进一步/d 解放思想/i ,/w 实事求是/i ,/w 抓住/v 机遇/n ,/w 开拓进取/l ,/w 建设/v 有/v 中国/ns 特色/n 社会主义/n 的/u 道路/n 就/c 会/v 越/d 走/v 越/d 宽广/a 。/w +19980101-01-001-011/m 实现/v 祖国/n 的/u 完全/a 统一/vn ,/w 是/v 海内外/s 全体/n 中国/ns 人/n 的/u 共同/b 心愿/n 。/w 通过/p 中/j 葡/j 双方/n 的/u 合作/vn 和/c 努力/an ,/w 按照/p “/w 一国两制/j ”/w 方针/n 和/c 澳门/ns 《/w 基本法/n 》/w ,/w 1999年/t 12月/t 澳门/ns 的/u 回归/vn 一定/d 能够/v 顺利/ad 实现/v 。/w +19980101-01-001-012/m 台湾/ns 是/v 中国/ns 领土/n 不可分割/l 的/u 一/m 部分/n 。/w 完成/v 祖国/n 统一/vn ,/w 是/v 大势所趋/i ,/w 民心所向/l 。/w 任何/r 企图/v 制造/v “/w 两/m 个/q 中国/ns ”/w 、/w “/w 一中一台/j ”/w 、/w “/w 台湾/ns 独立/v ”/w 的/u 图谋/n ,/w 都/d 注定/v 要/v 失败/v 。/w 希望/v 台湾/ns 当局/n 以/p 民族/n 大义/n 为重/v ,/w 拿/v 出/v 诚意/n ,/w 采取/v 实际/a 的/u 行动/vn ,/w 推动/v 两岸/n 经济/n 文化/n 交流/vn 和/c 人员/n 往来/vn ,/w 促进/v 两岸/n 直接/ad 通邮/v 、/w 通航/v 、/w 通商/v 的/u 早日/d 实现/v ,/w 并/c 尽早/d 回应/v 我们/r 发出/v 的/u 在/p 一个/m 中国/ns 的/u 原则/n 下/f 两岸/n 进行/v 谈判/vn 的/u 郑重/a 呼吁/vn 。/w +19980101-01-001-013/m 环顾/v 全球/n ,/w 日益/d 密切/a 的/u 世界/n 经济/n 联系/vn ,/w 日新月异/i 的/u 科技/n 进步/vn ,/w 正在/d 为/p 各国/r 经济/n 的/u 发展/vn 提供/v 历史/n 机遇/n 。/w 但是/c ,/w 世界/n 还/d 不/d 安宁/a 。/w 南北/f 之间/f 的/u 贫富/n 差距/n 继续/v 扩大/v ;/w 局部/n 冲突/vn 时有发生/l ;/w 不/d 公正/a 不/d 合理/a 的/u 旧/a 的/u 国际/n 政治/n 经济/n 秩序/n 还/d 没有/v 根本/a 改变/vn ;/w 发展中国家/l 在/p 激烈/a 的/u 国际/n 经济/n 竞争/vn 中/f 仍/d 处于/v 弱势/n 地位/n ;/w 人类/n 的/u 生存/vn 与/c 发展/vn 还/d 面临/v 种种/q 威胁/vn 和/c 挑战/vn 。/w 和平/n 与/c 发展/vn 的/u 前景/n 是/v 光明/a 的/u ,/w 21/m 世纪/n 将/d 是/v 充满/v 希望/n 的/u 世纪/n 。/w 但/c 前进/v 的/u 道路/n 不/d 会/v 也/d 不/d 可能/v 一帆风顺/i ,/w 关键/n 是/v 世界/n 各国/r 人民/n 要/v 进一步/d 团结/a 起来/v ,/w 共同/d 推动/v 早日/d 建立/v 公正/a 合理/a 的/u 国际/n 政治/n 经济/n 新/a 秩序/n 。/w +19980101-01-001-014/m [中国/ns 政府/n]nt 将/d 继续/v 坚持/v 奉行/v 独立自主/i 的/u 和平/n 外交/n 政策/n ,/w 在/p 和平共处/l 五/m 项/q 原则/n 的/u 基础/n 上/f 努力/ad 发展/v 同/p 世界/n 各国/r 的/u 友好/a 关系/n 。/w 中国/ns 愿意/v 加强/v 同/p 联合国/nt 和/c 其他/r 国际/n 组织/n 的/u 协调/vn ,/w 促进/v 在/p 扩大/v 经贸/j 科技/n 交流/vn 、/w 保护/v 环境/n 、/w 消除/v 贫困/an 、/w 打击/v 国际/n 犯罪/vn 等/u 方面/n 的/u 国际/n 合作/vn 。/w 中国/ns 永远/d 是/v 维护/v 世界/n 和平/n 与/c 稳定/an 的/u 重要/a 力量/n 。/w 中国/ns 人民/n 愿/v 与/p 世界/n 各国/r 人民/n 一道/d ,/w 为/p 开创/v 持久/a 和平/n 、/w 共同/d 发展/v 的/u 新/a 世纪/n 而/c 不懈努力/l !/w +19980101-01-001-015/m 在/p 这/r 辞旧迎新/l 的/u 美好/a 时刻/n ,/w 我/r 祝/v 大家/r 新年/t 快乐/a ,/w 家庭/n 幸福/a !/w +19980101-01-001-016/m 谢谢/v !/w (/w 新华社/nt 北京/ns 12月/t 31日/t 电/n )/w + +19980101-01-002-001/m 在/p 十五大/j 精神/n 指引/vn 下/f 胜利/vd 前进/v ——/w 元旦/t 献辞/n +19980101-01-002-002/m 我们/r 即将/d 以/p 丰收/vn 的/u 喜悦/an 送/v 走/v 牛年/t ,/w 以/p 昂扬/a 的/u 斗志/n 迎来/v 虎年/t 。/w 我们/r 伟大/a 祖国/n 在/p 新/a 的/u 一/m 年/q ,/w 将/d 是/v 充满/v 生机/n 、/w 充满/v 希望/n 的/u 一/m 年/q 。/w +19980101-01-002-003/m 刚刚/d 过去/v 的/u 一/m 年/q ,/w 大气磅礴/i ,/w 波澜壮阔/i 。/w 在/p 这/r 一/m 年/q ,/w 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt ,/w 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 高举/v 邓小平理论/n 的/u 伟大/a 旗帜/n ,/w 领导/v 全党/n 和/c 全国/n 各族/r 人民/n 坚定不移/i 地/u 沿着/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 道路/n 阔步/d 前进/v ,/w 写/v 下/v 了/u 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 辉煌/a 篇章/n 。/w 顺利/a 地/u 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 胜利/v 地/u 召开/v 党/n 的/u 第十五/m 次/q 全国/n 代表大会/n ———/w 两/m 件/q 大事/n 办/v 得/u 圆满/a 成功/a 。/w 国民经济/n 稳中求进/l ,/w 国家/n 经济/n 实力/n 进一步/d 增强/v ,/w 人民/n 生活/vn 继续/v 改善/v ,/w 对外/vn 经济/n 技术/n 交流/vn 日益/d 扩大/v 。/w 在/p 国际/n 金融/n 危机/n 的/u 风浪/n 波及/v 许多/m 国家/n 的/u 情况/n 下/f ,/w 我国/n 保持/v 了/u 金融/n 形势/n 和/c 整个/b 经济/n 形势/n 的/u 稳定/a 发展/vn 。/w 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 取得/v 新/a 的/u 成绩/n ,/w 各项/r 社会/n 事业/n 全面/ad 进步/v 。/w 外交/n 工作/vn 取得/v 可喜/a 的/u 突破/vn ,/w 我国/n 的/u 国际/n 地位/n 和/c 国际/n 威望/n 进一步/d 提高/v 。/w 实践/v 使/v 亿万/m 人民/n 对/p 邓小平理论/n 更加/d 信仰/v ,/w 对/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 更加/d 信赖/v ,/w 对/p 伟大/a 祖国/n 的/u 光辉/n 前景/n 更加/d 充满/v 信心/n 。/w +19980101-01-002-004/m 1998年/t ,/w 是/v 全面/ad 贯彻/v 落实/v 党/n 的/u 十五大/j 提出/v 的/u 任务/n 的/u 第一/m 年/q ,/w 各/r 条/q 战线/n 改革/v 和/c 发展/v 的/u 任务/n 都/d 十分/m 繁重/a ,/w 有/v 许多/m 深/a 层次/n 的/u 矛盾/an 和/c 问题/n 有待/v 克服/v 和/c 解决/v ,/w 特别/d 是/v 国有/vn 企业/n 改革/vn 已经/d 进入/v 攻坚/vn 阶段/n 。/w 我们/r 必须/d 进一步/d 深入/ad 学习/v 和/c 掌握/v 党/n 的/u 十五大/j 精神/n ,/w 统揽全局/l ,/w 精心/ad 部署/v ,/w 狠抓/v 落实/v ,/w 团结/a 一致/a ,/w 艰苦奋斗/i ,/w 开拓/v 前进/v ,/w 为/p 夺取/v 今年/t 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 新/a 胜利/vn 而/c 奋斗/v 。/w +19980101-01-002-005/m 今年/t 是/v 党/n 的/u 十一/m 届/q 三中全会/j 召开/v 20/m 周年/q ,/w 是/v 我们/r 党/n 和/c 国家/n 实现/v 伟大/a 的/u 历史/n 转折/vn 、/w 进入/v 改革/vn 开放/vn 历史/n 新/a 时期/n 的/u 20/m 周年/q 。/w 在/p 新/a 的/u 一/m 年/q 里/f ,/w 大力/d 发扬/v 十一/m 届/q 三中全会/j 以来/f 我们/r 党/n 所/u 恢复/v 的/u 优良/z 传统/n 和/c 在/p 新/a 的/u 历史/n 条件/n 下/f 形成/v 的/u 优良/z 作风/n ,/w 对于/p 完成/v 好/a 今年/t 的/u 各项/r 任务/n 具有/v 十分/m 重要/a 的/u 意义/n 。/w +19980101-01-002-006/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 解放思想/i 、/w 实事求是/i 的/u 思想/n 路线/n 。/w 解放思想/i 、/w 实事求是/i ,/w 是/v 邓小平理论/n 的/u 精髓/n 。/w 实践/v 证明/v ,/w 只有/c 解放思想/i 、/w 实事求是/i ,/w 才/c 能/v 冲破/v 各种/r 不/d 切合/v 实际/n 的/u 或者/c 过时/a 的/u 观念/n 的/u 束缚/vn ,/w 真正/d 做到/v 尊重/v 、/w 认识/v 和/c 掌握/v 客观/a 规律/n ,/w 勇于/v 突破/v ,/w 勇于/v 创新/v ,/w 不断/d 开创/v 社会主义/n 现代化/vn 建设/vn 的/u 新/a 局面/n 。/w 党/n 的/u 十五大/j 是/v 我们/r 党/n 解放思想/i 、/w 实事求是/i 的/u 新/a 的/u 里程碑/n 。/w 进一步/d 认真/ad 学习/v 和/c 掌握/v 十五大/j 精神/n ,/w 解放思想/i 、/w 实事求是/i ,/w 我们/r 的/u 各项/r 事业/n 就/d 能/v 结/v 出/v 更加/d 丰硕/a 的/u 成果/n 。/w +19980101-01-002-007/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 。/w 各项/r 工作/vn 必须/d 以/p 经济/n 建设/vn 为/v 中心/n ,/w 是/v 邓小平理论/n 的/u 基本/a 观点/n ,/w 是/v 党/n 的/u 基本/a 路线/n 的/u 核心/n 内容/n ,/w 近/a 20/m 年/q 来/f 的/u 实践/vn 证明/v ,/w 坚持/v 这个/r 中心/n ,/w 是/v 完全/ad 正确/a 的/u 。/w 今后/t ,/w 我们/r 能否/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 伟大/a 事业/n 全面/ad 推向/v 21/m 世纪/n ,/w 关键/n 仍然/d 要/v 看/v 能否/v 把/p 经济/n 工作/vn 搞/v 上去/v 。/w 各级/r 领导/n 干部/n 要/v 切实/ad 把/p 精力/n 集中/v 到/v 贯彻/v 落实/v 好/a 中央/n 关于/p 今年/t 经济/n 工作/vn 的/u 总体/n 要求/n 和/c 各项/r 重要/a 任务/n 上/f 来/v ,/w 不断/d 提高/v 领导/v 经济/n 建设/vn 的/u 能力/n 和/c 水平/n 。/w +19980101-01-002-008/m 我们/r 要/v 更/d 好/a 地/u 坚持/v “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 的/u 方针/n 。/w 在/p 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 的/u 同时/n ,/w 积极/ad 推进/v 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn ,/w 是/v 建设/v 富强/a 、/w 民主/a 、/w 文明/a 的/u 社会主义/n 现代化/vn 国家/n 的/u 重要/a 内容/n 。/w 实践/v 证明/v ,/w 经济/n 建设/vn 的/u 顺利/a 进行/vn ,/w 离/v 不/d 开/v 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 的/u 保证/vn 。/w 党/n 的/u 十五大/j 依据/p 邓小平理论/n 和/c 党/n 的/u 基本/a 路线/n 提出/v 的/u 党/n 在/p 社会主义/n 初级/b 阶段/n 经济/n 、/w 政治/n 、/w 文化/n 的/u 基本/a 纲领/n ,/w 为/p “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 提供/v 了/u 新/a 的/u 理论/n 根据/n ,/w 提出/v 了/u 更/d 高/a 要求/n ,/w 现在/t 的/u 关键/n 是/v 认真/ad 抓好/v 落实/v 。/w +19980101-01-002-009/m 我们/r 要/v 更/d 好/a 地/u 发扬/v 求真务实/l 、/w 密切/ad 联系/v 群众/n 的/u 作风/n 。/w 这/r 是/v 把/p 党/n 的/u 方针/n 、/w 政策/n 落到实处/l ,/w 使/v 改革/v 和/c 建设/v 取得/v 胜利/vn 的/u 重要/a 保证/vn 。/w 在/p 当前/t 改革/v 进一步/d 深化/v ,/w 经济/n 不断/d 发展/v ,/w 同时/c 又/d 出现/v 一些/m 新/a 情况/n 、/w 新/a 问题/n 和/c 新/a 困难/an 的/u 形势/n 下/f ,/w 更/d 要/v 发扬/v 这样/r 的/u 好/a 作风/n 。/w 要/v 尊重/v 群众/n 的/u 意愿/n ,/w 重视/v 群众/n 的/u 首创/vn 精神/n ,/w 关心/v 群众/n 的/u 生活/vn 疾苦/n 。/w 江/nr 泽民/nr 同志/n 最近/t 强调/vd 指出/v ,/w 要/v 大力/d 倡导/v 说实话/l 、/w 办/v 实事/n 、/w 鼓/v 实劲/n 、/w 讲/v 实效/n 的/u 作风/n ,/w 坚决/ad 制止/v 追求/v 表面文章/i ,/w 搞/v 花架子/n 等/u 形式主义/n ,/w 坚决/ad 杜绝/v 脱离/v 群众/n 、/w 脱离/v 实际/n 、/w 浮躁/a 虚夸/v 等/u 官僚主义/n 。/w 这/r 是/v 非常/d 重要/a 的/u 。/w 因此/c ,/w 各级/r 领导/n 干部/n 务必/d 牢记/v 全心全意/i 为/p 人民/n 服务/v 的/u 宗旨/n ,/w 在/p 勤政廉政/l 、/w 艰苦奋斗/i 方面/n 以身作则/i ,/w 当/v 好/a 表率/n 。/w +19980101-01-002-010/m 1998/m ,/w 瞩目/v 中华/nz 。/w 新/a 的/u 机遇/n 和/c 挑战/vn ,/w 催/v 人/n 进取/v ;/w 新/a 的/u 目标/n 和/c 征途/n ,/w 催/v 人/n 奋发/v 。/w 英雄/n 的/u 中国/ns 人民/n 在/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 坚强/a 领导/vn 和/c 党/n 的/u 十五大/j 精神/n 指引/v 下/f ,/w 更/d 高/a 地/u 举起/v 邓小平理论/n 的/u 伟大/a 旗帜/n ,/w 团结/a 一致/a ,/w 扎实/ad 工作/v ,/w 奋勇/d 前进/v ,/w 一定/d 能够/v 创造/v 出/v 更加/d 辉煌/a 的/u 业绩/n !/w diff --git a/test/loader/test_loader.py b/test/loader/test_config_loader.py similarity index 69% rename from test/loader/test_loader.py rename to test/loader/test_config_loader.py index 740ff952..485eed3c 100644 --- a/test/loader/test_loader.py +++ b/test/loader/test_config_loader.py @@ -4,7 +4,6 @@ import os import unittest from fastNLP.loader.config_loader import ConfigSection, ConfigLoader -from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader class TestConfigLoader(unittest.TestCase): @@ -52,21 +51,3 @@ class TestConfigLoader(unittest.TestCase): print("pass config test!") - -class TestDatasetLoader(unittest.TestCase): - def test_case_TokenizeDatasetLoader(self): - loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") - data = loader.load_pku(max_seq_len=32) - print("pass TokenizeDatasetLoader test!") - - def test_case_POSDatasetLoader(self): - loader = POSDatasetLoader("./test/data_for_tests/people.txt") - data = loader.load() - datas = loader.load_lines() - print("pass POSDatasetLoader test!") - - def test_case_LMDatasetLoader(self): - loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") - data = loader.load() - datas = loader.load_lines() - print("pass TokenizeDatasetLoader test!") \ No newline at end of file diff --git a/test/loader/test_dataset_loader.py b/test/loader/test_dataset_loader.py new file mode 100644 index 00000000..4dfe2020 --- /dev/null +++ b/test/loader/test_dataset_loader.py @@ -0,0 +1,42 @@ +import unittest + +from fastNLP.loader.dataset_loader import POSDatasetLoader, LMDatasetLoader, TokenizeDatasetLoader, \ + PeopleDailyCorpusLoader, ConllLoader + + +class TestDatasetLoader(unittest.TestCase): + def test_case_1(self): + data = """Tom\tT\nand\tF\nJerry\tT\n.\tF\n\nHello\tT\nworld\tF\n!\tF""" + lines = data.split("\n") + answer = POSDatasetLoader.parse(lines) + truth = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], [["Hello", "world", "!"], ["T", "F", "F"]]] + self.assertListEqual(answer, truth, "POS Dataset Loader") + + def test_case_TokenizeDatasetLoader(self): + loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") + data = loader.load_pku(max_seq_len=32) + print("pass TokenizeDatasetLoader test!") + + def test_case_POSDatasetLoader(self): + loader = POSDatasetLoader("./test/data_for_tests/people.txt") + data = loader.load() + datas = loader.load_lines() + print("pass POSDatasetLoader test!") + + def test_case_LMDatasetLoader(self): + loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") + data = loader.load() + datas = loader.load_lines() + print("pass TokenizeDatasetLoader test!") + + def test_PeopleDailyCorpusLoader(self): + loader = PeopleDailyCorpusLoader("./test/data_for_tests/people_daily_raw.txt") + _, _ = loader.load() + + def test_ConllLoader(self): + loader = ConllLoader("./test/data_for_tests/conll_example.txt") + _ = loader.load() + + +if __name__ == '__main__': + unittest.main() diff --git a/test/loader/test_loader2.py b/test/loader/test_loader2.py deleted file mode 100644 index b18a2fcf..00000000 --- a/test/loader/test_loader2.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from fastNLP.loader.dataset_loader import POSDatasetLoader - - -class TestPreprocess(unittest.TestCase): - def test_case_1(self): - data = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], - ["Hello", "world", "!"], ["T", "F", "F"]] - pickle_path = "./data_for_tests/" - # POSPreprocess(data, pickle_path) - - -class TestDatasetLoader(unittest.TestCase): - def test_case_1(self): - data = """Tom\tT\nand\tF\nJerry\tT\n.\tF\n\nHello\tT\nworld\tF\n!\tF""" - lines = data.split("\n") - answer = POSDatasetLoader.parse(lines) - truth = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], [["Hello", "world", "!"], ["T", "F", "F"]]] - self.assertListEqual(answer, truth, "POS Dataset Loader") - - -if __name__ == '__main__': - unittest.main() diff --git a/test/model/test_cws.py b/test/model/test_cws.py index 70716c3a..94437bb2 100644 --- a/test/model/test_cws.py +++ b/test/model/test_cws.py @@ -1,28 +1,25 @@ -import sys +import os -sys.path.append("..") - -from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.core.predictor import Predictor +from fastNLP.core.preprocess import Preprocessor, load_pickle +from fastNLP.core.tester import SeqLabelTester from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.config_loader import ConfigLoader, ConfigSection from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader -from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle -from fastNLP.saver.model_saver import ModelSaver from fastNLP.loader.model_loader import ModelLoader -from fastNLP.core.tester import SeqLabelTester from fastNLP.models.sequence_modeling import SeqLabeling -from fastNLP.core.predictor import Predictor +from fastNLP.saver.model_saver import ModelSaver data_name = "pku_training.utf8" -# cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" -cws_data_path = "data_for_tests/cws_pku_utf_8" -pickle_path = "data_for_tests" -data_infer_path = "data_for_tests/people_infer.txt" - +cws_data_path = "test/data_for_tests/cws_pku_utf_8" +pickle_path = "./save/" +data_infer_path = "test/data_for_tests/people_infer.txt" +config_path = "test/data_for_tests/config" def infer(): # Load infer configuration, the same as test test_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) + ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -34,41 +31,31 @@ def infer(): model = SeqLabeling(test_args) # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print("model loaded!") # Data Loader raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() - """ - Transform strings into list of list of strings. - [ - [word_11, word_12, ...], - [word_21, word_22, ...], - ... - ] - In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. - """ # Inference interface - infer = Predictor(pickle_path) + infer = Predictor(pickle_path, "seq_label") results = infer.predict(model, infer_data) print(results) - print("Inference finished!") def train_test(): # Config Loader train_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) + ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": train_args}) # Data Loader loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor - p = SeqLabelPreprocess() + p = Preprocessor(label_is_seq=True) data_train = p.run(train_data, pickle_path=pickle_path) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes @@ -81,12 +68,10 @@ def train_test(): # Start training trainer.train(model, data_train) - print("Training finished!") # Saver - saver = ModelSaver("./data_for_tests/saved_model.pkl") + saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) - print("Model saved!") del model, trainer, loader @@ -94,12 +79,11 @@ def train_test(): model = SeqLabeling(train_args) # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") # Load test configuration test_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) + ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": test_args}) # Tester tester = SeqLabelTester(**test_args.data) @@ -109,7 +93,13 @@ def train_test(): # print test results print(tester.show_metrics()) - print("model tested!") + + +def test(): + os.makedirs("save", exist_ok=True) + train_test() + infer() + os.system("rm -rf save") if __name__ == "__main__": diff --git a/test/modules/test_other_modules.py b/test/modules/test_other_modules.py index 141535a0..7e2491c4 100644 --- a/test/modules/test_other_modules.py +++ b/test/modules/test_other_modules.py @@ -1,7 +1,6 @@ - +import unittest import torch -import unittest from fastNLP.modules.other_modules import GroupNorm, LayerNormalization, BiLinear diff --git a/test/modules/test_utils.py b/test/modules/test_utils.py index 1cfc6aa1..1d3cfcac 100644 --- a/test/modules/test_utils.py +++ b/test/modules/test_utils.py @@ -1,18 +1,9 @@ - -import torch -import numpy as np import unittest -import fastNLP.modules.utils as utils class TestUtils(unittest.TestCase): def test_case_1(self): - a = torch.tensor([ - [1, 2, 3, 4, 5], [2, 3, 4, 5, 6] - ]) - utils.orthogonal(a) + pass def test_case_2(self): - a = np.random.rand(100, 100) - utils.mst(a) - + pass diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py index 92bc894f..a40a0cf4 100644 --- a/test/test_fastNLP.py +++ b/test/test_fastNLP.py @@ -1,16 +1,32 @@ -import sys +# encoding: utf-8 +import os -sys.path.append("..") +from fastNLP.core.preprocess import save_pickle +from fastNLP.core.vocabulary import Vocabulary from fastNLP.fastnlp import FastNLP from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results +from fastNLP.models.cnn_text_classification import CNNText +from fastNLP.models.sequence_modeling import AdvSeqLabel +from fastNLP.saver.model_saver import ModelSaver PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" -def word_seg(): - nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) - nlp.load("cws_basic_model", config_file="cws.cfg", section_name="POS_test") +DEFAULT_PADDING_LABEL = '' # dict index = 0 +DEFAULT_UNKNOWN_LABEL = '' # dict index = 1 +DEFAULT_RESERVED_LABEL = ['', + '', + ''] # dict index = 2~4 + +DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, + DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, + DEFAULT_RESERVED_LABEL[2]: 4} + + +def word_seg(model_dir, config, section): + nlp = FastNLP(model_dir=model_dir) + nlp.load("cws_basic_model", config_file=config, section_name=section) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] @@ -24,38 +40,52 @@ def word_seg(): print(interpret_word_seg_results(words, labels)) -def text_class(): - nlp = FastNLP("./data_for_tests/") - nlp.load("text_class_model") - text = "这是最好的基于深度学习的中文分词系统。" - result = nlp.run(text) - print(result) - print("FastNLP finished!") +def mock_cws(): + os.makedirs("mock", exist_ok=True) + text = ["这是最好的基于深度学习的中文分词系统。", + "大王叫我来巡山。", + "我党多年来致力于改善人民生活水平。"] + word2id = Vocabulary() + word_list = [ch for ch in "".join(text)] + word2id.update(word_list) + save_pickle(word2id, "./mock/", "word2id.pkl") -def test_word_seg_interpret(): - foo = [[('这', 'S'), ('是', 'S'), ('最', 'S'), ('好', 'S'), ('的', 'S'), ('基', 'B'), ('于', 'E'), ('深', 'B'), ('度', 'E'), - ('学', 'B'), ('习', 'E'), ('的', 'S'), ('中', 'B'), ('文', 'E'), ('分', 'B'), ('词', 'E'), ('系', 'B'), ('统', 'E'), - ('。', 'S')]] - chars = [x[0] for x in foo[0]] - labels = [x[1] for x in foo[0]] - print(interpret_word_seg_results(chars, labels)) + class2id = Vocabulary(need_default=False) + label_list = ['B', 'M', 'E', 'S'] + class2id.update(label_list) + save_pickle(class2id, "./mock/", "class2id.pkl") + model_args = {"vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id)} + config_file = """ + [test_section] + vocab_size = {} + word_emb_dim = 50 + rnn_hidden_units = 50 + num_classes = {} + """.format(len(word2id), len(class2id)) + with open("mock/test.cfg", "w", encoding="utf-8") as f: + f.write(config_file) -def test_interpret_cws_pos_results(): - foo = [ - [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), - ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), - ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] - ] - chars = [x[0] for x in foo[0]] - labels = [x[1] for x in foo[0]] - print(interpret_cws_pos_results(chars, labels)) + model = AdvSeqLabel(model_args) + ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model) + + +def test_word_seg(): + # fake the model and pickles + print("start mocking") + mock_cws() + # run the inference codes + print("start testing") + word_seg("./mock/", "test.cfg", "test_section") + # clean up environments + print("clean up") + os.system("rm -rf mock") -def pos_tag(): - nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) - nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") +def pos_tag(model_dir, config, section): + nlp = FastNLP(model_dir=model_dir) + nlp.load("pos_tag_model", config_file=config, section_name=section) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] @@ -65,21 +95,119 @@ def pos_tag(): for res in example: words.append(res[0]) labels.append(res[1]) - print(interpret_cws_pos_results(words, labels)) + try: + print(interpret_cws_pos_results(words, labels)) + except RuntimeError: + print("inconsistent pos tags. this is for test only.") + + +def mock_pos_tag(): + os.makedirs("mock", exist_ok=True) + text = ["这是最好的基于深度学习的中文分词系统。", + "大王叫我来巡山。", + "我党多年来致力于改善人民生活水平。"] + + vocab = Vocabulary() + word_list = [ch for ch in "".join(text)] + vocab.update(word_list) + save_pickle(vocab, "./mock/", "word2id.pkl") + + idx2label = Vocabulary(need_default=False) + label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv'] + idx2label.update(label_list) + save_pickle(idx2label, "./mock/", "class2id.pkl") + model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)} + config_file = """ + [test_section] + vocab_size = {} + word_emb_dim = 50 + rnn_hidden_units = 50 + num_classes = {} + """.format(len(vocab), len(idx2label)) + with open("mock/test.cfg", "w", encoding="utf-8") as f: + f.write(config_file) -def text_classify(): - nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES) - nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model") + model = AdvSeqLabel(model_args) + ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model) + + +def test_pos_tag(): + mock_pos_tag() + pos_tag("./mock/", "test.cfg", "test_section") + os.system("rm -rf mock") + + +def text_classify(model_dir, config, section): + nlp = FastNLP(model_dir=model_dir) + nlp.load("text_classify_model", config_file=config, section_name=section) text = [ "世界物联网大会明日在京召开龙头股启动在即", "乌鲁木齐市新增一处城市中心旅游目的地", "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"] results = nlp.run(text) print(results) - """ - ['finance', 'travel', 'history'] - """ + + +def mock_text_classify(): + os.makedirs("mock", exist_ok=True) + text = ["世界物联网大会明日在京召开龙头股启动在即", + "乌鲁木齐市新增一处城市中心旅游目的地", + "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”" + ] + vocab = Vocabulary() + word_list = [ch for ch in "".join(text)] + vocab.update(word_list) + save_pickle(vocab, "./mock/", "word2id.pkl") + + idx2label = Vocabulary(need_default=False) + label_list = ['class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F'] + idx2label.update(label_list) + save_pickle(idx2label, "./mock/", "class2id.pkl") + + model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)} + config_file = """ + [test_section] + vocab_size = {} + word_emb_dim = 50 + rnn_hidden_units = 50 + num_classes = {} + """.format(len(vocab), len(idx2label)) + with open("mock/test.cfg", "w", encoding="utf-8") as f: + f.write(config_file) + + model = CNNText(model_args) + ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model) + + +def test_text_classify(): + mock_text_classify() + text_classify("./mock/", "test.cfg", "test_section") + os.system("rm -rf mock") + + +def test_word_seg_interpret(): + foo = [[('这', 'S'), ('是', 'S'), ('最', 'S'), ('好', 'S'), ('的', 'S'), ('基', 'B'), ('于', 'E'), ('深', 'B'), ('度', 'E'), + ('学', 'B'), ('习', 'E'), ('的', 'S'), ('中', 'B'), ('文', 'E'), ('分', 'B'), ('词', 'E'), ('系', 'B'), ('统', 'E'), + ('。', 'S')]] + chars = [x[0] for x in foo[0]] + labels = [x[1] for x in foo[0]] + print(interpret_word_seg_results(chars, labels)) + + +def test_interpret_cws_pos_results(): + foo = [ + [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), + ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), + ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] + ] + chars = [x[0] for x in foo[0]] + labels = [x[1] for x in foo[0]] + print(interpret_cws_pos_results(chars, labels)) if __name__ == "__main__": - text_classify() + test_word_seg() + test_pos_tag() + test_text_classify() + test_word_seg_interpret() + test_interpret_cws_pos_results()