diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py index 5e77649e..12a7a987 100644 --- a/fastNLP/core/preprocess.py +++ b/fastNLP/core/preprocess.py @@ -1,13 +1,6 @@ import _pickle import os -import numpy as np - -from fastNLP.core.dataset import DataSet -from fastNLP.core.field import TextField, LabelField -from fastNLP.core.instance import Instance -from fastNLP.core.vocabulary import Vocabulary - # the first vocab in dict with the index = 5 @@ -53,258 +46,3 @@ def pickle_exist(pickle_path, pickle_name): return True else: return False - - -class Preprocessor(object): - """Preprocessors are responsible for converting data of strings into data of indices. - During the pre-processing, the following pickle files will be built: - - - "word2id.pkl", a Vocabulary object, mapping words to indices. - - "class2id.pkl", a Vocabulary object, mapping labels to indices. - - "data_train.pkl", a DataSet object for training - - "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0. - - "data_test.pkl", a DataSet object for testing, if test_data is not None. - - These four pickle files are expected to be saved in the given pickle directory once they are constructed. - Preprocessors will check if those files are already in the directory and will reuse them in future calls. - """ - - def __init__(self, label_is_seq=False, share_vocab=False, add_char_field=False): - """ - - :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve - several special tokens for sequence processing. - :param share_vocab: bool, whether word sequence and label sequence share the same vocabulary. Typically, this - is only available when label_is_seq is True. Default: False. - :param add_char_field: bool, whether to add character representations to all TextFields. Default: False. - """ - print("Preprocessor is about to deprecate. Please use DataSet class.") - self.data_vocab = Vocabulary() - if label_is_seq is True: - if share_vocab is True: - self.label_vocab = self.data_vocab - else: - self.label_vocab = Vocabulary() - else: - self.label_vocab = Vocabulary(need_default=False) - - self.character_vocab = Vocabulary(need_default=False) - self.add_char_field = add_char_field - - @property - def vocab_size(self): - return len(self.data_vocab) - - @property - def num_classes(self): - return len(self.label_vocab) - - @property - def char_vocab_size(self): - if self.character_vocab is None: - self.build_char_dict() - return len(self.character_vocab) - - def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): - """Main pre-processing pipeline. - - :param train_dev_data: three-level list, with either single label or multiple labels in a sample. - :param test_data: three-level list, with either single label or multiple labels in a sample. (optional) - :param pickle_path: str, the path to save the pickle files. - :param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set. - :param cross_val: bool, whether to do cross validation. - :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. - :return results: multiple datasets after pre-processing. If test_data is provided, return one more dataset. - If train_dev_split > 0, return one more dataset - the dev set. If cross_val is True, each dataset - is a list of DataSet objects; Otherwise, each dataset is a DataSet object. - """ - if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): - self.data_vocab = load_pickle(pickle_path, "word2id.pkl") - self.label_vocab = load_pickle(pickle_path, "class2id.pkl") - else: - self.data_vocab, self.label_vocab = self.build_dict(train_dev_data) - save_pickle(self.data_vocab, pickle_path, "word2id.pkl") - save_pickle(self.label_vocab, pickle_path, "class2id.pkl") - - self.build_reverse_dict() - - train_set = [] - dev_set = [] - if not cross_val: - if not pickle_exist(pickle_path, "data_train.pkl"): - if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"): - split = int(len(train_dev_data) * train_dev_split) - data_dev = train_dev_data[: split] - data_train = train_dev_data[split:] - train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab) - dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab) - - save_pickle(dev_set, pickle_path, "data_dev.pkl") - print("{} of the training data is split for validation. ".format(train_dev_split)) - else: - train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab) - save_pickle(train_set, pickle_path, "data_train.pkl") - else: - train_set = load_pickle(pickle_path, "data_train.pkl") - if pickle_exist(pickle_path, "data_dev.pkl"): - dev_set = load_pickle(pickle_path, "data_dev.pkl") - else: - # cross_val is True - if not pickle_exist(pickle_path, "data_train_0.pkl"): - # cross validation - data_cv = self.cv_split(train_dev_data, n_fold) - for i, (data_train_cv, data_dev_cv) in enumerate(data_cv): - data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab) - data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab) - save_pickle( - data_train_cv, pickle_path, - "data_train_{}.pkl".format(i)) - save_pickle( - data_dev_cv, pickle_path, - "data_dev_{}.pkl".format(i)) - train_set.append(data_train_cv) - dev_set.append(data_dev_cv) - print("{}-fold cross validation.".format(n_fold)) - else: - for i in range(n_fold): - data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i)) - data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i)) - train_set.append(data_train_cv) - dev_set.append(data_dev_cv) - - # prepare test data if provided - test_set = [] - if test_data is not None: - if not pickle_exist(pickle_path, "data_test.pkl"): - test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab) - save_pickle(test_set, pickle_path, "data_test.pkl") - - # return preprocessed results - results = [train_set] - if cross_val or train_dev_split > 0: - results.append(dev_set) - if test_data: - results.append(test_set) - if len(results) == 1: - return results[0] - else: - return tuple(results) - - def build_dict(self, data): - for example in data: - word, label = example - self.data_vocab.update(word) - self.label_vocab.update(label) - return self.data_vocab, self.label_vocab - - def build_char_dict(self): - char_collection = set() - for word in self.data_vocab.word2idx: - if len(word) == 0: - continue - for ch in word: - if ch not in char_collection: - char_collection.add(ch) - self.character_vocab.update(list(char_collection)) - - def build_reverse_dict(self): - self.data_vocab.build_reverse_vocab() - self.label_vocab.build_reverse_vocab() - - def data_split(self, data, train_dev_split): - """Split data into train and dev set.""" - split = int(len(data) * train_dev_split) - data_dev = data[: split] - data_train = data[split:] - return data_train, data_dev - - def cv_split(self, data, n_fold): - """Split data for cross validation. - - :param data: list of string - :param n_fold: int - :return data_cv: - - :: - [ - (data_train, data_dev), # 1st fold - (data_train, data_dev), # 2nd fold - ... - ] - - """ - data_copy = data.copy() - np.random.shuffle(data_copy) - fold_size = round(len(data_copy) / n_fold) - data_cv = [] - for i in range(n_fold - 1): - start = i * fold_size - end = (i + 1) * fold_size - data_dev = data_copy[start:end] - data_train = data_copy[:start] + data_copy[end:] - data_cv.append((data_train, data_dev)) - start = (n_fold - 1) * fold_size - data_dev = data_copy[start:] - data_train = data_copy[:start] - data_cv.append((data_train, data_dev)) - return data_cv - - def convert_to_dataset(self, data, vocab, label_vocab): - """Convert list of indices into a DataSet object. - - :param data: list. Entries are strings. - :param vocab: a dict, mapping string (token) to index (int). - :param label_vocab: a dict, mapping string (label) to index (int). - :return data_set: a DataSet object - """ - use_word_seq = False - use_label_seq = False - use_label_str = False - - # construct a DataSet object and fill it with Instances - data_set = DataSet() - for example in data: - words, label = example[0], example[1] - instance = Instance() - - if isinstance(words, list): - x = TextField(words, is_target=False) - instance.add_field("word_seq", x) - use_word_seq = True - else: - raise NotImplementedError("words is a {}".format(type(words))) - - if isinstance(label, list): - y = TextField(label, is_target=True) - instance.add_field("label_seq", y) - use_label_seq = True - elif isinstance(label, str): - y = LabelField(label, is_target=True) - instance.add_field("label", y) - use_label_str = True - else: - raise NotImplementedError("label is a {}".format(type(label))) - data_set.append(instance) - - # convert strings to indices - if use_word_seq: - data_set.index_field("word_seq", vocab) - if use_label_seq: - data_set.index_field("label_seq", label_vocab) - if use_label_str: - data_set.index_field("label", label_vocab) - - return data_set - - -class SeqLabelPreprocess(Preprocessor): - def __init__(self): - print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.") - super(SeqLabelPreprocess, self).__init__() - - -class ClassPreprocess(Preprocessor): - def __init__(self): - print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.") - super(ClassPreprocess, self).__init__() - diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index b1ae828f..c73bdfd9 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -13,69 +13,3 @@ class BaseModel(torch.nn.Module): def fit(self, train_data, dev_data=None, **train_args): trainer = Trainer(**train_args) trainer.train(self, train_data, dev_data) - - -class Vocabulary(object): - """A look-up table that allows you to access `Lexeme` objects. The `Vocab` - instance also provides access to the `StringStore`, and owns underlying - data that is shared between `Doc` objects. - """ - - def __init__(self): - """Create the vocabulary. - RETURNS (Vocab): The newly constructed object. - """ - self.data_frame = None - - -class Document(object): - """A sequence of Token objects. Access sentences and named entities, export - annotations to numpy arrays, losslessly serialize to compressed binary - strings. The `Doc` object holds an array of `Token` objects. The - Python-level `Token` and `Span` objects are views of this array, i.e. - they don't own the data themselves. -- spacy - """ - - def __init__(self, vocab, words=None, spaces=None): - """Create a Doc object. - vocab (Vocab): A vocabulary object, which must match any models you - want to use (e.g. tokenizer, parser, entity recognizer). - words (list or None): A list of unicode strings, to add to the document - as words. If `None`, defaults to empty list. - spaces (list or None): A list of boolean values, of the same length as - words. True means that the word is followed by a space, False means - it is not. If `None`, defaults to `[True]*len(words)` - user_data (dict or None): Optional extra data to attach to the Doc. - RETURNS (Doc): The newly constructed object. - """ - self.vocab = vocab - self.spaces = spaces - self.words = words - if spaces is None: - self.spaces = [True] * len(self.words) - elif len(spaces) != len(self.words): - raise ValueError("dismatch spaces and words") - - def get_chunker(self, vocab): - return None - - def push_back(self, vocab): - pass - - -class Token(object): - """An individual token – i.e. a word, punctuation symbol, whitespace, - etc. - """ - - def __init__(self, vocab, doc, offset): - """Construct a `Token` object. - vocab (Vocabulary): A storage container for lexical types. - doc (Document): The parent document. - offset (int): The index of the token within the document. - """ - self.vocab = vocab - self.doc = doc - self.token = doc[offset] - self.i = offset - diff --git a/fastNLP/models/char_language_model.py b/fastNLP/models/char_language_model.py index 2ad49abe..5fbde3cc 100644 --- a/fastNLP/models/char_language_model.py +++ b/fastNLP/models/char_language_model.py @@ -103,7 +103,7 @@ class CharLM(nn.Module): x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) # [num_seq, seq_len, total_num_filters] - x, hidden = self.lstm(x) + x = self.lstm(x) # [seq_len, num_seq, hidden_size] x = self.dropout(x) diff --git a/fastNLP/modules/encoder/char_embedding.py b/fastNLP/modules/encoder/char_embedding.py index 1da63947..1ca3b5ba 100644 --- a/fastNLP/modules/encoder/char_embedding.py +++ b/fastNLP/modules/encoder/char_embedding.py @@ -1,12 +1,14 @@ import torch import torch.nn.functional as F from torch import nn -# from torch.nn.init import xavier_uniform from fastNLP.modules.utils import initial_parameter + + +# from torch.nn.init import xavier_uniform class ConvCharEmbedding(nn.Module): - def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5),initial_method = None): + def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None): """ Character Level Word Embedding :param char_emb_size: the size of character level embedding. Default: 50 @@ -21,7 +23,7 @@ class ConvCharEmbedding(nn.Module): nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4)) for i in range(len(kernels))]) - initial_parameter(self,initial_method) + initial_parameter(self, initial_method) def forward(self, x): """ @@ -56,7 +58,7 @@ class LSTMCharEmbedding(nn.Module): :param hidden_size: int, the number of hidden units. Default: equal to char_emb_size. """ - def __init__(self, char_emb_size=50, hidden_size=None , initial_method= None): + def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None): super(LSTMCharEmbedding, self).__init__() self.hidden_size = char_emb_size if hidden_size is None else hidden_size @@ -66,6 +68,7 @@ class LSTMCharEmbedding(nn.Module): bias=True, batch_first=True) initial_parameter(self, initial_method) + def forward(self, x): """ :param x:[ n_batch*n_word, word_length, char_emb_size] @@ -79,20 +82,3 @@ class LSTMCharEmbedding(nn.Module): _, hidden = self.lstm(x, (h0, c0)) return hidden[0].squeeze().unsqueeze(2) - - -if __name__ == "__main__": - batch_size = 128 - char_emb = 100 - word_length = 1 - x = torch.Tensor(batch_size, char_emb, word_length) - x = x.transpose(1, 2) - cce = ConvCharEmbedding(char_emb) - y = cce(x) - print("CNN Char Emb input: ", x.shape) - print("CNN Char Emb output: ", y.shape) # [128, 100] - - lce = LSTMCharEmbedding(char_emb) - o = lce(x) - print("LSTM Char Emb input: ", x.shape) - print("LSTM Char Emb size: ", o.shape) diff --git a/reproduction/Char-aware_NLM/main.py b/reproduction/Char-aware_NLM/main.py index 03810650..6467d98d 100644 --- a/reproduction/Char-aware_NLM/main.py +++ b/reproduction/Char-aware_NLM/main.py @@ -1,24 +1,8 @@ -from fastNLP.core.loss import Loss -from fastNLP.core.preprocess import Preprocessor -from fastNLP.core.trainer import Trainer -from fastNLP.loader.dataset_loader import LMDataSetLoader -from fastNLP.models.char_language_model import CharLM - PICKLE = "./save/" def train(): - loader = LMDataSetLoader() - train_data = loader.load() - - pre = Preprocessor(label_is_seq=True, share_vocab=True) - train_set = pre.run(train_data, pickle_path=PICKLE) - - model = CharLM(50, 50, pre.vocab_size, pre.char_vocab_size) - - trainer = Trainer(task="language_model", loss=Loss("cross_entropy")) - - trainer.train(model, train_set) + pass if __name__ == "__main__": diff --git a/test/core/test_preprocess.py b/test/core/test_preprocess.py deleted file mode 100644 index 05c04ce9..00000000 --- a/test/core/test_preprocess.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import unittest - -from fastNLP.core.dataset import DataSet -from fastNLP.core.preprocess import SeqLabelPreprocess - -data = [ - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], -] - - -class TestCase1(unittest.TestCase): - def test(self): - if os.path.exists("./save"): - for root, dirs, files in os.walk("./save", topdown=False): - for name in files: - os.remove(os.path.join(root, name)) - for name in dirs: - os.rmdir(os.path.join(root, name)) - result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4, - pickle_path="./save") - self.assertEqual(len(result), 2) - self.assertEqual(type(result[0]), DataSet) - self.assertEqual(type(result[1]), DataSet) - - os.system("rm -rf save") - print("pickle path deleted") - - -class TestCase2(unittest.TestCase): - def test(self): - if os.path.exists("./save"): - for root, dirs, files in os.walk("./save", topdown=False): - for name in files: - os.remove(os.path.join(root, name)) - for name in dirs: - os.rmdir(os.path.join(root, name)) - result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data, - pickle_path="./save", train_dev_split=0.4, - cross_val=False) - self.assertEqual(len(result), 3) - self.assertEqual(type(result[0]), DataSet) - self.assertEqual(type(result[1]), DataSet) - self.assertEqual(type(result[2]), DataSet) - - os.system("rm -rf save") - print("pickle path deleted") - - -class TestCase3(unittest.TestCase): - def test(self): - num_folds = 2 - result = SeqLabelPreprocess().run(test_data=None, train_dev_data=data, - pickle_path="./save", train_dev_split=0.4, - cross_val=True, n_fold=num_folds) - self.assertEqual(len(result), 2) - self.assertEqual(len(result[0]), num_folds) - self.assertEqual(len(result[1]), num_folds) - for data_set in result[0] + result[1]: - self.assertEqual(type(data_set), DataSet) - - os.system("rm -rf save") - print("pickle path deleted") diff --git a/test/model/test_char_language_model.py b/test/model/test_char_language_model.py new file mode 100644 index 00000000..5a7bc835 --- /dev/null +++ b/test/model/test_char_language_model.py @@ -0,0 +1,25 @@ +import unittest + +import numpy as np +import torch + +from fastNLP.models.char_language_model import CharLM + + +class TestCharLM(unittest.TestCase): + def test_case_1(self): + char_emb_dim = 50 + word_emb_dim = 50 + vocab_size = 1000 + num_char = 24 + max_word_len = 21 + num_seq = 64 + seq_len = 32 + + model = CharLM(char_emb_dim, word_emb_dim, vocab_size, num_char) + + x = torch.from_numpy(np.random.randint(0, num_char, size=(num_seq, seq_len, max_word_len + 2))) + + self.assertEqual(tuple(x.shape), (num_seq, seq_len, max_word_len + 2)) + y = model(x) + self.assertEqual(tuple(y.shape), (num_seq * seq_len, vocab_size)) diff --git a/test/modules/test_char_embedding.py b/test/modules/test_char_embedding.py new file mode 100644 index 00000000..07def64a --- /dev/null +++ b/test/modules/test_char_embedding.py @@ -0,0 +1,28 @@ +import unittest + +import torch + +from fastNLP.modules.encoder.char_embedding import ConvCharEmbedding, LSTMCharEmbedding + + +class TestCharEmbed(unittest.TestCase): + def test_case_1(self): + batch_size = 128 + char_emb = 100 + word_length = 1 + x = torch.Tensor(batch_size, char_emb, word_length) + x = x.transpose(1, 2) + + cce = ConvCharEmbedding(char_emb) + y = cce(x) + self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb)) + print("CNN Char Emb input: ", x.shape) + self.assertEqual(tuple(y.shape), (batch_size, char_emb, 1)) + print("CNN Char Emb output: ", y.shape) # [128, 100] + + lce = LSTMCharEmbedding(char_emb) + o = lce(x) + self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb)) + print("LSTM Char Emb input: ", x.shape) + self.assertEqual(tuple(o.shape), (batch_size, char_emb, 1)) + print("LSTM Char Emb size: ", o.shape) diff --git a/test/modules/test_variational_rnn.py b/test/modules/test_variational_rnn.py index cd265109..b182fa1a 100644 --- a/test/modules/test_variational_rnn.py +++ b/test/modules/test_variational_rnn.py @@ -1,9 +1,11 @@ +import unittest +import numpy as np import torch -import unittest from fastNLP.modules.encoder.variational_rnn import VarMaskedFastLSTM + class TestMaskedRnn(unittest.TestCase): def test_case_1(self): masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True) @@ -16,13 +18,20 @@ class TestMaskedRnn(unittest.TestCase): y = masked_rnn(x, mask=mask) def test_case_2(self): - masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=False, batch_first=True) - x = torch.tensor([[[1.0], [2.0]]]) - print(x.size()) - y = masked_rnn(x) - mask = torch.tensor([[[1], [1]]]) - y = masked_rnn(x, mask=mask) - xx = torch.tensor([[[1.0]]]) - #y, hidden = masked_rnn.step(xx) - #step() still has a bug - #y, hidden = masked_rnn.step(xx, mask=mask) \ No newline at end of file + input_size = 12 + batch = 16 + hidden = 10 + masked_rnn = VarMaskedFastLSTM(input_size=input_size, hidden_size=hidden, bidirectional=False, batch_first=True) + + x = torch.randn((batch, input_size)) + output, _ = masked_rnn.step(x) + self.assertEqual(tuple(output.shape), (batch, hidden)) + + xx = torch.randn((batch, 32, input_size)) + y, _ = masked_rnn(xx) + self.assertEqual(tuple(y.shape), (batch, 32, hidden)) + + xx = torch.randn((batch, 32, input_size)) + mask = torch.from_numpy(np.random.randint(0, 2, size=(batch, 32))).to(xx) + y, _ = masked_rnn(xx, mask=mask) + self.assertEqual(tuple(y.shape), (batch, 32, hidden))