From 32a036e8e6a2f38a6368189607063543c940ded7 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Sep 2018 21:33:28 +0800 Subject: [PATCH 1/4] [fix] drop "data" in Tester.make_batch; correct spelling of "show_metrics" [add] PeopleDailyCorpusLoader, to parse PeopleDaily Corpus [update] add CWS + POS_tag interface at FastNLP, see example in test_fastNLP.py [update] modify README.md and readme_example.py to the latest version. --- README.md | 50 +++--- fastNLP/core/tester.py | 12 +- fastNLP/core/trainer.py | 2 +- fastNLP/fastnlp.py | 44 ++++++ fastNLP/loader/dataset_loader.py | 58 ++++++- fastNLP/modules/decoder/__init__.py | 3 +- reproduction/chinese_word_seg/cws_train.py | 114 -------------- reproduction/chinese_word_segment/cws.cfg | 12 ++ reproduction/chinese_word_segment/run.py | 2 +- .../cws.cfg => pos_tag_model/pos_tag.cfg} | 28 ++-- reproduction/pos_tag_model/train_pos_tag.py | 146 ++++++++++++++++++ test/ner.py | 2 +- test/readme_example.py | 48 +++--- test/seq_labeling.py | 2 +- test/test_cws.py | 2 +- test/test_fastNLP.py | 33 +++- test/test_tester.py | 6 +- 17 files changed, 370 insertions(+), 194 deletions(-) delete mode 100644 reproduction/chinese_word_seg/cws_train.py rename reproduction/{chinese_word_seg/cws.cfg => pos_tag_model/pos_tag.cfg} (59%) create mode 100644 reproduction/pos_tag_model/train_pos_tag.py diff --git a/README.md b/README.md index a38771ee..b0ac20db 100644 --- a/README.md +++ b/README.md @@ -27,15 +27,16 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. ```python +from fastNLP.core.preprocess import ClassPreprocess +from fastNLP.core.predictor import ClassificationInfer +from fastNLP.core.trainer import ClassificationTrainer +from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.models.base_model import BaseModel -from fastNLP.modules import encoder from fastNLP.modules import aggregation +from fastNLP.modules import encoder from fastNLP.modules import decoder - -from fastNLP.loader.dataset_loader import ClassDatasetLoader -from fastNLP.loader.preprocess import ClassPreprocess -from fastNLP.core.trainer import ClassificationTrainer -from fastNLP.core.inference import ClassificationInfer +from fastNLP.core.loss import Loss +from fastNLP.core.optimizer import Optimizer class ClassificationModel(BaseModel): @@ -50,7 +51,7 @@ class ClassificationModel(BaseModel): self.enc = encoder.Conv( in_channels=300, out_channels=100, kernel_size=3) self.agg = aggregation.MaxPool() - self.dec = decoder.MLP(100, num_classes=num_classes) + self.dec = decoder.MLP(size_layer=[100, num_classes]) def forward(self, x): x = self.emb(x) # [N,L] -> [N,L,C] @@ -60,16 +61,17 @@ class ClassificationModel(BaseModel): return x -data_dir = 'data' # directory to save data and model -train_path = 'test/data_for_tests/text_classify.txt' # training set file +data_dir = 'save/' # directory to save data and model +train_path = './data_for_tests/text_classify.txt' # training set file # load dataset ds_loader = ClassDatasetLoader("train", train_path) data = ds_loader.load() # pre-process dataset -pre = ClassPreprocess(data_dir) -vocab_size, n_classes = pre.process(data, "data_train.pkl") +pre = ClassPreprocess() +train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) +n_classes, vocab_size = pre.num_classes, pre.vocab_size # construct model model_args = { @@ -78,28 +80,36 @@ model_args = { } model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) -# train model +# construct trainer train_args = { - "epochs": 20, - "batch_size": 50, + "epochs": 3, + "batch_size": 16, "pickle_path": data_dir, "validate": False, "save_best_dev": False, "model_saved_path": None, "use_cuda": True, - "learn_rate": 1e-3, - "momentum": 0.9} -trainer = ClassificationTrainer(train_args) -trainer.train(model) + "loss": Loss("cross_entropy"), + "optimizer": Optimizer("Adam", lr=0.001) +} +trainer = ClassificationTrainer(**train_args) + +# start training +trainer.train(model, train_data=train_set, dev_data=dev_set) # predict using model -seqs = [x[0] for x in data] +data_infer = [x[0] for x in data] infer = ClassificationInfer(data_dir) -labels_pred = infer.predict(model, seqs) +labels_pred = infer.predict(model.cpu(), data_infer) +print(labels_pred) ``` ## Installation +Run the following commands to install fastNLP package. +```shell +pip install fastNLP +``` ### Cloning From GitHub diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index c168822e..c819069f 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -86,7 +86,7 @@ class BaseTester(object): iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) step = 0 - for batch_x, batch_y in self.make_batch(iterator, dev_data): + for batch_x, batch_y in self.make_batch(iterator): with torch.no_grad(): prediction = self.data_forward(network, batch_x) eval_results = self.evaluate(prediction, batch_y) @@ -123,14 +123,14 @@ class BaseTester(object): """Return a list of metrics. """ raise NotImplementedError - def show_matrices(self): + def show_metrics(self): """This is called by Trainer to print evaluation results on dev set during training. :return print_str: str """ raise NotImplementedError - def make_batch(self, iterator, data): + def make_batch(self, iterator): raise NotImplementedError @@ -194,7 +194,7 @@ class SeqLabelTester(BaseTester): batch_accuracy = np.mean([x[1] for x in self.eval_history]) return batch_loss, batch_accuracy - def show_matrices(self): + def show_metrics(self): """ This is called by Trainer to print evaluation on dev set. :return print_str: str @@ -202,7 +202,7 @@ class SeqLabelTester(BaseTester): loss, accuracy = self.metrics() return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) - def make_batch(self, iterator, data): + def make_batch(self, iterator): return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) @@ -216,7 +216,7 @@ class ClassificationTester(BaseTester): """ super(ClassificationTester, self).__init__(**test_args) - def make_batch(self, iterator, data, max_len=None): + def make_batch(self, iterator, max_len=None): return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) def data_forward(self, network, x): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 7fc34da0..ebba7975 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -144,7 +144,7 @@ class BaseTrainer(object): print("Saved better model selected by validation.") logger.info("Saved better model selected by validation.") - valid_results = validator.show_matrices() + valid_results = validator.show_metrics() print("[epoch {}] {}".format(epoch, valid_results)) logger.info("[epoch {}] {}".format(epoch, valid_results)) diff --git a/fastNLP/fastnlp.py b/fastNLP/fastnlp.py index 67204161..4a697e9a 100644 --- a/fastNLP/fastnlp.py +++ b/fastNLP/fastnlp.py @@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = { "type": "seq_label", "config_file_name": "config", "config_section_name": "text_class_model" + }, + "pos_tag_model": { + "url": "", + "class": "sequence_modeling.AdvSeqLabel", + "pickle": "pos_tag_model_v_0.pkl", + "type": "seq_label", + "config_file_name": "pos_tag.config", + "config_section_name": "pos_tag_model" } + } @@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq): else: raise ValueError("invalid label {}".format(label[0])) return words + + +def interpret_cws_pos_results(char_seq, label_seq): + """Transform model output into user-friendly contents. + + :param char_seq: list of string + :param label_seq: list of string, the same length as char_seq. + :return outputs: list of tuple (words, pos_tag): + """ + + def pos_tag_check(seq): + """check whether all entries are the same """ + return len(set(seq)) <= 1 + + word = [] + word_pos = [] + outputs = [] + for char, label in zip(char_seq, label_seq): + tmp = label.split("-") + cws_label, pos_tag = tmp[0], tmp[1] + + if cws_label == "B" or cws_label == "M": + word.append(char) + word_pos.append(pos_tag) + elif cws_label == "E": + word.append(char) + word_pos.append(pos_tag) + if not pos_tag_check(word_pos): + raise RuntimeError("character-wise pos tags inconsistent. ") + outputs.append(("".join(word), word_pos[0])) + word.clear() + word_pos.clear() + elif cws_label == "S": + outputs.append((char, pos_tag)) + return outputs diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 13a96030..a2f42d19 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): return text.strip().split() -if __name__ == "__main__": +class PeopleDailyCorpusLoader(DatasetLoader): """ - data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() - for example in data: - for w, l in zip(example[0], example[1]): - print(w, l) + People Daily Corpus: Chinese word segmentation, POS tag, NER """ - ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() - print(ans) + def __init__(self, data_path): + super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path) + + def load(self): + with open(self.data_path, "r", encoding="utf-8") as f: + sents = f.readlines() + + pos_tag_examples = [] + ner_examples = [] + for sent in sents: + inside_ne = False + sent_pos_tag = [] + sent_words = [] + sent_ner = [] + words = sent.strip().split()[1:] + for word in words: + if "[" in word and "]" in word: + ner_tag = "U" + print(word) + elif "[" in word: + inside_ne = True + ner_tag = "B" + word = word[1:] + elif "]" in word: + ner_tag = "L" + word = word[:word.index("]")] + if inside_ne is True: + inside_ne = False + else: + raise RuntimeError("only ] appears!") + else: + if inside_ne is True: + ner_tag = "I" + else: + ner_tag = "O" + tmp = word.split("/") + token, pos = tmp[0], tmp[1] + sent_ner.append(ner_tag) + sent_pos_tag.append(pos) + sent_words.append(token) + pos_tag_examples.append([sent_words, sent_pos_tag]) + ner_examples.append([sent_words, sent_ner]) + return pos_tag_examples, ner_examples + +if __name__ == "__main__": + loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt") + pos, ner = loader.load() + print(pos[:10]) + print(ner[:10]) diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py index 6c0e5141..7b8b2814 100644 --- a/fastNLP/modules/decoder/__init__.py +++ b/fastNLP/modules/decoder/__init__.py @@ -1,3 +1,4 @@ from .CRF import ConditionalRandomField +from .MLP import MLP -__all__ = ["ConditionalRandomField"] +__all__ = ["ConditionalRandomField", "MLP"] diff --git a/reproduction/chinese_word_seg/cws_train.py b/reproduction/chinese_word_seg/cws_train.py deleted file mode 100644 index b63a9401..00000000 --- a/reproduction/chinese_word_seg/cws_train.py +++ /dev/null @@ -1,114 +0,0 @@ -import sys - -sys.path.append("..") - -from fastNLP.loader.config_loader import ConfigLoader, ConfigSection -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader -from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle -from fastNLP.saver.model_saver import ModelSaver -from fastNLP.loader.model_loader import ModelLoader -from fastNLP.core.tester import SeqLabelTester -from fastNLP.models.sequence_modeling import SeqLabeling -from fastNLP.core.predictor import Predictor - -data_name = "pku_training.utf8" -cws_data_path = "/home/zyfeng/data/pku_training.utf8" -pickle_path = "./save/" -data_infer_path = "/home/zyfeng/data/pku_test.utf8" - - -def infer(): - # Load infer configuration, the same as test - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - - # fetch dictionary size and number of labels from pickle files - word2index = load_pickle(pickle_path, "word2id.pkl") - test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") - test_args["num_classes"] = len(index2label) - - # Define the same model - model = SeqLabeling(test_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") - - # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) - infer_data = raw_data_loader.load_lines() - - # Inference interface - infer = Predictor(pickle_path) - results = infer.predict(model, infer_data) - - print(results) - print("Inference finished!") - - -def train_test(): - # Config Loader - train_args = ConfigSection() - test_args = ConfigSection() - ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) - - # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) - train_data = loader.load_pku() - - # Preprocessor - preprocess = SeqLabelPreprocess() - data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) - train_args["vocab_size"] = preprocess.vocab_size - train_args["num_classes"] = preprocess.num_classes - - # Trainer - trainer = SeqLabelTrainer(train_args) - - # Model - model = SeqLabeling(train_args) - - # Start training - trainer.train(model, data_train, data_dev) - print("Training finished!") - - # Saver - saver = ModelSaver("./save/saved_model.pkl") - saver.save_pytorch(model) - print("Model saved!") - - # testing with validation set - test(data_dev) - - -def test(test_data): - # Config Loader - train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) - - # Define the same model - model = SeqLabeling(train_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") - - # Load test configuration - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - - # Tester - tester = SeqLabelTester(test_args) - - # Start testing - tester.test(model, test_data) - - # print test results - print(tester.show_matrices()) - print("model tested!") - - -if __name__ == "__main__": - train_test() diff --git a/reproduction/chinese_word_segment/cws.cfg b/reproduction/chinese_word_segment/cws.cfg index ab799428..033d3967 100644 --- a/reproduction/chinese_word_segment/cws.cfg +++ b/reproduction/chinese_word_segment/cws.cfg @@ -31,4 +31,16 @@ pickle_path = "./save/" use_crf = true use_cuda = true rnn_hidden_units = 100 +word_emb_dim = 100 + +[model] +save_output = true +validate_in_training = true +save_dev_input = false +save_loss = true +batch_size = 640 +pickle_path = "./save/" +use_crf = true +use_cuda = true +rnn_hidden_units = 100 word_emb_dim = 100 \ No newline at end of file diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index 66d01038..d08c9315 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -125,7 +125,7 @@ def test(): tester.test(model, dev_data) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/reproduction/chinese_word_seg/cws.cfg b/reproduction/pos_tag_model/pos_tag.cfg similarity index 59% rename from reproduction/chinese_word_seg/cws.cfg rename to reproduction/pos_tag_model/pos_tag.cfg index cdcb4496..eb5e315d 100644 --- a/reproduction/chinese_word_seg/cws.cfg +++ b/reproduction/pos_tag_model/pos_tag.cfg @@ -1,29 +1,35 @@ [train] -epochs = 10 -batch_size = 32 +epochs = 30 +batch_size = 64 pickle_path = "./save/" validate = true save_best_dev = true model_saved_path = "./save/" rnn_hidden_units = 100 -rnn_layers = 2 -rnn_bi_direction = true word_emb_dim = 100 -dropout = 0.5 use_crf = true use_cuda = true +print_every_step = 10 [test] save_output = true validate_in_training = true save_dev_input = false save_loss = true -batch_size = 64 +batch_size = 640 +pickle_path = "./save/" +use_crf = true +use_cuda = true + + +[POS_test] +save_output = true +validate_in_training = true +save_dev_input = false +save_loss = true +batch_size = 640 pickle_path = "./save/" -rnn_hidden_units = 100 -rnn_layers = 1 -rnn_bi_direction = true -word_emb_dim = 100 -dropout = 0.5 use_crf = true use_cuda = true +rnn_hidden_units = 100 +word_emb_dim = 100 \ No newline at end of file diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py new file mode 100644 index 00000000..822cba78 --- /dev/null +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -0,0 +1,146 @@ +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) + +from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader +from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle +from fastNLP.saver.model_saver import ModelSaver +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.core.tester import SeqLabelTester +from fastNLP.models.sequence_modeling import AdvSeqLabel +from fastNLP.core.predictor import SeqLabelInfer + +# not in the file's dir +if len(os.path.dirname(__file__)) != 0: + os.chdir(os.path.dirname(__file__)) +datadir = "/home/zyfeng/data/" +cfgfile = './pos_tag.cfg' +data_name = "CWS_POS_TAG_NER_people_daily.txt" + +pos_tag_data_path = os.path.join(datadir, data_name) +pickle_path = "save" +data_infer_path = os.path.join(datadir, "infer.utf8") + + +def infer(): + # Config Loader + test_args = ConfigSection() + ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # Define the same model + model = AdvSeqLabel(test_args) + + try: + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print('model loaded!') + except Exception as e: + print('cannot load model!') + raise + + # Data Loader + raw_data_loader = BaseLoader(data_name, data_infer_path) + infer_data = raw_data_loader.load_lines() + print('data loaded') + + # Inference interface + infer = SeqLabelInfer(pickle_path) + results = infer.predict(model, infer_data) + + print(results) + print("Inference finished!") + + +def train(): + # Config Loader + train_args = ConfigSection() + test_args = ConfigSection() + ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) + + # Data Loader + loader = PeopleDailyCorpusLoader(pos_tag_data_path) + train_data, _ = loader.load() + + # Preprocessor + preprocessor = SeqLabelPreprocess() + data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) + train_args["vocab_size"] = preprocessor.vocab_size + train_args["num_classes"] = preprocessor.num_classes + + # Trainer + trainer = SeqLabelTrainer(**train_args.data) + + # Model + model = AdvSeqLabel(train_args) + try: + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print('model parameter loaded!') + except Exception as e: + print("No saved model. Continue.") + pass + + # Start training + trainer.train(model, data_train, data_dev) + print("Training finished!") + + # Saver + saver = ModelSaver("./save/saved_model.pkl") + saver.save_pytorch(model) + print("Model saved!") + + +def test(): + # Config Loader + test_args = ConfigSection() + ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # load dev data + dev_data = load_pickle(pickle_path, "data_dev.pkl") + + # Define the same model + model = AdvSeqLabel(test_args) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print("model loaded!") + + # Tester + tester = SeqLabelTester(**test_args.data) + + # Start testing + tester.test(model, dev_data) + + # print test results + print(tester.show_metrics()) + print("model tested!") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') + parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) + args = parser.parse_args() + if args.mode == 'train': + train() + elif args.mode == 'test': + test() + elif args.mode == 'infer': + infer() + else: + print('no mode specified for model!') + parser.print_help() diff --git a/test/ner.py b/test/ner.py index 150bd8c7..a310b6cf 100644 --- a/test/ner.py +++ b/test/ner.py @@ -68,7 +68,7 @@ class MyNERTester(SeqLabelTester): def metrics(self): return np.mean(self.eval_history) - def show_matrices(self): + def show_metrics(self): return "dev accuracy={:.2f}".format(float(self.metrics())) diff --git a/test/readme_example.py b/test/readme_example.py index 17ac92c2..a644b4e4 100644 --- a/test/readme_example.py +++ b/test/readme_example.py @@ -1,19 +1,13 @@ -# python: 3.5 -# pytorch: 0.4 - -################ -# Test cross validation. -################ - -from fastNLP.loader.preprocess import ClassPreprocess - +from fastNLP.core.loss import Loss +from fastNLP.core.optimizer import Optimizer from fastNLP.core.predictor import ClassificationInfer +from fastNLP.core.preprocess import ClassPreprocess from fastNLP.core.trainer import ClassificationTrainer from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.models.base_model import BaseModel from fastNLP.modules import aggregation -from fastNLP.modules import encoder from fastNLP.modules import decoder +from fastNLP.modules import encoder class ClassificationModel(BaseModel): @@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): self.enc = encoder.Conv( in_channels=300, out_channels=100, kernel_size=3) self.agg = aggregation.MaxPool() - self.dec = decoder.MLP(100, num_classes=num_classes) + self.dec = decoder.MLP(size_layer=[100, num_classes]) def forward(self, x): x = self.emb(x) # [N,L] -> [N,L,C] @@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): return x -data_dir = 'data' # directory to save data and model -train_path = 'test/data_for_tests/text_classify.txt' # training set file +data_dir = 'save/' # directory to save data and model +train_path = './data_for_tests/text_classify.txt' # training set file # load dataset ds_loader = ClassDatasetLoader("train", train_path) data = ds_loader.load() # pre-process dataset -pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) -# pre = ClassPreprocess(data, data_dir) -n_classes = pre.num_classes -vocab_size = pre.vocab_size +pre = ClassPreprocess() +train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) +n_classes, vocab_size = pre.num_classes, pre.vocab_size # construct model model_args = { @@ -58,22 +51,25 @@ model_args = { } model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) -# train model +# construct trainer train_args = { - "epochs": 10, - "batch_size": 50, + "epochs": 3, + "batch_size": 16, "pickle_path": data_dir, "validate": False, "save_best_dev": False, "model_saved_path": None, "use_cuda": True, - "learn_rate": 1e-3, - "momentum": 0.9} -trainer = ClassificationTrainer(train_args) -# trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) -trainer.cross_validate(model) + "loss": Loss("cross_entropy"), + "optimizer": Optimizer("Adam", lr=0.001) +} +trainer = ClassificationTrainer(**train_args) + +# start training +trainer.train(model, train_data=train_set, dev_data=dev_set) # predict using model data_infer = [x[0] for x in data] infer = ClassificationInfer(data_dir) -labels_pred = infer.predict(model, data_infer) \ No newline at end of file +labels_pred = infer.predict(model.cpu(), data_infer) +print(labels_pred) diff --git a/test/seq_labeling.py b/test/seq_labeling.py index a9488834..b1a2657d 100644 --- a/test/seq_labeling.py +++ b/test/seq_labeling.py @@ -134,7 +134,7 @@ def train_and_test(): tester.test(model, data_dev) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/test/test_cws.py b/test/test_cws.py index bbbef67f..79911eeb 100644 --- a/test/test_cws.py +++ b/test/test_cws.py @@ -108,7 +108,7 @@ def train_test(): tester.test(model, data_train) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py index 0776109a..467c51b4 100644 --- a/test/test_fastNLP.py +++ b/test/test_fastNLP.py @@ -1,9 +1,12 @@ import sys + sys.path.append("..") from fastNLP.fastnlp import FastNLP -from fastNLP.fastnlp import interpret_word_seg_results +from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" +PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" + def word_seg(): nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) @@ -39,5 +42,33 @@ def test_word_seg_interpret(): print(interpret_word_seg_results(chars, labels)) +def test_interpret_cws_pos_results(): + foo = [ + [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), + ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), + ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] + ] + chars = [x[0] for x in foo[0]] + labels = [x[1] for x in foo[0]] + print(interpret_cws_pos_results(chars, labels)) + + +def test_pos_tag(): + nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) + nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") + text = ["这是最好的基于深度学习的中文分词系统。", + "大王叫我来巡山。", + "我党多年来致力于改善人民生活水平。"] + results = nlp.run(text) + for example in results: + words, labels = [], [] + for res in example: + words.append(res[0]) + labels.append(res[1]) + print(interpret_cws_pos_results(words, labels)) + + + + if __name__ == "__main__": word_seg() diff --git a/test/test_tester.py b/test/test_tester.py index 1c2658ef..83f73790 100644 --- a/test/test_tester.py +++ b/test/test_tester.py @@ -5,7 +5,6 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader from fastNLP.models.sequence_modeling import SeqLabeling data_name = "pku_training.utf8" -cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" pickle_path = "data_for_tests" @@ -17,7 +16,8 @@ def foo(): ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) # Preprocessor - p = SeqLabelPreprocess(train_data, pickle_path) + p = SeqLabelPreprocess() + p.run(train_data) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes @@ -30,7 +30,7 @@ def foo(): print("start validation.") validator.test(model) - print(validator.show_matrices()) + print(validator.show_metrics()) if __name__ == "__main__": From 57911f771a5f703c795571b3fafe14ba598e7e9d Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Sep 2018 13:32:57 +0800 Subject: [PATCH 2/4] - clean up unused codes - improve code comments - BaseLoader & its subclasses does not need a data name any more - update file tree - add setup.py --- README.md | 66 ++-------- fastNLP/core/loss.py | 2 +- fastNLP/core/predictor.py | 2 +- fastNLP/core/tester.py | 40 ++++-- fastNLP/core/trainer.py | 27 +--- fastNLP/loader/base_loader.py | 7 +- fastNLP/loader/config_loader.py | 4 +- fastNLP/loader/dataset_loader.py | 26 ++-- fastNLP/loader/embed_loader.py | 46 ++++++- fastNLP/loader/model_loader.py | 4 +- reproduction/chinese_word_segment/run.py | 10 +- reproduction/pos_tag_model/train_pos_tag.py | 8 +- setup.py | 24 ++++ test/loader/test_loader.py | 15 +-- test/ner.py | 138 -------------------- test/ner_decode.py | 129 ------------------ test/readme_example.py | 2 +- test/seq_labeling.py | 14 +- test/test_cws.py | 14 +- test/test_tester.py | 10 +- test/text_classify.py | 4 +- 21 files changed, 173 insertions(+), 419 deletions(-) delete mode 100644 test/ner.py delete mode 100644 test/ner_decode.py diff --git a/README.md b/README.md index b0ac20db..a9c4874b 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ data_dir = 'save/' # directory to save data and model train_path = './data_for_tests/text_classify.txt' # training set file # load dataset -ds_loader = ClassDatasetLoader("train", train_path) +ds_loader = ClassDatasetLoader(train_path) data = ds_loader.load() # pre-process dataset @@ -135,14 +135,15 @@ pip3 install torch torchvision ``` FastNLP ├── docs -│   └── quick_tutorial.md ├── fastNLP -│   ├── action +│   ├── core │   │   ├── action.py -│   │   ├── inference.py │   │   ├── __init__.py +│   │   ├── loss.py │   │   ├── metrics.py │   │   ├── optimizer.py +│   │   ├── predictor.py +│   │   ├── preprocess.py │   │   ├── README.md │   │   ├── tester.py │   │   └── trainer.py @@ -154,71 +155,28 @@ FastNLP │   │   ├── dataset_loader.py │   │   ├── embed_loader.py │   │   ├── __init__.py -│   │   ├── model_loader.py -│   │   └── preprocess.py +│   │   └── model_loader.py │   ├── models -│   │   ├── base_model.py -│   │   ├── char_language_model.py -│   │   ├── cnn_text_classification.py -│   │   ├── __init__.py -│   │   └── sequence_modeling.py │   ├── modules │   │   ├── aggregation -│   │   │   ├── attention.py -│   │   │   ├── avg_pool.py -│   │   │   ├── __init__.py -│   │   │   ├── kmax_pool.py -│   │   │   ├── max_pool.py -│   │   │   └── self_attention.py │   │   ├── decoder -│   │   │   ├── CRF.py -│   │   │   └── __init__.py │   │   ├── encoder -│   │   │   ├── char_embedding.py -│   │   │   ├── conv_maxpool.py -│   │   │   ├── conv.py -│   │   │   ├── embedding.py -│   │   │   ├── __init__.py -│   │   │   ├── linear.py -│   │   │   ├── lstm.py -│   │   │   ├── masked_rnn.py -│   │   │   └── variational_rnn.py │   │   ├── __init__.py │   │   ├── interaction -│   │   │   └── __init__.py │   │   ├── other_modules.py │   │   └── utils.py │   └── saver -│   ├── base_saver.py -│   ├── __init__.py -│   ├── logger.py -│   └── model_saver.py ├── LICENSE ├── README.md ├── reproduction -│   ├── Char-aware_NLM -│   │   -│   ├── CNN-sentence_classification -│   │   -│   ├── HAN-document_classification -│   │   -│   └── LSTM+self_attention_sentiment_analysis -| ├── requirements.txt ├── setup.py └── test + ├── core ├── data_for_tests - │   ├── charlm.txt - │   ├── config - │   ├── cws_test - │   ├── cws_train - │   ├── people_infer.txt - │   └── people.txt - ├── test_charlm.py - ├── test_cws.py - ├── test_fastNLP.py - ├── test_loader.py - ├── test_seq_labeling.py - ├── test_tester.py - └── test_trainer.py + ├── __init__.py + ├── loader + ├── modules + └── readme_example.py + ``` diff --git a/fastNLP/core/loss.py b/fastNLP/core/loss.py index f83b4959..6a5cb349 100644 --- a/fastNLP/core/loss.py +++ b/fastNLP/core/loss.py @@ -9,7 +9,7 @@ class Loss(object): def __init__(self, args): if args is None: - # this is useful when + # this is useful when Trainer.__init__ performs type check self._loss = None elif isinstance(args, str): self._loss = self._borrow_from_pytorch(args) diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 03a6e43c..d04a6ef0 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -70,7 +70,7 @@ class Predictor(object): def predict(self, network, data): """Perform inference using the trained model. - :param network: a PyTorch model + :param network: a PyTorch model (cpu) :param data: list of list of strings :return: list of list of strings, [num_examples, tag_seq_length] """ diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index c819069f..c085f7a4 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -38,7 +38,7 @@ class BaseTester(object): Obviously, "required_args" is the subset of "default_args". The value in "default_args" to the keys in "required_args" is simply for type check. """ - # TODO: required arguments + # add required arguments here required_args = {} for req_key in required_args: @@ -56,7 +56,7 @@ class BaseTester(object): logger.error(msg) raise ValueError(msg) else: - # BeseTester doesn't care about extra arguments + # BaseTester doesn't care about extra arguments pass print(default_args) @@ -69,8 +69,8 @@ class BaseTester(object): self.print_every_step = default_args["print_every_step"] self._model = None - self.eval_history = [] - self.batch_output = [] + self.eval_history = [] # evaluation results of all batches + self.batch_output = [] # outputs of all batches def test(self, network, dev_data): if torch.cuda.is_available() and self.use_cuda: @@ -83,7 +83,7 @@ class BaseTester(object): self.eval_history.clear() self.batch_output.clear() - iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) + iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False)) step = 0 for batch_x, batch_y in self.make_batch(iterator): @@ -99,7 +99,7 @@ class BaseTester(object): print_output = "[test step {}] {}".format(step, eval_results) logger.info(print_output) if self.print_every_step > 0 and step % self.print_every_step == 0: - print(print_output) + print(self.make_eval_output(prediction, eval_results)) step += 1 def mode(self, model, test): @@ -115,16 +115,28 @@ class BaseTester(object): raise NotImplementedError def evaluate(self, predict, truth): - """Compute evaluation metrics for the model. """ + """Compute evaluation metrics. + + :param predict: Tensor + :param truth: Tensor + :return eval_results: can be anything. It will be stored in self.eval_history + """ raise NotImplementedError @property def metrics(self): - """Return a list of metrics. """ + """Compute and return metrics. + Use self.eval_history to compute metrics over the whole dev set. + Please refer to metrics.py for common metric functions. + + :return : variable number of outputs + """ raise NotImplementedError def show_metrics(self): - """This is called by Trainer to print evaluation results on dev set during training. + """Customize evaluation outputs in Trainer. + Called by Trainer to print evaluation results on dev set during training. + Use self.metrics to fetch available metrics. :return print_str: str """ @@ -133,6 +145,14 @@ class BaseTester(object): def make_batch(self, iterator): raise NotImplementedError + def make_eval_output(self, predictions, eval_results): + """Customize Tester outputs. + + :param predictions: Tensor + :param eval_results: Tensor + :return: str, to be printed. + """ + raise NotImplementedError class SeqLabelTester(BaseTester): """ @@ -211,7 +231,7 @@ class ClassificationTester(BaseTester): def __init__(self, **test_args): """ - :param test_args: a dict-like object that has __getitem__ method, \ + :param test_args: a dict-like object that has __getitem__ method. can be accessed by "test_args["key_str"]" """ super(ClassificationTester, self).__init__(**test_args) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index ebba7975..5fb5b0dc 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,6 +1,4 @@ -import _pickle import copy -import os import time from datetime import timedelta @@ -15,16 +13,12 @@ from fastNLP.modules import utils from fastNLP.saver.logger import create_logger from fastNLP.saver.model_saver import ModelSaver -DEFAULT_QUEUE_SIZE = 300 logger = create_logger(__name__, "./train_test.log") class BaseTrainer(object): - """Operations to train a model, including data loading, SGD, and validation. + """Operations of training a model, including data loading, gradient descent, and validation. - Subclasses must implement the following abstract methods: - - grad_backward - - get_loss """ def __init__(self, **kwargs): @@ -47,7 +41,7 @@ class BaseTrainer(object): """ default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, - "loss": Loss(None), + "loss": Loss(None), # used to pass type check "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) } """ @@ -56,7 +50,7 @@ class BaseTrainer(object): Obviously, "required_args" is the subset of "default_args". The value in "default_args" to the keys in "required_args" is simply for type check. """ - # TODO: required arguments + # add required arguments here required_args = {} for req_key in required_args: @@ -198,21 +192,6 @@ class BaseTrainer(object): network_copy = copy.deepcopy(network) self.train(network_copy, train_data_cv[i], dev_data_cv[i]) - def load_train_data(self, pickle_path): - """ - For task-specific processing. - :param pickle_path: - :return data_train - """ - file_path = os.path.join(pickle_path, "data_train.pkl") - if os.path.exists(file_path): - with open(file_path, 'rb') as f: - data = _pickle.load(f) - else: - logger.error("cannot find training data {}. invalid input path for training data.".format(file_path)) - raise RuntimeError("cannot find training data {}".format(file_path)) - return data - def make_batch(self, iterator): raise NotImplementedError diff --git a/fastNLP/loader/base_loader.py b/fastNLP/loader/base_loader.py index 45a379c1..808567fb 100644 --- a/fastNLP/loader/base_loader.py +++ b/fastNLP/loader/base_loader.py @@ -1,9 +1,8 @@ class BaseLoader(object): """docstring for BaseLoader""" - def __init__(self, data_name, data_path): + def __init__(self, data_path): super(BaseLoader, self).__init__() - self.data_name = data_name self.data_path = data_path def load(self): @@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader): For charLM """ - def __init__(self, name, path): - super(ToyLoader0, self).__init__(name, path) + def __init__(self, data_path): + super(ToyLoader0, self).__init__(data_path) def load(self): with open(self.data_path, 'r') as f: diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py index 9e3ebc1c..20d791c4 100644 --- a/fastNLP/loader/config_loader.py +++ b/fastNLP/loader/config_loader.py @@ -9,7 +9,7 @@ class ConfigLoader(BaseLoader): """loader for configuration files""" def __int__(self, data_name, data_path): - super(ConfigLoader, self).__init__(data_name, data_path) + super(ConfigLoader, self).__init__(data_path) self.config = self.parse(super(ConfigLoader, self).load()) @staticmethod @@ -100,7 +100,7 @@ class ConfigSection(object): if __name__ == "__main__": - config = ConfigLoader('configLoader', 'there is no data') + config = ConfigLoader('there is no data') section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} """ diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index a2f42d19..2f03bd8a 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader class DatasetLoader(BaseLoader): """"loader for data sets""" - def __init__(self, data_name, data_path): - super(DatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(DatasetLoader, self).__init__(data_path) class POSDatasetLoader(DatasetLoader): @@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader): to label5. """ - def __init__(self, data_name, data_path): - super(POSDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(POSDatasetLoader, self).__init__(data_path) def load(self): assert os.path.exists(self.data_path) @@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader): Data set loader for tokenization data sets """ - def __init__(self, data_name, data_path): - super(TokenizeDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(TokenizeDatasetLoader, self).__init__(data_path) def load_pku(self, max_seq_len=32): """ @@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader): class ClassDatasetLoader(DatasetLoader): """Loader for classification data sets""" - def __init__(self, data_name, data_path): - super(ClassDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(ClassDatasetLoader, self).__init__(data_path) def load(self): assert os.path.exists(self.data_path) @@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader): :param str data_name: the name of the conll data set :param str data_path: the path to the conll data set """ - super(ConllLoader, self).__init__(data_name, data_path) + super(ConllLoader, self).__init__(data_path) self.data_set = self.parse(self.load()) def load(self): @@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader): class LMDatasetLoader(DatasetLoader): - def __init__(self, data_name, data_path): - super(LMDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(LMDatasetLoader, self).__init__(data_path) def load(self): if not os.path.exists(self.data_path): @@ -226,7 +226,7 @@ class PeopleDailyCorpusLoader(DatasetLoader): """ def __init__(self, data_path): - super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path) + super(PeopleDailyCorpusLoader, self).__init__(data_path) def load(self): with open(self.data_path, "r", encoding="utf-8") as f: @@ -270,7 +270,7 @@ class PeopleDailyCorpusLoader(DatasetLoader): return pos_tag_examples, ner_examples if __name__ == "__main__": - loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt") + loader = PeopleDailyCorpusLoader("./") pos, ner = loader.load() print(pos[:10]) print(ner[:10]) diff --git a/fastNLP/loader/embed_loader.py b/fastNLP/loader/embed_loader.py index 4b70dd0b..a84f6335 100644 --- a/fastNLP/loader/embed_loader.py +++ b/fastNLP/loader/embed_loader.py @@ -1,8 +1,50 @@ +import _pickle +import os + +import numpy as np + from fastNLP.loader.base_loader import BaseLoader class EmbedLoader(BaseLoader): """docstring for EmbedLoader""" - def __init__(self, data_name, data_path): - super(EmbedLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(EmbedLoader, self).__init__(data_path) + + @staticmethod + def load_embedding(emb_dim, emb_file, word_dict, emb_pkl): + """Load the pre-trained embedding and combine with the given dictionary. + + :param emb_file: str, the pre-trained embedding. + The embedding file should have the following format: + Each line is a word embedding, where a word string is followed by multiple floats. + Floats are separated by space. The word and the first float are separated by space. + :param word_dict: dict, a mapping from word to index. + :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. + :param emb_pkl: str, the embedding pickle file. + :return embedding_np: numpy array of shape (len(word_dict), emb_dim) + + TODO: fragile code + """ + # If the embedding pickle exists, load it and return. + if os.path.exists(emb_pkl): + with open(emb_pkl, "rb") as f: + embedding_np = _pickle.load(f) + return embedding_np + # Otherwise, load the pre-trained embedding. + with open(emb_file, "r", encoding="utf-8") as f: + # begin with a random embedding + embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) + for line in f: + line = line.strip().split() + if len(line) != emb_dim + 1: + # skip this line if two embedding dimension not match + continue + if line[0] in word_dict: + # find the word and replace its embedding with a pre-trained one + embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] + # save and return the result + with open(emb_pkl, "wb") as f: + _pickle.dump(embedding_np, f) + return embedding_np diff --git a/fastNLP/loader/model_loader.py b/fastNLP/loader/model_loader.py index 1e1d4f8f..c07576b8 100644 --- a/fastNLP/loader/model_loader.py +++ b/fastNLP/loader/model_loader.py @@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): Loader for models. """ - def __init__(self, data_name, data_path): - super(ModelLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(ModelLoader, self).__init__(data_path) @staticmethod def load_pytorch(empty_model, model_path): diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index d08c9315..d0a22e84 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -27,7 +27,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8") def infer(): # Config Loader test_args = ConfigSection() - ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -47,7 +47,7 @@ def infer(): raise # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() print('data loaded') @@ -63,10 +63,10 @@ def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() - ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) + ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) + loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor @@ -100,7 +100,7 @@ def train(): def test(): # Config Loader test_args = ConfigSection() - ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py index 822cba78..87a9f7e8 100644 --- a/reproduction/pos_tag_model/train_pos_tag.py +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -28,7 +28,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8") def infer(): # Config Loader test_args = ConfigSection() - ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -47,7 +47,7 @@ def infer(): raise # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() print('data loaded') @@ -63,7 +63,7 @@ def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() - ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) + ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) # Data Loader loader = PeopleDailyCorpusLoader(pos_tag_data_path) @@ -100,7 +100,7 @@ def train(): def test(): # Config Loader test_args = ConfigSection() - ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") diff --git a/setup.py b/setup.py index e69de29b..64e72c15 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# coding=utf-8 +from setuptools import setup, find_packages + +with open('README.md') as f: + readme = f.read() + +with open('LICENSE') as f: + license = f.read() + +with open('requirements.txt') as f: + reqs = f.read() + +setup( + name='fastNLP', + version='1.0', + description=('fudan fastNLP '), + long_description=readme, + license=license, + author='fudanNLP', + python_requires='>=3.5', + packages=find_packages(), + install_requires=reqs.strip().split('\n'), +) diff --git a/test/loader/test_loader.py b/test/loader/test_loader.py index fe826a6f..d2f22166 100644 --- a/test/loader/test_loader.py +++ b/test/loader/test_loader.py @@ -1,13 +1,12 @@ -import os import configparser - import json +import os import unittest - from fastNLP.loader.config_loader import ConfigSection, ConfigLoader from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader + class TestConfigLoader(unittest.TestCase): def test_case_ConfigLoader(self): @@ -33,8 +32,8 @@ class TestConfigLoader(unittest.TestCase): return dict test_arg = ConfigSection() - ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) - #ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", + ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) + # ConfigLoader("config").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", # {"test": test_arg}) #dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test") @@ -58,18 +57,18 @@ class TestConfigLoader(unittest.TestCase): class TestDatasetLoader(unittest.TestCase): def test_case_TokenizeDatasetLoader(self): - loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") + loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") data = loader.load_pku(max_seq_len=32) print("pass TokenizeDatasetLoader test!") def test_case_POSDatasetLoader(self): - loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt") + loader = POSDatasetLoader("./test/data_for_tests/people.txt") data = loader.load() datas = loader.load_lines() print("pass POSDatasetLoader test!") def test_case_LMDatasetLoader(self): - loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") + loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") data = loader.load() datas = loader.load_lines() print("pass TokenizeDatasetLoader test!") diff --git a/test/ner.py b/test/ner.py deleted file mode 100644 index a310b6cf..00000000 --- a/test/ner.py +++ /dev/null @@ -1,138 +0,0 @@ -import _pickle -import os - -import numpy as np -import torch - -from fastNLP.core.preprocess import SeqLabelPreprocess -from fastNLP.core.tester import SeqLabelTester -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.models.sequence_modeling import AdvSeqLabel - - -class MyNERTrainer(SeqLabelTrainer): - def __init__(self, train_args): - super(MyNERTrainer, self).__init__(train_args) - self.scheduler = None - - def define_optimizer(self): - """ - override - :return: - """ - self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001) - self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) - - def update(self): - """ - override - :return: - """ - self.optimizer.step() - self.scheduler.step() - - def _create_validator(self, valid_args): - return MyNERTester(valid_args) - - def best_eval_result(self, validator): - accuracy = validator.metrics() - if accuracy > self.best_accuracy: - self.best_accuracy = accuracy - return True - else: - return False - - -class MyNERTester(SeqLabelTester): - def __init__(self, test_args): - super(MyNERTester, self).__init__(test_args) - - def _evaluate(self, prediction, batch_y, seq_len): - """ - :param prediction: [batch_size, seq_len, num_classes] - :param batch_y: [batch_size, seq_len] - :param seq_len: [batch_size] - :return: - """ - summ = 0 - correct = 0 - _, indices = torch.max(prediction, 2) - for p, y, l in zip(indices, batch_y, seq_len): - summ += l - correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) - return float(correct / summ) - - def evaluate(self, predict, truth): - return self._evaluate(predict, truth, self.seq_len) - - def metrics(self): - return np.mean(self.eval_history) - - def show_metrics(self): - return "dev accuracy={:.2f}".format(float(self.metrics())) - - -def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): - if os.path.exists(emb_pkl): - with open(emb_pkl, "rb") as f: - embedding_np = _pickle.load(f) - return embedding_np - with open(emb_file, "r", encoding="utf-8") as f: - embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) - for line in f: - line = line.strip().split() - if len(line) != emb_dim + 1: - continue - if line[0] in word_dict: - embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] - with open(emb_pkl, "wb") as f: - _pickle.dump(embedding_np, f) - return embedding_np - - -def data_load(data_file): - with open(data_file, "r", encoding="utf-8") as f: - all_data = [] - sent = [] - label = [] - for line in f: - line = line.strip().split() - - if not len(line) <= 1: - sent.append(line[0]) - label.append(line[1]) - else: - all_data.append([sent, label]) - sent = [] - label = [] - return all_data - - -data_path = "data_for_tests/people.txt" -pick_path = "data_for_tests/" -emb_path = "data_for_tests/emb50.txt" -save_path = "data_for_tests/" -if __name__ == "__main__": - data = data_load(data_path) - preprocess = SeqLabelPreprocess() - data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3) - # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) - emb = None - args = {"epochs": 20, - "batch_size": 1, - "pickle_path": pick_path, - "validate": True, - "save_best_dev": True, - "model_saved_path": save_path, - "use_cuda": True, - - "vocab_size": preprocess.vocab_size, - "num_classes": preprocess.num_classes, - "word_emb_dim": 50, - "rnn_hidden_units": 100 - } - # emb = torch.Tensor(emb).float().cuda() - networks = AdvSeqLabel(args, emb) - trainer = MyNERTrainer(args) - trainer.train(networks, data_train, data_dev) - print("Training finished!") diff --git a/test/ner_decode.py b/test/ner_decode.py deleted file mode 100644 index 5c09cbd2..00000000 --- a/test/ner_decode.py +++ /dev/null @@ -1,129 +0,0 @@ -import _pickle -import os - -import torch - -from fastNLP.core.predictor import SeqLabelInfer -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.loader.model_loader import ModelLoader -from fastNLP.models.sequence_modeling import AdvSeqLabel - - -class Decode(SeqLabelTrainer): - def __init__(self, args): - super(Decode, self).__init__(args) - - def decoder(self, network, sents, model_path): - self.model = network - self.model.load_state_dict(torch.load(model_path)) - out_put = [] - self.mode(network, test=True) - for batch_x in sents: - prediction = self.data_forward(self.model, batch_x) - - seq_tag = self.model.prediction(prediction, batch_x[1]) - - out_put.append(list(seq_tag)[0]) - return out_put - - -def process_sent(sents, word2id): - sents_num = [] - for s in sents: - sent_num = [] - for c in s: - if c in word2id: - sent_num.append(word2id[c]) - else: - sent_num.append(word2id[""]) - sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 - - return sents_num - - -def process_tag(sents, tags, id2class): - Tags = [] - for ttt in tags: - Tags.append([id2class[t] for t in ttt]) - - Segs = [] - PosNers = [] - for sent, tag in zip(sents, tags): - word__ = [] - lll__ = [] - for c, t in zip(sent, tag): - - t = id2class[t] - l = t.split("-") - split_ = l[0] - pn = l[1] - - if split_ == "S": - word__.append(c) - lll__.append(pn) - word_1 = "" - elif split_ == "E": - word_1 += c - word__.append(word_1) - lll__.append(pn) - word_1 = "" - elif split_ == "B": - word_1 = "" - word_1 += c - else: - word_1 += c - Segs.append(word__) - PosNers.append(lll__) - return Segs, PosNers - - -pickle_path = "data_for_tests/" -model_path = "data_for_tests/model_best_dev.pkl" -if __name__ == "__main__": - - with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: - id2word = _pickle.load(f) - with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: - word2id = _pickle.load(f) - with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: - id2class = _pickle.load(f) - - sent = ["中共中央总书记、国家主席江泽民", - "逆向处理输入序列并返回逆序后的序列"] # here is input - - args = {"epochs": 1, - "batch_size": 1, - "pickle_path": "data_for_tests/", - "validate": True, - "save_best_dev": True, - "model_saved_path": "data_for_tests/", - "use_cuda": False, - - "vocab_size": len(word2id), - "num_classes": len(id2class), - "word_emb_dim": 50, - "rnn_hidden_units": 100, - } - """ - network = AdvSeqLabel(args, None) - decoder_ = Decode(args) - tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) - output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output - print(output_seg) - print(output_pn) - """ - # Define the same model - model = AdvSeqLabel(args, None) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") - print("model loaded!") - - # Inference interface - infer = SeqLabelInfer(pickle_path) - sent = [[ch for ch in s] for s in sent] - results = infer.predict(model, sent) - - for res in results: - print(res) - print("Inference finished!") diff --git a/test/readme_example.py b/test/readme_example.py index a644b4e4..bc50c48b 100644 --- a/test/readme_example.py +++ b/test/readme_example.py @@ -36,7 +36,7 @@ data_dir = 'save/' # directory to save data and model train_path = './data_for_tests/text_classify.txt' # training set file # load dataset -ds_loader = ClassDatasetLoader("train", train_path) +ds_loader = ClassDatasetLoader(train_path) data = ds_loader.load() # pre-process dataset diff --git a/test/seq_labeling.py b/test/seq_labeling.py index b1a2657d..0f7a072b 100644 --- a/test/seq_labeling.py +++ b/test/seq_labeling.py @@ -33,7 +33,7 @@ data_infer_path = args.infer def infer(): # Load infer configuration, the same as test test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args}) + ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -49,7 +49,7 @@ def infer(): print("model loaded!") # Data Loader - raw_data_loader = BaseLoader("xxx", data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() # Inference interface @@ -65,11 +65,11 @@ def train_and_test(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config(config_dir, { + ConfigLoader("config.cfg").load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) # Data Loader - pos_loader = POSDatasetLoader("xxx", data_path) + pos_loader = POSDatasetLoader(data_path) train_data = pos_loader.load_lines() # Preprocessor @@ -117,7 +117,7 @@ def train_and_test(): # Load test configuration tester_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args}) + ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(save_output=False, @@ -139,5 +139,5 @@ def train_and_test(): if __name__ == "__main__": - train_and_test() - # infer() + # train_and_test() + infer() diff --git a/test/test_cws.py b/test/test_cws.py index 79911eeb..802d97ba 100644 --- a/test/test_cws.py +++ b/test/test_cws.py @@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt" def infer(): # Load infer configuration, the same as test test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -38,7 +38,7 @@ def infer(): print("model loaded!") # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() """ Transform strings into list of list of strings. @@ -61,10 +61,10 @@ def infer(): def train_test(): # Config Loader train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) + loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor @@ -74,7 +74,7 @@ def train_test(): train_args["num_classes"] = p.num_classes # Trainer - trainer = SeqLabelTrainer(train_args) + trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) @@ -99,10 +99,10 @@ def train_test(): # Load test configuration test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) # Tester - tester = SeqLabelTester(test_args) + tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, data_train) diff --git a/test/test_tester.py b/test/test_tester.py index 83f73790..e4ccf536 100644 --- a/test/test_tester.py +++ b/test/test_tester.py @@ -9,15 +9,15 @@ pickle_path = "data_for_tests" def foo(): - loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8") + loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") train_data = loader.load_pku() train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) # Preprocessor p = SeqLabelPreprocess() - p.run(train_data) + train_data = p.run(train_data) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes @@ -26,10 +26,10 @@ def foo(): valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", "use_cuda": True} - validator = SeqLabelTester(valid_args) + validator = SeqLabelTester(**valid_args) print("start validation.") - validator.test(model) + validator.test(model, train_data) print(validator.show_metrics()) diff --git a/test/text_classify.py b/test/text_classify.py index 64294d37..6ff3c059 100644 --- a/test/text_classify.py +++ b/test/text_classify.py @@ -34,7 +34,7 @@ config_dir = args.config def infer(): # load dataset print("Loading data...") - ds_loader = ClassDatasetLoader("train", train_data_dir) + ds_loader = ClassDatasetLoader(train_data_dir) data = ds_loader.load() unlabeled_data = [x[0] for x in data] @@ -69,7 +69,7 @@ def train(): # load dataset print("Loading data...") - ds_loader = ClassDatasetLoader("train", train_data_dir) + ds_loader = ClassDatasetLoader(train_data_dir) data = ds_loader.load() print(data[0]) From 6f59384d6c6a344ffc26471b9bad93edde2f5278 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Sep 2018 14:18:04 +0800 Subject: [PATCH 3/4] pass CI --- test/test_fastNLP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py index 467c51b4..b858ba46 100644 --- a/test/test_fastNLP.py +++ b/test/test_fastNLP.py @@ -53,7 +53,7 @@ def test_interpret_cws_pos_results(): print(interpret_cws_pos_results(chars, labels)) -def test_pos_tag(): +def pos_tag(): nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") text = ["这是最好的基于深度学习的中文分词系统。", @@ -71,4 +71,4 @@ def test_pos_tag(): if __name__ == "__main__": - word_seg() + pos_tag() From 31eac4a795f90e87bfe5cff2df87eadeceffd688 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Sep 2018 14:33:23 +0800 Subject: [PATCH 4/4] fix bug in preprocessor: reported in issue 47 --- fastNLP/core/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py index 99bf45ba..1c419ce9 100644 --- a/fastNLP/core/preprocess.py +++ b/fastNLP/core/preprocess.py @@ -268,7 +268,7 @@ class ClassPreprocess(BasePreprocess): for word in sent: if word not in word2index: - word2index[word[0]] = len(word2index) + word2index[word] = len(word2index) return word2index, label2index def to_index(self, data):