From 32a036e8e6a2f38a6368189607063543c940ded7 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Sep 2018 21:33:28 +0800 Subject: [PATCH] [fix] drop "data" in Tester.make_batch; correct spelling of "show_metrics" [add] PeopleDailyCorpusLoader, to parse PeopleDaily Corpus [update] add CWS + POS_tag interface at FastNLP, see example in test_fastNLP.py [update] modify README.md and readme_example.py to the latest version. --- README.md | 50 +++--- fastNLP/core/tester.py | 12 +- fastNLP/core/trainer.py | 2 +- fastNLP/fastnlp.py | 44 ++++++ fastNLP/loader/dataset_loader.py | 58 ++++++- fastNLP/modules/decoder/__init__.py | 3 +- reproduction/chinese_word_seg/cws_train.py | 114 -------------- reproduction/chinese_word_segment/cws.cfg | 12 ++ reproduction/chinese_word_segment/run.py | 2 +- .../cws.cfg => pos_tag_model/pos_tag.cfg} | 28 ++-- reproduction/pos_tag_model/train_pos_tag.py | 146 ++++++++++++++++++ test/ner.py | 2 +- test/readme_example.py | 48 +++--- test/seq_labeling.py | 2 +- test/test_cws.py | 2 +- test/test_fastNLP.py | 33 +++- test/test_tester.py | 6 +- 17 files changed, 370 insertions(+), 194 deletions(-) delete mode 100644 reproduction/chinese_word_seg/cws_train.py rename reproduction/{chinese_word_seg/cws.cfg => pos_tag_model/pos_tag.cfg} (59%) create mode 100644 reproduction/pos_tag_model/train_pos_tag.py diff --git a/README.md b/README.md index a38771ee..b0ac20db 100644 --- a/README.md +++ b/README.md @@ -27,15 +27,16 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. ```python +from fastNLP.core.preprocess import ClassPreprocess +from fastNLP.core.predictor import ClassificationInfer +from fastNLP.core.trainer import ClassificationTrainer +from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.models.base_model import BaseModel -from fastNLP.modules import encoder from fastNLP.modules import aggregation +from fastNLP.modules import encoder from fastNLP.modules import decoder - -from fastNLP.loader.dataset_loader import ClassDatasetLoader -from fastNLP.loader.preprocess import ClassPreprocess -from fastNLP.core.trainer import ClassificationTrainer -from fastNLP.core.inference import ClassificationInfer +from fastNLP.core.loss import Loss +from fastNLP.core.optimizer import Optimizer class ClassificationModel(BaseModel): @@ -50,7 +51,7 @@ class ClassificationModel(BaseModel): self.enc = encoder.Conv( in_channels=300, out_channels=100, kernel_size=3) self.agg = aggregation.MaxPool() - self.dec = decoder.MLP(100, num_classes=num_classes) + self.dec = decoder.MLP(size_layer=[100, num_classes]) def forward(self, x): x = self.emb(x) # [N,L] -> [N,L,C] @@ -60,16 +61,17 @@ class ClassificationModel(BaseModel): return x -data_dir = 'data' # directory to save data and model -train_path = 'test/data_for_tests/text_classify.txt' # training set file +data_dir = 'save/' # directory to save data and model +train_path = './data_for_tests/text_classify.txt' # training set file # load dataset ds_loader = ClassDatasetLoader("train", train_path) data = ds_loader.load() # pre-process dataset -pre = ClassPreprocess(data_dir) -vocab_size, n_classes = pre.process(data, "data_train.pkl") +pre = ClassPreprocess() +train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) +n_classes, vocab_size = pre.num_classes, pre.vocab_size # construct model model_args = { @@ -78,28 +80,36 @@ model_args = { } model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) -# train model +# construct trainer train_args = { - "epochs": 20, - "batch_size": 50, + "epochs": 3, + "batch_size": 16, "pickle_path": data_dir, "validate": False, "save_best_dev": False, "model_saved_path": None, "use_cuda": True, - "learn_rate": 1e-3, - "momentum": 0.9} -trainer = ClassificationTrainer(train_args) -trainer.train(model) + "loss": Loss("cross_entropy"), + "optimizer": Optimizer("Adam", lr=0.001) +} +trainer = ClassificationTrainer(**train_args) + +# start training +trainer.train(model, train_data=train_set, dev_data=dev_set) # predict using model -seqs = [x[0] for x in data] +data_infer = [x[0] for x in data] infer = ClassificationInfer(data_dir) -labels_pred = infer.predict(model, seqs) +labels_pred = infer.predict(model.cpu(), data_infer) +print(labels_pred) ``` ## Installation +Run the following commands to install fastNLP package. +```shell +pip install fastNLP +``` ### Cloning From GitHub diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index c168822e..c819069f 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -86,7 +86,7 @@ class BaseTester(object): iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) step = 0 - for batch_x, batch_y in self.make_batch(iterator, dev_data): + for batch_x, batch_y in self.make_batch(iterator): with torch.no_grad(): prediction = self.data_forward(network, batch_x) eval_results = self.evaluate(prediction, batch_y) @@ -123,14 +123,14 @@ class BaseTester(object): """Return a list of metrics. """ raise NotImplementedError - def show_matrices(self): + def show_metrics(self): """This is called by Trainer to print evaluation results on dev set during training. :return print_str: str """ raise NotImplementedError - def make_batch(self, iterator, data): + def make_batch(self, iterator): raise NotImplementedError @@ -194,7 +194,7 @@ class SeqLabelTester(BaseTester): batch_accuracy = np.mean([x[1] for x in self.eval_history]) return batch_loss, batch_accuracy - def show_matrices(self): + def show_metrics(self): """ This is called by Trainer to print evaluation on dev set. :return print_str: str @@ -202,7 +202,7 @@ class SeqLabelTester(BaseTester): loss, accuracy = self.metrics() return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) - def make_batch(self, iterator, data): + def make_batch(self, iterator): return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) @@ -216,7 +216,7 @@ class ClassificationTester(BaseTester): """ super(ClassificationTester, self).__init__(**test_args) - def make_batch(self, iterator, data, max_len=None): + def make_batch(self, iterator, max_len=None): return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) def data_forward(self, network, x): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 7fc34da0..ebba7975 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -144,7 +144,7 @@ class BaseTrainer(object): print("Saved better model selected by validation.") logger.info("Saved better model selected by validation.") - valid_results = validator.show_matrices() + valid_results = validator.show_metrics() print("[epoch {}] {}".format(epoch, valid_results)) logger.info("[epoch {}] {}".format(epoch, valid_results)) diff --git a/fastNLP/fastnlp.py b/fastNLP/fastnlp.py index 67204161..4a697e9a 100644 --- a/fastNLP/fastnlp.py +++ b/fastNLP/fastnlp.py @@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = { "type": "seq_label", "config_file_name": "config", "config_section_name": "text_class_model" + }, + "pos_tag_model": { + "url": "", + "class": "sequence_modeling.AdvSeqLabel", + "pickle": "pos_tag_model_v_0.pkl", + "type": "seq_label", + "config_file_name": "pos_tag.config", + "config_section_name": "pos_tag_model" } + } @@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq): else: raise ValueError("invalid label {}".format(label[0])) return words + + +def interpret_cws_pos_results(char_seq, label_seq): + """Transform model output into user-friendly contents. + + :param char_seq: list of string + :param label_seq: list of string, the same length as char_seq. + :return outputs: list of tuple (words, pos_tag): + """ + + def pos_tag_check(seq): + """check whether all entries are the same """ + return len(set(seq)) <= 1 + + word = [] + word_pos = [] + outputs = [] + for char, label in zip(char_seq, label_seq): + tmp = label.split("-") + cws_label, pos_tag = tmp[0], tmp[1] + + if cws_label == "B" or cws_label == "M": + word.append(char) + word_pos.append(pos_tag) + elif cws_label == "E": + word.append(char) + word_pos.append(pos_tag) + if not pos_tag_check(word_pos): + raise RuntimeError("character-wise pos tags inconsistent. ") + outputs.append(("".join(word), word_pos[0])) + word.clear() + word_pos.clear() + elif cws_label == "S": + outputs.append((char, pos_tag)) + return outputs diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 13a96030..a2f42d19 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): return text.strip().split() -if __name__ == "__main__": +class PeopleDailyCorpusLoader(DatasetLoader): """ - data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() - for example in data: - for w, l in zip(example[0], example[1]): - print(w, l) + People Daily Corpus: Chinese word segmentation, POS tag, NER """ - ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() - print(ans) + def __init__(self, data_path): + super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path) + + def load(self): + with open(self.data_path, "r", encoding="utf-8") as f: + sents = f.readlines() + + pos_tag_examples = [] + ner_examples = [] + for sent in sents: + inside_ne = False + sent_pos_tag = [] + sent_words = [] + sent_ner = [] + words = sent.strip().split()[1:] + for word in words: + if "[" in word and "]" in word: + ner_tag = "U" + print(word) + elif "[" in word: + inside_ne = True + ner_tag = "B" + word = word[1:] + elif "]" in word: + ner_tag = "L" + word = word[:word.index("]")] + if inside_ne is True: + inside_ne = False + else: + raise RuntimeError("only ] appears!") + else: + if inside_ne is True: + ner_tag = "I" + else: + ner_tag = "O" + tmp = word.split("/") + token, pos = tmp[0], tmp[1] + sent_ner.append(ner_tag) + sent_pos_tag.append(pos) + sent_words.append(token) + pos_tag_examples.append([sent_words, sent_pos_tag]) + ner_examples.append([sent_words, sent_ner]) + return pos_tag_examples, ner_examples + +if __name__ == "__main__": + loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt") + pos, ner = loader.load() + print(pos[:10]) + print(ner[:10]) diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py index 6c0e5141..7b8b2814 100644 --- a/fastNLP/modules/decoder/__init__.py +++ b/fastNLP/modules/decoder/__init__.py @@ -1,3 +1,4 @@ from .CRF import ConditionalRandomField +from .MLP import MLP -__all__ = ["ConditionalRandomField"] +__all__ = ["ConditionalRandomField", "MLP"] diff --git a/reproduction/chinese_word_seg/cws_train.py b/reproduction/chinese_word_seg/cws_train.py deleted file mode 100644 index b63a9401..00000000 --- a/reproduction/chinese_word_seg/cws_train.py +++ /dev/null @@ -1,114 +0,0 @@ -import sys - -sys.path.append("..") - -from fastNLP.loader.config_loader import ConfigLoader, ConfigSection -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader -from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle -from fastNLP.saver.model_saver import ModelSaver -from fastNLP.loader.model_loader import ModelLoader -from fastNLP.core.tester import SeqLabelTester -from fastNLP.models.sequence_modeling import SeqLabeling -from fastNLP.core.predictor import Predictor - -data_name = "pku_training.utf8" -cws_data_path = "/home/zyfeng/data/pku_training.utf8" -pickle_path = "./save/" -data_infer_path = "/home/zyfeng/data/pku_test.utf8" - - -def infer(): - # Load infer configuration, the same as test - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - - # fetch dictionary size and number of labels from pickle files - word2index = load_pickle(pickle_path, "word2id.pkl") - test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") - test_args["num_classes"] = len(index2label) - - # Define the same model - model = SeqLabeling(test_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") - - # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) - infer_data = raw_data_loader.load_lines() - - # Inference interface - infer = Predictor(pickle_path) - results = infer.predict(model, infer_data) - - print(results) - print("Inference finished!") - - -def train_test(): - # Config Loader - train_args = ConfigSection() - test_args = ConfigSection() - ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) - - # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) - train_data = loader.load_pku() - - # Preprocessor - preprocess = SeqLabelPreprocess() - data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) - train_args["vocab_size"] = preprocess.vocab_size - train_args["num_classes"] = preprocess.num_classes - - # Trainer - trainer = SeqLabelTrainer(train_args) - - # Model - model = SeqLabeling(train_args) - - # Start training - trainer.train(model, data_train, data_dev) - print("Training finished!") - - # Saver - saver = ModelSaver("./save/saved_model.pkl") - saver.save_pytorch(model) - print("Model saved!") - - # testing with validation set - test(data_dev) - - -def test(test_data): - # Config Loader - train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) - - # Define the same model - model = SeqLabeling(train_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") - - # Load test configuration - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - - # Tester - tester = SeqLabelTester(test_args) - - # Start testing - tester.test(model, test_data) - - # print test results - print(tester.show_matrices()) - print("model tested!") - - -if __name__ == "__main__": - train_test() diff --git a/reproduction/chinese_word_segment/cws.cfg b/reproduction/chinese_word_segment/cws.cfg index ab799428..033d3967 100644 --- a/reproduction/chinese_word_segment/cws.cfg +++ b/reproduction/chinese_word_segment/cws.cfg @@ -31,4 +31,16 @@ pickle_path = "./save/" use_crf = true use_cuda = true rnn_hidden_units = 100 +word_emb_dim = 100 + +[model] +save_output = true +validate_in_training = true +save_dev_input = false +save_loss = true +batch_size = 640 +pickle_path = "./save/" +use_crf = true +use_cuda = true +rnn_hidden_units = 100 word_emb_dim = 100 \ No newline at end of file diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index 66d01038..d08c9315 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -125,7 +125,7 @@ def test(): tester.test(model, dev_data) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/reproduction/chinese_word_seg/cws.cfg b/reproduction/pos_tag_model/pos_tag.cfg similarity index 59% rename from reproduction/chinese_word_seg/cws.cfg rename to reproduction/pos_tag_model/pos_tag.cfg index cdcb4496..eb5e315d 100644 --- a/reproduction/chinese_word_seg/cws.cfg +++ b/reproduction/pos_tag_model/pos_tag.cfg @@ -1,29 +1,35 @@ [train] -epochs = 10 -batch_size = 32 +epochs = 30 +batch_size = 64 pickle_path = "./save/" validate = true save_best_dev = true model_saved_path = "./save/" rnn_hidden_units = 100 -rnn_layers = 2 -rnn_bi_direction = true word_emb_dim = 100 -dropout = 0.5 use_crf = true use_cuda = true +print_every_step = 10 [test] save_output = true validate_in_training = true save_dev_input = false save_loss = true -batch_size = 64 +batch_size = 640 +pickle_path = "./save/" +use_crf = true +use_cuda = true + + +[POS_test] +save_output = true +validate_in_training = true +save_dev_input = false +save_loss = true +batch_size = 640 pickle_path = "./save/" -rnn_hidden_units = 100 -rnn_layers = 1 -rnn_bi_direction = true -word_emb_dim = 100 -dropout = 0.5 use_crf = true use_cuda = true +rnn_hidden_units = 100 +word_emb_dim = 100 \ No newline at end of file diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py new file mode 100644 index 00000000..822cba78 --- /dev/null +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -0,0 +1,146 @@ +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) + +from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader +from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle +from fastNLP.saver.model_saver import ModelSaver +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.core.tester import SeqLabelTester +from fastNLP.models.sequence_modeling import AdvSeqLabel +from fastNLP.core.predictor import SeqLabelInfer + +# not in the file's dir +if len(os.path.dirname(__file__)) != 0: + os.chdir(os.path.dirname(__file__)) +datadir = "/home/zyfeng/data/" +cfgfile = './pos_tag.cfg' +data_name = "CWS_POS_TAG_NER_people_daily.txt" + +pos_tag_data_path = os.path.join(datadir, data_name) +pickle_path = "save" +data_infer_path = os.path.join(datadir, "infer.utf8") + + +def infer(): + # Config Loader + test_args = ConfigSection() + ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # Define the same model + model = AdvSeqLabel(test_args) + + try: + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print('model loaded!') + except Exception as e: + print('cannot load model!') + raise + + # Data Loader + raw_data_loader = BaseLoader(data_name, data_infer_path) + infer_data = raw_data_loader.load_lines() + print('data loaded') + + # Inference interface + infer = SeqLabelInfer(pickle_path) + results = infer.predict(model, infer_data) + + print(results) + print("Inference finished!") + + +def train(): + # Config Loader + train_args = ConfigSection() + test_args = ConfigSection() + ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) + + # Data Loader + loader = PeopleDailyCorpusLoader(pos_tag_data_path) + train_data, _ = loader.load() + + # Preprocessor + preprocessor = SeqLabelPreprocess() + data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) + train_args["vocab_size"] = preprocessor.vocab_size + train_args["num_classes"] = preprocessor.num_classes + + # Trainer + trainer = SeqLabelTrainer(**train_args.data) + + # Model + model = AdvSeqLabel(train_args) + try: + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print('model parameter loaded!') + except Exception as e: + print("No saved model. Continue.") + pass + + # Start training + trainer.train(model, data_train, data_dev) + print("Training finished!") + + # Saver + saver = ModelSaver("./save/saved_model.pkl") + saver.save_pytorch(model) + print("Model saved!") + + +def test(): + # Config Loader + test_args = ConfigSection() + ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # load dev data + dev_data = load_pickle(pickle_path, "data_dev.pkl") + + # Define the same model + model = AdvSeqLabel(test_args) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print("model loaded!") + + # Tester + tester = SeqLabelTester(**test_args.data) + + # Start testing + tester.test(model, dev_data) + + # print test results + print(tester.show_metrics()) + print("model tested!") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') + parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) + args = parser.parse_args() + if args.mode == 'train': + train() + elif args.mode == 'test': + test() + elif args.mode == 'infer': + infer() + else: + print('no mode specified for model!') + parser.print_help() diff --git a/test/ner.py b/test/ner.py index 150bd8c7..a310b6cf 100644 --- a/test/ner.py +++ b/test/ner.py @@ -68,7 +68,7 @@ class MyNERTester(SeqLabelTester): def metrics(self): return np.mean(self.eval_history) - def show_matrices(self): + def show_metrics(self): return "dev accuracy={:.2f}".format(float(self.metrics())) diff --git a/test/readme_example.py b/test/readme_example.py index 17ac92c2..a644b4e4 100644 --- a/test/readme_example.py +++ b/test/readme_example.py @@ -1,19 +1,13 @@ -# python: 3.5 -# pytorch: 0.4 - -################ -# Test cross validation. -################ - -from fastNLP.loader.preprocess import ClassPreprocess - +from fastNLP.core.loss import Loss +from fastNLP.core.optimizer import Optimizer from fastNLP.core.predictor import ClassificationInfer +from fastNLP.core.preprocess import ClassPreprocess from fastNLP.core.trainer import ClassificationTrainer from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.models.base_model import BaseModel from fastNLP.modules import aggregation -from fastNLP.modules import encoder from fastNLP.modules import decoder +from fastNLP.modules import encoder class ClassificationModel(BaseModel): @@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): self.enc = encoder.Conv( in_channels=300, out_channels=100, kernel_size=3) self.agg = aggregation.MaxPool() - self.dec = decoder.MLP(100, num_classes=num_classes) + self.dec = decoder.MLP(size_layer=[100, num_classes]) def forward(self, x): x = self.emb(x) # [N,L] -> [N,L,C] @@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): return x -data_dir = 'data' # directory to save data and model -train_path = 'test/data_for_tests/text_classify.txt' # training set file +data_dir = 'save/' # directory to save data and model +train_path = './data_for_tests/text_classify.txt' # training set file # load dataset ds_loader = ClassDatasetLoader("train", train_path) data = ds_loader.load() # pre-process dataset -pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) -# pre = ClassPreprocess(data, data_dir) -n_classes = pre.num_classes -vocab_size = pre.vocab_size +pre = ClassPreprocess() +train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) +n_classes, vocab_size = pre.num_classes, pre.vocab_size # construct model model_args = { @@ -58,22 +51,25 @@ model_args = { } model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) -# train model +# construct trainer train_args = { - "epochs": 10, - "batch_size": 50, + "epochs": 3, + "batch_size": 16, "pickle_path": data_dir, "validate": False, "save_best_dev": False, "model_saved_path": None, "use_cuda": True, - "learn_rate": 1e-3, - "momentum": 0.9} -trainer = ClassificationTrainer(train_args) -# trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) -trainer.cross_validate(model) + "loss": Loss("cross_entropy"), + "optimizer": Optimizer("Adam", lr=0.001) +} +trainer = ClassificationTrainer(**train_args) + +# start training +trainer.train(model, train_data=train_set, dev_data=dev_set) # predict using model data_infer = [x[0] for x in data] infer = ClassificationInfer(data_dir) -labels_pred = infer.predict(model, data_infer) \ No newline at end of file +labels_pred = infer.predict(model.cpu(), data_infer) +print(labels_pred) diff --git a/test/seq_labeling.py b/test/seq_labeling.py index a9488834..b1a2657d 100644 --- a/test/seq_labeling.py +++ b/test/seq_labeling.py @@ -134,7 +134,7 @@ def train_and_test(): tester.test(model, data_dev) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/test/test_cws.py b/test/test_cws.py index bbbef67f..79911eeb 100644 --- a/test/test_cws.py +++ b/test/test_cws.py @@ -108,7 +108,7 @@ def train_test(): tester.test(model, data_train) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py index 0776109a..467c51b4 100644 --- a/test/test_fastNLP.py +++ b/test/test_fastNLP.py @@ -1,9 +1,12 @@ import sys + sys.path.append("..") from fastNLP.fastnlp import FastNLP -from fastNLP.fastnlp import interpret_word_seg_results +from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" +PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" + def word_seg(): nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) @@ -39,5 +42,33 @@ def test_word_seg_interpret(): print(interpret_word_seg_results(chars, labels)) +def test_interpret_cws_pos_results(): + foo = [ + [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), + ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), + ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] + ] + chars = [x[0] for x in foo[0]] + labels = [x[1] for x in foo[0]] + print(interpret_cws_pos_results(chars, labels)) + + +def test_pos_tag(): + nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) + nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") + text = ["这是最好的基于深度学习的中文分词系统。", + "大王叫我来巡山。", + "我党多年来致力于改善人民生活水平。"] + results = nlp.run(text) + for example in results: + words, labels = [], [] + for res in example: + words.append(res[0]) + labels.append(res[1]) + print(interpret_cws_pos_results(words, labels)) + + + + if __name__ == "__main__": word_seg() diff --git a/test/test_tester.py b/test/test_tester.py index 1c2658ef..83f73790 100644 --- a/test/test_tester.py +++ b/test/test_tester.py @@ -5,7 +5,6 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader from fastNLP.models.sequence_modeling import SeqLabeling data_name = "pku_training.utf8" -cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" pickle_path = "data_for_tests" @@ -17,7 +16,8 @@ def foo(): ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) # Preprocessor - p = SeqLabelPreprocess(train_data, pickle_path) + p = SeqLabelPreprocess() + p.run(train_data) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes @@ -30,7 +30,7 @@ def foo(): print("start validation.") validator.test(model) - print(validator.show_matrices()) + print(validator.show_metrics()) if __name__ == "__main__":