From c83008add9068a9afbbb931a2579055692b2ef58 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Thu, 26 Jul 2018 22:19:30 +0800 Subject: [PATCH] fastnlp.py works, see test/test_fastNLP.py for high-level API --- fastNLP/action/inference.py | 10 +- fastNLP/{fastNLP.py => fastnlp.py} | 93 ++++++++++++-- fastNLP/loader/config_loader.py | 6 +- fastNLP/loader/dataset_loader.py | 51 ++++++++ test/data_for_tests/config | 13 +- test/test_cws.py | 115 ++++++++++++++++++ test/test_fastNLP.py | 14 +++ test/test_keras_like.py | 28 ----- ...t_POS_pipeline.py => test_seq_labeling.py} | 11 +- 9 files changed, 289 insertions(+), 52 deletions(-) rename fastNLP/{fastNLP.py => fastnlp.py} (51%) create mode 100644 test/test_cws.py create mode 100644 test/test_fastNLP.py delete mode 100644 test/test_keras_like.py rename test/{test_POS_pipeline.py => test_seq_labeling.py} (90%) diff --git a/fastNLP/action/inference.py b/fastNLP/action/inference.py index c0692f28..ce6a8b62 100644 --- a/fastNLP/action/inference.py +++ b/fastNLP/action/inference.py @@ -38,7 +38,7 @@ class Inference(object): num_iter = len(data) // self.batch_size for step in range(num_iter): - batch_x = self.batchify(data) + batch_x = self.make_batch(data) prediction = self.data_forward(network, batch_x) @@ -68,10 +68,11 @@ class Inference(object): results = torch.Tensor(prediction).view(-1, ) return list(results.data) - def batchify(self, data): + def make_batch(self, data): indices = next(self.iterator) batch_x = [data[idx] for idx in indices] - batch_x = self.pad(batch_x) + if self.batch_size > 1: + batch_x = self.pad(batch_x) return batch_x @staticmethod @@ -98,6 +99,7 @@ class Inference(object): ... ] """ + assert isinstance(data, list) data_index = [] default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] for example in data: @@ -107,7 +109,7 @@ class Inference(object): def prepare_output(self, batch_outputs): """ Transform list of batch outputs into strings. - :param batch_outputs: list of list [num_batch, tag_seq_length] + :param batch_outputs: list of list, of shape [num_batch, tag_seq_length]. Element type is Tensor. :return: """ results = [] diff --git a/fastNLP/fastNLP.py b/fastNLP/fastnlp.py similarity index 51% rename from fastNLP/fastNLP.py rename to fastNLP/fastnlp.py index cfda830c..cb97aa53 100644 --- a/fastNLP/fastNLP.py +++ b/fastNLP/fastnlp.py @@ -3,14 +3,14 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection from fastNLP.loader.model_loader import ModelLoader """ -mapping from model name to [URL, file_name.class_name] +mapping from model name to [URL, file_name.class_name, model_pickle_name] Notice that the class of the model should be in "models" directory. Example: - "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] + "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] """ FastNLP_MODEL_COLLECTION = { - "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"] + "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] } @@ -26,6 +26,13 @@ class FastNLP(object): """ def __init__(self, model_dir="./"): + """ + :param model_dir: this directory should contain the following files: + 1. a pre-trained model + 2. a config file + 3. "id2class.pkl" + 4. "word2id.pkl" + """ self.model_dir = model_dir self.model = None @@ -45,27 +52,32 @@ class FastNLP(object): model_args = ConfigSection() # To do: customized config file for model init parameters - ConfigLoader.load_config(self.model_dir + "default.cfg", model_args) + ConfigLoader.load_config(self.model_dir + "config", {"POS_infer": model_args}) + # Construct the model model = model_class(model_args) # To do: framework independent - ModelLoader.load_pytorch(model, self.model_dir + model_name) + ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name][2]) self.model = model print("Model loaded. ") - def run(self, infer_input): + def run(self, raw_input): """ Perform inference over given input using the loaded model. - :param infer_input: str, raw text + :param raw_input: str, raw text :return results: """ - infer = Inference() - data = infer.prepare_input(infer_input) - results = infer.predict(self.model, data) - return results + + infer = Inference(self.model_dir) + infer_input = self.string_to_list(raw_input) + + results = infer.predict(self.model, infer_input) + + outputs = self.make_output(results) + return outputs @staticmethod def _get_model_class(file_class_name): @@ -101,4 +113,61 @@ class FastNLP(object): Check whether the desired model is already in the directory. :param model_dir: """ - pass + return True + + def string_to_list(self, text, delimiter="\n"): + """ + For word seg only, currently. + This function is used to transform raw input to lists, which is done by DatasetLoader in training. + Split text string into three-level lists. + [ + [word_11, word_12, ...], + [word_21, word_22, ...], + ... + ] + :param text: string + :param delimiter: str, character used to split text into sentences. + :return data: three-level lists + """ + data = [] + sents = text.strip().split(delimiter) + for sent in sents: + characters = [] + for ch in sent: + characters.append(ch) + data.append(characters) + # To refactor: this is used in make_output + self.data = data + return data + + def make_output(self, results): + """ + Transform model output into user-friendly contents. + Example: In CWS, convert labeling into segmented text. + :param results: + :return: + """ + outputs = [] + for sent_char, sent_label in zip(self.data, results): + words = [] + word = "" + for char, label in zip(sent_char, sent_label): + if label[0] == "B": + if word != "": + words.append(word) + word = char + elif label[0] == "M": + word += char + elif label[0] == "E": + word += char + words.append(word) + word = "" + elif label[0] == "S": + if word != "": + words.append(word) + word = "" + words.append(char) + else: + raise ValueError("invalid label") + outputs.append(" ".join(words)) + return outputs diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py index e3a856d9..d348e75e 100644 --- a/fastNLP/loader/config_loader.py +++ b/fastNLP/loader/config_loader.py @@ -20,9 +20,13 @@ class ConfigLoader(BaseLoader): def load_config(file_path, sections): """ :param file_path: the path of config file - :param sections: the dict of sections + :param sections: the dict of {section_name(string): Section instance} + Example: + test_args = ConfigSection() + ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) :return: """ + assert isinstance(sections, dict) cfg = configparser.ConfigParser() if not os.path.exists(file_path): raise FileNotFoundError("config file {} not found. ".format(file_path)) diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index dc5640f1..88ff151d 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -22,6 +22,7 @@ class POSDatasetLoader(DatasetLoader): and label2 Jerry label1 . label3 + (separated by an empty line) Hello label4 world label5 ! label3 @@ -77,6 +78,51 @@ class POSDatasetLoader(DatasetLoader): return data +class TokenizeDatasetLoader(DatasetLoader): + """ + Data set loader for tokenization data sets + """ + + def __init__(self, data_name, data_path): + super(TokenizeDatasetLoader, self).__init__(data_name, data_path) + + def load_pku(self): + """ + load pku dataset for Chinese word segmentation + CWS (Chinese Word Segmentation) pku training dataset format: + 1. Each line is a sentence. + 2. Each word in a sentence is separated by space. + This function convert the pku dataset into three-level lists with labels . + B: beginning of a word + M: middle of a word + E: ending of a word + S: single character + + :return: three-level lists + """ + with open(self.data_path, "r", encoding="utf-8") as f: + sentences = f.readlines() + data = [] + for sent in sentences: + words = [] + labels = [] + tokens = sent.strip().split() + for token in tokens: + if len(token) == 1: + words.append(token) + labels.append("S") + else: + words.append(token[0]) + labels.append("B") + for idx in range(1, len(token) - 1): + words.append(token[idx]) + labels.append("M") + words.append(token[-1]) + labels.append("E") + data.append([words, labels]) + return data + + class ClassDatasetLoader(DatasetLoader): """Loader for classification data sets""" @@ -163,7 +209,12 @@ class LMDatasetLoader(DatasetLoader): if __name__ == "__main__": + """ data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() for example in data: for w, l in zip(example[0], example[1]): print(w, l) + """ + + ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() + print(ans) diff --git a/test/data_for_tests/config b/test/data_for_tests/config index fad9d876..243ad1ff 100644 --- a/test/data_for_tests/config +++ b/test/data_for_tests/config @@ -54,8 +54,8 @@ test = 5 new_attr = 40 [POS] -epochs = 20 -batch_size = 1 +epochs = 1 +batch_size = 32 pickle_path = "./data_for_tests/" validate = true save_best_dev = true @@ -80,3 +80,12 @@ rnn_bi_direction = true word_emb_dim = 100 dropout = 0.5 use_crf = true + +[POS_infer] +pickle_path = "./data_for_tests/" +rnn_hidden_units = 100 +rnn_layers = 1 +rnn_bi_direction = true +word_emb_dim = 100 +vocab_size = 52 +num_classes = 22 \ No newline at end of file diff --git a/test/test_cws.py b/test/test_cws.py new file mode 100644 index 00000000..8cee7177 --- /dev/null +++ b/test/test_cws.py @@ -0,0 +1,115 @@ +import sys + +sys.path.append("..") + +from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.action.trainer import POSTrainer +from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader +from fastNLP.loader.preprocess import POSPreprocess, load_pickle +from fastNLP.saver.model_saver import ModelSaver +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.action.tester import POSTester +from fastNLP.models.sequence_modeling import SeqLabeling +from fastNLP.action.inference import Inference + +data_name = "pku_training.utf8" +cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" +pickle_path = "data_for_tests" +data_infer_path = "data_for_tests/people_infer.txt" + + +def infer(): + # Load infer configuration, the same as test + test_args = ConfigSection() + ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # Define the same model + model = SeqLabeling(test_args) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") + print("model loaded!") + + # Data Loader + raw_data_loader = BaseLoader(data_name, data_infer_path) + infer_data = raw_data_loader.load_lines() + """ + Transform strings into list of list of strings. + [ + [word_11, word_12, ...], + [word_21, word_22, ...], + ... + ] + In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. + """ + + # Inference interface + infer = Inference(pickle_path) + results = infer.predict(model, infer_data) + + print(results) + print("Inference finished!") + + +def train_test(): + # Config Loader + train_args = ConfigSection() + ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) + + # Data Loader + loader = TokenizeDatasetLoader(data_name, cws_data_path) + train_data = loader.load_pku() + + # Preprocessor + p = POSPreprocess(train_data, pickle_path) + train_args["vocab_size"] = p.vocab_size + train_args["num_classes"] = p.num_classes + + # Trainer + trainer = POSTrainer(train_args) + + # Model + model = SeqLabeling(train_args) + + # Start training + trainer.train(model) + print("Training finished!") + + # Saver + saver = ModelSaver("./data_for_tests/saved_model.pkl") + saver.save_pytorch(model) + print("Model saved!") + + del model, trainer, loader + + # Define the same model + model = SeqLabeling(train_args) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") + print("model loaded!") + + # Load test configuration + test_args = ConfigSection() + ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + + # Tester + tester = POSTester(test_args) + + # Start testing + tester.test(model) + + # print test results + print(tester.show_matrices()) + print("model tested!") + + +if __name__ == "__main__": + train_test() + # infer() diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py new file mode 100644 index 00000000..35bac153 --- /dev/null +++ b/test/test_fastNLP.py @@ -0,0 +1,14 @@ +from fastNLP.fastnlp import FastNLP + + +def foo(): + nlp = FastNLP("./data_for_tests/") + nlp.load("zh_pos_tag_model") + text = "这是最好的基于深度学习的中文分词系统。" + result = nlp.run(text) + print(result) + print("FastNLP finished!") + + +if __name__ == "__main__": + foo() diff --git a/test/test_keras_like.py b/test/test_keras_like.py deleted file mode 100644 index 08f7d6ae..00000000 --- a/test/test_keras_like.py +++ /dev/null @@ -1,28 +0,0 @@ -import aggregation -import decoder -import encoder - - -class Input(object): - def __init__(self): - pass - - -class Trainer(object): - def __init__(self, input, target, truth): - pass - - def train(self): - pass - - -def test_keras_like(): - data_train, label_train = dataLoader("./data_path") - - x = Input() - x = encoder.LSTM(input=x) - x = aggregation.max_pool(input=x) - y = decoder.CRF(input=x) - - trainer = Trainer(input=data_train, target=y, truth=label_train) - trainer.train() diff --git a/test/test_POS_pipeline.py b/test/test_seq_labeling.py similarity index 90% rename from test/test_POS_pipeline.py rename to test/test_seq_labeling.py index fdf5de3e..9a5fa711 100644 --- a/test/test_POS_pipeline.py +++ b/test/test_seq_labeling.py @@ -23,7 +23,7 @@ def infer(): test_args = ConfigSection() ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - # fetch dictinary size and number of labels from pickle files + # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "id2class.pkl") @@ -33,7 +33,7 @@ def infer(): model = SeqLabeling(test_args) # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./saved_model.pkl") + ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") print("model loaded!") # Data Loader @@ -82,7 +82,7 @@ def train_test(): print("Training finished!") # Saver - saver = ModelSaver("./saved_model.pkl") + saver = ModelSaver("./data_for_tests/saved_model.pkl") saver.save_pytorch(model) print("Model saved!") @@ -92,7 +92,7 @@ def train_test(): model = SeqLabeling(train_args) # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./saved_model.pkl") + ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") print("model loaded!") # Load test configuration @@ -111,4 +111,5 @@ def train_test(): if __name__ == "__main__": - infer() + train_test() + # infer()