| @@ -23,9 +23,11 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa | |||||
| - [Documentation](https://fastnlp.readthedocs.io/en/latest/) | - [Documentation](https://fastnlp.readthedocs.io/en/latest/) | ||||
| - [Source Code](https://github.com/fastnlp/fastNLP) | - [Source Code](https://github.com/fastnlp/fastNLP) | ||||
| ## Installation | ## Installation | ||||
| Run the following commands to install fastNLP package. | |||||
| ```shell | |||||
| pip install fastNLP | |||||
| ``` | |||||
| ### Cloning From GitHub | ### Cloning From GitHub | ||||
| @@ -51,14 +53,15 @@ pip3 install torch torchvision | |||||
| ``` | ``` | ||||
| FastNLP | FastNLP | ||||
| ├── docs | ├── docs | ||||
| │ └── quick_tutorial.md | |||||
| ├── fastNLP | ├── fastNLP | ||||
| │ ├── action | |||||
| │ ├── core | |||||
| │ │ ├── action.py | │ │ ├── action.py | ||||
| │ │ ├── inference.py | |||||
| │ │ ├── __init__.py | │ │ ├── __init__.py | ||||
| │ │ ├── loss.py | |||||
| │ │ ├── metrics.py | │ │ ├── metrics.py | ||||
| │ │ ├── optimizer.py | │ │ ├── optimizer.py | ||||
| │ │ ├── predictor.py | |||||
| │ │ ├── preprocess.py | |||||
| │ │ ├── README.md | │ │ ├── README.md | ||||
| │ │ ├── tester.py | │ │ ├── tester.py | ||||
| │ │ └── trainer.py | │ │ └── trainer.py | ||||
| @@ -70,71 +73,28 @@ FastNLP | |||||
| │ │ ├── dataset_loader.py | │ │ ├── dataset_loader.py | ||||
| │ │ ├── embed_loader.py | │ │ ├── embed_loader.py | ||||
| │ │ ├── __init__.py | │ │ ├── __init__.py | ||||
| │ │ ├── model_loader.py | |||||
| │ │ └── preprocess.py | |||||
| │ │ └── model_loader.py | |||||
| │ ├── models | │ ├── models | ||||
| │ │ ├── base_model.py | |||||
| │ │ ├── char_language_model.py | |||||
| │ │ ├── cnn_text_classification.py | |||||
| │ │ ├── __init__.py | |||||
| │ │ └── sequence_modeling.py | |||||
| │ ├── modules | │ ├── modules | ||||
| │ │ ├── aggregation | │ │ ├── aggregation | ||||
| │ │ │ ├── attention.py | |||||
| │ │ │ ├── avg_pool.py | |||||
| │ │ │ ├── __init__.py | |||||
| │ │ │ ├── kmax_pool.py | |||||
| │ │ │ ├── max_pool.py | |||||
| │ │ │ └── self_attention.py | |||||
| │ │ ├── decoder | │ │ ├── decoder | ||||
| │ │ │ ├── CRF.py | |||||
| │ │ │ └── __init__.py | |||||
| │ │ ├── encoder | │ │ ├── encoder | ||||
| │ │ │ ├── char_embedding.py | |||||
| │ │ │ ├── conv_maxpool.py | |||||
| │ │ │ ├── conv.py | |||||
| │ │ │ ├── embedding.py | |||||
| │ │ │ ├── __init__.py | |||||
| │ │ │ ├── linear.py | |||||
| │ │ │ ├── lstm.py | |||||
| │ │ │ ├── masked_rnn.py | |||||
| │ │ │ └── variational_rnn.py | |||||
| │ │ ├── __init__.py | │ │ ├── __init__.py | ||||
| │ │ ├── interaction | │ │ ├── interaction | ||||
| │ │ │ └── __init__.py | |||||
| │ │ ├── other_modules.py | │ │ ├── other_modules.py | ||||
| │ │ └── utils.py | │ │ └── utils.py | ||||
| │ └── saver | │ └── saver | ||||
| │ ├── base_saver.py | |||||
| │ ├── __init__.py | |||||
| │ ├── logger.py | |||||
| │ └── model_saver.py | |||||
| ├── LICENSE | ├── LICENSE | ||||
| ├── README.md | ├── README.md | ||||
| ├── reproduction | ├── reproduction | ||||
| │ ├── Char-aware_NLM | |||||
| │ │ | |||||
| │ ├── CNN-sentence_classification | |||||
| │ │ | |||||
| │ ├── HAN-document_classification | |||||
| │ │ | |||||
| │ └── LSTM+self_attention_sentiment_analysis | |||||
| | | |||||
| ├── requirements.txt | ├── requirements.txt | ||||
| ├── setup.py | ├── setup.py | ||||
| └── test | └── test | ||||
| ├── core | |||||
| ├── data_for_tests | ├── data_for_tests | ||||
| │ ├── charlm.txt | |||||
| │ ├── config | |||||
| │ ├── cws_test | |||||
| │ ├── cws_train | |||||
| │ ├── people_infer.txt | |||||
| │ └── people.txt | |||||
| ├── test_charlm.py | |||||
| ├── test_cws.py | |||||
| ├── test_fastNLP.py | |||||
| ├── test_loader.py | |||||
| ├── test_seq_labeling.py | |||||
| ├── test_tester.py | |||||
| └── test_trainer.py | |||||
| ├── __init__.py | |||||
| ├── loader | |||||
| ├── modules | |||||
| └── readme_example.py | |||||
| ``` | ``` | ||||
| @@ -9,7 +9,7 @@ class Loss(object): | |||||
| def __init__(self, args): | def __init__(self, args): | ||||
| if args is None: | if args is None: | ||||
| # this is useful when | |||||
| # this is useful when Trainer.__init__ performs type check | |||||
| self._loss = None | self._loss = None | ||||
| elif isinstance(args, str): | elif isinstance(args, str): | ||||
| self._loss = self._borrow_from_pytorch(args) | self._loss = self._borrow_from_pytorch(args) | ||||
| @@ -70,7 +70,7 @@ class Predictor(object): | |||||
| def predict(self, network, data): | def predict(self, network, data): | ||||
| """Perform inference using the trained model. | """Perform inference using the trained model. | ||||
| :param network: a PyTorch model | |||||
| :param network: a PyTorch model (cpu) | |||||
| :param data: list of list of strings | :param data: list of list of strings | ||||
| :return: list of list of strings, [num_examples, tag_seq_length] | :return: list of list of strings, [num_examples, tag_seq_length] | ||||
| """ | """ | ||||
| @@ -38,7 +38,7 @@ class BaseTester(object): | |||||
| Obviously, "required_args" is the subset of "default_args". | Obviously, "required_args" is the subset of "default_args". | ||||
| The value in "default_args" to the keys in "required_args" is simply for type check. | The value in "default_args" to the keys in "required_args" is simply for type check. | ||||
| """ | """ | ||||
| # TODO: required arguments | |||||
| # add required arguments here | |||||
| required_args = {} | required_args = {} | ||||
| for req_key in required_args: | for req_key in required_args: | ||||
| @@ -56,7 +56,7 @@ class BaseTester(object): | |||||
| logger.error(msg) | logger.error(msg) | ||||
| raise ValueError(msg) | raise ValueError(msg) | ||||
| else: | else: | ||||
| # BeseTester doesn't care about extra arguments | |||||
| # BaseTester doesn't care about extra arguments | |||||
| pass | pass | ||||
| print(default_args) | print(default_args) | ||||
| @@ -69,8 +69,8 @@ class BaseTester(object): | |||||
| self.print_every_step = default_args["print_every_step"] | self.print_every_step = default_args["print_every_step"] | ||||
| self._model = None | self._model = None | ||||
| self.eval_history = [] | |||||
| self.batch_output = [] | |||||
| self.eval_history = [] # evaluation results of all batches | |||||
| self.batch_output = [] # outputs of all batches | |||||
| def test(self, network, dev_data): | def test(self, network, dev_data): | ||||
| if torch.cuda.is_available() and self.use_cuda: | if torch.cuda.is_available() and self.use_cuda: | ||||
| @@ -83,10 +83,10 @@ class BaseTester(object): | |||||
| self.eval_history.clear() | self.eval_history.clear() | ||||
| self.batch_output.clear() | self.batch_output.clear() | ||||
| iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | |||||
| iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False)) | |||||
| step = 0 | step = 0 | ||||
| for batch_x, batch_y in self.make_batch(iterator, dev_data): | |||||
| for batch_x, batch_y in self.make_batch(iterator): | |||||
| with torch.no_grad(): | with torch.no_grad(): | ||||
| prediction = self.data_forward(network, batch_x) | prediction = self.data_forward(network, batch_x) | ||||
| eval_results = self.evaluate(prediction, batch_y) | eval_results = self.evaluate(prediction, batch_y) | ||||
| @@ -99,7 +99,7 @@ class BaseTester(object): | |||||
| print_output = "[test step {}] {}".format(step, eval_results) | print_output = "[test step {}] {}".format(step, eval_results) | ||||
| logger.info(print_output) | logger.info(print_output) | ||||
| if self.print_every_step > 0 and step % self.print_every_step == 0: | if self.print_every_step > 0 and step % self.print_every_step == 0: | ||||
| print(print_output) | |||||
| print(self.make_eval_output(prediction, eval_results)) | |||||
| step += 1 | step += 1 | ||||
| def mode(self, model, test): | def mode(self, model, test): | ||||
| @@ -115,24 +115,44 @@ class BaseTester(object): | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def evaluate(self, predict, truth): | def evaluate(self, predict, truth): | ||||
| """Compute evaluation metrics for the model. """ | |||||
| """Compute evaluation metrics. | |||||
| :param predict: Tensor | |||||
| :param truth: Tensor | |||||
| :return eval_results: can be anything. It will be stored in self.eval_history | |||||
| """ | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| @property | @property | ||||
| def metrics(self): | def metrics(self): | ||||
| """Return a list of metrics. """ | |||||
| """Compute and return metrics. | |||||
| Use self.eval_history to compute metrics over the whole dev set. | |||||
| Please refer to metrics.py for common metric functions. | |||||
| :return : variable number of outputs | |||||
| """ | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def show_matrices(self): | |||||
| """This is called by Trainer to print evaluation results on dev set during training. | |||||
| def show_metrics(self): | |||||
| """Customize evaluation outputs in Trainer. | |||||
| Called by Trainer to print evaluation results on dev set during training. | |||||
| Use self.metrics to fetch available metrics. | |||||
| :return print_str: str | :return print_str: str | ||||
| """ | """ | ||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def make_batch(self, iterator, data): | |||||
| def make_batch(self, iterator): | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def make_eval_output(self, predictions, eval_results): | |||||
| """Customize Tester outputs. | |||||
| :param predictions: Tensor | |||||
| :param eval_results: Tensor | |||||
| :return: str, to be printed. | |||||
| """ | |||||
| raise NotImplementedError | |||||
| class SeqLabelTester(BaseTester): | class SeqLabelTester(BaseTester): | ||||
| """ | """ | ||||
| @@ -194,7 +214,7 @@ class SeqLabelTester(BaseTester): | |||||
| batch_accuracy = np.mean([x[1] for x in self.eval_history]) | batch_accuracy = np.mean([x[1] for x in self.eval_history]) | ||||
| return batch_loss, batch_accuracy | return batch_loss, batch_accuracy | ||||
| def show_matrices(self): | |||||
| def show_metrics(self): | |||||
| """ | """ | ||||
| This is called by Trainer to print evaluation on dev set. | This is called by Trainer to print evaluation on dev set. | ||||
| :return print_str: str | :return print_str: str | ||||
| @@ -202,7 +222,7 @@ class SeqLabelTester(BaseTester): | |||||
| loss, accuracy = self.metrics() | loss, accuracy = self.metrics() | ||||
| return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | ||||
| def make_batch(self, iterator, data): | |||||
| def make_batch(self, iterator): | |||||
| return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | ||||
| @@ -211,12 +231,12 @@ class ClassificationTester(BaseTester): | |||||
| def __init__(self, **test_args): | def __init__(self, **test_args): | ||||
| """ | """ | ||||
| :param test_args: a dict-like object that has __getitem__ method, \ | |||||
| :param test_args: a dict-like object that has __getitem__ method. | |||||
| can be accessed by "test_args["key_str"]" | can be accessed by "test_args["key_str"]" | ||||
| """ | """ | ||||
| super(ClassificationTester, self).__init__(**test_args) | super(ClassificationTester, self).__init__(**test_args) | ||||
| def make_batch(self, iterator, data, max_len=None): | |||||
| def make_batch(self, iterator, max_len=None): | |||||
| return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | ||||
| def data_forward(self, network, x): | def data_forward(self, network, x): | ||||
| @@ -1,6 +1,4 @@ | |||||
| import _pickle | |||||
| import copy | import copy | ||||
| import os | |||||
| import time | import time | ||||
| from datetime import timedelta | from datetime import timedelta | ||||
| @@ -15,16 +13,12 @@ from fastNLP.modules import utils | |||||
| from fastNLP.saver.logger import create_logger | from fastNLP.saver.logger import create_logger | ||||
| from fastNLP.saver.model_saver import ModelSaver | from fastNLP.saver.model_saver import ModelSaver | ||||
| DEFAULT_QUEUE_SIZE = 300 | |||||
| logger = create_logger(__name__, "./train_test.log") | logger = create_logger(__name__, "./train_test.log") | ||||
| class BaseTrainer(object): | class BaseTrainer(object): | ||||
| """Operations to train a model, including data loading, SGD, and validation. | |||||
| """Operations of training a model, including data loading, gradient descent, and validation. | |||||
| Subclasses must implement the following abstract methods: | |||||
| - grad_backward | |||||
| - get_loss | |||||
| """ | """ | ||||
| def __init__(self, **kwargs): | def __init__(self, **kwargs): | ||||
| @@ -47,7 +41,7 @@ class BaseTrainer(object): | |||||
| """ | """ | ||||
| default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", | default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", | ||||
| "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, | "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, | ||||
| "loss": Loss(None), | |||||
| "loss": Loss(None), # used to pass type check | |||||
| "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) | "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) | ||||
| } | } | ||||
| """ | """ | ||||
| @@ -56,7 +50,7 @@ class BaseTrainer(object): | |||||
| Obviously, "required_args" is the subset of "default_args". | Obviously, "required_args" is the subset of "default_args". | ||||
| The value in "default_args" to the keys in "required_args" is simply for type check. | The value in "default_args" to the keys in "required_args" is simply for type check. | ||||
| """ | """ | ||||
| # TODO: required arguments | |||||
| # add required arguments here | |||||
| required_args = {} | required_args = {} | ||||
| for req_key in required_args: | for req_key in required_args: | ||||
| @@ -144,7 +138,7 @@ class BaseTrainer(object): | |||||
| print("Saved better model selected by validation.") | print("Saved better model selected by validation.") | ||||
| logger.info("Saved better model selected by validation.") | logger.info("Saved better model selected by validation.") | ||||
| valid_results = validator.show_matrices() | |||||
| valid_results = validator.show_metrics() | |||||
| print("[epoch {}] {}".format(epoch, valid_results)) | print("[epoch {}] {}".format(epoch, valid_results)) | ||||
| logger.info("[epoch {}] {}".format(epoch, valid_results)) | logger.info("[epoch {}] {}".format(epoch, valid_results)) | ||||
| @@ -198,21 +192,6 @@ class BaseTrainer(object): | |||||
| network_copy = copy.deepcopy(network) | network_copy = copy.deepcopy(network) | ||||
| self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | ||||
| def load_train_data(self, pickle_path): | |||||
| """ | |||||
| For task-specific processing. | |||||
| :param pickle_path: | |||||
| :return data_train | |||||
| """ | |||||
| file_path = os.path.join(pickle_path, "data_train.pkl") | |||||
| if os.path.exists(file_path): | |||||
| with open(file_path, 'rb') as f: | |||||
| data = _pickle.load(f) | |||||
| else: | |||||
| logger.error("cannot find training data {}. invalid input path for training data.".format(file_path)) | |||||
| raise RuntimeError("cannot find training data {}".format(file_path)) | |||||
| return data | |||||
| def make_batch(self, iterator): | def make_batch(self, iterator): | ||||
| raise NotImplementedError | raise NotImplementedError | ||||
| @@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = { | |||||
| "type": "seq_label", | "type": "seq_label", | ||||
| "config_file_name": "config", | "config_file_name": "config", | ||||
| "config_section_name": "text_class_model" | "config_section_name": "text_class_model" | ||||
| }, | |||||
| "pos_tag_model": { | |||||
| "url": "", | |||||
| "class": "sequence_modeling.AdvSeqLabel", | |||||
| "pickle": "pos_tag_model_v_0.pkl", | |||||
| "type": "seq_label", | |||||
| "config_file_name": "pos_tag.config", | |||||
| "config_section_name": "pos_tag_model" | |||||
| } | } | ||||
| } | } | ||||
| @@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq): | |||||
| else: | else: | ||||
| raise ValueError("invalid label {}".format(label[0])) | raise ValueError("invalid label {}".format(label[0])) | ||||
| return words | return words | ||||
| def interpret_cws_pos_results(char_seq, label_seq): | |||||
| """Transform model output into user-friendly contents. | |||||
| :param char_seq: list of string | |||||
| :param label_seq: list of string, the same length as char_seq. | |||||
| :return outputs: list of tuple (words, pos_tag): | |||||
| """ | |||||
| def pos_tag_check(seq): | |||||
| """check whether all entries are the same """ | |||||
| return len(set(seq)) <= 1 | |||||
| word = [] | |||||
| word_pos = [] | |||||
| outputs = [] | |||||
| for char, label in zip(char_seq, label_seq): | |||||
| tmp = label.split("-") | |||||
| cws_label, pos_tag = tmp[0], tmp[1] | |||||
| if cws_label == "B" or cws_label == "M": | |||||
| word.append(char) | |||||
| word_pos.append(pos_tag) | |||||
| elif cws_label == "E": | |||||
| word.append(char) | |||||
| word_pos.append(pos_tag) | |||||
| if not pos_tag_check(word_pos): | |||||
| raise RuntimeError("character-wise pos tags inconsistent. ") | |||||
| outputs.append(("".join(word), word_pos[0])) | |||||
| word.clear() | |||||
| word_pos.clear() | |||||
| elif cws_label == "S": | |||||
| outputs.append((char, pos_tag)) | |||||
| return outputs | |||||
| @@ -1,9 +1,8 @@ | |||||
| class BaseLoader(object): | class BaseLoader(object): | ||||
| """docstring for BaseLoader""" | """docstring for BaseLoader""" | ||||
| def __init__(self, data_name, data_path): | |||||
| def __init__(self, data_path): | |||||
| super(BaseLoader, self).__init__() | super(BaseLoader, self).__init__() | ||||
| self.data_name = data_name | |||||
| self.data_path = data_path | self.data_path = data_path | ||||
| def load(self): | def load(self): | ||||
| @@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader): | |||||
| For charLM | For charLM | ||||
| """ | """ | ||||
| def __init__(self, name, path): | |||||
| super(ToyLoader0, self).__init__(name, path) | |||||
| def __init__(self, data_path): | |||||
| super(ToyLoader0, self).__init__(data_path) | |||||
| def load(self): | def load(self): | ||||
| with open(self.data_path, 'r') as f: | with open(self.data_path, 'r') as f: | ||||
| @@ -9,7 +9,7 @@ class ConfigLoader(BaseLoader): | |||||
| """loader for configuration files""" | """loader for configuration files""" | ||||
| def __int__(self, data_name, data_path): | def __int__(self, data_name, data_path): | ||||
| super(ConfigLoader, self).__init__(data_name, data_path) | |||||
| super(ConfigLoader, self).__init__(data_path) | |||||
| self.config = self.parse(super(ConfigLoader, self).load()) | self.config = self.parse(super(ConfigLoader, self).load()) | ||||
| @staticmethod | @staticmethod | ||||
| @@ -100,7 +100,7 @@ class ConfigSection(object): | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| config = ConfigLoader('configLoader', 'there is no data') | |||||
| config = ConfigLoader('there is no data') | |||||
| section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} | section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} | ||||
| """ | """ | ||||
| @@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader | |||||
| class DatasetLoader(BaseLoader): | class DatasetLoader(BaseLoader): | ||||
| """"loader for data sets""" | """"loader for data sets""" | ||||
| def __init__(self, data_name, data_path): | |||||
| super(DatasetLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(DatasetLoader, self).__init__(data_path) | |||||
| class POSDatasetLoader(DatasetLoader): | class POSDatasetLoader(DatasetLoader): | ||||
| @@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader): | |||||
| to label5. | to label5. | ||||
| """ | """ | ||||
| def __init__(self, data_name, data_path): | |||||
| super(POSDatasetLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(POSDatasetLoader, self).__init__(data_path) | |||||
| def load(self): | def load(self): | ||||
| assert os.path.exists(self.data_path) | assert os.path.exists(self.data_path) | ||||
| @@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
| Data set loader for tokenization data sets | Data set loader for tokenization data sets | ||||
| """ | """ | ||||
| def __init__(self, data_name, data_path): | |||||
| super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(TokenizeDatasetLoader, self).__init__(data_path) | |||||
| def load_pku(self, max_seq_len=32): | def load_pku(self, max_seq_len=32): | ||||
| """ | """ | ||||
| @@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
| class ClassDatasetLoader(DatasetLoader): | class ClassDatasetLoader(DatasetLoader): | ||||
| """Loader for classification data sets""" | """Loader for classification data sets""" | ||||
| def __init__(self, data_name, data_path): | |||||
| super(ClassDatasetLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(ClassDatasetLoader, self).__init__(data_path) | |||||
| def load(self): | def load(self): | ||||
| assert os.path.exists(self.data_path) | assert os.path.exists(self.data_path) | ||||
| @@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader): | |||||
| :param str data_name: the name of the conll data set | :param str data_name: the name of the conll data set | ||||
| :param str data_path: the path to the conll data set | :param str data_path: the path to the conll data set | ||||
| """ | """ | ||||
| super(ConllLoader, self).__init__(data_name, data_path) | |||||
| super(ConllLoader, self).__init__(data_path) | |||||
| self.data_set = self.parse(self.load()) | self.data_set = self.parse(self.load()) | ||||
| def load(self): | def load(self): | ||||
| @@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader): | |||||
| class LMDatasetLoader(DatasetLoader): | class LMDatasetLoader(DatasetLoader): | ||||
| def __init__(self, data_name, data_path): | |||||
| super(LMDatasetLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(LMDatasetLoader, self).__init__(data_path) | |||||
| def load(self): | def load(self): | ||||
| if not os.path.exists(self.data_path): | if not os.path.exists(self.data_path): | ||||
| @@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): | |||||
| return text.strip().split() | return text.strip().split() | ||||
| if __name__ == "__main__": | |||||
| class PeopleDailyCorpusLoader(DatasetLoader): | |||||
| """ | """ | ||||
| data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | |||||
| for example in data: | |||||
| for w, l in zip(example[0], example[1]): | |||||
| print(w, l) | |||||
| People Daily Corpus: Chinese word segmentation, POS tag, NER | |||||
| """ | """ | ||||
| ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||||
| print(ans) | |||||
| def __init__(self, data_path): | |||||
| super(PeopleDailyCorpusLoader, self).__init__(data_path) | |||||
| def load(self): | |||||
| with open(self.data_path, "r", encoding="utf-8") as f: | |||||
| sents = f.readlines() | |||||
| pos_tag_examples = [] | |||||
| ner_examples = [] | |||||
| for sent in sents: | |||||
| inside_ne = False | |||||
| sent_pos_tag = [] | |||||
| sent_words = [] | |||||
| sent_ner = [] | |||||
| words = sent.strip().split()[1:] | |||||
| for word in words: | |||||
| if "[" in word and "]" in word: | |||||
| ner_tag = "U" | |||||
| print(word) | |||||
| elif "[" in word: | |||||
| inside_ne = True | |||||
| ner_tag = "B" | |||||
| word = word[1:] | |||||
| elif "]" in word: | |||||
| ner_tag = "L" | |||||
| word = word[:word.index("]")] | |||||
| if inside_ne is True: | |||||
| inside_ne = False | |||||
| else: | |||||
| raise RuntimeError("only ] appears!") | |||||
| else: | |||||
| if inside_ne is True: | |||||
| ner_tag = "I" | |||||
| else: | |||||
| ner_tag = "O" | |||||
| tmp = word.split("/") | |||||
| token, pos = tmp[0], tmp[1] | |||||
| sent_ner.append(ner_tag) | |||||
| sent_pos_tag.append(pos) | |||||
| sent_words.append(token) | |||||
| pos_tag_examples.append([sent_words, sent_pos_tag]) | |||||
| ner_examples.append([sent_words, sent_ner]) | |||||
| return pos_tag_examples, ner_examples | |||||
| if __name__ == "__main__": | |||||
| loader = PeopleDailyCorpusLoader("./") | |||||
| pos, ner = loader.load() | |||||
| print(pos[:10]) | |||||
| print(ner[:10]) | |||||
| @@ -1,8 +1,50 @@ | |||||
| import _pickle | |||||
| import os | |||||
| import numpy as np | |||||
| from fastNLP.loader.base_loader import BaseLoader | from fastNLP.loader.base_loader import BaseLoader | ||||
| class EmbedLoader(BaseLoader): | class EmbedLoader(BaseLoader): | ||||
| """docstring for EmbedLoader""" | """docstring for EmbedLoader""" | ||||
| def __init__(self, data_name, data_path): | |||||
| super(EmbedLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(EmbedLoader, self).__init__(data_path) | |||||
| @staticmethod | |||||
| def load_embedding(emb_dim, emb_file, word_dict, emb_pkl): | |||||
| """Load the pre-trained embedding and combine with the given dictionary. | |||||
| :param emb_file: str, the pre-trained embedding. | |||||
| The embedding file should have the following format: | |||||
| Each line is a word embedding, where a word string is followed by multiple floats. | |||||
| Floats are separated by space. The word and the first float are separated by space. | |||||
| :param word_dict: dict, a mapping from word to index. | |||||
| :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. | |||||
| :param emb_pkl: str, the embedding pickle file. | |||||
| :return embedding_np: numpy array of shape (len(word_dict), emb_dim) | |||||
| TODO: fragile code | |||||
| """ | |||||
| # If the embedding pickle exists, load it and return. | |||||
| if os.path.exists(emb_pkl): | |||||
| with open(emb_pkl, "rb") as f: | |||||
| embedding_np = _pickle.load(f) | |||||
| return embedding_np | |||||
| # Otherwise, load the pre-trained embedding. | |||||
| with open(emb_file, "r", encoding="utf-8") as f: | |||||
| # begin with a random embedding | |||||
| embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||||
| for line in f: | |||||
| line = line.strip().split() | |||||
| if len(line) != emb_dim + 1: | |||||
| # skip this line if two embedding dimension not match | |||||
| continue | |||||
| if line[0] in word_dict: | |||||
| # find the word and replace its embedding with a pre-trained one | |||||
| embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||||
| # save and return the result | |||||
| with open(emb_pkl, "wb") as f: | |||||
| _pickle.dump(embedding_np, f) | |||||
| return embedding_np | |||||
| @@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): | |||||
| Loader for models. | Loader for models. | ||||
| """ | """ | ||||
| def __init__(self, data_name, data_path): | |||||
| super(ModelLoader, self).__init__(data_name, data_path) | |||||
| def __init__(self, data_path): | |||||
| super(ModelLoader, self).__init__(data_path) | |||||
| @staticmethod | @staticmethod | ||||
| def load_pytorch(empty_model, model_path): | def load_pytorch(empty_model, model_path): | ||||
| @@ -1,3 +1,4 @@ | |||||
| from .CRF import ConditionalRandomField | from .CRF import ConditionalRandomField | ||||
| from .MLP import MLP | |||||
| __all__ = ["ConditionalRandomField"] | |||||
| __all__ = ["ConditionalRandomField", "MLP"] | |||||
| @@ -1,114 +0,0 @@ | |||||
| import sys | |||||
| sys.path.append("..") | |||||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
| from fastNLP.core.trainer import SeqLabelTrainer | |||||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||||
| from fastNLP.saver.model_saver import ModelSaver | |||||
| from fastNLP.loader.model_loader import ModelLoader | |||||
| from fastNLP.core.tester import SeqLabelTester | |||||
| from fastNLP.models.sequence_modeling import SeqLabeling | |||||
| from fastNLP.core.predictor import Predictor | |||||
| data_name = "pku_training.utf8" | |||||
| cws_data_path = "/home/zyfeng/data/pku_training.utf8" | |||||
| pickle_path = "./save/" | |||||
| data_infer_path = "/home/zyfeng/data/pku_test.utf8" | |||||
| def infer(): | |||||
| # Load infer configuration, the same as test | |||||
| test_args = ConfigSection() | |||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | |||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
| test_args["vocab_size"] = len(word2index) | |||||
| index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
| test_args["num_classes"] = len(index2label) | |||||
| # Define the same model | |||||
| model = SeqLabeling(test_args) | |||||
| # Dump trained parameters into the model | |||||
| ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
| print("model loaded!") | |||||
| # Data Loader | |||||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
| infer_data = raw_data_loader.load_lines() | |||||
| # Inference interface | |||||
| infer = Predictor(pickle_path) | |||||
| results = infer.predict(model, infer_data) | |||||
| print(results) | |||||
| print("Inference finished!") | |||||
| def train_test(): | |||||
| # Config Loader | |||||
| train_args = ConfigSection() | |||||
| test_args = ConfigSection() | |||||
| ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) | |||||
| # Data Loader | |||||
| loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||||
| train_data = loader.load_pku() | |||||
| # Preprocessor | |||||
| preprocess = SeqLabelPreprocess() | |||||
| data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||||
| train_args["vocab_size"] = preprocess.vocab_size | |||||
| train_args["num_classes"] = preprocess.num_classes | |||||
| # Trainer | |||||
| trainer = SeqLabelTrainer(train_args) | |||||
| # Model | |||||
| model = SeqLabeling(train_args) | |||||
| # Start training | |||||
| trainer.train(model, data_train, data_dev) | |||||
| print("Training finished!") | |||||
| # Saver | |||||
| saver = ModelSaver("./save/saved_model.pkl") | |||||
| saver.save_pytorch(model) | |||||
| print("Model saved!") | |||||
| # testing with validation set | |||||
| test(data_dev) | |||||
| def test(test_data): | |||||
| # Config Loader | |||||
| train_args = ConfigSection() | |||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
| # Define the same model | |||||
| model = SeqLabeling(train_args) | |||||
| # Dump trained parameters into the model | |||||
| ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||||
| print("model loaded!") | |||||
| # Load test configuration | |||||
| test_args = ConfigSection() | |||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
| # Tester | |||||
| tester = SeqLabelTester(test_args) | |||||
| # Start testing | |||||
| tester.test(model, test_data) | |||||
| # print test results | |||||
| print(tester.show_matrices()) | |||||
| print("model tested!") | |||||
| if __name__ == "__main__": | |||||
| train_test() | |||||
| @@ -31,4 +31,16 @@ pickle_path = "./save/" | |||||
| use_crf = true | use_crf = true | ||||
| use_cuda = true | use_cuda = true | ||||
| rnn_hidden_units = 100 | rnn_hidden_units = 100 | ||||
| word_emb_dim = 100 | |||||
| [model] | |||||
| save_output = true | |||||
| validate_in_training = true | |||||
| save_dev_input = false | |||||
| save_loss = true | |||||
| batch_size = 640 | |||||
| pickle_path = "./save/" | |||||
| use_crf = true | |||||
| use_cuda = true | |||||
| rnn_hidden_units = 100 | |||||
| word_emb_dim = 100 | word_emb_dim = 100 | ||||
| @@ -27,7 +27,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8") | |||||
| def infer(): | def infer(): | ||||
| # Config Loader | # Config Loader | ||||
| test_args = ConfigSection() | test_args = ConfigSection() | ||||
| ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | # fetch dictionary size and number of labels from pickle files | ||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
| @@ -47,7 +47,7 @@ def infer(): | |||||
| raise | raise | ||||
| # Data Loader | # Data Loader | ||||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
| raw_data_loader = BaseLoader(data_infer_path) | |||||
| infer_data = raw_data_loader.load_lines() | infer_data = raw_data_loader.load_lines() | ||||
| print('data loaded') | print('data loaded') | ||||
| @@ -63,10 +63,10 @@ def train(): | |||||
| # Config Loader | # Config Loader | ||||
| train_args = ConfigSection() | train_args = ConfigSection() | ||||
| test_args = ConfigSection() | test_args = ConfigSection() | ||||
| ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||||
| ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||||
| # Data Loader | # Data Loader | ||||
| loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||||
| loader = TokenizeDatasetLoader(cws_data_path) | |||||
| train_data = loader.load_pku() | train_data = loader.load_pku() | ||||
| # Preprocessor | # Preprocessor | ||||
| @@ -100,7 +100,7 @@ def train(): | |||||
| def test(): | def test(): | ||||
| # Config Loader | # Config Loader | ||||
| test_args = ConfigSection() | test_args = ConfigSection() | ||||
| ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | # fetch dictionary size and number of labels from pickle files | ||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
| @@ -125,7 +125,7 @@ def test(): | |||||
| tester.test(model, dev_data) | tester.test(model, dev_data) | ||||
| # print test results | # print test results | ||||
| print(tester.show_matrices()) | |||||
| print(tester.show_metrics()) | |||||
| print("model tested!") | print("model tested!") | ||||
| @@ -1,29 +1,35 @@ | |||||
| [train] | [train] | ||||
| epochs = 10 | |||||
| batch_size = 32 | |||||
| epochs = 30 | |||||
| batch_size = 64 | |||||
| pickle_path = "./save/" | pickle_path = "./save/" | ||||
| validate = true | validate = true | ||||
| save_best_dev = true | save_best_dev = true | ||||
| model_saved_path = "./save/" | model_saved_path = "./save/" | ||||
| rnn_hidden_units = 100 | rnn_hidden_units = 100 | ||||
| rnn_layers = 2 | |||||
| rnn_bi_direction = true | |||||
| word_emb_dim = 100 | word_emb_dim = 100 | ||||
| dropout = 0.5 | |||||
| use_crf = true | use_crf = true | ||||
| use_cuda = true | use_cuda = true | ||||
| print_every_step = 10 | |||||
| [test] | [test] | ||||
| save_output = true | save_output = true | ||||
| validate_in_training = true | validate_in_training = true | ||||
| save_dev_input = false | save_dev_input = false | ||||
| save_loss = true | save_loss = true | ||||
| batch_size = 64 | |||||
| batch_size = 640 | |||||
| pickle_path = "./save/" | |||||
| use_crf = true | |||||
| use_cuda = true | |||||
| [POS_test] | |||||
| save_output = true | |||||
| validate_in_training = true | |||||
| save_dev_input = false | |||||
| save_loss = true | |||||
| batch_size = 640 | |||||
| pickle_path = "./save/" | pickle_path = "./save/" | ||||
| rnn_hidden_units = 100 | |||||
| rnn_layers = 1 | |||||
| rnn_bi_direction = true | |||||
| word_emb_dim = 100 | |||||
| dropout = 0.5 | |||||
| use_crf = true | use_crf = true | ||||
| use_cuda = true | use_cuda = true | ||||
| rnn_hidden_units = 100 | |||||
| word_emb_dim = 100 | |||||
| @@ -0,0 +1,146 @@ | |||||
| import os | |||||
| import sys | |||||
| sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) | |||||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
| from fastNLP.core.trainer import SeqLabelTrainer | |||||
| from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader | |||||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||||
| from fastNLP.saver.model_saver import ModelSaver | |||||
| from fastNLP.loader.model_loader import ModelLoader | |||||
| from fastNLP.core.tester import SeqLabelTester | |||||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||||
| from fastNLP.core.predictor import SeqLabelInfer | |||||
| # not in the file's dir | |||||
| if len(os.path.dirname(__file__)) != 0: | |||||
| os.chdir(os.path.dirname(__file__)) | |||||
| datadir = "/home/zyfeng/data/" | |||||
| cfgfile = './pos_tag.cfg' | |||||
| data_name = "CWS_POS_TAG_NER_people_daily.txt" | |||||
| pos_tag_data_path = os.path.join(datadir, data_name) | |||||
| pickle_path = "save" | |||||
| data_infer_path = os.path.join(datadir, "infer.utf8") | |||||
| def infer(): | |||||
| # Config Loader | |||||
| test_args = ConfigSection() | |||||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | |||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
| test_args["vocab_size"] = len(word2index) | |||||
| index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
| test_args["num_classes"] = len(index2label) | |||||
| # Define the same model | |||||
| model = AdvSeqLabel(test_args) | |||||
| try: | |||||
| ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||||
| print('model loaded!') | |||||
| except Exception as e: | |||||
| print('cannot load model!') | |||||
| raise | |||||
| # Data Loader | |||||
| raw_data_loader = BaseLoader(data_infer_path) | |||||
| infer_data = raw_data_loader.load_lines() | |||||
| print('data loaded') | |||||
| # Inference interface | |||||
| infer = SeqLabelInfer(pickle_path) | |||||
| results = infer.predict(model, infer_data) | |||||
| print(results) | |||||
| print("Inference finished!") | |||||
| def train(): | |||||
| # Config Loader | |||||
| train_args = ConfigSection() | |||||
| test_args = ConfigSection() | |||||
| ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||||
| # Data Loader | |||||
| loader = PeopleDailyCorpusLoader(pos_tag_data_path) | |||||
| train_data, _ = loader.load() | |||||
| # Preprocessor | |||||
| preprocessor = SeqLabelPreprocess() | |||||
| data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||||
| train_args["vocab_size"] = preprocessor.vocab_size | |||||
| train_args["num_classes"] = preprocessor.num_classes | |||||
| # Trainer | |||||
| trainer = SeqLabelTrainer(**train_args.data) | |||||
| # Model | |||||
| model = AdvSeqLabel(train_args) | |||||
| try: | |||||
| ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||||
| print('model parameter loaded!') | |||||
| except Exception as e: | |||||
| print("No saved model. Continue.") | |||||
| pass | |||||
| # Start training | |||||
| trainer.train(model, data_train, data_dev) | |||||
| print("Training finished!") | |||||
| # Saver | |||||
| saver = ModelSaver("./save/saved_model.pkl") | |||||
| saver.save_pytorch(model) | |||||
| print("Model saved!") | |||||
| def test(): | |||||
| # Config Loader | |||||
| test_args = ConfigSection() | |||||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | |||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||||
| test_args["vocab_size"] = len(word2index) | |||||
| index2label = load_pickle(pickle_path, "id2class.pkl") | |||||
| test_args["num_classes"] = len(index2label) | |||||
| # load dev data | |||||
| dev_data = load_pickle(pickle_path, "data_dev.pkl") | |||||
| # Define the same model | |||||
| model = AdvSeqLabel(test_args) | |||||
| # Dump trained parameters into the model | |||||
| ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||||
| print("model loaded!") | |||||
| # Tester | |||||
| tester = SeqLabelTester(**test_args.data) | |||||
| # Start testing | |||||
| tester.test(model, dev_data) | |||||
| # print test results | |||||
| print(tester.show_metrics()) | |||||
| print("model tested!") | |||||
| if __name__ == "__main__": | |||||
| import argparse | |||||
| parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') | |||||
| parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) | |||||
| args = parser.parse_args() | |||||
| if args.mode == 'train': | |||||
| train() | |||||
| elif args.mode == 'test': | |||||
| test() | |||||
| elif args.mode == 'infer': | |||||
| infer() | |||||
| else: | |||||
| print('no mode specified for model!') | |||||
| parser.print_help() | |||||
| @@ -0,0 +1,24 @@ | |||||
| #!/usr/bin/env python | |||||
| # coding=utf-8 | |||||
| from setuptools import setup, find_packages | |||||
| with open('README.md') as f: | |||||
| readme = f.read() | |||||
| with open('LICENSE') as f: | |||||
| license = f.read() | |||||
| with open('requirements.txt') as f: | |||||
| reqs = f.read() | |||||
| setup( | |||||
| name='fastNLP', | |||||
| version='1.0', | |||||
| description=('fudan fastNLP '), | |||||
| long_description=readme, | |||||
| license=license, | |||||
| author='fudanNLP', | |||||
| python_requires='>=3.5', | |||||
| packages=find_packages(), | |||||
| install_requires=reqs.strip().split('\n'), | |||||
| ) | |||||
| @@ -1,13 +1,12 @@ | |||||
| import os | |||||
| import configparser | import configparser | ||||
| import json | import json | ||||
| import os | |||||
| import unittest | import unittest | ||||
| from fastNLP.loader.config_loader import ConfigSection, ConfigLoader | from fastNLP.loader.config_loader import ConfigSection, ConfigLoader | ||||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader | from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader | ||||
| class TestConfigLoader(unittest.TestCase): | class TestConfigLoader(unittest.TestCase): | ||||
| def test_case_ConfigLoader(self): | def test_case_ConfigLoader(self): | ||||
| @@ -33,8 +32,8 @@ class TestConfigLoader(unittest.TestCase): | |||||
| return dict | return dict | ||||
| test_arg = ConfigSection() | test_arg = ConfigSection() | ||||
| ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) | |||||
| #ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", | |||||
| ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) | |||||
| # ConfigLoader("config").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", | |||||
| # {"test": test_arg}) | # {"test": test_arg}) | ||||
| #dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test") | #dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test") | ||||
| @@ -58,18 +57,18 @@ class TestConfigLoader(unittest.TestCase): | |||||
| class TestDatasetLoader(unittest.TestCase): | class TestDatasetLoader(unittest.TestCase): | ||||
| def test_case_TokenizeDatasetLoader(self): | def test_case_TokenizeDatasetLoader(self): | ||||
| loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") | |||||
| loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||||
| data = loader.load_pku(max_seq_len=32) | data = loader.load_pku(max_seq_len=32) | ||||
| print("pass TokenizeDatasetLoader test!") | print("pass TokenizeDatasetLoader test!") | ||||
| def test_case_POSDatasetLoader(self): | def test_case_POSDatasetLoader(self): | ||||
| loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt") | |||||
| loader = POSDatasetLoader("./test/data_for_tests/people.txt") | |||||
| data = loader.load() | data = loader.load() | ||||
| datas = loader.load_lines() | datas = loader.load_lines() | ||||
| print("pass POSDatasetLoader test!") | print("pass POSDatasetLoader test!") | ||||
| def test_case_LMDatasetLoader(self): | def test_case_LMDatasetLoader(self): | ||||
| loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") | |||||
| loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||||
| data = loader.load() | data = loader.load() | ||||
| datas = loader.load_lines() | datas = loader.load_lines() | ||||
| print("pass TokenizeDatasetLoader test!") | print("pass TokenizeDatasetLoader test!") | ||||
| @@ -1,138 +0,0 @@ | |||||
| import _pickle | |||||
| import os | |||||
| import numpy as np | |||||
| import torch | |||||
| from fastNLP.core.preprocess import SeqLabelPreprocess | |||||
| from fastNLP.core.tester import SeqLabelTester | |||||
| from fastNLP.core.trainer import SeqLabelTrainer | |||||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||||
| class MyNERTrainer(SeqLabelTrainer): | |||||
| def __init__(self, train_args): | |||||
| super(MyNERTrainer, self).__init__(train_args) | |||||
| self.scheduler = None | |||||
| def define_optimizer(self): | |||||
| """ | |||||
| override | |||||
| :return: | |||||
| """ | |||||
| self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001) | |||||
| self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) | |||||
| def update(self): | |||||
| """ | |||||
| override | |||||
| :return: | |||||
| """ | |||||
| self.optimizer.step() | |||||
| self.scheduler.step() | |||||
| def _create_validator(self, valid_args): | |||||
| return MyNERTester(valid_args) | |||||
| def best_eval_result(self, validator): | |||||
| accuracy = validator.metrics() | |||||
| if accuracy > self.best_accuracy: | |||||
| self.best_accuracy = accuracy | |||||
| return True | |||||
| else: | |||||
| return False | |||||
| class MyNERTester(SeqLabelTester): | |||||
| def __init__(self, test_args): | |||||
| super(MyNERTester, self).__init__(test_args) | |||||
| def _evaluate(self, prediction, batch_y, seq_len): | |||||
| """ | |||||
| :param prediction: [batch_size, seq_len, num_classes] | |||||
| :param batch_y: [batch_size, seq_len] | |||||
| :param seq_len: [batch_size] | |||||
| :return: | |||||
| """ | |||||
| summ = 0 | |||||
| correct = 0 | |||||
| _, indices = torch.max(prediction, 2) | |||||
| for p, y, l in zip(indices, batch_y, seq_len): | |||||
| summ += l | |||||
| correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) | |||||
| return float(correct / summ) | |||||
| def evaluate(self, predict, truth): | |||||
| return self._evaluate(predict, truth, self.seq_len) | |||||
| def metrics(self): | |||||
| return np.mean(self.eval_history) | |||||
| def show_matrices(self): | |||||
| return "dev accuracy={:.2f}".format(float(self.metrics())) | |||||
| def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): | |||||
| if os.path.exists(emb_pkl): | |||||
| with open(emb_pkl, "rb") as f: | |||||
| embedding_np = _pickle.load(f) | |||||
| return embedding_np | |||||
| with open(emb_file, "r", encoding="utf-8") as f: | |||||
| embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||||
| for line in f: | |||||
| line = line.strip().split() | |||||
| if len(line) != emb_dim + 1: | |||||
| continue | |||||
| if line[0] in word_dict: | |||||
| embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||||
| with open(emb_pkl, "wb") as f: | |||||
| _pickle.dump(embedding_np, f) | |||||
| return embedding_np | |||||
| def data_load(data_file): | |||||
| with open(data_file, "r", encoding="utf-8") as f: | |||||
| all_data = [] | |||||
| sent = [] | |||||
| label = [] | |||||
| for line in f: | |||||
| line = line.strip().split() | |||||
| if not len(line) <= 1: | |||||
| sent.append(line[0]) | |||||
| label.append(line[1]) | |||||
| else: | |||||
| all_data.append([sent, label]) | |||||
| sent = [] | |||||
| label = [] | |||||
| return all_data | |||||
| data_path = "data_for_tests/people.txt" | |||||
| pick_path = "data_for_tests/" | |||||
| emb_path = "data_for_tests/emb50.txt" | |||||
| save_path = "data_for_tests/" | |||||
| if __name__ == "__main__": | |||||
| data = data_load(data_path) | |||||
| preprocess = SeqLabelPreprocess() | |||||
| data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3) | |||||
| # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) | |||||
| emb = None | |||||
| args = {"epochs": 20, | |||||
| "batch_size": 1, | |||||
| "pickle_path": pick_path, | |||||
| "validate": True, | |||||
| "save_best_dev": True, | |||||
| "model_saved_path": save_path, | |||||
| "use_cuda": True, | |||||
| "vocab_size": preprocess.vocab_size, | |||||
| "num_classes": preprocess.num_classes, | |||||
| "word_emb_dim": 50, | |||||
| "rnn_hidden_units": 100 | |||||
| } | |||||
| # emb = torch.Tensor(emb).float().cuda() | |||||
| networks = AdvSeqLabel(args, emb) | |||||
| trainer = MyNERTrainer(args) | |||||
| trainer.train(networks, data_train, data_dev) | |||||
| print("Training finished!") | |||||
| @@ -1,129 +0,0 @@ | |||||
| import _pickle | |||||
| import os | |||||
| import torch | |||||
| from fastNLP.core.predictor import SeqLabelInfer | |||||
| from fastNLP.core.trainer import SeqLabelTrainer | |||||
| from fastNLP.loader.model_loader import ModelLoader | |||||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||||
| class Decode(SeqLabelTrainer): | |||||
| def __init__(self, args): | |||||
| super(Decode, self).__init__(args) | |||||
| def decoder(self, network, sents, model_path): | |||||
| self.model = network | |||||
| self.model.load_state_dict(torch.load(model_path)) | |||||
| out_put = [] | |||||
| self.mode(network, test=True) | |||||
| for batch_x in sents: | |||||
| prediction = self.data_forward(self.model, batch_x) | |||||
| seq_tag = self.model.prediction(prediction, batch_x[1]) | |||||
| out_put.append(list(seq_tag)[0]) | |||||
| return out_put | |||||
| def process_sent(sents, word2id): | |||||
| sents_num = [] | |||||
| for s in sents: | |||||
| sent_num = [] | |||||
| for c in s: | |||||
| if c in word2id: | |||||
| sent_num.append(word2id[c]) | |||||
| else: | |||||
| sent_num.append(word2id["<unk>"]) | |||||
| sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 | |||||
| return sents_num | |||||
| def process_tag(sents, tags, id2class): | |||||
| Tags = [] | |||||
| for ttt in tags: | |||||
| Tags.append([id2class[t] for t in ttt]) | |||||
| Segs = [] | |||||
| PosNers = [] | |||||
| for sent, tag in zip(sents, tags): | |||||
| word__ = [] | |||||
| lll__ = [] | |||||
| for c, t in zip(sent, tag): | |||||
| t = id2class[t] | |||||
| l = t.split("-") | |||||
| split_ = l[0] | |||||
| pn = l[1] | |||||
| if split_ == "S": | |||||
| word__.append(c) | |||||
| lll__.append(pn) | |||||
| word_1 = "" | |||||
| elif split_ == "E": | |||||
| word_1 += c | |||||
| word__.append(word_1) | |||||
| lll__.append(pn) | |||||
| word_1 = "" | |||||
| elif split_ == "B": | |||||
| word_1 = "" | |||||
| word_1 += c | |||||
| else: | |||||
| word_1 += c | |||||
| Segs.append(word__) | |||||
| PosNers.append(lll__) | |||||
| return Segs, PosNers | |||||
| pickle_path = "data_for_tests/" | |||||
| model_path = "data_for_tests/model_best_dev.pkl" | |||||
| if __name__ == "__main__": | |||||
| with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: | |||||
| id2word = _pickle.load(f) | |||||
| with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: | |||||
| word2id = _pickle.load(f) | |||||
| with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: | |||||
| id2class = _pickle.load(f) | |||||
| sent = ["中共中央总书记、国家主席江泽民", | |||||
| "逆向处理输入序列并返回逆序后的序列"] # here is input | |||||
| args = {"epochs": 1, | |||||
| "batch_size": 1, | |||||
| "pickle_path": "data_for_tests/", | |||||
| "validate": True, | |||||
| "save_best_dev": True, | |||||
| "model_saved_path": "data_for_tests/", | |||||
| "use_cuda": False, | |||||
| "vocab_size": len(word2id), | |||||
| "num_classes": len(id2class), | |||||
| "word_emb_dim": 50, | |||||
| "rnn_hidden_units": 100, | |||||
| } | |||||
| """ | |||||
| network = AdvSeqLabel(args, None) | |||||
| decoder_ = Decode(args) | |||||
| tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) | |||||
| output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output | |||||
| print(output_seg) | |||||
| print(output_pn) | |||||
| """ | |||||
| # Define the same model | |||||
| model = AdvSeqLabel(args, None) | |||||
| # Dump trained parameters into the model | |||||
| ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") | |||||
| print("model loaded!") | |||||
| # Inference interface | |||||
| infer = SeqLabelInfer(pickle_path) | |||||
| sent = [[ch for ch in s] for s in sent] | |||||
| results = infer.predict(model, sent) | |||||
| for res in results: | |||||
| print(res) | |||||
| print("Inference finished!") | |||||
| @@ -1,19 +1,13 @@ | |||||
| # python: 3.5 | |||||
| # pytorch: 0.4 | |||||
| ################ | |||||
| # Test cross validation. | |||||
| ################ | |||||
| from fastNLP.loader.preprocess import ClassPreprocess | |||||
| from fastNLP.core.loss import Loss | |||||
| from fastNLP.core.optimizer import Optimizer | |||||
| from fastNLP.core.predictor import ClassificationInfer | from fastNLP.core.predictor import ClassificationInfer | ||||
| from fastNLP.core.preprocess import ClassPreprocess | |||||
| from fastNLP.core.trainer import ClassificationTrainer | from fastNLP.core.trainer import ClassificationTrainer | ||||
| from fastNLP.loader.dataset_loader import ClassDatasetLoader | from fastNLP.loader.dataset_loader import ClassDatasetLoader | ||||
| from fastNLP.models.base_model import BaseModel | from fastNLP.models.base_model import BaseModel | ||||
| from fastNLP.modules import aggregation | from fastNLP.modules import aggregation | ||||
| from fastNLP.modules import encoder | |||||
| from fastNLP.modules import decoder | from fastNLP.modules import decoder | ||||
| from fastNLP.modules import encoder | |||||
| class ClassificationModel(BaseModel): | class ClassificationModel(BaseModel): | ||||
| @@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): | |||||
| self.enc = encoder.Conv( | self.enc = encoder.Conv( | ||||
| in_channels=300, out_channels=100, kernel_size=3) | in_channels=300, out_channels=100, kernel_size=3) | ||||
| self.agg = aggregation.MaxPool() | self.agg = aggregation.MaxPool() | ||||
| self.dec = decoder.MLP(100, num_classes=num_classes) | |||||
| self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||||
| def forward(self, x): | def forward(self, x): | ||||
| x = self.emb(x) # [N,L] -> [N,L,C] | x = self.emb(x) # [N,L] -> [N,L,C] | ||||
| @@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): | |||||
| return x | return x | ||||
| data_dir = 'data' # directory to save data and model | |||||
| train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||||
| data_dir = 'save/' # directory to save data and model | |||||
| train_path = './data_for_tests/text_classify.txt' # training set file | |||||
| # load dataset | # load dataset | ||||
| ds_loader = ClassDatasetLoader("train", train_path) | |||||
| ds_loader = ClassDatasetLoader(train_path) | |||||
| data = ds_loader.load() | data = ds_loader.load() | ||||
| # pre-process dataset | # pre-process dataset | ||||
| pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) | |||||
| # pre = ClassPreprocess(data, data_dir) | |||||
| n_classes = pre.num_classes | |||||
| vocab_size = pre.vocab_size | |||||
| pre = ClassPreprocess() | |||||
| train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||||
| n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||||
| # construct model | # construct model | ||||
| model_args = { | model_args = { | ||||
| @@ -58,22 +51,25 @@ model_args = { | |||||
| } | } | ||||
| model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | ||||
| # train model | |||||
| # construct trainer | |||||
| train_args = { | train_args = { | ||||
| "epochs": 10, | |||||
| "batch_size": 50, | |||||
| "epochs": 3, | |||||
| "batch_size": 16, | |||||
| "pickle_path": data_dir, | "pickle_path": data_dir, | ||||
| "validate": False, | "validate": False, | ||||
| "save_best_dev": False, | "save_best_dev": False, | ||||
| "model_saved_path": None, | "model_saved_path": None, | ||||
| "use_cuda": True, | "use_cuda": True, | ||||
| "learn_rate": 1e-3, | |||||
| "momentum": 0.9} | |||||
| trainer = ClassificationTrainer(train_args) | |||||
| # trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) | |||||
| trainer.cross_validate(model) | |||||
| "loss": Loss("cross_entropy"), | |||||
| "optimizer": Optimizer("Adam", lr=0.001) | |||||
| } | |||||
| trainer = ClassificationTrainer(**train_args) | |||||
| # start training | |||||
| trainer.train(model, train_data=train_set, dev_data=dev_set) | |||||
| # predict using model | # predict using model | ||||
| data_infer = [x[0] for x in data] | data_infer = [x[0] for x in data] | ||||
| infer = ClassificationInfer(data_dir) | infer = ClassificationInfer(data_dir) | ||||
| labels_pred = infer.predict(model, data_infer) | |||||
| labels_pred = infer.predict(model.cpu(), data_infer) | |||||
| print(labels_pred) | |||||
| @@ -33,7 +33,7 @@ data_infer_path = args.infer | |||||
| def infer(): | def infer(): | ||||
| # Load infer configuration, the same as test | # Load infer configuration, the same as test | ||||
| test_args = ConfigSection() | test_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args}) | |||||
| ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | # fetch dictionary size and number of labels from pickle files | ||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
| @@ -49,7 +49,7 @@ def infer(): | |||||
| print("model loaded!") | print("model loaded!") | ||||
| # Data Loader | # Data Loader | ||||
| raw_data_loader = BaseLoader("xxx", data_infer_path) | |||||
| raw_data_loader = BaseLoader(data_infer_path) | |||||
| infer_data = raw_data_loader.load_lines() | infer_data = raw_data_loader.load_lines() | ||||
| # Inference interface | # Inference interface | ||||
| @@ -65,11 +65,11 @@ def train_and_test(): | |||||
| # Config Loader | # Config Loader | ||||
| trainer_args = ConfigSection() | trainer_args = ConfigSection() | ||||
| model_args = ConfigSection() | model_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config(config_dir, { | |||||
| ConfigLoader("config.cfg").load_config(config_dir, { | |||||
| "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) | "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) | ||||
| # Data Loader | # Data Loader | ||||
| pos_loader = POSDatasetLoader("xxx", data_path) | |||||
| pos_loader = POSDatasetLoader(data_path) | |||||
| train_data = pos_loader.load_lines() | train_data = pos_loader.load_lines() | ||||
| # Preprocessor | # Preprocessor | ||||
| @@ -117,7 +117,7 @@ def train_and_test(): | |||||
| # Load test configuration | # Load test configuration | ||||
| tester_args = ConfigSection() | tester_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||||
| ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||||
| # Tester | # Tester | ||||
| tester = SeqLabelTester(save_output=False, | tester = SeqLabelTester(save_output=False, | ||||
| @@ -134,10 +134,10 @@ def train_and_test(): | |||||
| tester.test(model, data_dev) | tester.test(model, data_dev) | ||||
| # print test results | # print test results | ||||
| print(tester.show_matrices()) | |||||
| print(tester.show_metrics()) | |||||
| print("model tested!") | print("model tested!") | ||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| train_and_test() | |||||
| # infer() | |||||
| # train_and_test() | |||||
| infer() | |||||
| @@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt" | |||||
| def infer(): | def infer(): | ||||
| # Load infer configuration, the same as test | # Load infer configuration, the same as test | ||||
| test_args = ConfigSection() | test_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
| # fetch dictionary size and number of labels from pickle files | # fetch dictionary size and number of labels from pickle files | ||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
| @@ -38,7 +38,7 @@ def infer(): | |||||
| print("model loaded!") | print("model loaded!") | ||||
| # Data Loader | # Data Loader | ||||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||||
| raw_data_loader = BaseLoader(data_infer_path) | |||||
| infer_data = raw_data_loader.load_lines() | infer_data = raw_data_loader.load_lines() | ||||
| """ | """ | ||||
| Transform strings into list of list of strings. | Transform strings into list of list of strings. | ||||
| @@ -61,10 +61,10 @@ def infer(): | |||||
| def train_test(): | def train_test(): | ||||
| # Config Loader | # Config Loader | ||||
| train_args = ConfigSection() | train_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
| # Data Loader | # Data Loader | ||||
| loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||||
| loader = TokenizeDatasetLoader(cws_data_path) | |||||
| train_data = loader.load_pku() | train_data = loader.load_pku() | ||||
| # Preprocessor | # Preprocessor | ||||
| @@ -74,7 +74,7 @@ def train_test(): | |||||
| train_args["num_classes"] = p.num_classes | train_args["num_classes"] = p.num_classes | ||||
| # Trainer | # Trainer | ||||
| trainer = SeqLabelTrainer(train_args) | |||||
| trainer = SeqLabelTrainer(**train_args.data) | |||||
| # Model | # Model | ||||
| model = SeqLabeling(train_args) | model = SeqLabeling(train_args) | ||||
| @@ -99,16 +99,16 @@ def train_test(): | |||||
| # Load test configuration | # Load test configuration | ||||
| test_args = ConfigSection() | test_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
| # Tester | # Tester | ||||
| tester = SeqLabelTester(test_args) | |||||
| tester = SeqLabelTester(**test_args.data) | |||||
| # Start testing | # Start testing | ||||
| tester.test(model, data_train) | tester.test(model, data_train) | ||||
| # print test results | # print test results | ||||
| print(tester.show_matrices()) | |||||
| print(tester.show_metrics()) | |||||
| print("model tested!") | print("model tested!") | ||||
| @@ -1,9 +1,12 @@ | |||||
| import sys | import sys | ||||
| sys.path.append("..") | sys.path.append("..") | ||||
| from fastNLP.fastnlp import FastNLP | from fastNLP.fastnlp import FastNLP | ||||
| from fastNLP.fastnlp import interpret_word_seg_results | |||||
| from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results | |||||
| PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | ||||
| PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | |||||
| def word_seg(): | def word_seg(): | ||||
| nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | ||||
| @@ -39,5 +42,33 @@ def test_word_seg_interpret(): | |||||
| print(interpret_word_seg_results(chars, labels)) | print(interpret_word_seg_results(chars, labels)) | ||||
| def test_interpret_cws_pos_results(): | |||||
| foo = [ | |||||
| [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||||
| ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||||
| ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||||
| ] | |||||
| chars = [x[0] for x in foo[0]] | |||||
| labels = [x[1] for x in foo[0]] | |||||
| print(interpret_cws_pos_results(chars, labels)) | |||||
| def pos_tag(): | |||||
| nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) | |||||
| nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") | |||||
| text = ["这是最好的基于深度学习的中文分词系统。", | |||||
| "大王叫我来巡山。", | |||||
| "我党多年来致力于改善人民生活水平。"] | |||||
| results = nlp.run(text) | |||||
| for example in results: | |||||
| words, labels = [], [] | |||||
| for res in example: | |||||
| words.append(res[0]) | |||||
| labels.append(res[1]) | |||||
| print(interpret_cws_pos_results(words, labels)) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| word_seg() | |||||
| pos_tag() | |||||
| @@ -5,19 +5,19 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||||
| from fastNLP.models.sequence_modeling import SeqLabeling | from fastNLP.models.sequence_modeling import SeqLabeling | ||||
| data_name = "pku_training.utf8" | data_name = "pku_training.utf8" | ||||
| cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||||
| pickle_path = "data_for_tests" | pickle_path = "data_for_tests" | ||||
| def foo(): | def foo(): | ||||
| loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8") | |||||
| loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") | |||||
| train_data = loader.load_pku() | train_data = loader.load_pku() | ||||
| train_args = ConfigSection() | train_args = ConfigSection() | ||||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
| # Preprocessor | # Preprocessor | ||||
| p = SeqLabelPreprocess(train_data, pickle_path) | |||||
| p = SeqLabelPreprocess() | |||||
| train_data = p.run(train_data) | |||||
| train_args["vocab_size"] = p.vocab_size | train_args["vocab_size"] = p.vocab_size | ||||
| train_args["num_classes"] = p.num_classes | train_args["num_classes"] = p.num_classes | ||||
| @@ -26,11 +26,11 @@ def foo(): | |||||
| valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | ||||
| "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", | "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", | ||||
| "use_cuda": True} | "use_cuda": True} | ||||
| validator = SeqLabelTester(valid_args) | |||||
| validator = SeqLabelTester(**valid_args) | |||||
| print("start validation.") | print("start validation.") | ||||
| validator.test(model) | |||||
| print(validator.show_matrices()) | |||||
| validator.test(model, train_data) | |||||
| print(validator.show_metrics()) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| @@ -34,7 +34,7 @@ config_dir = args.config | |||||
| def infer(): | def infer(): | ||||
| # load dataset | # load dataset | ||||
| print("Loading data...") | print("Loading data...") | ||||
| ds_loader = ClassDatasetLoader("train", train_data_dir) | |||||
| ds_loader = ClassDatasetLoader(train_data_dir) | |||||
| data = ds_loader.load() | data = ds_loader.load() | ||||
| unlabeled_data = [x[0] for x in data] | unlabeled_data = [x[0] for x in data] | ||||
| @@ -69,7 +69,7 @@ def train(): | |||||
| # load dataset | # load dataset | ||||
| print("Loading data...") | print("Loading data...") | ||||
| ds_loader = ClassDatasetLoader("train", train_data_dir) | |||||
| ds_loader = ClassDatasetLoader(train_data_dir) | |||||
| data = ds_loader.load() | data = ds_loader.load() | ||||
| print(data[0]) | print(data[0]) | ||||