- clean up unused codes

- improve code comments - BaseLoader & its subclasses does not need a data name any more - update file tree - add setup.py
6 years ago · 57911f771a
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ data_dir = 'save/'  # directory to save data and model
 train_path = './data_for_tests/text_classify.txt'  # training set file
 # load dataset
 ds_loader = ClassDatasetLoader("train", train_path)
 ds_loader = ClassDatasetLoader(train_path)
 data = ds_loader.load()
 # pre-process dataset
@@ -135,14 +135,15 @@ pip3 install torch torchvision
 ```
 FastNLP
 ├── docs
 │   └── quick_tutorial.md
 ├── fastNLP
 │   ├── action
 │   ├── core
 │   │   ├── action.py
 │   │   ├── inference.py
 │   │   ├── __init__.py
 │   │   ├── loss.py
 │   │   ├── metrics.py
 │   │   ├── optimizer.py
 │   │   ├── predictor.py
 │   │   ├── preprocess.py
 │   │   ├── README.md
 │   │   ├── tester.py
 │   │   └── trainer.py
@@ -154,71 +155,28 @@ FastNLP
 │   │   ├── dataset_loader.py
 │   │   ├── embed_loader.py
 │   │   ├── __init__.py
 │   │   ├── model_loader.py
 │   │   └── preprocess.py
 │   │   └── model_loader.py
 │   ├── models
 │   │   ├── base_model.py
 │   │   ├── char_language_model.py
 │   │   ├── cnn_text_classification.py
 │   │   ├── __init__.py
 │   │   └── sequence_modeling.py
 │   ├── modules
 │   │   ├── aggregation
 │   │   │   ├── attention.py
 │   │   │   ├── avg_pool.py
 │   │   │   ├── __init__.py
 │   │   │   ├── kmax_pool.py
 │   │   │   ├── max_pool.py
 │   │   │   └── self_attention.py
 │   │   ├── decoder
 │   │   │   ├── CRF.py
 │   │   │   └── __init__.py
 │   │   ├── encoder
 │   │   │   ├── char_embedding.py
 │   │   │   ├── conv_maxpool.py
 │   │   │   ├── conv.py
 │   │   │   ├── embedding.py
 │   │   │   ├── __init__.py
 │   │   │   ├── linear.py
 │   │   │   ├── lstm.py
 │   │   │   ├── masked_rnn.py
 │   │   │   └── variational_rnn.py
 │   │   ├── __init__.py
 │   │   ├── interaction
 │   │   │   └── __init__.py
 │   │   ├── other_modules.py
 │   │   └── utils.py
 │   └── saver
 │       ├── base_saver.py
 │       ├── __init__.py
 │       ├── logger.py
 │       └── model_saver.py
 ├── LICENSE
 ├── README.md
 ├── reproduction
 │   ├── Char-aware_NLM
 │   │  
 │   ├── CNN-sentence_classification
 │   │  
 │   ├── HAN-document_classification
 │   │  
 │   └── LSTM+self_attention_sentiment_analysis
 |
 ├── requirements.txt
 ├── setup.py
 └── test
    ├── core
    ├── data_for_tests
    │   ├── charlm.txt
    │   ├── config
    │   ├── cws_test
    │   ├── cws_train
    │   ├── people_infer.txt
    │   └── people.txt
    ├── test_charlm.py
    ├── test_cws.py
    ├── test_fastNLP.py
    ├── test_loader.py
    ├── test_seq_labeling.py
    ├── test_tester.py
    └── test_trainer.py
    ├── __init__.py
    ├── loader
    ├── modules
    └── readme_example.py
 ```
--- a/fastNLP/core/loss.py
+++ b/fastNLP/core/loss.py
@@ -9,7 +9,7 @@ class Loss(object):
    def __init__(self, args):
        if args is None:
            # this is useful when
            # this is useful when Trainer.__init__ performs type check
            self._loss = None
        elif isinstance(args, str):
            self._loss = self._borrow_from_pytorch(args)
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -70,7 +70,7 @@ class Predictor(object):
    def predict(self, network, data):
        """Perform inference using the trained model.
        :param network: a PyTorch model
        :param network: a PyTorch model (cpu)
        :param data: list of list of strings
        :return: list of list of strings, [num_examples, tag_seq_length]
        """
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -38,7 +38,7 @@ class BaseTester(object):
            Obviously, "required_args" is the subset of "default_args". 
            The value in "default_args" to the keys in "required_args" is simply for type check. 
        """
        # TODO: required arguments
        # add required arguments here
        required_args = {}
        for req_key in required_args:
@@ -56,7 +56,7 @@ class BaseTester(object):
                    logger.error(msg)
                    raise ValueError(msg)
            else:
                # BeseTester doesn't care about extra arguments
                # BaseTester doesn't care about extra arguments
                pass
        print(default_args)
@@ -69,8 +69,8 @@ class BaseTester(object):
        self.print_every_step = default_args["print_every_step"]
        self._model = None
        self.eval_history = []
        self.batch_output = []
        self.eval_history = []  # evaluation results of all batches
        self.batch_output = []  # outputs of all batches
    def test(self, network, dev_data):
        if torch.cuda.is_available() and self.use_cuda:
@@ -83,7 +83,7 @@ class BaseTester(object):
        self.eval_history.clear()
        self.batch_output.clear()
        iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
        iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False))
        step = 0
        for batch_x, batch_y in self.make_batch(iterator):
@@ -99,7 +99,7 @@ class BaseTester(object):
            print_output = "[test step {}] {}".format(step, eval_results)
            logger.info(print_output)
            if self.print_every_step > 0 and step % self.print_every_step == 0:
                print(print_output)
                print(self.make_eval_output(prediction, eval_results))
            step += 1
    def mode(self, model, test):
@@ -115,16 +115,28 @@ class BaseTester(object):
        raise NotImplementedError
    def evaluate(self, predict, truth):
        """Compute evaluation metrics for the model. """
        """Compute evaluation metrics.
        :param predict: Tensor
        :param truth: Tensor
        :return eval_results: can be anything. It will be stored in self.eval_history
        """
        raise NotImplementedError
    @property
    def metrics(self):
        """Return a list of metrics. """
        """Compute and return metrics.
        Use self.eval_history to compute metrics over the whole dev set.
        Please refer to metrics.py for common metric functions.
        :return : variable number of outputs
        """
        raise NotImplementedError
    def show_metrics(self):
        """This is called by Trainer to print evaluation results on dev set during training.
        """Customize evaluation outputs in Trainer.
        Called by Trainer to print evaluation results on dev set during training.
        Use self.metrics to fetch available metrics.
        :return print_str: str
        """
@@ -133,6 +145,14 @@ class BaseTester(object):
    def make_batch(self, iterator):
        raise NotImplementedError
    def make_eval_output(self, predictions, eval_results):
        """Customize Tester outputs.
        :param predictions: Tensor
        :param eval_results: Tensor
        :return: str, to be printed.
        """
        raise NotImplementedError
 class SeqLabelTester(BaseTester):
    """
@@ -211,7 +231,7 @@ class ClassificationTester(BaseTester):
    def __init__(self, **test_args):
        """
        :param test_args: a dict-like object that has __getitem__ method, \
        :param test_args: a dict-like object that has __getitem__ method.
            can be accessed by "test_args["key_str"]"
        """
        super(ClassificationTester, self).__init__(**test_args)
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -1,6 +1,4 @@
 import _pickle
 import copy
 import os
 import time
 from datetime import timedelta
@@ -15,16 +13,12 @@ from fastNLP.modules import utils
 from fastNLP.saver.logger import create_logger
 from fastNLP.saver.model_saver import ModelSaver
 DEFAULT_QUEUE_SIZE = 300
 logger = create_logger(__name__, "./train_test.log")
 class BaseTrainer(object):
    """Operations to train a model, including data loading, SGD, and validation.
    """Operations of training a model, including data loading, gradient descent, and validation.
        Subclasses must implement the following abstract methods:
        - grad_backward
        - get_loss
    """
    def __init__(self, **kwargs):
@@ -47,7 +41,7 @@ class BaseTrainer(object):
        """
        default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/",
                        "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1,
                        "loss": Loss(None),
                        "loss": Loss(None),  # used to pass type check
                        "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0)
                        }
        """
@@ -56,7 +50,7 @@ class BaseTrainer(object):
            Obviously, "required_args" is the subset of "default_args". 
            The value in "default_args" to the keys in "required_args" is simply for type check. 
        """
        # TODO: required arguments
        # add required arguments here
        required_args = {}
        for req_key in required_args:
@@ -198,21 +192,6 @@ class BaseTrainer(object):
            network_copy = copy.deepcopy(network)
            self.train(network_copy, train_data_cv[i], dev_data_cv[i])
    def load_train_data(self, pickle_path):
        """
        For task-specific processing.
        :param pickle_path:
        :return data_train
        """
        file_path = os.path.join(pickle_path, "data_train.pkl")
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                data = _pickle.load(f)
        else:
            logger.error("cannot find training data {}. invalid input path for training data.".format(file_path))
            raise RuntimeError("cannot find training data {}".format(file_path))
        return data
    def make_batch(self, iterator):
        raise NotImplementedError
--- a/fastNLP/loader/base_loader.py
+++ b/fastNLP/loader/base_loader.py
@@ -1,9 +1,8 @@
 class BaseLoader(object):
    """docstring for BaseLoader"""
    def __init__(self, data_name, data_path):
    def __init__(self, data_path):
        super(BaseLoader, self).__init__()
        self.data_name = data_name
        self.data_path = data_path
    def load(self):
@@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader):
        For charLM
    """
    def __init__(self, name, path):
        super(ToyLoader0, self).__init__(name, path)
    def __init__(self, data_path):
        super(ToyLoader0, self).__init__(data_path)
    def load(self):
        with open(self.data_path, 'r') as f:
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -9,7 +9,7 @@ class ConfigLoader(BaseLoader):
    """loader for configuration files"""
    def __int__(self, data_name, data_path):
        super(ConfigLoader, self).__init__(data_name, data_path)
        super(ConfigLoader, self).__init__(data_path)
        self.config = self.parse(super(ConfigLoader, self).load())
    @staticmethod
@@ -100,7 +100,7 @@ class ConfigSection(object):
 if __name__ == "__main__":
    config = ConfigLoader('configLoader', 'there is no data')
    config = ConfigLoader('there is no data')
    section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()}
    """
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader
 class DatasetLoader(BaseLoader):
    """"loader for data sets"""
    def __init__(self, data_name, data_path):
        super(DatasetLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(DatasetLoader, self).__init__(data_path)
 class POSDatasetLoader(DatasetLoader):
@@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader):
    to label5.
    """
    def __init__(self, data_name, data_path):
        super(POSDatasetLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(POSDatasetLoader, self).__init__(data_path)
    def load(self):
        assert os.path.exists(self.data_path)
@@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader):
    Data set loader for tokenization data sets
    """
    def __init__(self, data_name, data_path):
        super(TokenizeDatasetLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(TokenizeDatasetLoader, self).__init__(data_path)
    def load_pku(self, max_seq_len=32):
        """
@@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader):
 class ClassDatasetLoader(DatasetLoader):
    """Loader for classification data sets"""
    def __init__(self, data_name, data_path):
        super(ClassDatasetLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(ClassDatasetLoader, self).__init__(data_path)
    def load(self):
        assert os.path.exists(self.data_path)
@@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader):
        :param  str data_name: the name of the conll data set
        :param str data_path: the path to the conll data set
        """
        super(ConllLoader, self).__init__(data_name, data_path)
        super(ConllLoader, self).__init__(data_path)
        self.data_set = self.parse(self.load())
    def load(self):
@@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader):
 class LMDatasetLoader(DatasetLoader):
    def __init__(self, data_name, data_path):
        super(LMDatasetLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(LMDatasetLoader, self).__init__(data_path)
    def load(self):
        if not os.path.exists(self.data_path):
@@ -226,7 +226,7 @@ class PeopleDailyCorpusLoader(DatasetLoader):
    """
    def __init__(self, data_path):
        super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path)
        super(PeopleDailyCorpusLoader, self).__init__(data_path)
    def load(self):
        with open(self.data_path, "r", encoding="utf-8") as f:
@@ -270,7 +270,7 @@ class PeopleDailyCorpusLoader(DatasetLoader):
        return pos_tag_examples, ner_examples
 if __name__ == "__main__":
    loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt")
    loader = PeopleDailyCorpusLoader("./")
    pos, ner = loader.load()
    print(pos[:10])
    print(ner[:10])
--- a/fastNLP/loader/embed_loader.py
+++ b/fastNLP/loader/embed_loader.py
@@ -1,8 +1,50 @@
 import _pickle
 import os
 import numpy as np
 from fastNLP.loader.base_loader import BaseLoader
 class EmbedLoader(BaseLoader):
    """docstring for EmbedLoader"""
    def __init__(self, data_name, data_path):
        super(EmbedLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(EmbedLoader, self).__init__(data_path)
    @staticmethod
    def load_embedding(emb_dim, emb_file, word_dict, emb_pkl):
        """Load the pre-trained embedding and combine with the given dictionary.
        :param emb_file: str, the pre-trained embedding.
                The embedding file should have the following format:
                    Each line is a word embedding, where a word string is followed by multiple floats.
                    Floats are separated by space. The word and the first float are separated by space.
        :param word_dict: dict, a mapping from word to index.
        :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
        :param emb_pkl: str, the embedding pickle file.
        :return embedding_np: numpy array of shape (len(word_dict), emb_dim)
        TODO: fragile code
        """
        # If the embedding pickle exists, load it and return.
        if os.path.exists(emb_pkl):
            with open(emb_pkl, "rb") as f:
                embedding_np = _pickle.load(f)
            return embedding_np
        # Otherwise, load the pre-trained embedding.
        with open(emb_file, "r", encoding="utf-8") as f:
            # begin with a random embedding
            embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
            for line in f:
                line = line.strip().split()
                if len(line) != emb_dim + 1:
                    # skip this line if two embedding dimension not match
                    continue
                if line[0] in word_dict:
                    # find the word and replace its embedding with a pre-trained one
                    embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
        # save and return the result
        with open(emb_pkl, "wb") as f:
            _pickle.dump(embedding_np, f)
        return embedding_np
--- a/fastNLP/loader/model_loader.py
+++ b/fastNLP/loader/model_loader.py
@@ -8,8 +8,8 @@ class ModelLoader(BaseLoader):
        Loader for models.
    """
    def __init__(self, data_name, data_path):
        super(ModelLoader, self).__init__(data_name, data_path)
    def __init__(self, data_path):
        super(ModelLoader, self).__init__(data_path)
    @staticmethod
    def load_pytorch(empty_model, model_path):
--- a/reproduction/chinese_word_segment/run.py
+++ b/reproduction/chinese_word_segment/run.py
@@ -27,7 +27,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8")
 def infer():
    # Config Loader
    test_args = ConfigSection()
    ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
    ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -47,7 +47,7 @@ def infer():
        raise
    # Data Loader
    raw_data_loader = BaseLoader(data_name, data_infer_path)
    raw_data_loader = BaseLoader(data_infer_path)
    infer_data = raw_data_loader.load_lines()
    print('data loaded')
@@ -63,10 +63,10 @@ def train():
    # Config Loader
    train_args = ConfigSection()
    test_args = ConfigSection()
    ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
    ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
    # Data Loader
    loader = TokenizeDatasetLoader(data_name, cws_data_path)
    loader = TokenizeDatasetLoader(cws_data_path)
    train_data = loader.load_pku()
    # Preprocessor
@@ -100,7 +100,7 @@ def train():
 def test():
    # Config Loader
    test_args = ConfigSection()
    ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
    ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
--- a/reproduction/pos_tag_model/train_pos_tag.py
+++ b/reproduction/pos_tag_model/train_pos_tag.py
@@ -28,7 +28,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8")
 def infer():
    # Config Loader
    test_args = ConfigSection()
    ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
    ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -47,7 +47,7 @@ def infer():
        raise
    # Data Loader
    raw_data_loader = BaseLoader(data_name, data_infer_path)
    raw_data_loader = BaseLoader(data_infer_path)
    infer_data = raw_data_loader.load_lines()
    print('data loaded')
@@ -63,7 +63,7 @@ def train():
    # Config Loader
    train_args = ConfigSection()
    test_args = ConfigSection()
    ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
    ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args})
    # Data Loader
    loader = PeopleDailyCorpusLoader(pos_tag_data_path)
@@ -100,7 +100,7 @@ def train():
 def test():
    # Config Loader
    test_args = ConfigSection()
    ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
    ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,24 @@
 #!/usr/bin/env python
 # coding=utf-8
 from setuptools import setup, find_packages
 with open('README.md') as f:
    readme = f.read()
 with open('LICENSE') as f:
    license = f.read()
 with open('requirements.txt') as f:
    reqs = f.read()
 setup(
    name='fastNLP',
    version='1.0',
    description=('fudan fastNLP '),
    long_description=readme,
    license=license,
    author='fudanNLP',
    python_requires='>=3.5',
    packages=find_packages(),
    install_requires=reqs.strip().split('\n'),
 )
--- a/test/loader/test_loader.py
+++ b/test/loader/test_loader.py
@@ -1,13 +1,12 @@
 import os
 import configparser
 import json
 import os
 import unittest
 from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader
 class TestConfigLoader(unittest.TestCase):
    def test_case_ConfigLoader(self):
@@ -33,8 +32,8 @@ class TestConfigLoader(unittest.TestCase):
            return dict
        test_arg = ConfigSection()
        ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg})
        #ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config",
        ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg})
        # ConfigLoader("config").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config",
        #                                       {"test": test_arg})
        #dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test")
@@ -58,18 +57,18 @@ class TestConfigLoader(unittest.TestCase):
 class TestDatasetLoader(unittest.TestCase):
    def test_case_TokenizeDatasetLoader(self):
        loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8")
        loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
        data = loader.load_pku(max_seq_len=32)
        print("pass TokenizeDatasetLoader test!")
    def test_case_POSDatasetLoader(self):
        loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt")
        loader = POSDatasetLoader("./test/data_for_tests/people.txt")
        data = loader.load()
        datas = loader.load_lines()
        print("pass POSDatasetLoader test!")
    def test_case_LMDatasetLoader(self):
        loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8")
        loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
        data = loader.load()
        datas = loader.load_lines()
        print("pass TokenizeDatasetLoader test!")
--- a/test/ner.py
+++ b/test/ner.py
@@ -1,138 +0,0 @@
 import _pickle
 import os
 import numpy as np
 import torch
 from fastNLP.core.preprocess import SeqLabelPreprocess
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.models.sequence_modeling import AdvSeqLabel
 class MyNERTrainer(SeqLabelTrainer):
    def __init__(self, train_args):
        super(MyNERTrainer, self).__init__(train_args)
        self.scheduler = None
    def define_optimizer(self):
        """
        override
        :return:
        """
        self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5)
    def update(self):
        """
        override
        :return:
        """
        self.optimizer.step()
        self.scheduler.step()
    def _create_validator(self, valid_args):
        return MyNERTester(valid_args)
    def best_eval_result(self, validator):
        accuracy = validator.metrics()
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            return True
        else:
            return False
 class MyNERTester(SeqLabelTester):
    def __init__(self, test_args):
        super(MyNERTester, self).__init__(test_args)
    def _evaluate(self, prediction, batch_y, seq_len):
        """
        :param prediction: [batch_size, seq_len, num_classes]
        :param batch_y: [batch_size, seq_len]
        :param seq_len: [batch_size]
        :return:
        """
        summ = 0
        correct = 0
        _, indices = torch.max(prediction, 2)
        for p, y, l in zip(indices, batch_y, seq_len):
            summ += l
            correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy())
        return float(correct / summ)
    def evaluate(self, predict, truth):
        return self._evaluate(predict, truth, self.seq_len)
    def metrics(self):
        return np.mean(self.eval_history)
    def show_metrics(self):
        return "dev accuracy={:.2f}".format(float(self.metrics()))
 def embedding_process(emb_file, word_dict, emb_dim, emb_pkl):
    if os.path.exists(emb_pkl):
        with open(emb_pkl, "rb") as f:
            embedding_np = _pickle.load(f)
        return embedding_np
    with open(emb_file, "r", encoding="utf-8") as f:
        embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
        for line in f:
            line = line.strip().split()
            if len(line) != emb_dim + 1:
                continue
            if line[0] in word_dict:
                embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
    with open(emb_pkl, "wb") as f:
        _pickle.dump(embedding_np, f)
    return embedding_np
 def data_load(data_file):
    with open(data_file, "r", encoding="utf-8") as f:
        all_data = []
        sent = []
        label = []
        for line in f:
            line = line.strip().split()
            if not len(line) <= 1:
                sent.append(line[0])
                label.append(line[1])
            else:
                all_data.append([sent, label])
                sent = []
                label = []
    return all_data
 data_path = "data_for_tests/people.txt"
 pick_path = "data_for_tests/"
 emb_path = "data_for_tests/emb50.txt"
 save_path = "data_for_tests/"
 if __name__ == "__main__":
    data = data_load(data_path)
    preprocess = SeqLabelPreprocess()
    data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3)
    # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl"))
    emb = None
    args = {"epochs": 20,
            "batch_size": 1,
            "pickle_path": pick_path,
            "validate": True,
            "save_best_dev": True,
            "model_saved_path": save_path,
            "use_cuda": True,
            "vocab_size": preprocess.vocab_size,
            "num_classes": preprocess.num_classes,
            "word_emb_dim": 50,
            "rnn_hidden_units": 100
            }
    # emb = torch.Tensor(emb).float().cuda()
    networks = AdvSeqLabel(args, emb)
    trainer = MyNERTrainer(args)
    trainer.train(networks, data_train, data_dev)
    print("Training finished!")
--- a/test/ner_decode.py
+++ b/test/ner_decode.py
@@ -1,129 +0,0 @@
 import _pickle
 import os
 import torch
 from fastNLP.core.predictor import SeqLabelInfer
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.models.sequence_modeling import AdvSeqLabel
 class Decode(SeqLabelTrainer):
    def __init__(self, args):
        super(Decode, self).__init__(args)
    def decoder(self, network, sents, model_path):
        self.model = network
        self.model.load_state_dict(torch.load(model_path))
        out_put = []
        self.mode(network, test=True)
        for batch_x in sents:
            prediction = self.data_forward(self.model, batch_x)
            seq_tag = self.model.prediction(prediction, batch_x[1])
            out_put.append(list(seq_tag)[0])
        return out_put
 def process_sent(sents, word2id):
    sents_num = []
    for s in sents:
        sent_num = []
        for c in s:
            if c in word2id:
                sent_num.append(word2id[c])
            else:
                sent_num.append(word2id["<unk>"])
        sents_num.append(([sent_num], [len(sent_num)]))  # batch_size is 1
    return sents_num
 def process_tag(sents, tags, id2class):
    Tags = []
    for ttt in tags:
        Tags.append([id2class[t] for t in ttt])
    Segs = []
    PosNers = []
    for sent, tag in zip(sents, tags):
        word__ = []
        lll__ = []
        for c, t in zip(sent, tag):
            t = id2class[t]
            l = t.split("-")
            split_ = l[0]
            pn = l[1]
            if split_ == "S":
                word__.append(c)
                lll__.append(pn)
                word_1 = ""
            elif split_ == "E":
                word_1 += c
                word__.append(word_1)
                lll__.append(pn)
                word_1 = ""
            elif split_ == "B":
                word_1 = ""
                word_1 += c
            else:
                word_1 += c
        Segs.append(word__)
        PosNers.append(lll__)
    return Segs, PosNers
 pickle_path = "data_for_tests/"
 model_path = "data_for_tests/model_best_dev.pkl"
 if __name__ == "__main__":
    with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f:
        id2word = _pickle.load(f)
    with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f:
        word2id = _pickle.load(f)
    with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f:
        id2class = _pickle.load(f)
    sent = ["中共中央总书记、国家主席江泽民",
            "逆向处理输入序列并返回逆序后的序列"]  # here is input
    args = {"epochs": 1,
            "batch_size": 1,
            "pickle_path": "data_for_tests/",
            "validate": True,
            "save_best_dev": True,
            "model_saved_path": "data_for_tests/",
            "use_cuda": False,
            "vocab_size": len(word2id),
            "num_classes": len(id2class),
            "word_emb_dim": 50,
            "rnn_hidden_units": 100,
            }
    """
    network = AdvSeqLabel(args, None)
    decoder_ = Decode(args)
    tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path)
    output_seg, output_pn = process_tag(sent, tags_num, id2class)  # here is output
    print(output_seg)
    print(output_pn)
    """
    # Define the same model
    model = AdvSeqLabel(args, None)
    # Dump trained parameters into the model
    ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl")
    print("model loaded!")
    # Inference interface
    infer = SeqLabelInfer(pickle_path)
    sent = [[ch for ch in s] for s in sent]
    results = infer.predict(model, sent)
    for res in results:
        print(res)
    print("Inference finished!")
--- a/test/readme_example.py
+++ b/test/readme_example.py
@@ -36,7 +36,7 @@ data_dir = 'save/'  # directory to save data and model
 train_path = './data_for_tests/text_classify.txt'  # training set file
 # load dataset
 ds_loader = ClassDatasetLoader("train", train_path)
 ds_loader = ClassDatasetLoader(train_path)
 data = ds_loader.load()
 # pre-process dataset
--- a/test/seq_labeling.py
+++ b/test/seq_labeling.py
@@ -33,7 +33,7 @@ data_infer_path = args.infer
 def infer():
    # Load infer configuration, the same as test
    test_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args})
    ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args})
    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -49,7 +49,7 @@ def infer():
    print("model loaded!")
    # Data Loader
    raw_data_loader = BaseLoader("xxx", data_infer_path)
    raw_data_loader = BaseLoader(data_infer_path)
    infer_data = raw_data_loader.load_lines()
    # Inference interface
@@ -65,11 +65,11 @@ def train_and_test():
    # Config Loader
    trainer_args = ConfigSection()
    model_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config(config_dir, {
    ConfigLoader("config.cfg").load_config(config_dir, {
        "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})
    # Data Loader
    pos_loader = POSDatasetLoader("xxx", data_path)
    pos_loader = POSDatasetLoader(data_path)
    train_data = pos_loader.load_lines()
    # Preprocessor
@@ -117,7 +117,7 @@ def train_and_test():
    # Load test configuration
    tester_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args})
    ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args})
    # Tester
    tester = SeqLabelTester(save_output=False,
@@ -139,5 +139,5 @@ def train_and_test():
 if __name__ == "__main__":
    train_and_test()
    # infer()
    # train_and_test()
    infer()
--- a/test/test_cws.py
+++ b/test/test_cws.py
@@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt"
 def infer():
    # Load infer configuration, the same as test
    test_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
    ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})
    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -38,7 +38,7 @@ def infer():
    print("model loaded!")
    # Data Loader
    raw_data_loader = BaseLoader(data_name, data_infer_path)
    raw_data_loader = BaseLoader(data_infer_path)
    infer_data = raw_data_loader.load_lines()
    """
        Transform strings into list of list of strings. 
@@ -61,10 +61,10 @@ def infer():
 def train_test():
    # Config Loader
    train_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
    ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args})
    # Data Loader
    loader = TokenizeDatasetLoader(data_name, cws_data_path)
    loader = TokenizeDatasetLoader(cws_data_path)
    train_data = loader.load_pku()
    # Preprocessor
@@ -74,7 +74,7 @@ def train_test():
    train_args["num_classes"] = p.num_classes
    # Trainer
    trainer = SeqLabelTrainer(train_args)
    trainer = SeqLabelTrainer(**train_args.data)
    # Model
    model = SeqLabeling(train_args)
@@ -99,10 +99,10 @@ def train_test():
    # Load test configuration
    test_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
    ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})
    # Tester
    tester = SeqLabelTester(test_args)
    tester = SeqLabelTester(**test_args.data)
    # Start testing
    tester.test(model, data_train)
--- a/test/test_tester.py
+++ b/test/test_tester.py
@@ -9,15 +9,15 @@ pickle_path = "data_for_tests"
 def foo():
    loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8")
    loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8")
    train_data = loader.load_pku()
    train_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
    ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args})
    # Preprocessor
    p = SeqLabelPreprocess()
    p.run(train_data)
    train_data = p.run(train_data)
    train_args["vocab_size"] = p.vocab_size
    train_args["num_classes"] = p.num_classes
@@ -26,10 +26,10 @@ def foo():
    valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
                  "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/",
                  "use_cuda": True}
    validator = SeqLabelTester(valid_args)
    validator = SeqLabelTester(**valid_args)
    print("start validation.")
    validator.test(model)
    validator.test(model, train_data)
    print(validator.show_metrics())
--- a/test/text_classify.py
+++ b/test/text_classify.py
@@ -34,7 +34,7 @@ config_dir = args.config
 def infer():
    # load dataset
    print("Loading data...")
    ds_loader = ClassDatasetLoader("train", train_data_dir)
    ds_loader = ClassDatasetLoader(train_data_dir)
    data = ds_loader.load()
    unlabeled_data = [x[0] for x in data]
@@ -69,7 +69,7 @@ def train():
    # load dataset
    print("Loading data...")
    ds_loader = ClassDatasetLoader("train", train_data_dir)
    ds_loader = ClassDatasetLoader(train_data_dir)
    data = ds_loader.load()
    print(data[0])