diff --git a/.travis.yml b/.travis.yml index eb5cc5cd..11239eb4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,6 @@ python: install: - pip install --quiet -r requirements.txt - pip install pytest pytest-cov - - pip install -U scikit-learn # command to run tests script: - pytest --cov=./ diff --git a/README.md b/README.md index 84d658fd..8169520a 100644 --- a/README.md +++ b/README.md @@ -30,77 +30,36 @@ Run the following commands to install fastNLP package. pip install fastNLP ``` -### Cloning From GitHub - -If you just want to use fastNLP, use: -```shell -git clone https://github.com/fastnlp/fastNLP -cd fastNLP -``` - -### PyTorch Installation - -Visit the [PyTorch official website] for installation instructions based on your system. In general, you could use: -```shell -# using conda -conda install pytorch torchvision -c pytorch -# or using pip -pip3 install torch torchvision -``` - -### TensorboardX Installation - -```shell -pip3 install tensorboardX -``` ## Project Structure -``` -FastNLP -├── docs -├── fastNLP -│   ├── core -│   │   ├── action.py -│   │   ├── __init__.py -│   │   ├── loss.py -│   │   ├── metrics.py -│   │   ├── optimizer.py -│   │   ├── predictor.py -│   │   ├── preprocess.py -│   │   ├── README.md -│   │   ├── tester.py -│   │   └── trainer.py -│   ├── fastnlp.py -│   ├── __init__.py -│   ├── loader -│   │   ├── base_loader.py -│   │   ├── config_loader.py -│   │   ├── dataset_loader.py -│   │   ├── embed_loader.py -│   │   ├── __init__.py -│   │   └── model_loader.py -│   ├── models -│   ├── modules -│   │   ├── aggregation -│   │   ├── decoder -│   │   ├── encoder -│   │   ├── __init__.py -│   │   ├── interaction -│   │   ├── other_modules.py -│   │   └── utils.py -│   └── saver -├── LICENSE -├── README.md -├── reproduction -├── requirements.txt -├── setup.py -└── test - ├── core - ├── data_for_tests - ├── __init__.py - ├── loader - ├── modules - └── readme_example.py - -``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
fastNLP an open-source NLP library
fastNLP.core trainer, tester, predictor
fastNLP.loader all kinds of loaders/readers
fastNLP.models a collection of NLP models
fastNLP.modules a collection of PyTorch sub-models/components/wheels
fastNLP.saver all kinds of savers/writers
fastNLP.fastnlp a high-level interface for prediction
\ No newline at end of file diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 3ee1a43d..13370969 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -1,4 +1,5 @@ import random +import sys from collections import defaultdict from copy import deepcopy @@ -184,6 +185,7 @@ class SeqLabelDataSet(DataSet): :param data: 3-level lists. Entries are strings. """ + bar = ProgressBar(total=len(data)) for example in data: word_seq, label_seq = example[0], example[1] # list, list @@ -197,6 +199,7 @@ class SeqLabelDataSet(DataSet): instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) + bar.move() self.index_field("word_seq", self.word_vocab) self.index_field("truth", self.label_vocab) # no need to index "word_seq_origin_len" @@ -285,3 +288,19 @@ def change_field_is_target(data_set, field_name, new_target): for inst in data_set: inst.fields[field_name].is_target = new_target + +class ProgressBar: + + def __init__(self, count=0, total=0, width=100): + self.count = count + self.total = total + self.width = width + + def move(self): + self.count += 1 + progress = self.width * self.count // self.total + sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total)) + sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r') + if progress == self.width: + sys.stdout.write('\n') + sys.stdout.flush() diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 75401194..6eedd214 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -45,9 +45,12 @@ class SeqLabelEvaluator(Evaluator): truth = [item["truth"] for item in truth] total_correct, total_count= 0., 0. for x, y in zip(predict, truth): - mask = torch.Tensor(x).ge(1) - correct = torch.sum(torch.Tensor(x) * mask.float() == (y * mask.long()).float()) - correct -= torch.sum(torch.Tensor(x).le(0)) + x = torch.Tensor(x) + y = y.to(x) # make sure they are in the same device + mask = x.ge(1).float() + # correct = torch.sum(x * mask.float() == (y * mask.long()).float()) + correct = torch.sum(x * mask == y * mask) + correct -= torch.sum(x.le(0)) total_correct += float(correct) total_count += float(torch.sum(mask)) accuracy = total_correct / total_count diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 597a4019..957a4757 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -141,15 +141,6 @@ class Trainer(object): logger.info("validation started") validator.test(network, dev_data) - if self.save_best_dev and self.best_eval_result(validator): - self.save_model(network, self.model_name) - print("Saved better model selected by validation.") - logger.info("Saved better model selected by validation.") - - valid_results = validator.show_metrics() - print("[epoch {}] {}".format(epoch, valid_results)) - logger.info("[epoch {}] {}".format(epoch, valid_results)) - def _train_step(self, data_iterator, network, **kwargs): """Training process in one epoch. diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index cd38c7d4..f940c5b8 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -5,50 +5,52 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from fastNLP.loader.config_loader import ConfigLoader, ConfigSection from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader -from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle +from fastNLP.loader.dataset_loader import BaseLoader, TokenizeDataSetLoader +from fastNLP.core.preprocess import load_pickle from fastNLP.saver.model_saver import ModelSaver from fastNLP.loader.model_loader import ModelLoader from fastNLP.core.tester import SeqLabelTester from fastNLP.models.sequence_modeling import AdvSeqLabel from fastNLP.core.predictor import SeqLabelInfer +from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target +from fastNLP.core.preprocess import save_pickle +from fastNLP.core.metrics import SeqLabelEvaluator # not in the file's dir if len(os.path.dirname(__file__)) != 0: os.chdir(os.path.dirname(__file__)) datadir = "/home/zyfeng/data/" cfgfile = './cws.cfg' -data_name = "pku_training.utf8" cws_data_path = os.path.join(datadir, "pku_training.utf8") pickle_path = "save" data_infer_path = os.path.join(datadir, "infer.utf8") + def infer(): # Config Loader test_args = ConfigSection() - ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "class2id.pkl") + index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) - # Define the same model model = AdvSeqLabel(test_args) try: - ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print('model loaded!') except Exception as e: print('cannot load model!') raise # Data Loader - raw_data_loader = BaseLoader(data_infer_path) - infer_data = raw_data_loader.load_lines() + infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines) + infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True) print('data loaded') # Inference interface @@ -63,20 +65,27 @@ def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() - ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) + ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args}) - # Data Loader - loader = TokenizeDataSetLoader() - train_data = loader.load() + print("loading data set...") + data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) + data.load(cws_data_path) + data_train, data_dev = data.split(ratio=0.3) + train_args["vocab_size"] = len(data.word_vocab) + train_args["num_classes"] = len(data.label_vocab) + print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab))) - # Preprocessor - preprocessor = SeqLabelPreprocess() - data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) - train_args["vocab_size"] = preprocessor.vocab_size - train_args["num_classes"] = preprocessor.num_classes + change_field_is_target(data_dev, "truth", True) + save_pickle(data_dev, "./save/", "data_dev.pkl") + save_pickle(data.word_vocab, "./save/", "word2id.pkl") + save_pickle(data.label_vocab, "./save/", "label2id.pkl") # Trainer - trainer = SeqLabelTrainer(**train_args.data) + trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], + validate=train_args["validate"], + use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"], + save_best_dev=True, print_every_step=10, model_name="trained_model.pkl", + evaluator=SeqLabelEvaluator()) # Model model = AdvSeqLabel(train_args) @@ -86,26 +95,26 @@ def train(): except Exception as e: print("No saved model. Continue.") pass - + # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver - saver = ModelSaver("./save/saved_model.pkl") + saver = ModelSaver("./save/trained_model.pkl") saver.save_pytorch(model) print("Model saved!") -def test(): +def predict(): # Config Loader test_args = ConfigSection() - ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "class2id.pkl") + index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # load dev data @@ -115,29 +124,28 @@ def test(): model = AdvSeqLabel(test_args) # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print("model loaded!") # Tester + test_args["evaluator"] = SeqLabelEvaluator() tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, dev_data) - # print test results - print(tester.show_metrics()) - print("model tested!") - if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) args = parser.parse_args() if args.mode == 'train': train() elif args.mode == 'test': - test() + predict() elif args.mode == 'infer': infer() else: