diff --git a/.travis.yml b/.travis.yml
index eb5cc5cd..11239eb4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,7 +5,6 @@ python:
install:
- pip install --quiet -r requirements.txt
- pip install pytest pytest-cov
- - pip install -U scikit-learn
# command to run tests
script:
- pytest --cov=./
diff --git a/README.md b/README.md
index 84d658fd..8169520a 100644
--- a/README.md
+++ b/README.md
@@ -30,77 +30,36 @@ Run the following commands to install fastNLP package.
pip install fastNLP
```
-### Cloning From GitHub
-
-If you just want to use fastNLP, use:
-```shell
-git clone https://github.com/fastnlp/fastNLP
-cd fastNLP
-```
-
-### PyTorch Installation
-
-Visit the [PyTorch official website] for installation instructions based on your system. In general, you could use:
-```shell
-# using conda
-conda install pytorch torchvision -c pytorch
-# or using pip
-pip3 install torch torchvision
-```
-
-### TensorboardX Installation
-
-```shell
-pip3 install tensorboardX
-```
## Project Structure
-```
-FastNLP
-├── docs
-├── fastNLP
-│ ├── core
-│ │ ├── action.py
-│ │ ├── __init__.py
-│ │ ├── loss.py
-│ │ ├── metrics.py
-│ │ ├── optimizer.py
-│ │ ├── predictor.py
-│ │ ├── preprocess.py
-│ │ ├── README.md
-│ │ ├── tester.py
-│ │ └── trainer.py
-│ ├── fastnlp.py
-│ ├── __init__.py
-│ ├── loader
-│ │ ├── base_loader.py
-│ │ ├── config_loader.py
-│ │ ├── dataset_loader.py
-│ │ ├── embed_loader.py
-│ │ ├── __init__.py
-│ │ └── model_loader.py
-│ ├── models
-│ ├── modules
-│ │ ├── aggregation
-│ │ ├── decoder
-│ │ ├── encoder
-│ │ ├── __init__.py
-│ │ ├── interaction
-│ │ ├── other_modules.py
-│ │ └── utils.py
-│ └── saver
-├── LICENSE
-├── README.md
-├── reproduction
-├── requirements.txt
-├── setup.py
-└── test
- ├── core
- ├── data_for_tests
- ├── __init__.py
- ├── loader
- ├── modules
- └── readme_example.py
-
-```
+
+
+ fastNLP |
+ an open-source NLP library |
+
+
+ fastNLP.core |
+ trainer, tester, predictor |
+
+
+ fastNLP.loader |
+ all kinds of loaders/readers |
+
+
+ fastNLP.models |
+ a collection of NLP models |
+
+
+ fastNLP.modules |
+ a collection of PyTorch sub-models/components/wheels |
+
+
+ fastNLP.saver |
+ all kinds of savers/writers |
+
+
+ fastNLP.fastnlp |
+ a high-level interface for prediction |
+
+
\ No newline at end of file
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 3ee1a43d..13370969 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -1,4 +1,5 @@
import random
+import sys
from collections import defaultdict
from copy import deepcopy
@@ -184,6 +185,7 @@ class SeqLabelDataSet(DataSet):
:param data: 3-level lists. Entries are strings.
"""
+ bar = ProgressBar(total=len(data))
for example in data:
word_seq, label_seq = example[0], example[1]
# list, list
@@ -197,6 +199,7 @@ class SeqLabelDataSet(DataSet):
instance.add_field("truth", y)
instance.add_field("word_seq_origin_len", x_len)
self.append(instance)
+ bar.move()
self.index_field("word_seq", self.word_vocab)
self.index_field("truth", self.label_vocab)
# no need to index "word_seq_origin_len"
@@ -285,3 +288,19 @@ def change_field_is_target(data_set, field_name, new_target):
for inst in data_set:
inst.fields[field_name].is_target = new_target
+
+class ProgressBar:
+
+ def __init__(self, count=0, total=0, width=100):
+ self.count = count
+ self.total = total
+ self.width = width
+
+ def move(self):
+ self.count += 1
+ progress = self.width * self.count // self.total
+ sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
+ sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
+ if progress == self.width:
+ sys.stdout.write('\n')
+ sys.stdout.flush()
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 75401194..6eedd214 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -45,9 +45,12 @@ class SeqLabelEvaluator(Evaluator):
truth = [item["truth"] for item in truth]
total_correct, total_count= 0., 0.
for x, y in zip(predict, truth):
- mask = torch.Tensor(x).ge(1)
- correct = torch.sum(torch.Tensor(x) * mask.float() == (y * mask.long()).float())
- correct -= torch.sum(torch.Tensor(x).le(0))
+ x = torch.Tensor(x)
+ y = y.to(x) # make sure they are in the same device
+ mask = x.ge(1).float()
+ # correct = torch.sum(x * mask.float() == (y * mask.long()).float())
+ correct = torch.sum(x * mask == y * mask)
+ correct -= torch.sum(x.le(0))
total_correct += float(correct)
total_count += float(torch.sum(mask))
accuracy = total_correct / total_count
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 597a4019..957a4757 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -141,15 +141,6 @@ class Trainer(object):
logger.info("validation started")
validator.test(network, dev_data)
- if self.save_best_dev and self.best_eval_result(validator):
- self.save_model(network, self.model_name)
- print("Saved better model selected by validation.")
- logger.info("Saved better model selected by validation.")
-
- valid_results = validator.show_metrics()
- print("[epoch {}] {}".format(epoch, valid_results))
- logger.info("[epoch {}] {}".format(epoch, valid_results))
-
def _train_step(self, data_iterator, network, **kwargs):
"""Training process in one epoch.
diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py
index cd38c7d4..f940c5b8 100644
--- a/reproduction/chinese_word_segment/run.py
+++ b/reproduction/chinese_word_segment/run.py
@@ -5,50 +5,52 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
-from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader
-from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
+from fastNLP.loader.dataset_loader import BaseLoader, TokenizeDataSetLoader
+from fastNLP.core.preprocess import load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
+from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
+from fastNLP.core.preprocess import save_pickle
+from fastNLP.core.metrics import SeqLabelEvaluator
# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'
-data_name = "pku_training.utf8"
cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")
+
def infer():
# Config Loader
test_args = ConfigSection()
- ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
+ ConfigLoader().load_config(cfgfile, {"POS_test": test_args})
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
- index2label = load_pickle(pickle_path, "class2id.pkl")
+ index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)
-
# Define the same model
model = AdvSeqLabel(test_args)
try:
- ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
+ ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise
# Data Loader
- raw_data_loader = BaseLoader(data_infer_path)
- infer_data = raw_data_loader.load_lines()
+ infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines)
+ infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
print('data loaded')
# Inference interface
@@ -63,20 +65,27 @@ def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
- ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
+ ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args})
- # Data Loader
- loader = TokenizeDataSetLoader()
- train_data = loader.load()
+ print("loading data set...")
+ data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
+ data.load(cws_data_path)
+ data_train, data_dev = data.split(ratio=0.3)
+ train_args["vocab_size"] = len(data.word_vocab)
+ train_args["num_classes"] = len(data.label_vocab)
+ print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab)))
- # Preprocessor
- preprocessor = SeqLabelPreprocess()
- data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
- train_args["vocab_size"] = preprocessor.vocab_size
- train_args["num_classes"] = preprocessor.num_classes
+ change_field_is_target(data_dev, "truth", True)
+ save_pickle(data_dev, "./save/", "data_dev.pkl")
+ save_pickle(data.word_vocab, "./save/", "word2id.pkl")
+ save_pickle(data.label_vocab, "./save/", "label2id.pkl")
# Trainer
- trainer = SeqLabelTrainer(**train_args.data)
+ trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"],
+ validate=train_args["validate"],
+ use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"],
+ save_best_dev=True, print_every_step=10, model_name="trained_model.pkl",
+ evaluator=SeqLabelEvaluator())
# Model
model = AdvSeqLabel(train_args)
@@ -86,26 +95,26 @@ def train():
except Exception as e:
print("No saved model. Continue.")
pass
-
+
# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")
# Saver
- saver = ModelSaver("./save/saved_model.pkl")
+ saver = ModelSaver("./save/trained_model.pkl")
saver.save_pytorch(model)
print("Model saved!")
-def test():
+def predict():
# Config Loader
test_args = ConfigSection()
- ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
+ ConfigLoader().load_config(cfgfile, {"POS_test": test_args})
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
- index2label = load_pickle(pickle_path, "class2id.pkl")
+ index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)
# load dev data
@@ -115,29 +124,28 @@ def test():
model = AdvSeqLabel(test_args)
# Dump trained parameters into the model
- ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
+ ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print("model loaded!")
# Tester
+ test_args["evaluator"] = SeqLabelEvaluator()
tester = SeqLabelTester(**test_args.data)
# Start testing
tester.test(model, dev_data)
- # print test results
- print(tester.show_metrics())
- print("model tested!")
-
if __name__ == "__main__":
+
import argparse
+
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
- test()
+ predict()
elif args.mode == 'infer':
infer()
else: