Browse Source

- add progress bar for data set loading

- improve metrics codes
- fix validator bugs in trainer; remove early saving
- run CWS codes
- improve README.md
tags/v0.1.0^2
FengZiYjun 6 years ago
parent
commit
cc15588a77
6 changed files with 93 additions and 114 deletions
  1. +0
    -1
      .travis.yml
  2. +30
    -71
      README.md
  3. +19
    -0
      fastNLP/core/dataset.py
  4. +6
    -3
      fastNLP/core/metrics.py
  5. +0
    -9
      fastNLP/core/trainer.py
  6. +38
    -30
      reproduction/chinese_word_segment/run.py

+ 0
- 1
.travis.yml View File

@@ -5,7 +5,6 @@ python:
install:
- pip install --quiet -r requirements.txt
- pip install pytest pytest-cov
- pip install -U scikit-learn
# command to run tests
script:
- pytest --cov=./


+ 30
- 71
README.md View File

@@ -30,77 +30,36 @@ Run the following commands to install fastNLP package.
pip install fastNLP
```

### Cloning From GitHub

If you just want to use fastNLP, use:
```shell
git clone https://github.com/fastnlp/fastNLP
cd fastNLP
```

### PyTorch Installation

Visit the [PyTorch official website] for installation instructions based on your system. In general, you could use:
```shell
# using conda
conda install pytorch torchvision -c pytorch
# or using pip
pip3 install torch torchvision
```

### TensorboardX Installation

```shell
pip3 install tensorboardX
```

## Project Structure

```
FastNLP
├── docs
├── fastNLP
│   ├── core
│   │   ├── action.py
│   │   ├── __init__.py
│   │   ├── loss.py
│   │   ├── metrics.py
│   │   ├── optimizer.py
│   │   ├── predictor.py
│   │   ├── preprocess.py
│   │   ├── README.md
│   │   ├── tester.py
│   │   └── trainer.py
│   ├── fastnlp.py
│   ├── __init__.py
│   ├── loader
│   │   ├── base_loader.py
│   │   ├── config_loader.py
│   │   ├── dataset_loader.py
│   │   ├── embed_loader.py
│   │   ├── __init__.py
│   │   └── model_loader.py
│   ├── models
│   ├── modules
│   │   ├── aggregation
│   │   ├── decoder
│   │   ├── encoder
│   │   ├── __init__.py
│   │   ├── interaction
│   │   ├── other_modules.py
│   │   └── utils.py
│   └── saver
├── LICENSE
├── README.md
├── reproduction
├── requirements.txt
├── setup.py
└── test
├── core
├── data_for_tests
├── __init__.py
├── loader
├── modules
└── readme_example.py

```
<table>
<tr>
<td><b> fastNLP </b></td>
<td> an open-source NLP library </td>
</tr>
<tr>
<td><b> fastNLP.core </b></td>
<td> trainer, tester, predictor </td>
</tr>
<tr>
<td><b> fastNLP.loader </b></td>
<td> all kinds of loaders/readers </td>
</tr>
<tr>
<td><b> fastNLP.models </b></td>
<td> a collection of NLP models </td>
</tr>
<tr>
<td><b> fastNLP.modules </b></td>
<td> a collection of PyTorch sub-models/components/wheels </td>
</tr>
<tr>
<td><b> fastNLP.saver </b></td>
<td> all kinds of savers/writers </td>
</tr>
<tr>
<td><b> fastNLP.fastnlp </b></td>
<td> a high-level interface for prediction </td>
</tr>
</table>

+ 19
- 0
fastNLP/core/dataset.py View File

@@ -1,4 +1,5 @@
import random
import sys
from collections import defaultdict
from copy import deepcopy

@@ -184,6 +185,7 @@ class SeqLabelDataSet(DataSet):

:param data: 3-level lists. Entries are strings.
"""
bar = ProgressBar(total=len(data))
for example in data:
word_seq, label_seq = example[0], example[1]
# list, list
@@ -197,6 +199,7 @@ class SeqLabelDataSet(DataSet):
instance.add_field("truth", y)
instance.add_field("word_seq_origin_len", x_len)
self.append(instance)
bar.move()
self.index_field("word_seq", self.word_vocab)
self.index_field("truth", self.label_vocab)
# no need to index "word_seq_origin_len"
@@ -285,3 +288,19 @@ def change_field_is_target(data_set, field_name, new_target):
for inst in data_set:
inst.fields[field_name].is_target = new_target


class ProgressBar:

def __init__(self, count=0, total=0, width=100):
self.count = count
self.total = total
self.width = width

def move(self):
self.count += 1
progress = self.width * self.count // self.total
sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
if progress == self.width:
sys.stdout.write('\n')
sys.stdout.flush()

+ 6
- 3
fastNLP/core/metrics.py View File

@@ -45,9 +45,12 @@ class SeqLabelEvaluator(Evaluator):
truth = [item["truth"] for item in truth]
total_correct, total_count= 0., 0.
for x, y in zip(predict, truth):
mask = torch.Tensor(x).ge(1)
correct = torch.sum(torch.Tensor(x) * mask.float() == (y * mask.long()).float())
correct -= torch.sum(torch.Tensor(x).le(0))
x = torch.Tensor(x)
y = y.to(x) # make sure they are in the same device
mask = x.ge(1).float()
# correct = torch.sum(x * mask.float() == (y * mask.long()).float())
correct = torch.sum(x * mask == y * mask)
correct -= torch.sum(x.le(0))
total_correct += float(correct)
total_count += float(torch.sum(mask))
accuracy = total_correct / total_count


+ 0
- 9
fastNLP/core/trainer.py View File

@@ -141,15 +141,6 @@ class Trainer(object):
logger.info("validation started")
validator.test(network, dev_data)

if self.save_best_dev and self.best_eval_result(validator):
self.save_model(network, self.model_name)
print("Saved better model selected by validation.")
logger.info("Saved better model selected by validation.")

valid_results = validator.show_metrics()
print("[epoch {}] {}".format(epoch, valid_results))
logger.info("[epoch {}] {}".format(epoch, valid_results))

def _train_step(self, data_iterator, network, **kwargs):
"""Training process in one epoch.



+ 38
- 30
reproduction/chinese_word_segment/run.py View File

@@ -5,50 +5,52 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.loader.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.preprocess import load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.metrics import SeqLabelEvaluator

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'
data_name = "pku_training.utf8"

cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")


def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "class2id.pkl")
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)


# Define the same model
model = AdvSeqLabel(test_args)

try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise

# Data Loader
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines)
infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
print('data loaded')

# Inference interface
@@ -63,20 +65,27 @@ def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = TokenizeDataSetLoader()
train_data = loader.load()
print("loading data set...")
data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
data.load(cws_data_path)
data_train, data_dev = data.split(ratio=0.3)
train_args["vocab_size"] = len(data.word_vocab)
train_args["num_classes"] = len(data.label_vocab)
print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab)))

# Preprocessor
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes
change_field_is_target(data_dev, "truth", True)
save_pickle(data_dev, "./save/", "data_dev.pkl")
save_pickle(data.word_vocab, "./save/", "word2id.pkl")
save_pickle(data.label_vocab, "./save/", "label2id.pkl")

# Trainer
trainer = SeqLabelTrainer(**train_args.data)
trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"],
validate=train_args["validate"],
use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"],
save_best_dev=True, print_every_step=10, model_name="trained_model.pkl",
evaluator=SeqLabelEvaluator())

# Model
model = AdvSeqLabel(train_args)
@@ -86,26 +95,26 @@ def train():
except Exception as e:
print("No saved model. Continue.")
pass
# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver = ModelSaver("./save/trained_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def test():
def predict():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "class2id.pkl")
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
@@ -115,29 +124,28 @@ def test():
model = AdvSeqLabel(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print("model loaded!")

# Tester
test_args["evaluator"] = SeqLabelEvaluator()
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, dev_data)

# print test results
print(tester.show_metrics())
print("model tested!")


if __name__ == "__main__":

import argparse

parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
test()
predict()
elif args.mode == 'infer':
infer()
else:


Loading…
Cancel
Save