@@ -2,6 +2,9 @@ | |||
[![Build Status](https://travis-ci.org/fastnlp/fastNLP.svg?branch=master)](https://travis-ci.org/fastnlp/fastNLP) | |||
[![codecov](https://codecov.io/gh/fastnlp/fastNLP/branch/master/graph/badge.svg)](https://codecov.io/gh/fastnlp/fastNLP) | |||
[![PyPI version](https://badge.fury.io/py/fastNLP.svg)](https://badge.fury.io/py/fastNLP) | |||
![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) | |||
[![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest) | |||
fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below: | |||
@@ -13,93 +16,19 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa | |||
- numpy>=1.14.2 | |||
- torch==0.4.0 | |||
- torchvision>=0.1.8 | |||
- tensorboardX | |||
## Resources | |||
- [Documentation](https://github.com/fastnlp/fastNLP) | |||
- [Documentation](https://fastnlp.readthedocs.io/en/latest/) | |||
- [Source Code](https://github.com/fastnlp/fastNLP) | |||
## Example | |||
### Basic Usage | |||
A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. | |||
```python | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import decoder | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.core.inference import ClassificationInfer | |||
class ClassificationModel(BaseModel): | |||
""" | |||
Simple text classification model based on CNN. | |||
""" | |||
def __init__(self, num_classes, vocab_size): | |||
super(ClassificationModel, self).__init__() | |||
self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] | |||
x = self.agg(x) # [N,L,C] -> [N,C] | |||
x = self.dec(x) # [N,C] -> [N, N_class] | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data_dir) | |||
vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
# construct model | |||
model_args = { | |||
'num_classes': n_classes, | |||
'vocab_size': vocab_size | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
train_args = { | |||
"epochs": 20, | |||
"batch_size": 50, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
trainer.train(model) | |||
# predict using model | |||
seqs = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, seqs) | |||
``` | |||
## Installation | |||
Run the following commands to install fastNLP package. | |||
```shell | |||
pip install fastNLP | |||
``` | |||
### Cloning From GitHub | |||
@@ -119,20 +48,26 @@ conda install pytorch torchvision -c pytorch | |||
pip3 install torch torchvision | |||
``` | |||
### TensorboardX Installation | |||
```shell | |||
pip3 install tensorboardX | |||
``` | |||
## Project Structure | |||
``` | |||
FastNLP | |||
├── docs | |||
│ └── quick_tutorial.md | |||
├── fastNLP | |||
│ ├── action | |||
│ ├── core | |||
│ │ ├── action.py | |||
│ │ ├── inference.py | |||
│ │ ├── __init__.py | |||
│ │ ├── loss.py | |||
│ │ ├── metrics.py | |||
│ │ ├── optimizer.py | |||
│ │ ├── predictor.py | |||
│ │ ├── preprocess.py | |||
│ │ ├── README.md | |||
│ │ ├── tester.py | |||
│ │ └── trainer.py | |||
@@ -144,71 +79,28 @@ FastNLP | |||
│ │ ├── dataset_loader.py | |||
│ │ ├── embed_loader.py | |||
│ │ ├── __init__.py | |||
│ │ ├── model_loader.py | |||
│ │ └── preprocess.py | |||
│ │ └── model_loader.py | |||
│ ├── models | |||
│ │ ├── base_model.py | |||
│ │ ├── char_language_model.py | |||
│ │ ├── cnn_text_classification.py | |||
│ │ ├── __init__.py | |||
│ │ └── sequence_modeling.py | |||
│ ├── modules | |||
│ │ ├── aggregation | |||
│ │ │ ├── attention.py | |||
│ │ │ ├── avg_pool.py | |||
│ │ │ ├── __init__.py | |||
│ │ │ ├── kmax_pool.py | |||
│ │ │ ├── max_pool.py | |||
│ │ │ └── self_attention.py | |||
│ │ ├── decoder | |||
│ │ │ ├── CRF.py | |||
│ │ │ └── __init__.py | |||
│ │ ├── encoder | |||
│ │ │ ├── char_embedding.py | |||
│ │ │ ├── conv_maxpool.py | |||
│ │ │ ├── conv.py | |||
│ │ │ ├── embedding.py | |||
│ │ │ ├── __init__.py | |||
│ │ │ ├── linear.py | |||
│ │ │ ├── lstm.py | |||
│ │ │ ├── masked_rnn.py | |||
│ │ │ └── variational_rnn.py | |||
│ │ ├── __init__.py | |||
│ │ ├── interaction | |||
│ │ │ └── __init__.py | |||
│ │ ├── other_modules.py | |||
│ │ └── utils.py | |||
│ └── saver | |||
│ ├── base_saver.py | |||
│ ├── __init__.py | |||
│ ├── logger.py | |||
│ └── model_saver.py | |||
├── LICENSE | |||
├── README.md | |||
├── reproduction | |||
│ ├── Char-aware_NLM | |||
│ │ | |||
│ ├── CNN-sentence_classification | |||
│ │ | |||
│ ├── HAN-document_classification | |||
│ │ | |||
│ └── LSTM+self_attention_sentiment_analysis | |||
| | |||
├── requirements.txt | |||
├── setup.py | |||
└── test | |||
├── core | |||
├── data_for_tests | |||
│ ├── charlm.txt | |||
│ ├── config | |||
│ ├── cws_test | |||
│ ├── cws_train | |||
│ ├── people_infer.txt | |||
│ └── people.txt | |||
├── test_charlm.py | |||
├── test_cws.py | |||
├── test_fastNLP.py | |||
├── test_loader.py | |||
├── test_seq_labeling.py | |||
├── test_tester.py | |||
└── test_trainer.py | |||
├── __init__.py | |||
├── loader | |||
├── modules | |||
└── readme_example.py | |||
``` |
@@ -1,3 +1,4 @@ | |||
sphinx | |||
-e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme | |||
sphinxcontrib.katex | |||
numpy>=1.14.2 | |||
http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl | |||
torchvision>=0.1.8 | |||
sphinx-rtd-theme==0.4.1 |
@@ -42,6 +42,8 @@ release = '1.0' | |||
extensions = [ | |||
'sphinx.ext.autodoc', | |||
'sphinx.ext.viewcode', | |||
'sphinx.ext.autosummary', | |||
] | |||
# Add any paths that contain templates here, relative to this directory. | |||
@@ -1,62 +1,54 @@ | |||
fastNLP.core package | |||
==================== | |||
fastNLP.core | |||
============= | |||
Submodules | |||
---------- | |||
fastNLP.core.action module | |||
-------------------------- | |||
fastNLP.core.action | |||
-------------------- | |||
.. automodule:: fastNLP.core.action | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.core.metrics module | |||
--------------------------- | |||
fastNLP.core.loss | |||
------------------ | |||
.. automodule:: fastNLP.core.loss | |||
:members: | |||
fastNLP.core.metrics | |||
--------------------- | |||
.. automodule:: fastNLP.core.metrics | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.core.optimizer module | |||
----------------------------- | |||
fastNLP.core.optimizer | |||
----------------------- | |||
.. automodule:: fastNLP.core.optimizer | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.core.predictor module | |||
----------------------------- | |||
fastNLP.core.predictor | |||
----------------------- | |||
.. automodule:: fastNLP.core.predictor | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.core.tester module | |||
-------------------------- | |||
fastNLP.core.preprocess | |||
------------------------ | |||
.. automodule:: fastNLP.core.preprocess | |||
:members: | |||
fastNLP.core.tester | |||
-------------------- | |||
.. automodule:: fastNLP.core.tester | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.core.trainer module | |||
--------------------------- | |||
fastNLP.core.trainer | |||
--------------------- | |||
.. automodule:: fastNLP.core.trainer | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.core | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,62 +1,36 @@ | |||
fastNLP.loader package | |||
====================== | |||
fastNLP.loader | |||
=============== | |||
Submodules | |||
---------- | |||
fastNLP.loader.base\_loader module | |||
---------------------------------- | |||
fastNLP.loader.base\_loader | |||
---------------------------- | |||
.. automodule:: fastNLP.loader.base_loader | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.loader.config\_loader module | |||
------------------------------------ | |||
fastNLP.loader.config\_loader | |||
------------------------------ | |||
.. automodule:: fastNLP.loader.config_loader | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.loader.dataset\_loader module | |||
------------------------------------- | |||
fastNLP.loader.dataset\_loader | |||
------------------------------- | |||
.. automodule:: fastNLP.loader.dataset_loader | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.loader.embed\_loader module | |||
----------------------------------- | |||
fastNLP.loader.embed\_loader | |||
----------------------------- | |||
.. automodule:: fastNLP.loader.embed_loader | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.loader.model\_loader module | |||
----------------------------------- | |||
fastNLP.loader.model\_loader | |||
----------------------------- | |||
.. automodule:: fastNLP.loader.model_loader | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.loader.preprocess module | |||
-------------------------------- | |||
.. automodule:: fastNLP.loader.preprocess | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.loader | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,46 +1,30 @@ | |||
fastNLP.models package | |||
====================== | |||
fastNLP.models | |||
=============== | |||
Submodules | |||
---------- | |||
fastNLP.models.base\_model module | |||
--------------------------------- | |||
fastNLP.models.base\_model | |||
--------------------------- | |||
.. automodule:: fastNLP.models.base_model | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.models.char\_language\_model module | |||
------------------------------------------- | |||
fastNLP.models.char\_language\_model | |||
------------------------------------- | |||
.. automodule:: fastNLP.models.char_language_model | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.models.cnn\_text\_classification module | |||
----------------------------------------------- | |||
fastNLP.models.cnn\_text\_classification | |||
----------------------------------------- | |||
.. automodule:: fastNLP.models.cnn_text_classification | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.models.sequence\_modeling module | |||
---------------------------------------- | |||
fastNLP.models.sequence\_modeling | |||
---------------------------------- | |||
.. automodule:: fastNLP.models.sequence_modeling | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.models | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,54 +1,36 @@ | |||
fastNLP.modules.aggregation package | |||
=================================== | |||
fastNLP.modules.aggregation | |||
============================ | |||
Submodules | |||
---------- | |||
fastNLP.modules.aggregation.attention module | |||
-------------------------------------------- | |||
fastNLP.modules.aggregation.attention | |||
-------------------------------------- | |||
.. automodule:: fastNLP.modules.aggregation.attention | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.aggregation.avg\_pool module | |||
-------------------------------------------- | |||
fastNLP.modules.aggregation.avg\_pool | |||
-------------------------------------- | |||
.. automodule:: fastNLP.modules.aggregation.avg_pool | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.aggregation.kmax\_pool module | |||
--------------------------------------------- | |||
fastNLP.modules.aggregation.kmax\_pool | |||
--------------------------------------- | |||
.. automodule:: fastNLP.modules.aggregation.kmax_pool | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.aggregation.max\_pool module | |||
-------------------------------------------- | |||
fastNLP.modules.aggregation.max\_pool | |||
-------------------------------------- | |||
.. automodule:: fastNLP.modules.aggregation.max_pool | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.aggregation.self\_attention module | |||
-------------------------------------------------- | |||
fastNLP.modules.aggregation.self\_attention | |||
-------------------------------------------- | |||
.. automodule:: fastNLP.modules.aggregation.self_attention | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.modules.aggregation | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,22 +1,18 @@ | |||
fastNLP.modules.decoder package | |||
=============================== | |||
fastNLP.modules.decoder | |||
======================== | |||
Submodules | |||
---------- | |||
fastNLP.modules.decoder.CRF module | |||
---------------------------------- | |||
fastNLP.modules.decoder.CRF | |||
---------------------------- | |||
.. automodule:: fastNLP.modules.decoder.CRF | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.decoder.MLP | |||
---------------------------- | |||
.. automodule:: fastNLP.modules.decoder.MLP | |||
:members: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.modules.decoder | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,78 +1,54 @@ | |||
fastNLP.modules.encoder package | |||
=============================== | |||
fastNLP.modules.encoder | |||
======================== | |||
Submodules | |||
---------- | |||
fastNLP.modules.encoder.char\_embedding module | |||
---------------------------------------------- | |||
fastNLP.modules.encoder.char\_embedding | |||
---------------------------------------- | |||
.. automodule:: fastNLP.modules.encoder.char_embedding | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.conv module | |||
----------------------------------- | |||
fastNLP.modules.encoder.conv | |||
----------------------------- | |||
.. automodule:: fastNLP.modules.encoder.conv | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.conv\_maxpool module | |||
-------------------------------------------- | |||
fastNLP.modules.encoder.conv\_maxpool | |||
-------------------------------------- | |||
.. automodule:: fastNLP.modules.encoder.conv_maxpool | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.embedding module | |||
---------------------------------------- | |||
fastNLP.modules.encoder.embedding | |||
---------------------------------- | |||
.. automodule:: fastNLP.modules.encoder.embedding | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.linear module | |||
------------------------------------- | |||
fastNLP.modules.encoder.linear | |||
------------------------------- | |||
.. automodule:: fastNLP.modules.encoder.linear | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.lstm module | |||
----------------------------------- | |||
fastNLP.modules.encoder.lstm | |||
----------------------------- | |||
.. automodule:: fastNLP.modules.encoder.lstm | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.masked\_rnn module | |||
------------------------------------------ | |||
fastNLP.modules.encoder.masked\_rnn | |||
------------------------------------ | |||
.. automodule:: fastNLP.modules.encoder.masked_rnn | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.encoder.variational\_rnn module | |||
----------------------------------------------- | |||
fastNLP.modules.encoder.variational\_rnn | |||
----------------------------------------- | |||
.. automodule:: fastNLP.modules.encoder.variational_rnn | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.modules.encoder | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,10 +1,5 @@ | |||
fastNLP.modules.interaction package | |||
=================================== | |||
Module contents | |||
--------------- | |||
fastNLP.modules.interaction | |||
============================ | |||
.. automodule:: fastNLP.modules.interaction | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,8 +1,5 @@ | |||
fastNLP.modules package | |||
======================= | |||
Subpackages | |||
----------- | |||
fastNLP.modules | |||
================ | |||
.. toctree:: | |||
@@ -11,30 +8,18 @@ Subpackages | |||
fastNLP.modules.encoder | |||
fastNLP.modules.interaction | |||
Submodules | |||
---------- | |||
fastNLP.modules.other\_modules module | |||
------------------------------------- | |||
fastNLP.modules.other\_modules | |||
------------------------------- | |||
.. automodule:: fastNLP.modules.other_modules | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.modules.utils module | |||
---------------------------- | |||
fastNLP.modules.utils | |||
---------------------- | |||
.. automodule:: fastNLP.modules.utils | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.modules | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,8 +1,5 @@ | |||
fastNLP package | |||
=============== | |||
Subpackages | |||
----------- | |||
fastNLP | |||
======== | |||
.. toctree:: | |||
@@ -12,22 +9,12 @@ Subpackages | |||
fastNLP.modules | |||
fastNLP.saver | |||
Submodules | |||
---------- | |||
fastNLP.fastnlp module | |||
---------------------- | |||
fastNLP.fastnlp | |||
---------------- | |||
.. automodule:: fastNLP.fastnlp | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,30 +1,18 @@ | |||
fastNLP.saver package | |||
===================== | |||
fastNLP.saver | |||
============== | |||
Submodules | |||
---------- | |||
fastNLP.saver.logger module | |||
--------------------------- | |||
fastNLP.saver.logger | |||
--------------------- | |||
.. automodule:: fastNLP.saver.logger | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
fastNLP.saver.model\_saver module | |||
--------------------------------- | |||
fastNLP.saver.model\_saver | |||
--------------------------- | |||
.. automodule:: fastNLP.saver.model_saver | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Module contents | |||
--------------- | |||
.. automodule:: fastNLP.saver | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -1,16 +1,54 @@ | |||
.. fastNLP documentation master file, created by | |||
sphinx-quickstart on Mon Aug 20 17:06:44 2018. | |||
You can adapt this file completely to your liking, but it should at least | |||
contain the root `toctree` directive. | |||
fastNLP documentation | |||
===================== | |||
fastNLP,目前仍在孵化中。 | |||
Welcome to fastNLP's documentation! | |||
=================================== | |||
Introduction | |||
------------ | |||
fastNLP是一个基于PyTorch的模块化自然语言处理系统,用于快速开发NLP工具。 | |||
它将基于深度学习的NLP模型划分为不同的模块。 | |||
这些模块分为4类:encoder(编码),interaction(交互), aggregration(聚合) and decoder(解码), | |||
而每个类别包含不同的实现模块。 | |||
大多数当前的NLP模型可以构建在这些模块上,这极大地简化了开发NLP模型的过程。 | |||
fastNLP的架构如下左图所示: | |||
.. image:: figures/procedures_and_sequence_labeling.png | |||
在constructing model部分,以序列标注(上右图)和文本分类(下图)为例进行说明: | |||
.. image:: figures/text_classification.png | |||
* encoder module:将输入编码为一些抽象表示,输入的是单词序列,输出向量序列。 | |||
* interaction module:使表示中的信息相互交互,输入的是向量序列,输出的也是向量序列。 | |||
* aggregation module:聚合和减少信息,输入向量序列,输出一个向量。 | |||
* decoder module:将表示解码为输出,输出一个label(文本分类)或者输出label序列(序列标注) | |||
其中interaction module和aggregation module在模型中不一定存在,例如上面的序列标注模型。 | |||
User's Guide | |||
------------ | |||
.. toctree:: | |||
:maxdepth: 2 | |||
user/installation | |||
user/quickstart | |||
API Reference | |||
------------- | |||
If you are looking for information on a specific function, class or | |||
method, this part of the documentation is for you. | |||
.. toctree:: | |||
:maxdepth: 4 | |||
:caption: Contents: | |||
:maxdepth: 2 | |||
fastNLP | |||
fastNLP API <fastNLP> | |||
@@ -1,7 +0,0 @@ | |||
fastNLP | |||
======= | |||
.. toctree:: | |||
:maxdepth: 4 | |||
fastNLP |
@@ -0,0 +1,31 @@ | |||
============ | |||
Installation | |||
============ | |||
.. contents:: | |||
:local: | |||
Cloning From GitHub | |||
~~~~~~~~~~~~~~~~~~~ | |||
If you just want to use fastNLP, use: | |||
.. code:: shell | |||
git clone https://github.com/fastnlp/fastNLP | |||
cd fastNLP | |||
PyTorch Installation | |||
~~~~~~~~~~~~~~~~~~~~ | |||
Visit the [PyTorch official website] for installation instructions based | |||
on your system. In general, you could use: | |||
.. code:: shell | |||
# using conda | |||
conda install pytorch torchvision -c pytorch | |||
# or using pip | |||
pip3 install torch torchvision |
@@ -0,0 +1,84 @@ | |||
========== | |||
Quickstart | |||
========== | |||
Example | |||
------- | |||
Basic Usage | |||
~~~~~~~~~~~ | |||
A typical fastNLP routine is composed of four phases: loading dataset, | |||
pre-processing data, constructing model and training model. | |||
.. code:: python | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import decoder | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.core.inference import ClassificationInfer | |||
class ClassificationModel(BaseModel): | |||
""" | |||
Simple text classification model based on CNN. | |||
""" | |||
def __init__(self, num_classes, vocab_size): | |||
super(ClassificationModel, self).__init__() | |||
self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] | |||
x = self.agg(x) # [N,L,C] -> [N,C] | |||
x = self.dec(x) # [N,C] -> [N, N_class] | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data_dir) | |||
vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
# construct model | |||
model_args = { | |||
'num_classes': n_classes, | |||
'vocab_size': vocab_size | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
train_args = { | |||
"epochs": 20, | |||
"batch_size": 50, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
trainer.train(model) | |||
# predict using model | |||
seqs = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, seqs) |
@@ -1,7 +1,3 @@ | |||
""" | |||
This file defines Action(s) and sample methods. | |||
""" | |||
from collections import Counter | |||
import numpy as np | |||
@@ -9,13 +5,12 @@ import torch | |||
class Action(object): | |||
""" | |||
Operations shared by Trainer, Tester, or Inference. | |||
"""Operations shared by Trainer, Tester, or Inference. | |||
This is designed for reducing replicate codes. | |||
- make_batch: produce a min-batch of data. @staticmethod | |||
- pad: padding method used in sequence modeling. @staticmethod | |||
- mode: change network mode for either train or test. (for PyTorch) @staticmethod | |||
The base Action shall define operations shared by as much task-specific Actions as possible. | |||
""" | |||
def __init__(self): | |||
@@ -24,18 +19,20 @@ class Action(object): | |||
@staticmethod | |||
def make_batch(iterator, use_cuda, output_length=True, max_len=None): | |||
"""Batch and Pad data. | |||
:param iterator: an iterator, (object that implements __next__ method) which returns the next sample. | |||
:param use_cuda: bool, whether to use GPU | |||
:param output_length: bool, whether to output the original length of the sequence before padding. (default: True) | |||
:param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) | |||
:return | |||
if output_length is True: | |||
:return : | |||
if output_length is True, | |||
(batch_x, seq_len): tuple of two elements | |||
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||
seq_len: list. The length of the pre-padded sequence, if output_length is True. | |||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||
if output_length is False: | |||
if output_length is False, | |||
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||
""" | |||
@@ -77,21 +74,21 @@ class Action(object): | |||
return batch | |||
@staticmethod | |||
def mode(model, test=False): | |||
""" | |||
Train mode or Test mode. This is for PyTorch currently. | |||
:param model: | |||
:param test: | |||
def mode(model, is_test=False): | |||
"""Train mode or Test mode. This is for PyTorch currently. | |||
:param model: a PyTorch model | |||
:param is_test: bool, whether in test mode or not. | |||
""" | |||
if test: | |||
if is_test: | |||
model.eval() | |||
else: | |||
model.train() | |||
def convert_to_torch_tensor(data_list, use_cuda): | |||
""" | |||
convert lists into (cuda) Tensors. | |||
"""Convert lists into (cuda) Tensors. | |||
:param data_list: 2-level lists | |||
:param use_cuda: bool, whether to use GPU or not | |||
:return data_list: PyTorch Tensor of shape [batch_size, max_seq_len] | |||
@@ -103,8 +100,8 @@ def convert_to_torch_tensor(data_list, use_cuda): | |||
def k_means_1d(x, k, max_iter=100): | |||
""" | |||
Perform k-means on 1-D data. | |||
"""Perform k-means on 1-D data. | |||
:param x: list of int, representing points in 1-D. | |||
:param k: the number of clusters required. | |||
:param max_iter: maximum iteration | |||
@@ -132,21 +129,28 @@ def k_means_1d(x, k, max_iter=100): | |||
def k_means_bucketing(all_inst, buckets): | |||
""" | |||
"""Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. | |||
:param all_inst: 3-level list | |||
E.g. :: | |||
[ | |||
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1 | |||
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | |||
... | |||
] | |||
:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length | |||
threshold for each bucket (This is usually None.). | |||
:return data: 2-level list | |||
:: | |||
[ | |||
[index_11, index_12, ...], # bucket 1 | |||
[index_21, index_22, ...], # bucket 2 | |||
... | |||
] | |||
""" | |||
bucket_data = [[] for _ in buckets] | |||
num_buckets = len(buckets) | |||
@@ -160,11 +164,16 @@ def k_means_bucketing(all_inst, buckets): | |||
class BaseSampler(object): | |||
""" | |||
Base class for all samplers. | |||
"""The base class of all samplers. | |||
""" | |||
def __init__(self, data_set): | |||
""" | |||
:param data_set: multi-level list, of shape [num_example, *] | |||
""" | |||
self.data_set_length = len(data_set) | |||
self.data = data_set | |||
@@ -176,11 +185,16 @@ class BaseSampler(object): | |||
class SequentialSampler(BaseSampler): | |||
""" | |||
Sample data in the original order. | |||
"""Sample data in the original order. | |||
""" | |||
def __init__(self, data_set): | |||
""" | |||
:param data_set: multi-level list | |||
""" | |||
super(SequentialSampler, self).__init__(data_set) | |||
def __iter__(self): | |||
@@ -188,11 +202,16 @@ class SequentialSampler(BaseSampler): | |||
class RandomSampler(BaseSampler): | |||
""" | |||
Sample data in random permutation order. | |||
"""Sample data in random permutation order. | |||
""" | |||
def __init__(self, data_set): | |||
""" | |||
:param data_set: multi-level list | |||
""" | |||
super(RandomSampler, self).__init__(data_set) | |||
self.order = np.random.permutation(self.data_set_length) | |||
@@ -201,11 +220,18 @@ class RandomSampler(BaseSampler): | |||
class Batchifier(object): | |||
""" | |||
Wrap random or sequential sampler to generate a mini-batch. | |||
"""Wrap random or sequential sampler to generate a mini-batch. | |||
""" | |||
def __init__(self, sampler, batch_size, drop_last=True): | |||
""" | |||
:param sampler: a Sampler object | |||
:param batch_size: int, the size of the mini-batch | |||
:param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch. | |||
""" | |||
super(Batchifier, self).__init__() | |||
self.sampler = sampler | |||
self.batch_size = batch_size | |||
@@ -223,8 +249,7 @@ class Batchifier(object): | |||
class BucketBatchifier(Batchifier): | |||
""" | |||
Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||
"""Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||
In sampling, first random choose a bucket. Then sample data from it. | |||
The number of buckets is decided dynamically by the variance of sentence lengths. | |||
""" | |||
@@ -237,6 +262,7 @@ class BucketBatchifier(Batchifier): | |||
:param num_buckets: int, number of buckets for grouping these sequences. | |||
:param drop_last: bool, useless currently. | |||
:param sampler: Sampler, useless currently. | |||
""" | |||
super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) | |||
buckets = ([None] * num_buckets) | |||
@@ -8,8 +8,13 @@ class Loss(object): | |||
""" | |||
def __init__(self, args): | |||
""" | |||
:param args: None or str, the name of a loss function. | |||
""" | |||
if args is None: | |||
# this is useful when | |||
# this is useful when Trainer.__init__ performs type check | |||
self._loss = None | |||
elif isinstance(args, str): | |||
self._loss = self._borrow_from_pytorch(args) | |||
@@ -17,10 +22,19 @@ class Loss(object): | |||
raise NotImplementedError | |||
def get(self): | |||
""" | |||
:return self._loss: the loss function | |||
""" | |||
return self._loss | |||
@staticmethod | |||
def _borrow_from_pytorch(loss_name): | |||
"""Given a name of a loss function, return it from PyTorch. | |||
:param loss_name: str, the name of a loss function | |||
:return loss: a PyTorch loss | |||
""" | |||
if loss_name == "cross_entropy": | |||
return torch.nn.CrossEntropyLoss() | |||
else: | |||
@@ -1,11 +1,12 @@ | |||
import warnings | |||
import numpy as np | |||
import torch | |||
def _conver_numpy(x): | |||
""" | |||
convert input data to numpy array | |||
"""convert input data to numpy array | |||
""" | |||
if isinstance(x, np.ndarray): | |||
return x | |||
@@ -17,21 +18,20 @@ def _conver_numpy(x): | |||
def _check_same_len(*arrays, axis=0): | |||
""" | |||
check if input array list has same length for one dimension | |||
"""check if input array list has same length for one dimension | |||
""" | |||
lens = set([x.shape[axis] for x in arrays if x is not None]) | |||
return len(lens) == 1 | |||
def _label_types(y): | |||
""" | |||
determine the type | |||
"binary" | |||
"multiclass" | |||
"multiclass-multioutput" | |||
"multilabel" | |||
"unknown" | |||
"""Determine the type | |||
- "binary" | |||
- "multiclass" | |||
- "multiclass-multioutput" | |||
- "multilabel" | |||
- "unknown" | |||
""" | |||
# never squeeze the first dimension | |||
y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1) | |||
@@ -46,8 +46,8 @@ def _label_types(y): | |||
def _check_data(y_true, y_pred): | |||
""" | |||
check if y_true and y_pred is same type of data e.g both binary or multiclass | |||
"""Check if y_true and y_pred is same type of data e.g both binary or multiclass | |||
""" | |||
y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred) | |||
if not _check_same_len(y_true, y_pred): | |||
@@ -174,16 +174,13 @@ def classification_report(y_true, y_pred, labels=None, target_names=None, digits | |||
def accuracy_topk(y_true, y_prob, k=1): | |||
""" | |||
Compute accuracy of y_true matching top-k probable | |||
"""Compute accuracy of y_true matching top-k probable | |||
labels in y_prob. | |||
Paras: | |||
y_ture - ndarray, true label, [n_samples] | |||
y_prob - ndarray, label probabilities, [n_samples, n_classes] | |||
k - int, k in top-k | |||
Returns: | |||
accuracy of top-k | |||
:param y_true: ndarray, true label, [n_samples] | |||
:param y_prob: ndarray, label probabilities, [n_samples, n_classes] | |||
:param k: int, k in top-k | |||
:return :accuracy of top-k | |||
""" | |||
y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | |||
@@ -195,16 +192,14 @@ def accuracy_topk(y_true, y_prob, k=1): | |||
def pred_topk(y_prob, k=1): | |||
""" | |||
Return top-k predicted labels and corresponding probabilities. | |||
Args: | |||
y_prob - ndarray, size [n_samples, n_classes], probabilities on labels | |||
k - int, k of top-k | |||
Returns: | |||
y_pred_topk - ndarray, size [n_samples, k], predicted top-k labels | |||
y_prob_topk - ndarray, size [n_samples, k], probabilities for | |||
top-k labels | |||
"""Return top-k predicted labels and corresponding probabilities. | |||
:param y_prob: ndarray, size [n_samples, n_classes], probabilities on labels | |||
:param k: int, k of top-k | |||
:returns | |||
y_pred_topk: ndarray, size [n_samples, k], predicted top-k labels | |||
y_prob_topk: ndarray, size [n_samples, k], probabilities for top-k labels | |||
""" | |||
y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | |||
@@ -4,7 +4,6 @@ import torch | |||
class Optimizer(object): | |||
"""Wrapper of optimizer from framework | |||
names: arguments (type) | |||
1. Adam: lr (float), weight_decay (float) | |||
2. AdaGrad | |||
3. RMSProp | |||
@@ -16,20 +15,29 @@ class Optimizer(object): | |||
""" | |||
:param optimizer_name: str, the name of the optimizer | |||
:param kwargs: the arguments | |||
""" | |||
self.optim_name = optimizer_name | |||
self.kwargs = kwargs | |||
@property | |||
def name(self): | |||
"""The name of the optimizer. | |||
:return: str | |||
""" | |||
return self.optim_name | |||
@property | |||
def params(self): | |||
"""The arguments used to create the optimizer. | |||
:return: dict of (str, *) | |||
""" | |||
return self.kwargs | |||
def construct_from_pytorch(self, model_params): | |||
"""construct a optimizer from framework over given model parameters""" | |||
"""Construct a optimizer from framework over given model parameters.""" | |||
if self.optim_name in ["SGD", "sgd"]: | |||
if "lr" in self.kwargs: | |||
@@ -70,7 +70,7 @@ class Predictor(object): | |||
def predict(self, network, data): | |||
"""Perform inference using the trained model. | |||
:param network: a PyTorch model | |||
:param network: a PyTorch model (cpu) | |||
:param data: list of list of strings | |||
:return: list of list of strings, [num_examples, tag_seq_length] | |||
""" | |||
@@ -17,20 +17,33 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
# the first vocab in dict with the index = 5 | |||
def save_pickle(obj, pickle_path, file_name): | |||
"""Save an object into a pickle file. | |||
:param obj: an object | |||
:param pickle_path: str, the directory where the pickle file is to be saved | |||
:param file_name: str, the name of the pickle file. In general, it should be ended by "pkl". | |||
""" | |||
with open(os.path.join(pickle_path, file_name), "wb") as f: | |||
_pickle.dump(obj, f) | |||
print("{} saved. ".format(file_name)) | |||
print("{} saved in {}".format(file_name, pickle_path)) | |||
def load_pickle(pickle_path, file_name): | |||
"""Load an object from a given pickle file. | |||
:param pickle_path: str, the directory where the pickle file is. | |||
:param file_name: str, the name of the pickle file. | |||
:return obj: an object stored in the pickle | |||
""" | |||
with open(os.path.join(pickle_path, file_name), "rb") as f: | |||
obj = _pickle.load(f) | |||
print("{} loaded. ".format(file_name)) | |||
print("{} loaded from {}".format(file_name, pickle_path)) | |||
return obj | |||
def pickle_exist(pickle_path, pickle_name): | |||
""" | |||
"""Check if a given pickle file exists in the directory. | |||
:param pickle_path: the directory of target pickle file | |||
:param pickle_name: the filename of target pickle file | |||
:return: True if file exists else False | |||
@@ -45,6 +58,19 @@ def pickle_exist(pickle_path, pickle_name): | |||
class BasePreprocess(object): | |||
"""Base class of all preprocessors. | |||
Preprocessors are responsible for converting data of strings into data of indices. | |||
During the pre-processing, the following pickle files will be built: | |||
- "word2id.pkl", a mapping from words(tokens) to indices | |||
- "id2word.pkl", a reversed dictionary | |||
- "label2id.pkl", a dictionary on labels | |||
- "id2label.pkl", a reversed dictionary on labels | |||
These four pickle files are expected to be saved in the given pickle directory once they are constructed. | |||
Preprocessors will check if those files are already in the directory and will reuse them in future calls. | |||
""" | |||
def __init__(self): | |||
self.word2index = None | |||
self.label2index = None | |||
@@ -68,6 +94,7 @@ class BasePreprocess(object): | |||
:param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. | |||
:return results: a tuple of datasets after preprocessing. | |||
""" | |||
if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | |||
self.word2index = load_pickle(pickle_path, "word2id.pkl") | |||
self.label2index = load_pickle(pickle_path, "class2id.pkl") | |||
@@ -98,6 +125,8 @@ class BasePreprocess(object): | |||
save_pickle(data_train, pickle_path, "data_train.pkl") | |||
else: | |||
data_train = load_pickle(pickle_path, "data_train.pkl") | |||
if pickle_exist(pickle_path, "data_dev.pkl"): | |||
data_dev = load_pickle(pickle_path, "data_dev.pkl") | |||
else: | |||
# cross_val is True | |||
if not pickle_exist(pickle_path, "data_train_0.pkl"): | |||
@@ -181,25 +210,31 @@ class SeqLabelPreprocess(BasePreprocess): | |||
"""Preprocess pipeline, including building mapping from words to index, from index to words, | |||
from labels/classes to index, from index to labels/classes. | |||
data of three-level list which have multiple labels in each sample. | |||
:: | |||
[ | |||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
... | |||
] | |||
""" | |||
def __init__(self): | |||
super(SeqLabelPreprocess, self).__init__() | |||
def build_dict(self, data): | |||
""" | |||
Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||
"""Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||
:param data: three-level list | |||
:: | |||
[ | |||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
... | |||
] | |||
:return word2index: dict of {str, int} | |||
label2index: dict of {str, int} | |||
""" | |||
@@ -215,14 +250,17 @@ class SeqLabelPreprocess(BasePreprocess): | |||
return word2index, label2index | |||
def to_index(self, data): | |||
""" | |||
Convert word strings and label strings into indices. | |||
"""Convert word strings and label strings into indices. | |||
:param data: three-level list | |||
:: | |||
[ | |||
[ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
[ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
... | |||
] | |||
:return data_index: the same shape as data, but each string is replaced by its corresponding index | |||
""" | |||
data_index = [] | |||
@@ -241,11 +279,14 @@ class ClassPreprocess(BasePreprocess): | |||
Preprocess pipeline, including building mapping from words to index, from index to words, | |||
from labels/classes to index, from index to labels/classes. | |||
design for data of three-level list which has a single label in each sample. | |||
:: | |||
[ | |||
[ [word_11, word_12, ...], label_1 ], | |||
[ [word_21, word_22, ...], label_2 ], | |||
... | |||
] | |||
""" | |||
def __init__(self): | |||
@@ -268,18 +309,21 @@ class ClassPreprocess(BasePreprocess): | |||
for word in sent: | |||
if word not in word2index: | |||
word2index[word[0]] = len(word2index) | |||
word2index[word] = len(word2index) | |||
return word2index, label2index | |||
def to_index(self, data): | |||
""" | |||
Convert word strings and label strings into indices. | |||
"""Convert word strings and label strings into indices. | |||
:param data: three-level list | |||
:: | |||
[ | |||
[ [word_11, word_12, ...], label_1 ], | |||
[ [word_21, word_22, ...], label_2 ], | |||
... | |||
] | |||
:return data_index: the same shape as data, but each string is replaced by its corresponding index | |||
""" | |||
data_index = [] | |||
@@ -294,14 +338,15 @@ class ClassPreprocess(BasePreprocess): | |||
def infer_preprocess(pickle_path, data): | |||
""" | |||
Preprocess over inference data. | |||
Transform three-level list of strings into that of index. | |||
"""Preprocess over inference data. Transform three-level list of strings into that of index. | |||
:: | |||
[ | |||
[word_11, word_12, ...], | |||
[word_21, word_22, ...], | |||
... | |||
] | |||
""" | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
data_index = [] | |||
@@ -38,7 +38,7 @@ class BaseTester(object): | |||
Obviously, "required_args" is the subset of "default_args". | |||
The value in "default_args" to the keys in "required_args" is simply for type check. | |||
""" | |||
# TODO: required arguments | |||
# add required arguments here | |||
required_args = {} | |||
for req_key in required_args: | |||
@@ -56,7 +56,7 @@ class BaseTester(object): | |||
logger.error(msg) | |||
raise ValueError(msg) | |||
else: | |||
# BeseTester doesn't care about extra arguments | |||
# BaseTester doesn't care about extra arguments | |||
pass | |||
print(default_args) | |||
@@ -69,8 +69,8 @@ class BaseTester(object): | |||
self.print_every_step = default_args["print_every_step"] | |||
self._model = None | |||
self.eval_history = [] | |||
self.batch_output = [] | |||
self.eval_history = [] # evaluation results of all batches | |||
self.batch_output = [] # outputs of all batches | |||
def test(self, network, dev_data): | |||
if torch.cuda.is_available() and self.use_cuda: | |||
@@ -83,10 +83,10 @@ class BaseTester(object): | |||
self.eval_history.clear() | |||
self.batch_output.clear() | |||
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | |||
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False)) | |||
step = 0 | |||
for batch_x, batch_y in self.make_batch(iterator, dev_data): | |||
for batch_x, batch_y in self.make_batch(iterator): | |||
with torch.no_grad(): | |||
prediction = self.data_forward(network, batch_x) | |||
eval_results = self.evaluate(prediction, batch_y) | |||
@@ -98,8 +98,8 @@ class BaseTester(object): | |||
print_output = "[test step {}] {}".format(step, eval_results) | |||
logger.info(print_output) | |||
if step % self.print_every_step == 0: | |||
print(print_output) | |||
if self.print_every_step > 0 and step % self.print_every_step == 0: | |||
print(self.make_eval_output(prediction, eval_results)) | |||
step += 1 | |||
def mode(self, model, test): | |||
@@ -115,28 +115,48 @@ class BaseTester(object): | |||
raise NotImplementedError | |||
def evaluate(self, predict, truth): | |||
"""Compute evaluation metrics for the model. """ | |||
"""Compute evaluation metrics. | |||
:param predict: Tensor | |||
:param truth: Tensor | |||
:return eval_results: can be anything. It will be stored in self.eval_history | |||
""" | |||
raise NotImplementedError | |||
@property | |||
def metrics(self): | |||
"""Return a list of metrics. """ | |||
"""Compute and return metrics. | |||
Use self.eval_history to compute metrics over the whole dev set. | |||
Please refer to metrics.py for common metric functions. | |||
:return : variable number of outputs | |||
""" | |||
raise NotImplementedError | |||
def show_matrices(self): | |||
"""This is called by Trainer to print evaluation results on dev set during training. | |||
def show_metrics(self): | |||
"""Customize evaluation outputs in Trainer. | |||
Called by Trainer to print evaluation results on dev set during training. | |||
Use self.metrics to fetch available metrics. | |||
:return print_str: str | |||
""" | |||
raise NotImplementedError | |||
def make_batch(self, iterator, data): | |||
def make_batch(self, iterator): | |||
raise NotImplementedError | |||
def make_eval_output(self, predictions, eval_results): | |||
"""Customize Tester outputs. | |||
:param predictions: Tensor | |||
:param eval_results: Tensor | |||
:return: str, to be printed. | |||
""" | |||
raise NotImplementedError | |||
class SeqLabelTester(BaseTester): | |||
""" | |||
Tester for sequence labeling. | |||
"""Tester for sequence labeling. | |||
""" | |||
def __init__(self, **test_args): | |||
@@ -187,22 +207,22 @@ class SeqLabelTester(BaseTester): | |||
# make sure "results" is in the same device as "truth" | |||
results = results.to(truth) | |||
accuracy = torch.sum(results == truth.view((-1,))).to(torch.float) / results.shape[0] | |||
return [loss.data, accuracy.data] | |||
return [float(loss), float(accuracy)] | |||
def metrics(self): | |||
batch_loss = np.mean([x[0] for x in self.eval_history]) | |||
batch_accuracy = np.mean([x[1] for x in self.eval_history]) | |||
return batch_loss, batch_accuracy | |||
def show_matrices(self): | |||
""" | |||
This is called by Trainer to print evaluation on dev set. | |||
def show_metrics(self): | |||
"""This is called by Trainer to print evaluation on dev set. | |||
:return print_str: str | |||
""" | |||
loss, accuracy = self.metrics() | |||
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | |||
def make_batch(self, iterator, data): | |||
def make_batch(self, iterator): | |||
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | |||
@@ -211,12 +231,12 @@ class ClassificationTester(BaseTester): | |||
def __init__(self, **test_args): | |||
""" | |||
:param test_args: a dict-like object that has __getitem__ method, \ | |||
:param test_args: a dict-like object that has __getitem__ method. | |||
can be accessed by "test_args["key_str"]" | |||
""" | |||
super(ClassificationTester, self).__init__(**test_args) | |||
def make_batch(self, iterator, data, max_len=None): | |||
def make_batch(self, iterator, max_len=None): | |||
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | |||
def data_forward(self, network, x): | |||
@@ -1,11 +1,11 @@ | |||
import _pickle | |||
import copy | |||
import os | |||
import time | |||
from datetime import timedelta | |||
import numpy as np | |||
import torch | |||
import tensorboardX | |||
from tensorboardX import SummaryWriter | |||
from fastNLP.core.action import Action | |||
from fastNLP.core.action import RandomSampler, Batchifier | |||
@@ -16,16 +16,12 @@ from fastNLP.modules import utils | |||
from fastNLP.saver.logger import create_logger | |||
from fastNLP.saver.model_saver import ModelSaver | |||
DEFAULT_QUEUE_SIZE = 300 | |||
logger = create_logger(__name__, "./train_test.log") | |||
class BaseTrainer(object): | |||
"""Operations to train a model, including data loading, SGD, and validation. | |||
"""Operations of training a model, including data loading, gradient descent, and validation. | |||
Subclasses must implement the following abstract methods: | |||
- grad_backward | |||
- get_loss | |||
""" | |||
def __init__(self, **kwargs): | |||
@@ -33,10 +29,10 @@ class BaseTrainer(object): | |||
:param kwargs: dict of (key, value), or dict-like object. key is str. | |||
The base trainer requires the following keys: | |||
- epochs: int, the number of epochs in training | |||
- validate: bool, whether or not to validate on dev set | |||
- batch_size: int | |||
- pickle_path: str, the path to pickle files for pre-processing | |||
- epochs: int, the number of epochs in training | |||
- validate: bool, whether or not to validate on dev set | |||
- batch_size: int | |||
- pickle_path: str, the path to pickle files for pre-processing | |||
""" | |||
super(BaseTrainer, self).__init__() | |||
@@ -47,8 +43,8 @@ class BaseTrainer(object): | |||
Otherwise, error will raise. | |||
""" | |||
default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", | |||
"save_best_dev": True, "model_name": "default_model_name.pkl", | |||
"loss": Loss(None), | |||
"save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, | |||
"loss": Loss(None), # used to pass type check | |||
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) | |||
} | |||
""" | |||
@@ -57,7 +53,7 @@ class BaseTrainer(object): | |||
Obviously, "required_args" is the subset of "default_args". | |||
The value in "default_args" to the keys in "required_args" is simply for type check. | |||
""" | |||
# TODO: required arguments | |||
# add required arguments here | |||
required_args = {} | |||
for req_key in required_args: | |||
@@ -86,55 +82,46 @@ class BaseTrainer(object): | |||
self.save_best_dev = default_args["save_best_dev"] | |||
self.use_cuda = default_args["use_cuda"] | |||
self.model_name = default_args["model_name"] | |||
self.print_every_step = default_args["print_every_step"] | |||
self._model = None | |||
self._loss_func = default_args["loss"].get() # return a pytorch loss function or None | |||
self._optimizer = None | |||
self._optimizer_proto = default_args["optimizer"] | |||
self._summary_writer = SummaryWriter(self.pickle_path + 'tensorboard_logs') | |||
self._graph_summaried = False | |||
def train(self, network, train_data, dev_data=None): | |||
"""General Training Steps | |||
"""General Training Procedure | |||
:param network: a model | |||
:param train_data: three-level list, the training set. | |||
:param dev_data: three-level list, the validation data (optional) | |||
The method is framework independent. | |||
Work by calling the following methods: | |||
- prepare_input | |||
- mode | |||
- define_optimizer | |||
- data_forward | |||
- get_loss | |||
- grad_backward | |||
- update | |||
Subclasses must implement these methods with a specific framework. | |||
""" | |||
# prepare model and data, transfer model to gpu if available | |||
# transfer model to gpu if available | |||
if torch.cuda.is_available() and self.use_cuda: | |||
self._model = network.cuda() | |||
# self._model is used to access model-specific loss | |||
else: | |||
self._model = network | |||
# define tester over dev data | |||
# define Tester over dev data | |||
if self.validate: | |||
default_valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | |||
"save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path, | |||
"use_cuda": self.use_cuda} | |||
"use_cuda": self.use_cuda, "print_every_step": 0} | |||
validator = self._create_validator(default_valid_args) | |||
logger.info("validator defined as {}".format(str(validator))) | |||
# optimizer and loss | |||
self.define_optimizer() | |||
logger.info("optimizer defined as {}".format(str(self._optimizer))) | |||
self.define_loss() | |||
logger.info("loss function defined as {}".format(str(self._loss_func))) | |||
# main training epochs | |||
n_samples = len(train_data) | |||
n_batches = n_samples // self.batch_size | |||
n_print = 1 | |||
# main training procedure | |||
start = time.time() | |||
logger.info("training epochs started") | |||
for epoch in range(1, self.n_epochs + 1): | |||
logger.info("training epoch {}".format(epoch)) | |||
@@ -144,23 +131,31 @@ class BaseTrainer(object): | |||
data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False)) | |||
logger.info("prepared data iterator") | |||
self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch) | |||
# one forward and backward pass | |||
self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch) | |||
# validation | |||
if self.validate: | |||
logger.info("validation started") | |||
validator.test(network, dev_data) | |||
if self.save_best_dev and self.best_eval_result(validator): | |||
self.save_model(network, self.model_name) | |||
print("saved better model selected by dev") | |||
logger.info("saved better model selected by dev") | |||
print("Saved better model selected by validation.") | |||
logger.info("Saved better model selected by validation.") | |||
valid_results = validator.show_matrices() | |||
valid_results = validator.show_metrics() | |||
print("[epoch {}] {}".format(epoch, valid_results)) | |||
logger.info("[epoch {}] {}".format(epoch, valid_results)) | |||
def _train_step(self, data_iterator, network, **kwargs): | |||
"""Training process in one epoch.""" | |||
"""Training process in one epoch. | |||
kwargs should contain: | |||
- n_print: int, print training information every n steps. | |||
- start: time.time(), the starting time of this step. | |||
- epoch: int, | |||
""" | |||
step = 0 | |||
for batch_x, batch_y in self.make_batch(data_iterator): | |||
@@ -169,8 +164,13 @@ class BaseTrainer(object): | |||
loss = self.get_loss(prediction, batch_y) | |||
self.grad_backward(loss) | |||
self.update() | |||
self._summary_writer.add_scalar("loss", loss.item(), global_step=step) | |||
if step % kwargs["n_print"] == 0: | |||
if not self._graph_summaried: | |||
self._summary_writer.add_graph(network, batch_x) | |||
self._graph_summaried = True | |||
if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: | |||
end = time.time() | |||
diff = timedelta(seconds=round(end - kwargs["start"])) | |||
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format( | |||
@@ -204,21 +204,6 @@ class BaseTrainer(object): | |||
network_copy = copy.deepcopy(network) | |||
self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | |||
def load_train_data(self, pickle_path): | |||
""" | |||
For task-specific processing. | |||
:param pickle_path: | |||
:return data_train | |||
""" | |||
file_path = os.path.join(pickle_path, "data_train.pkl") | |||
if os.path.exists(file_path): | |||
with open(file_path, 'rb') as f: | |||
data = _pickle.load(f) | |||
else: | |||
logger.error("cannot find training data {}. invalid input path for training data.".format(file_path)) | |||
raise RuntimeError("cannot find training data {}".format(file_path)) | |||
return data | |||
def make_batch(self, iterator): | |||
raise NotImplementedError | |||
@@ -226,14 +211,13 @@ class BaseTrainer(object): | |||
Action.mode(network, test) | |||
def define_optimizer(self): | |||
""" | |||
Define framework-specific optimizer specified by the models. | |||
"""Define framework-specific optimizer specified by the models. | |||
""" | |||
self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters()) | |||
def update(self): | |||
""" | |||
Perform weight update on a model. | |||
"""Perform weight update on a model. | |||
For PyTorch, just call optimizer to update. | |||
""" | |||
@@ -243,8 +227,8 @@ class BaseTrainer(object): | |||
raise NotImplementedError | |||
def grad_backward(self, loss): | |||
""" | |||
Compute gradient with link rules. | |||
"""Compute gradient with link rules. | |||
:param loss: a scalar where back-prop starts | |||
For PyTorch, just do "loss.backward()" | |||
@@ -253,8 +237,8 @@ class BaseTrainer(object): | |||
loss.backward() | |||
def get_loss(self, predict, truth): | |||
""" | |||
Compute loss given prediction and ground truth. | |||
"""Compute loss given prediction and ground truth. | |||
:param predict: prediction label vector | |||
:param truth: ground truth label vector | |||
:return: a scalar | |||
@@ -262,8 +246,9 @@ class BaseTrainer(object): | |||
return self._loss_func(predict, truth) | |||
def define_loss(self): | |||
""" | |||
if the model defines a loss, use model's loss. | |||
"""Define a loss for the trainer. | |||
If the model defines a loss, use model's loss. | |||
Otherwise, Trainer must has a loss argument, use it as loss. | |||
These two losses cannot be defined at the same time. | |||
Trainer does not handle loss definition or choose default losses. | |||
@@ -280,53 +265,30 @@ class BaseTrainer(object): | |||
logger.info("The model didn't define loss, use Trainer's loss.") | |||
def best_eval_result(self, validator): | |||
""" | |||
"""Check if the current epoch yields better validation results. | |||
:param validator: a Tester instance | |||
:return: bool, True means current results on dev set is the best. | |||
""" | |||
raise NotImplementedError | |||
def save_model(self, network, model_name): | |||
""" | |||
"""Save this model with such a name. | |||
This method may be called multiple times by Trainer to overwritten a better model. | |||
:param network: the PyTorch model | |||
:param model_name: str | |||
model_best_dev.pkl may be overwritten by a better model in future epochs. | |||
""" | |||
if model_name[-4:] != ".pkl": | |||
model_name += ".pkl" | |||
ModelSaver(self.pickle_path + model_name).save_pytorch(network) | |||
ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network) | |||
def _create_validator(self, valid_args): | |||
raise NotImplementedError | |||
class ToyTrainer(BaseTrainer): | |||
""" | |||
An example to show the definition of Trainer. | |||
""" | |||
def __init__(self, training_args): | |||
super(ToyTrainer, self).__init__(training_args) | |||
def load_train_data(self, data_path): | |||
data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb")) | |||
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) | |||
return data_train, data_dev, 0, 1 | |||
def data_forward(self, network, x): | |||
return network(x) | |||
def grad_backward(self, loss): | |||
self._model.zero_grad() | |||
loss.backward() | |||
def get_loss(self, pred, truth): | |||
return np.mean(np.square(pred - truth)) | |||
class SeqLabelTrainer(BaseTrainer): | |||
""" | |||
Trainer for Sequence Modeling | |||
"""Trainer for Sequence Labeling | |||
""" | |||
@@ -356,11 +318,11 @@ class SeqLabelTrainer(BaseTrainer): | |||
return y | |||
def get_loss(self, predict, truth): | |||
""" | |||
Compute loss given prediction and ground truth. | |||
"""Compute loss given prediction and ground truth. | |||
:param predict: prediction label vector, [batch_size, max_len, tag_size] | |||
:param truth: ground truth label vector, [batch_size, max_len] | |||
:return: a scalar | |||
:return loss: a scalar | |||
""" | |||
batch_size, max_len = predict.size(0), predict.size(1) | |||
assert truth.shape == (batch_size, max_len) | |||
@@ -384,7 +346,7 @@ class SeqLabelTrainer(BaseTrainer): | |||
class ClassificationTrainer(BaseTrainer): | |||
"""Trainer for classification.""" | |||
"""Trainer for text classification.""" | |||
def __init__(self, **train_args): | |||
super(ClassificationTrainer, self).__init__(**train_args) | |||
@@ -1,4 +1,7 @@ | |||
import os | |||
from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer | |||
from fastNLP.core.preprocess import load_pickle | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.loader.model_loader import ModelLoader | |||
@@ -7,14 +10,13 @@ mapping from model name to [URL, file_name.class_name, model_pickle_name] | |||
Notice that the class of the model should be in "models" directory. | |||
Example: | |||
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"] | |||
""" | |||
FastNLP_MODEL_COLLECTION = { | |||
"seq_label_model": { | |||
"url": "www.fudan.edu.cn", | |||
"class": "sequence_modeling.SeqLabeling", | |||
"class": "sequence_modeling.SeqLabeling", # file_name.class_name in models/ | |||
"pickle": "seq_label_model.pkl", | |||
"type": "seq_label" | |||
"type": "seq_label", | |||
"config_file_name": "config", # the name of the config file which stores model initialization parameters | |||
"config_section_name": "text_class_model" # the name of the section in the config file which stores model init params | |||
}, | |||
"text_class_model": { | |||
"url": "www.fudan.edu.cn", | |||
@@ -22,11 +24,34 @@ FastNLP_MODEL_COLLECTION = { | |||
"pickle": "text_class_model.pkl", | |||
"type": "text_class" | |||
} | |||
""" | |||
FastNLP_MODEL_COLLECTION = { | |||
"cws_basic_model": { | |||
"url": "", | |||
"class": "sequence_modeling.AdvSeqLabel", | |||
"pickle": "cws_basic_model_v_0.pkl", | |||
"type": "seq_label", | |||
"config_file_name": "config", | |||
"config_section_name": "text_class_model" | |||
}, | |||
"pos_tag_model": { | |||
"url": "", | |||
"class": "sequence_modeling.AdvSeqLabel", | |||
"pickle": "pos_tag_model_v_0.pkl", | |||
"type": "seq_label", | |||
"config_file_name": "pos_tag.config", | |||
"config_section_name": "pos_tag_model" | |||
}, | |||
"text_classify_model": { | |||
"url": "", | |||
"class": "cnn_text_classification.CNNText", | |||
"pickle": "text_class_model_v0.pkl", | |||
"type": "text_class", | |||
"config_file_name": "text_classify.cfg", | |||
"config_section_name": "model" | |||
} | |||
} | |||
CONFIG_FILE_NAME = "config" | |||
SECTION_NAME = "text_class_model" | |||
class FastNLP(object): | |||
""" | |||
@@ -51,10 +76,13 @@ class FastNLP(object): | |||
self.model = None | |||
self.infer_type = None # "seq_label"/"text_class" | |||
def load(self, model_name): | |||
def load(self, model_name, config_file="config", section_name="model"): | |||
""" | |||
Load a pre-trained FastNLP model together with additional data. | |||
:param model_name: str, the name of a FastNLP model. | |||
:param config_file: str, the name of the config file which stores the initialization information of the model. | |||
(default: "config") | |||
:param section_name: str, the name of the corresponding section in the config file. (default: model) | |||
""" | |||
assert type(model_name) is str | |||
if model_name not in FastNLP_MODEL_COLLECTION: | |||
@@ -64,37 +92,47 @@ class FastNLP(object): | |||
self._download(model_name, FastNLP_MODEL_COLLECTION[model_name]["url"]) | |||
model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name]["class"]) | |||
print("Restore model class {}".format(str(model_class))) | |||
model_args = ConfigSection() | |||
ConfigLoader.load_config(self.model_dir + CONFIG_FILE_NAME, {SECTION_NAME: model_args}) | |||
ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args}) | |||
print("Restore model hyper-parameters {}".format(str(model_args.data))) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(self.model_dir, "word2id.pkl") | |||
model_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(self.model_dir, "id2class.pkl") | |||
model_args["num_classes"] = len(index2label) | |||
# Construct the model | |||
model = model_class(model_args) | |||
print("Model constructed.") | |||
# To do: framework independent | |||
ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name]["pickle"]) | |||
ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"])) | |||
print("Model weights loaded.") | |||
self.model = model | |||
self.infer_type = FastNLP_MODEL_COLLECTION[model_name]["type"] | |||
print("Model loaded. ") | |||
print("Inference ready.") | |||
def run(self, raw_input): | |||
""" | |||
Perform inference over given input using the loaded model. | |||
:param raw_input: str, raw text | |||
:param raw_input: list of string. Each list is an input query. | |||
:return results: | |||
""" | |||
infer = self._create_inference(self.model_dir) | |||
# string ---> 2-D list of string | |||
infer_input = self.string_to_list(raw_input) | |||
# tokenize: list of string ---> 2-D list of string | |||
infer_input = self.tokenize(raw_input, language="zh") | |||
# 2-D list of string ---> list of strings | |||
# 2-D list of string ---> 2-D list of tags | |||
results = infer.predict(self.model, infer_input) | |||
# list of strings ---> final answers | |||
# 2-D list of tags ---> list of final answers | |||
outputs = self._make_output(results, infer_input) | |||
return outputs | |||
@@ -142,81 +180,135 @@ class FastNLP(object): | |||
""" | |||
return True | |||
def string_to_list(self, text, delimiter="\n"): | |||
""" | |||
This function is used to transform raw input to lists, which is done by DatasetLoader in training. | |||
Split text string into three-level lists. | |||
[ | |||
[word_11, word_12, ...], | |||
[word_21, word_22, ...], | |||
... | |||
] | |||
:param text: string | |||
:param delimiter: str, character used to split text into sentences. | |||
:return data: two-level lists | |||
def tokenize(self, text, language): | |||
"""Extract tokens from strings. | |||
For English, extract words separated by space. | |||
For Chinese, extract characters. | |||
TODO: more complex tokenization methods | |||
:param text: list of string | |||
:param language: str, one of ('zh', 'en'), Chinese or English. | |||
:return data: list of list of string, each string is a token. | |||
""" | |||
assert language in ("zh", "en") | |||
data = [] | |||
sents = text.strip().split(delimiter) | |||
for sent in sents: | |||
characters = [] | |||
for ch in sent: | |||
characters.append(ch) | |||
data.append(characters) | |||
for sent in text: | |||
if language == "en": | |||
tokens = sent.strip().split() | |||
elif language == "zh": | |||
tokens = [char for char in sent] | |||
else: | |||
raise RuntimeError("Unknown language {}".format(language)) | |||
data.append(tokens) | |||
return data | |||
def _make_output(self, results, infer_input): | |||
"""Transform the infer output into user-friendly output. | |||
:param results: 1 or 2-D list of strings. | |||
If self.infer_type == "seq_label", it is of shape [num_examples, tag_seq_length] | |||
If self.infer_type == "text_class", it is of shape [num_examples] | |||
:param infer_input: 2-D list of string, the input query before inference. | |||
:return outputs: list. Each entry is a prediction. | |||
""" | |||
if self.infer_type == "seq_label": | |||
outputs = make_seq_label_output(results, infer_input) | |||
elif self.infer_type == "text_class": | |||
outputs = make_class_output(results, infer_input) | |||
else: | |||
raise ValueError("fail to make outputs with infer type {}".format(self.infer_type)) | |||
raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type)) | |||
return outputs | |||
def make_seq_label_output(result, infer_input): | |||
""" | |||
Transform model output into user-friendly contents. | |||
:param result: 1-D list of strings. (model output) | |||
"""Transform model output into user-friendly contents. | |||
:param result: 2-D list of strings. (model output) | |||
:param infer_input: 2-D list of string (model input) | |||
:return outputs: | |||
:return ret: list of list of tuples | |||
[ | |||
[(word_11, label_11), (word_12, label_12), ...], | |||
[(word_21, label_21), (word_22, label_22), ...], | |||
... | |||
] | |||
""" | |||
return result | |||
ret = [] | |||
for example_x, example_y in zip(infer_input, result): | |||
ret.append([(x, y) for x, y in zip(example_x, example_y)]) | |||
return ret | |||
def make_class_output(result, infer_input): | |||
"""Transform model output into user-friendly contents. | |||
:param result: 2-D list of strings. (model output) | |||
:param infer_input: 1-D list of string (model input) | |||
:return ret: the same as result, [label_1, label_2, ...] | |||
""" | |||
return result | |||
def interpret_word_seg_results(infer_input, results): | |||
""" | |||
Transform model output into user-friendly contents. | |||
def interpret_word_seg_results(char_seq, label_seq): | |||
"""Transform model output into user-friendly contents. | |||
Example: In CWS, convert <BMES> labeling into segmented text. | |||
:param results: list of strings. (model output) | |||
:param infer_input: 2-D list of string (model input) | |||
:return output: list of strings | |||
:param char_seq: list of string, | |||
:param label_seq: list of string, the same length as char_seq | |||
Each entry is one of ('B', 'M', 'E', 'S'). | |||
:return output: list of words | |||
""" | |||
outputs = [] | |||
for sent_char, sent_label in zip(infer_input, results): | |||
words = [] | |||
word = "" | |||
for char, label in zip(sent_char, sent_label): | |||
if label[0] == "B": | |||
if word != "": | |||
words.append(word) | |||
word = char | |||
elif label[0] == "M": | |||
word += char | |||
elif label[0] == "E": | |||
word += char | |||
words = [] | |||
word = "" | |||
for char, label in zip(char_seq, label_seq): | |||
if label[0] == "B": | |||
if word != "": | |||
words.append(word) | |||
word = "" | |||
elif label[0] == "S": | |||
if word != "": | |||
words.append(word) | |||
word = "" | |||
words.append(char) | |||
else: | |||
raise ValueError("invalid label") | |||
outputs.append(" ".join(words)) | |||
word = char | |||
elif label[0] == "M": | |||
word += char | |||
elif label[0] == "E": | |||
word += char | |||
words.append(word) | |||
word = "" | |||
elif label[0] == "S": | |||
if word != "": | |||
words.append(word) | |||
word = "" | |||
words.append(char) | |||
else: | |||
raise ValueError("invalid label {}".format(label[0])) | |||
return words | |||
def interpret_cws_pos_results(char_seq, label_seq): | |||
"""Transform model output into user-friendly contents. | |||
:param char_seq: list of string | |||
:param label_seq: list of string, the same length as char_seq. | |||
:return outputs: list of tuple (words, pos_tag): | |||
""" | |||
def pos_tag_check(seq): | |||
"""check whether all entries are the same """ | |||
return len(set(seq)) <= 1 | |||
word = [] | |||
word_pos = [] | |||
outputs = [] | |||
for char, label in zip(char_seq, label_seq): | |||
tmp = label.split("-") | |||
cws_label, pos_tag = tmp[0], tmp[1] | |||
if cws_label == "B" or cws_label == "M": | |||
word.append(char) | |||
word_pos.append(pos_tag) | |||
elif cws_label == "E": | |||
word.append(char) | |||
word_pos.append(pos_tag) | |||
if not pos_tag_check(word_pos): | |||
raise RuntimeError("character-wise pos tags inconsistent. ") | |||
outputs.append(("".join(word), word_pos[0])) | |||
word.clear() | |||
word_pos.clear() | |||
elif cws_label == "S": | |||
outputs.append((char, pos_tag)) | |||
return outputs |
@@ -1,9 +1,8 @@ | |||
class BaseLoader(object): | |||
"""docstring for BaseLoader""" | |||
def __init__(self, data_name, data_path): | |||
def __init__(self, data_path): | |||
super(BaseLoader, self).__init__() | |||
self.data_name = data_name | |||
self.data_path = data_path | |||
def load(self): | |||
@@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader): | |||
For charLM | |||
""" | |||
def __init__(self, name, path): | |||
super(ToyLoader0, self).__init__(name, path) | |||
def __init__(self, data_path): | |||
super(ToyLoader0, self).__init__(data_path) | |||
def load(self): | |||
with open(self.data_path, 'r') as f: | |||
@@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader | |||
class DatasetLoader(BaseLoader): | |||
""""loader for data sets""" | |||
def __init__(self, data_name, data_path): | |||
super(DatasetLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(DatasetLoader, self).__init__(data_path) | |||
class POSDatasetLoader(DatasetLoader): | |||
@@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader): | |||
to label5. | |||
""" | |||
def __init__(self, data_name, data_path): | |||
super(POSDatasetLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(POSDatasetLoader, self).__init__(data_path) | |||
def load(self): | |||
assert os.path.exists(self.data_path) | |||
@@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader): | |||
Data set loader for tokenization data sets | |||
""" | |||
def __init__(self, data_name, data_path): | |||
super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(TokenizeDatasetLoader, self).__init__(data_path) | |||
def load_pku(self, max_seq_len=32): | |||
""" | |||
@@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader): | |||
class ClassDatasetLoader(DatasetLoader): | |||
"""Loader for classification data sets""" | |||
def __init__(self, data_name, data_path): | |||
super(ClassDatasetLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(ClassDatasetLoader, self).__init__(data_path) | |||
def load(self): | |||
assert os.path.exists(self.data_path) | |||
@@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader): | |||
:param str data_name: the name of the conll data set | |||
:param str data_path: the path to the conll data set | |||
""" | |||
super(ConllLoader, self).__init__(data_name, data_path) | |||
super(ConllLoader, self).__init__(data_path) | |||
self.data_set = self.parse(self.load()) | |||
def load(self): | |||
@@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader): | |||
class LMDatasetLoader(DatasetLoader): | |||
def __init__(self, data_name, data_path): | |||
super(LMDatasetLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(LMDatasetLoader, self).__init__(data_path) | |||
def load(self): | |||
if not os.path.exists(self.data_path): | |||
@@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): | |||
return text.strip().split() | |||
if __name__ == "__main__": | |||
class PeopleDailyCorpusLoader(DatasetLoader): | |||
""" | |||
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | |||
for example in data: | |||
for w, l in zip(example[0], example[1]): | |||
print(w, l) | |||
People Daily Corpus: Chinese word segmentation, POS tag, NER | |||
""" | |||
ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||
print(ans) | |||
def __init__(self, data_path): | |||
super(PeopleDailyCorpusLoader, self).__init__(data_path) | |||
def load(self): | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
sents = f.readlines() | |||
pos_tag_examples = [] | |||
ner_examples = [] | |||
for sent in sents: | |||
inside_ne = False | |||
sent_pos_tag = [] | |||
sent_words = [] | |||
sent_ner = [] | |||
words = sent.strip().split()[1:] | |||
for word in words: | |||
if "[" in word and "]" in word: | |||
ner_tag = "U" | |||
print(word) | |||
elif "[" in word: | |||
inside_ne = True | |||
ner_tag = "B" | |||
word = word[1:] | |||
elif "]" in word: | |||
ner_tag = "L" | |||
word = word[:word.index("]")] | |||
if inside_ne is True: | |||
inside_ne = False | |||
else: | |||
raise RuntimeError("only ] appears!") | |||
else: | |||
if inside_ne is True: | |||
ner_tag = "I" | |||
else: | |||
ner_tag = "O" | |||
tmp = word.split("/") | |||
token, pos = tmp[0], tmp[1] | |||
sent_ner.append(ner_tag) | |||
sent_pos_tag.append(pos) | |||
sent_words.append(token) | |||
pos_tag_examples.append([sent_words, sent_pos_tag]) | |||
ner_examples.append([sent_words, sent_ner]) | |||
return pos_tag_examples, ner_examples | |||
if __name__ == "__main__": | |||
loader = PeopleDailyCorpusLoader("./") | |||
pos, ner = loader.load() | |||
print(pos[:10]) | |||
print(ner[:10]) |
@@ -1,8 +1,50 @@ | |||
import _pickle | |||
import os | |||
import numpy as np | |||
from fastNLP.loader.base_loader import BaseLoader | |||
class EmbedLoader(BaseLoader): | |||
"""docstring for EmbedLoader""" | |||
def __init__(self, data_name, data_path): | |||
super(EmbedLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(EmbedLoader, self).__init__(data_path) | |||
@staticmethod | |||
def load_embedding(emb_dim, emb_file, word_dict, emb_pkl): | |||
"""Load the pre-trained embedding and combine with the given dictionary. | |||
:param emb_file: str, the pre-trained embedding. | |||
The embedding file should have the following format: | |||
Each line is a word embedding, where a word string is followed by multiple floats. | |||
Floats are separated by space. The word and the first float are separated by space. | |||
:param word_dict: dict, a mapping from word to index. | |||
:param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. | |||
:param emb_pkl: str, the embedding pickle file. | |||
:return embedding_np: numpy array of shape (len(word_dict), emb_dim) | |||
TODO: fragile code | |||
""" | |||
# If the embedding pickle exists, load it and return. | |||
if os.path.exists(emb_pkl): | |||
with open(emb_pkl, "rb") as f: | |||
embedding_np = _pickle.load(f) | |||
return embedding_np | |||
# Otherwise, load the pre-trained embedding. | |||
with open(emb_file, "r", encoding="utf-8") as f: | |||
# begin with a random embedding | |||
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||
for line in f: | |||
line = line.strip().split() | |||
if len(line) != emb_dim + 1: | |||
# skip this line if two embedding dimension not match | |||
continue | |||
if line[0] in word_dict: | |||
# find the word and replace its embedding with a pre-trained one | |||
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||
# save and return the result | |||
with open(emb_pkl, "wb") as f: | |||
_pickle.dump(embedding_np, f) | |||
return embedding_np |
@@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): | |||
Loader for models. | |||
""" | |||
def __init__(self, data_name, data_path): | |||
super(ModelLoader, self).__init__(data_name, data_path) | |||
def __init__(self, data_path): | |||
super(ModelLoader, self).__init__(data_path) | |||
@staticmethod | |||
def load_pytorch(empty_model, model_path): | |||
@@ -5,7 +5,7 @@ import torch | |||
import torch.nn as nn | |||
# import torch.nn.functional as F | |||
from fastNLP.modules.encoder.conv_maxpool import ConvMaxpool | |||
import fastNLP.modules.encoder as encoder | |||
class CNNText(torch.nn.Module): | |||
@@ -18,22 +18,22 @@ class CNNText(torch.nn.Module): | |||
def __init__(self, args): | |||
super(CNNText, self).__init__() | |||
class_num = args["num_classes"] | |||
num_classes = args["num_classes"] | |||
kernel_nums = [100, 100, 100] | |||
kernel_sizes = [3, 4, 5] | |||
embed_num = args["vocab_size"] | |||
vocab_size = args["vocab_size"] | |||
embed_dim = 300 | |||
pretrained_embed = None | |||
drop_prob = 0.5 | |||
# no support for pre-trained embedding currently | |||
self.embed = nn.Embedding(embed_num, embed_dim, padding_idx=0) | |||
self.conv_pool = ConvMaxpool( | |||
self.embed = encoder.embedding.Embedding(vocab_size, embed_dim) | |||
self.conv_pool = encoder.conv_maxpool.ConvMaxpool( | |||
in_channels=embed_dim, | |||
out_channels=kernel_nums, | |||
kernel_sizes=kernel_sizes) | |||
self.dropout = nn.Dropout(drop_prob) | |||
self.fc = nn.Linear(sum(kernel_nums), class_num) | |||
self.fc = encoder.linear.Linear(sum(kernel_nums), num_classes) | |||
def forward(self, x): | |||
x = self.embed(x) # [N,L] -> [N,L,C] | |||
@@ -1,3 +1,4 @@ | |||
from .CRF import ConditionalRandomField | |||
from .MLP import MLP | |||
__all__ = ["ConditionalRandomField"] | |||
__all__ = ["ConditionalRandomField", "MLP"] |
@@ -2,8 +2,10 @@ from .embedding import Embedding | |||
from .linear import Linear | |||
from .lstm import Lstm | |||
from .conv import Conv | |||
from .conv_maxpool import ConvMaxpool | |||
__all__ = ["Lstm", | |||
"Embedding", | |||
"Linear", | |||
"Conv"] | |||
"Conv", | |||
"ConvMaxpool"] |
@@ -4,6 +4,7 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from torch.nn.init import xavier_uniform_ | |||
class ConvMaxpool(nn.Module): | |||
@@ -21,6 +22,7 @@ class ConvMaxpool(nn.Module): | |||
if isinstance(kernel_sizes, int): | |||
out_channels = [out_channels] | |||
kernel_sizes = [kernel_sizes] | |||
self.convs = nn.ModuleList([nn.Conv1d( | |||
in_channels=in_channels, | |||
out_channels=oc, | |||
@@ -31,6 +33,9 @@ class ConvMaxpool(nn.Module): | |||
groups=groups, | |||
bias=bias) | |||
for oc, ks in zip(out_channels, kernel_sizes)]) | |||
for conv in self.convs: | |||
xavier_uniform_(conv.weight) # weight initialization | |||
else: | |||
raise Exception( | |||
'Incorrect kernel sizes: should be list, tuple or int') | |||
@@ -15,7 +15,7 @@ class Embedding(nn.Module): | |||
def __init__(self, nums, dims, padding_idx=0, sparse=False, init_emb=None, dropout=0.0): | |||
super(Embedding, self).__init__() | |||
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse) | |||
if init_emb: | |||
if init_emb is not None: | |||
self.embed.weight = nn.Parameter(init_emb) | |||
self.dropout = nn.Dropout(dropout) | |||
@@ -1,114 +0,0 @@ | |||
import sys | |||
sys.path.append("..") | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
from fastNLP.core.predictor import Predictor | |||
data_name = "pku_training.utf8" | |||
cws_data_path = "/home/zyfeng/data/pku_training.utf8" | |||
pickle_path = "./save/" | |||
data_infer_path = "/home/zyfeng/data/pku_test.utf8" | |||
def infer(): | |||
# Load infer configuration, the same as test | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
model = SeqLabeling(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
# Inference interface | |||
infer = Predictor(pickle_path) | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
print("Inference finished!") | |||
def train_test(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
test_args = ConfigSection() | |||
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) | |||
# Data Loader | |||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
train_data = loader.load_pku() | |||
# Preprocessor | |||
preprocess = SeqLabelPreprocess() | |||
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = preprocess.vocab_size | |||
train_args["num_classes"] = preprocess.num_classes | |||
# Trainer | |||
trainer = SeqLabelTrainer(train_args) | |||
# Model | |||
model = SeqLabeling(train_args) | |||
# Start training | |||
trainer.train(model, data_train, data_dev) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./save/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
# testing with validation set | |||
test(data_dev) | |||
def test(test_data): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
# Define the same model | |||
model = SeqLabeling(train_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
print("model loaded!") | |||
# Load test configuration | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# Tester | |||
tester = SeqLabelTester(test_args) | |||
# Start testing | |||
tester.test(model, test_data) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print("model tested!") | |||
if __name__ == "__main__": | |||
train_test() |
@@ -31,4 +31,16 @@ pickle_path = "./save/" | |||
use_crf = true | |||
use_cuda = true | |||
rnn_hidden_units = 100 | |||
word_emb_dim = 100 | |||
[model] | |||
save_output = true | |||
validate_in_training = true | |||
save_dev_input = false | |||
save_loss = true | |||
batch_size = 640 | |||
pickle_path = "./save/" | |||
use_crf = true | |||
use_cuda = true | |||
rnn_hidden_units = 100 | |||
word_emb_dim = 100 |
@@ -1,33 +1,33 @@ | |||
import sys, os | |||
import os | |||
import sys | |||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
from fastNLP.core.inference import SeqLabelInfer | |||
from fastNLP.core.optimizer import SGD | |||
from fastNLP.core.predictor import SeqLabelInfer | |||
# not in the file's dir | |||
if len(os.path.dirname(__file__)) != 0: | |||
os.chdir(os.path.dirname(__file__)) | |||
datadir = 'icwb2-data' | |||
cfgfile = 'cws.cfg' | |||
datadir = "/home/zyfeng/data/" | |||
cfgfile = './cws.cfg' | |||
data_name = "pku_training.utf8" | |||
cws_data_path = os.path.join(datadir, "training/pku_training.utf8") | |||
cws_data_path = os.path.join(datadir, "pku_training.utf8") | |||
pickle_path = "save" | |||
data_infer_path = os.path.join(datadir, "infer.utf8") | |||
def infer(): | |||
# Config Loader | |||
test_args = ConfigSection() | |||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
@@ -47,7 +47,7 @@ def infer(): | |||
raise | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
raw_data_loader = BaseLoader(data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
print('data loaded') | |||
@@ -63,19 +63,20 @@ def train(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
test_args = ConfigSection() | |||
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
# Data Loader | |||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
loader = TokenizeDatasetLoader(cws_data_path) | |||
train_data = loader.load_pku() | |||
# Preprocessor | |||
p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = p.vocab_size | |||
train_args["num_classes"] = p.num_classes | |||
preprocessor = SeqLabelPreprocess() | |||
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = preprocessor.vocab_size | |||
train_args["num_classes"] = preprocessor.num_classes | |||
# Trainer | |||
trainer = SeqLabelTrainer(train_args) | |||
trainer = SeqLabelTrainer(**train_args.data) | |||
# Model | |||
model = AdvSeqLabel(train_args) | |||
@@ -83,10 +84,11 @@ def train(): | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print('model parameter loaded!') | |||
except Exception as e: | |||
print("No saved model. Continue.") | |||
pass | |||
# Start training | |||
trainer.train(model) | |||
trainer.train(model, data_train, data_dev) | |||
print("Training finished!") | |||
# Saver | |||
@@ -98,7 +100,7 @@ def train(): | |||
def test(): | |||
# Config Loader | |||
test_args = ConfigSection() | |||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
@@ -106,6 +108,9 @@ def test(): | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# load dev data | |||
dev_data = load_pickle(pickle_path, "data_dev.pkl") | |||
# Define the same model | |||
model = AdvSeqLabel(test_args) | |||
@@ -114,13 +119,13 @@ def test(): | |||
print("model loaded!") | |||
# Tester | |||
tester = SeqLabelTester(test_args) | |||
tester = SeqLabelTester(**test_args.data) | |||
# Start testing | |||
tester.test(model) | |||
tester.test(model, dev_data) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
@@ -1,29 +1,35 @@ | |||
[train] | |||
epochs = 10 | |||
batch_size = 32 | |||
epochs = 30 | |||
batch_size = 64 | |||
pickle_path = "./save/" | |||
validate = true | |||
save_best_dev = true | |||
model_saved_path = "./save/" | |||
rnn_hidden_units = 100 | |||
rnn_layers = 2 | |||
rnn_bi_direction = true | |||
word_emb_dim = 100 | |||
dropout = 0.5 | |||
use_crf = true | |||
use_cuda = true | |||
print_every_step = 10 | |||
[test] | |||
save_output = true | |||
validate_in_training = true | |||
save_dev_input = false | |||
save_loss = true | |||
batch_size = 64 | |||
batch_size = 640 | |||
pickle_path = "./save/" | |||
use_crf = true | |||
use_cuda = true | |||
[POS_test] | |||
save_output = true | |||
validate_in_training = true | |||
save_dev_input = false | |||
save_loss = true | |||
batch_size = 640 | |||
pickle_path = "./save/" | |||
rnn_hidden_units = 100 | |||
rnn_layers = 1 | |||
rnn_bi_direction = true | |||
word_emb_dim = 100 | |||
dropout = 0.5 | |||
use_crf = true | |||
use_cuda = true | |||
rnn_hidden_units = 100 | |||
word_emb_dim = 100 |
@@ -0,0 +1,146 @@ | |||
import os | |||
import sys | |||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader | |||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
from fastNLP.saver.model_saver import ModelSaver | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
from fastNLP.core.predictor import SeqLabelInfer | |||
# not in the file's dir | |||
if len(os.path.dirname(__file__)) != 0: | |||
os.chdir(os.path.dirname(__file__)) | |||
datadir = "/home/zyfeng/data/" | |||
cfgfile = './pos_tag.cfg' | |||
data_name = "CWS_POS_TAG_NER_people_daily.txt" | |||
pos_tag_data_path = os.path.join(datadir, data_name) | |||
pickle_path = "save" | |||
data_infer_path = os.path.join(datadir, "infer.utf8") | |||
def infer(): | |||
# Config Loader | |||
test_args = ConfigSection() | |||
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# Define the same model | |||
model = AdvSeqLabel(test_args) | |||
try: | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print('model loaded!') | |||
except Exception as e: | |||
print('cannot load model!') | |||
raise | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
print('data loaded') | |||
# Inference interface | |||
infer = SeqLabelInfer(pickle_path) | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
print("Inference finished!") | |||
def train(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
test_args = ConfigSection() | |||
ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
# Data Loader | |||
loader = PeopleDailyCorpusLoader(pos_tag_data_path) | |||
train_data, _ = loader.load() | |||
# Preprocessor | |||
preprocessor = SeqLabelPreprocess() | |||
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
train_args["vocab_size"] = preprocessor.vocab_size | |||
train_args["num_classes"] = preprocessor.num_classes | |||
# Trainer | |||
trainer = SeqLabelTrainer(**train_args.data) | |||
# Model | |||
model = AdvSeqLabel(train_args) | |||
try: | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print('model parameter loaded!') | |||
except Exception as e: | |||
print("No saved model. Continue.") | |||
pass | |||
# Start training | |||
trainer.train(model, data_train, data_dev) | |||
print("Training finished!") | |||
# Saver | |||
saver = ModelSaver("./save/saved_model.pkl") | |||
saver.save_pytorch(model) | |||
print("Model saved!") | |||
def test(): | |||
# Config Loader | |||
test_args = ConfigSection() | |||
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
test_args["vocab_size"] = len(word2index) | |||
index2label = load_pickle(pickle_path, "id2class.pkl") | |||
test_args["num_classes"] = len(index2label) | |||
# load dev data | |||
dev_data = load_pickle(pickle_path, "data_dev.pkl") | |||
# Define the same model | |||
model = AdvSeqLabel(test_args) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
print("model loaded!") | |||
# Tester | |||
tester = SeqLabelTester(**test_args.data) | |||
# Start testing | |||
tester.test(model, dev_data) | |||
# print test results | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
if __name__ == "__main__": | |||
import argparse | |||
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') | |||
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) | |||
args = parser.parse_args() | |||
if args.mode == 'train': | |||
train() | |||
elif args.mode == 'test': | |||
test() | |||
elif args.mode == 'infer': | |||
infer() | |||
else: | |||
print('no mode specified for model!') | |||
parser.print_help() |
@@ -1,3 +1,4 @@ | |||
numpy>=1.14.2 | |||
torch==0.4.0 | |||
torchvision>=0.1.8 | |||
tensorboardX |
@@ -0,0 +1,24 @@ | |||
#!/usr/bin/env python | |||
# coding=utf-8 | |||
from setuptools import setup, find_packages | |||
with open('README.md') as f: | |||
readme = f.read() | |||
with open('LICENSE') as f: | |||
license = f.read() | |||
with open('requirements.txt') as f: | |||
reqs = f.read() | |||
setup( | |||
name='fastNLP', | |||
version='0.0.1', | |||
description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', | |||
long_description=readme, | |||
license=license, | |||
author='fudanNLP', | |||
python_requires='>=3.5', | |||
packages=find_packages(), | |||
install_requires=reqs.strip().split('\n'), | |||
) |
@@ -1,9 +1,8 @@ | |||
import os | |||
import unittest | |||
from fastNLP.core.action import Action, Batchifier, SequentialSampler | |||
class TestAction(unittest.TestCase): | |||
def test_case_1(self): | |||
x = [1, 2, 3, 4, 5, 6, 7, 8] | |||
@@ -33,8 +33,10 @@ class TestConfigLoader(unittest.TestCase): | |||
test_arg = ConfigSection() | |||
ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) | |||
section = read_section_from_config(os.path.join("./test/loader", "config"), "test") | |||
for sec in section: | |||
if (sec not in test_arg) or (section[sec] != test_arg[sec]): | |||
raise AttributeError("ERROR") | |||
@@ -1,138 +0,0 @@ | |||
import _pickle | |||
import os | |||
import numpy as np | |||
import torch | |||
from fastNLP.core.preprocess import SeqLabelPreprocess | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
class MyNERTrainer(SeqLabelTrainer): | |||
def __init__(self, train_args): | |||
super(MyNERTrainer, self).__init__(train_args) | |||
self.scheduler = None | |||
def define_optimizer(self): | |||
""" | |||
override | |||
:return: | |||
""" | |||
self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001) | |||
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) | |||
def update(self): | |||
""" | |||
override | |||
:return: | |||
""" | |||
self.optimizer.step() | |||
self.scheduler.step() | |||
def _create_validator(self, valid_args): | |||
return MyNERTester(valid_args) | |||
def best_eval_result(self, validator): | |||
accuracy = validator.metrics() | |||
if accuracy > self.best_accuracy: | |||
self.best_accuracy = accuracy | |||
return True | |||
else: | |||
return False | |||
class MyNERTester(SeqLabelTester): | |||
def __init__(self, test_args): | |||
super(MyNERTester, self).__init__(test_args) | |||
def _evaluate(self, prediction, batch_y, seq_len): | |||
""" | |||
:param prediction: [batch_size, seq_len, num_classes] | |||
:param batch_y: [batch_size, seq_len] | |||
:param seq_len: [batch_size] | |||
:return: | |||
""" | |||
summ = 0 | |||
correct = 0 | |||
_, indices = torch.max(prediction, 2) | |||
for p, y, l in zip(indices, batch_y, seq_len): | |||
summ += l | |||
correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) | |||
return float(correct / summ) | |||
def evaluate(self, predict, truth): | |||
return self._evaluate(predict, truth, self.seq_len) | |||
def metrics(self): | |||
return np.mean(self.eval_history) | |||
def show_matrices(self): | |||
return "dev accuracy={:.2f}".format(float(self.metrics())) | |||
def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): | |||
if os.path.exists(emb_pkl): | |||
with open(emb_pkl, "rb") as f: | |||
embedding_np = _pickle.load(f) | |||
return embedding_np | |||
with open(emb_file, "r", encoding="utf-8") as f: | |||
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||
for line in f: | |||
line = line.strip().split() | |||
if len(line) != emb_dim + 1: | |||
continue | |||
if line[0] in word_dict: | |||
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||
with open(emb_pkl, "wb") as f: | |||
_pickle.dump(embedding_np, f) | |||
return embedding_np | |||
def data_load(data_file): | |||
with open(data_file, "r", encoding="utf-8") as f: | |||
all_data = [] | |||
sent = [] | |||
label = [] | |||
for line in f: | |||
line = line.strip().split() | |||
if not len(line) <= 1: | |||
sent.append(line[0]) | |||
label.append(line[1]) | |||
else: | |||
all_data.append([sent, label]) | |||
sent = [] | |||
label = [] | |||
return all_data | |||
data_path = "data_for_tests/people.txt" | |||
pick_path = "data_for_tests/" | |||
emb_path = "data_for_tests/emb50.txt" | |||
save_path = "data_for_tests/" | |||
if __name__ == "__main__": | |||
data = data_load(data_path) | |||
preprocess = SeqLabelPreprocess() | |||
data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3) | |||
# emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) | |||
emb = None | |||
args = {"epochs": 20, | |||
"batch_size": 1, | |||
"pickle_path": pick_path, | |||
"validate": True, | |||
"save_best_dev": True, | |||
"model_saved_path": save_path, | |||
"use_cuda": True, | |||
"vocab_size": preprocess.vocab_size, | |||
"num_classes": preprocess.num_classes, | |||
"word_emb_dim": 50, | |||
"rnn_hidden_units": 100 | |||
} | |||
# emb = torch.Tensor(emb).float().cuda() | |||
networks = AdvSeqLabel(args, emb) | |||
trainer = MyNERTrainer(args) | |||
trainer.train(networks, data_train, data_dev) | |||
print("Training finished!") |
@@ -1,129 +0,0 @@ | |||
import _pickle | |||
import os | |||
import torch | |||
from fastNLP.core.predictor import SeqLabelInfer | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
class Decode(SeqLabelTrainer): | |||
def __init__(self, args): | |||
super(Decode, self).__init__(args) | |||
def decoder(self, network, sents, model_path): | |||
self.model = network | |||
self.model.load_state_dict(torch.load(model_path)) | |||
out_put = [] | |||
self.mode(network, test=True) | |||
for batch_x in sents: | |||
prediction = self.data_forward(self.model, batch_x) | |||
seq_tag = self.model.prediction(prediction, batch_x[1]) | |||
out_put.append(list(seq_tag)[0]) | |||
return out_put | |||
def process_sent(sents, word2id): | |||
sents_num = [] | |||
for s in sents: | |||
sent_num = [] | |||
for c in s: | |||
if c in word2id: | |||
sent_num.append(word2id[c]) | |||
else: | |||
sent_num.append(word2id["<unk>"]) | |||
sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 | |||
return sents_num | |||
def process_tag(sents, tags, id2class): | |||
Tags = [] | |||
for ttt in tags: | |||
Tags.append([id2class[t] for t in ttt]) | |||
Segs = [] | |||
PosNers = [] | |||
for sent, tag in zip(sents, tags): | |||
word__ = [] | |||
lll__ = [] | |||
for c, t in zip(sent, tag): | |||
t = id2class[t] | |||
l = t.split("-") | |||
split_ = l[0] | |||
pn = l[1] | |||
if split_ == "S": | |||
word__.append(c) | |||
lll__.append(pn) | |||
word_1 = "" | |||
elif split_ == "E": | |||
word_1 += c | |||
word__.append(word_1) | |||
lll__.append(pn) | |||
word_1 = "" | |||
elif split_ == "B": | |||
word_1 = "" | |||
word_1 += c | |||
else: | |||
word_1 += c | |||
Segs.append(word__) | |||
PosNers.append(lll__) | |||
return Segs, PosNers | |||
pickle_path = "data_for_tests/" | |||
model_path = "data_for_tests/model_best_dev.pkl" | |||
if __name__ == "__main__": | |||
with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: | |||
id2word = _pickle.load(f) | |||
with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: | |||
word2id = _pickle.load(f) | |||
with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: | |||
id2class = _pickle.load(f) | |||
sent = ["中共中央总书记、国家主席江泽民", | |||
"逆向处理输入序列并返回逆序后的序列"] # here is input | |||
args = {"epochs": 1, | |||
"batch_size": 1, | |||
"pickle_path": "data_for_tests/", | |||
"validate": True, | |||
"save_best_dev": True, | |||
"model_saved_path": "data_for_tests/", | |||
"use_cuda": False, | |||
"vocab_size": len(word2id), | |||
"num_classes": len(id2class), | |||
"word_emb_dim": 50, | |||
"rnn_hidden_units": 100, | |||
} | |||
""" | |||
network = AdvSeqLabel(args, None) | |||
decoder_ = Decode(args) | |||
tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) | |||
output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output | |||
print(output_seg) | |||
print(output_pn) | |||
""" | |||
# Define the same model | |||
model = AdvSeqLabel(args, None) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") | |||
print("model loaded!") | |||
# Inference interface | |||
infer = SeqLabelInfer(pickle_path) | |||
sent = [[ch for ch in s] for s in sent] | |||
results = infer.predict(model, sent) | |||
for res in results: | |||
print(res) | |||
print("Inference finished!") |
@@ -1,19 +1,13 @@ | |||
# python: 3.5 | |||
# pytorch: 0.4 | |||
################ | |||
# Test cross validation. | |||
################ | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.loss import Loss | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.predictor import ClassificationInfer | |||
from fastNLP.core.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import decoder | |||
from fastNLP.modules import encoder | |||
class ClassificationModel(BaseModel): | |||
@@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
@@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
data_dir = 'save/' # directory to save data and model | |||
train_path = './data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
ds_loader = ClassDatasetLoader(train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) | |||
# pre = ClassPreprocess(data, data_dir) | |||
n_classes = pre.num_classes | |||
vocab_size = pre.vocab_size | |||
pre = ClassPreprocess() | |||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||
n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||
# construct model | |||
model_args = { | |||
@@ -58,22 +51,25 @@ model_args = { | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
# construct trainer | |||
train_args = { | |||
"epochs": 10, | |||
"batch_size": 50, | |||
"epochs": 3, | |||
"batch_size": 16, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) | |||
trainer.cross_validate(model) | |||
"loss": Loss("cross_entropy"), | |||
"optimizer": Optimizer("Adam", lr=0.001) | |||
} | |||
trainer = ClassificationTrainer(**train_args) | |||
# start training | |||
trainer.train(model, train_data=train_set, dev_data=dev_set) | |||
# predict using model | |||
data_infer = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, data_infer) | |||
labels_pred = infer.predict(model.cpu(), data_infer) | |||
print(labels_pred) |
@@ -33,7 +33,7 @@ data_infer_path = args.infer | |||
def infer(): | |||
# Load infer configuration, the same as test | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args}) | |||
ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
@@ -49,7 +49,7 @@ def infer(): | |||
print("model loaded!") | |||
# Data Loader | |||
raw_data_loader = BaseLoader("xxx", data_infer_path) | |||
raw_data_loader = BaseLoader(data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
# Inference interface | |||
@@ -65,11 +65,11 @@ def train_and_test(): | |||
# Config Loader | |||
trainer_args = ConfigSection() | |||
model_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config(config_dir, { | |||
ConfigLoader("config.cfg").load_config(config_dir, { | |||
"test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) | |||
# Data Loader | |||
pos_loader = POSDatasetLoader("xxx", data_path) | |||
pos_loader = POSDatasetLoader(data_path) | |||
train_data = pos_loader.load_lines() | |||
# Preprocessor | |||
@@ -117,13 +117,13 @@ def train_and_test(): | |||
# Load test configuration | |||
tester_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||
ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||
# Tester | |||
tester = SeqLabelTester(save_output=False, | |||
save_loss=False, | |||
save_best_dev=False, | |||
batch_size=8, | |||
batch_size=4, | |||
use_cuda=False, | |||
pickle_path=pickle_path, | |||
model_name="seq_label_in_test.pkl", | |||
@@ -134,10 +134,10 @@ def train_and_test(): | |||
tester.test(model, data_dev) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
if __name__ == "__main__": | |||
train_and_test() | |||
# train_and_test() | |||
infer() |
@@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt" | |||
def infer(): | |||
# Load infer configuration, the same as test | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# fetch dictionary size and number of labels from pickle files | |||
word2index = load_pickle(pickle_path, "word2id.pkl") | |||
@@ -38,7 +38,7 @@ def infer(): | |||
print("model loaded!") | |||
# Data Loader | |||
raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
raw_data_loader = BaseLoader(data_infer_path) | |||
infer_data = raw_data_loader.load_lines() | |||
""" | |||
Transform strings into list of list of strings. | |||
@@ -61,10 +61,10 @@ def infer(): | |||
def train_test(): | |||
# Config Loader | |||
train_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||
# Data Loader | |||
loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
loader = TokenizeDatasetLoader(cws_data_path) | |||
train_data = loader.load_pku() | |||
# Preprocessor | |||
@@ -74,7 +74,7 @@ def train_test(): | |||
train_args["num_classes"] = p.num_classes | |||
# Trainer | |||
trainer = SeqLabelTrainer(train_args) | |||
trainer = SeqLabelTrainer(**train_args.data) | |||
# Model | |||
model = SeqLabeling(train_args) | |||
@@ -99,16 +99,16 @@ def train_test(): | |||
# Load test configuration | |||
test_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
# Tester | |||
tester = SeqLabelTester(test_args) | |||
tester = SeqLabelTester(**test_args.data) | |||
# Start testing | |||
tester.test(model, data_train) | |||
# print test results | |||
print(tester.show_matrices()) | |||
print(tester.show_metrics()) | |||
print("model tested!") | |||
@@ -1,13 +1,27 @@ | |||
import sys | |||
sys.path.append("..") | |||
from fastNLP.fastnlp import FastNLP | |||
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results | |||
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | |||
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | |||
PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" | |||
def word_seg(): | |||
nlp = FastNLP("./data_for_tests/") | |||
nlp.load("seq_label_model") | |||
text = "这是最好的基于深度学习的中文分词系统。" | |||
result = nlp.run(text) | |||
print(result) | |||
print("FastNLP finished!") | |||
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | |||
nlp.load("cws_basic_model", config_file="cws.cfg", section_name="POS_test") | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
results = nlp.run(text) | |||
print(results) | |||
for example in results: | |||
words, labels = [], [] | |||
for res in example: | |||
words.append(res[0]) | |||
labels.append(res[1]) | |||
print(interpret_word_seg_results(words, labels)) | |||
def text_class(): | |||
@@ -19,5 +33,53 @@ def text_class(): | |||
print("FastNLP finished!") | |||
def test_word_seg_interpret(): | |||
foo = [[('这', 'S'), ('是', 'S'), ('最', 'S'), ('好', 'S'), ('的', 'S'), ('基', 'B'), ('于', 'E'), ('深', 'B'), ('度', 'E'), | |||
('学', 'B'), ('习', 'E'), ('的', 'S'), ('中', 'B'), ('文', 'E'), ('分', 'B'), ('词', 'E'), ('系', 'B'), ('统', 'E'), | |||
('。', 'S')]] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_word_seg_results(chars, labels)) | |||
def test_interpret_cws_pos_results(): | |||
foo = [ | |||
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||
] | |||
chars = [x[0] for x in foo[0]] | |||
labels = [x[1] for x in foo[0]] | |||
print(interpret_cws_pos_results(chars, labels)) | |||
def pos_tag(): | |||
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) | |||
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") | |||
text = ["这是最好的基于深度学习的中文分词系统。", | |||
"大王叫我来巡山。", | |||
"我党多年来致力于改善人民生活水平。"] | |||
results = nlp.run(text) | |||
for example in results: | |||
words, labels = [], [] | |||
for res in example: | |||
words.append(res[0]) | |||
labels.append(res[1]) | |||
print(interpret_cws_pos_results(words, labels)) | |||
def text_classify(): | |||
nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES) | |||
nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model") | |||
text = [ | |||
"世界物联网大会明日在京召开龙头股启动在即", | |||
"乌鲁木齐市新增一处城市中心旅游目的地", | |||
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"] | |||
results = nlp.run(text) | |||
print(results) | |||
""" | |||
['finance', 'travel', 'history'] | |||
""" | |||
if __name__ == "__main__": | |||
text_class() | |||
text_classify() |
@@ -5,19 +5,19 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
data_name = "pku_training.utf8" | |||
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||
pickle_path = "data_for_tests" | |||
def foo(): | |||
loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8") | |||
loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") | |||
train_data = loader.load_pku() | |||
train_args = ConfigSection() | |||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||
# Preprocessor | |||
p = SeqLabelPreprocess(train_data, pickle_path) | |||
p = SeqLabelPreprocess() | |||
train_data = p.run(train_data) | |||
train_args["vocab_size"] = p.vocab_size | |||
train_args["num_classes"] = p.num_classes | |||
@@ -26,11 +26,11 @@ def foo(): | |||
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | |||
"save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", | |||
"use_cuda": True} | |||
validator = SeqLabelTester(valid_args) | |||
validator = SeqLabelTester(**valid_args) | |||
print("start validation.") | |||
validator.test(model) | |||
print(validator.show_matrices()) | |||
validator.test(model, train_data) | |||
print(validator.show_metrics()) | |||
if __name__ == "__main__": | |||
@@ -34,7 +34,7 @@ config_dir = args.config | |||
def infer(): | |||
# load dataset | |||
print("Loading data...") | |||
ds_loader = ClassDatasetLoader("train", train_data_dir) | |||
ds_loader = ClassDatasetLoader(train_data_dir) | |||
data = ds_loader.load() | |||
unlabeled_data = [x[0] for x in data] | |||
@@ -69,7 +69,7 @@ def train(): | |||
# load dataset | |||
print("Loading data...") | |||
ds_loader = ClassDatasetLoader("train", train_data_dir) | |||
ds_loader = ClassDatasetLoader(train_data_dir) | |||
data = ds_loader.load() | |||
print(data[0]) | |||