@@ -20,87 +20,10 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa | |||
## Resources | |||
- [Documentation](https://github.com/fastnlp/fastNLP) | |||
- [Documentation](https://fastnlp.readthedocs.io/en/latest/) | |||
- [Source Code](https://github.com/fastnlp/fastNLP) | |||
## Example | |||
### Basic Usage | |||
A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. | |||
```python | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import decoder | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.core.inference import ClassificationInfer | |||
class ClassificationModel(BaseModel): | |||
""" | |||
Simple text classification model based on CNN. | |||
""" | |||
def __init__(self, num_classes, vocab_size): | |||
super(ClassificationModel, self).__init__() | |||
self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] | |||
x = self.agg(x) # [N,L,C] -> [N,C] | |||
x = self.dec(x) # [N,C] -> [N, N_class] | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data_dir) | |||
vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
# construct model | |||
model_args = { | |||
'num_classes': n_classes, | |||
'vocab_size': vocab_size | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
train_args = { | |||
"epochs": 20, | |||
"batch_size": 50, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
trainer.train(model) | |||
# predict using model | |||
seqs = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, seqs) | |||
``` | |||
## Installation | |||
@@ -7,6 +7,12 @@ fastNLP.core.action | |||
.. automodule:: fastNLP.core.action | |||
:members: | |||
fastNLP.core.loss | |||
------------------ | |||
.. automodule:: fastNLP.core.loss | |||
:members: | |||
fastNLP.core.metrics | |||
--------------------- | |||
@@ -7,6 +7,12 @@ fastNLP.modules.decoder.CRF | |||
.. automodule:: fastNLP.modules.decoder.CRF | |||
:members: | |||
fastNLP.modules.decoder.MLP | |||
---------------------------- | |||
.. automodule:: fastNLP.modules.decoder.MLP | |||
:members: | |||
.. automodule:: fastNLP.modules.decoder | |||
:members: |
@@ -1,15 +1,43 @@ | |||
.. fastNLP documentation master file, created by | |||
sphinx-quickstart on Mon Aug 20 17:06:44 2018. | |||
You can adapt this file completely to your liking, but it should at least | |||
contain the root `toctree` directive. | |||
fastNLP documentation | |||
===================== | |||
fastNLP,目前仍在孵化中。 | |||
Introduction | |||
------------ | |||
fastNLP是一个基于PyTorch的模块化自然语言处理系统,用于快速开发NLP工具。 | |||
它将基于深度学习的NLP模型划分为不同的模块。 | |||
这些模块分为4类:encoder(编码),interaction(交互), aggregration(聚合) and decoder(解码), | |||
而每个类别包含不同的实现模块。 | |||
大多数当前的NLP模型可以构建在这些模块上,这极大地简化了开发NLP模型的过程。 | |||
fastNLP的架构如下左图所示: | |||
.. image:: figures/procedures_and_sequence_labeling.png | |||
在constructing model部分,以序列标注(上右图)和文本分类(下图)为例进行说明: | |||
.. image:: figures/text_classification.png | |||
* encoder module:将输入编码为一些抽象表示,输入的是单词序列,输出向量序列。 | |||
* interaction module:使表示中的信息相互交互,输入的是向量序列,输出的也是向量序列。 | |||
* aggregation module:聚合和减少信息,输入向量序列,输出一个向量。 | |||
* decoder module:将表示解码为输出,输出一个label(文本分类)或者输出label序列(序列标注) | |||
其中interaction module和aggregation module在模型中不一定存在,例如上面的序列标注模型。 | |||
User's Guide | |||
------------ | |||
.. toctree:: | |||
:maxdepth: 2 | |||
user/installation | |||
user/quickstart | |||
API Reference | |||
------------- | |||
@@ -0,0 +1,31 @@ | |||
============ | |||
Installation | |||
============ | |||
.. contents:: | |||
:local: | |||
Cloning From GitHub | |||
~~~~~~~~~~~~~~~~~~~ | |||
If you just want to use fastNLP, use: | |||
.. code:: shell | |||
git clone https://github.com/fastnlp/fastNLP | |||
cd fastNLP | |||
PyTorch Installation | |||
~~~~~~~~~~~~~~~~~~~~ | |||
Visit the [PyTorch official website] for installation instructions based | |||
on your system. In general, you could use: | |||
.. code:: shell | |||
# using conda | |||
conda install pytorch torchvision -c pytorch | |||
# or using pip | |||
pip3 install torch torchvision |
@@ -0,0 +1,84 @@ | |||
========== | |||
Quickstart | |||
========== | |||
Example | |||
------- | |||
Basic Usage | |||
~~~~~~~~~~~ | |||
A typical fastNLP routine is composed of four phases: loading dataset, | |||
pre-processing data, constructing model and training model. | |||
.. code:: python | |||
from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import encoder | |||
from fastNLP.modules import aggregation | |||
from fastNLP.modules import decoder | |||
from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
from fastNLP.loader.preprocess import ClassPreprocess | |||
from fastNLP.core.trainer import ClassificationTrainer | |||
from fastNLP.core.inference import ClassificationInfer | |||
class ClassificationModel(BaseModel): | |||
""" | |||
Simple text classification model based on CNN. | |||
""" | |||
def __init__(self, num_classes, vocab_size): | |||
super(ClassificationModel, self).__init__() | |||
self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
self.enc = encoder.Conv( | |||
in_channels=300, out_channels=100, kernel_size=3) | |||
self.agg = aggregation.MaxPool() | |||
self.dec = decoder.MLP(100, num_classes=num_classes) | |||
def forward(self, x): | |||
x = self.emb(x) # [N,L] -> [N,L,C] | |||
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] | |||
x = self.agg(x) # [N,L,C] -> [N,C] | |||
x = self.dec(x) # [N,C] -> [N, N_class] | |||
return x | |||
data_dir = 'data' # directory to save data and model | |||
train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
# load dataset | |||
ds_loader = ClassDatasetLoader("train", train_path) | |||
data = ds_loader.load() | |||
# pre-process dataset | |||
pre = ClassPreprocess(data_dir) | |||
vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
# construct model | |||
model_args = { | |||
'num_classes': n_classes, | |||
'vocab_size': vocab_size | |||
} | |||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
# train model | |||
train_args = { | |||
"epochs": 20, | |||
"batch_size": 50, | |||
"pickle_path": data_dir, | |||
"validate": False, | |||
"save_best_dev": False, | |||
"model_saved_path": None, | |||
"use_cuda": True, | |||
"learn_rate": 1e-3, | |||
"momentum": 0.9} | |||
trainer = ClassificationTrainer(train_args) | |||
trainer.train(model) | |||
# predict using model | |||
seqs = [x[0] for x in data] | |||
infer = ClassificationInfer(data_dir) | |||
labels_pred = infer.predict(model, seqs) |