Browse Source

Merge branch 'master' into test_code

tags/v0.1.0
Yige XU GitHub 6 years ago
parent
commit
be2f4aade3
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
60 changed files with 1203 additions and 11838 deletions
  1. +25
    -133
      README.md
  2. +4
    -3
      docs/requirements.txt
  3. +2
    -0
      docs/source/conf.py
  4. +26
    -34
      docs/source/fastNLP.core.rst
  5. +12
    -38
      docs/source/fastNLP.loader.rst
  6. +10
    -26
      docs/source/fastNLP.models.rst
  7. +12
    -30
      docs/source/fastNLP.modules.aggregation.rst
  8. +9
    -13
      docs/source/fastNLP.modules.decoder.rst
  9. +18
    -42
      docs/source/fastNLP.modules.encoder.rst
  10. +2
    -7
      docs/source/fastNLP.modules.interaction.rst
  11. +6
    -21
      docs/source/fastNLP.modules.rst
  12. +4
    -17
      docs/source/fastNLP.rst
  13. +6
    -18
      docs/source/fastNLP.saver.rst
  14. BIN
      docs/source/figures/procedures_and_sequence_labeling.png
  15. BIN
      docs/source/figures/text_classification.png
  16. +47
    -9
      docs/source/index.rst
  17. +0
    -7
      docs/source/modules.rst
  18. +31
    -0
      docs/source/user/installation.rst
  19. +84
    -0
      docs/source/user/quickstart.rst
  20. +57
    -31
      fastNLP/core/action.py
  21. +15
    -1
      fastNLP/core/loss.py
  22. +26
    -31
      fastNLP/core/metrics.py
  23. +10
    -2
      fastNLP/core/optimizer.py
  24. +1
    -1
      fastNLP/core/predictor.py
  25. +58
    -13
      fastNLP/core/preprocess.py
  26. +42
    -22
      fastNLP/core/tester.py
  27. +61
    -99
      fastNLP/core/trainer.py
  28. +162
    -70
      fastNLP/fastnlp.py
  29. +3
    -4
      fastNLP/loader/base_loader.py
  30. +62
    -18
      fastNLP/loader/dataset_loader.py
  31. +44
    -2
      fastNLP/loader/embed_loader.py
  32. +2
    -2
      fastNLP/loader/model_loader.py
  33. +6
    -6
      fastNLP/models/cnn_text_classification.py
  34. +2
    -1
      fastNLP/modules/decoder/__init__.py
  35. +3
    -1
      fastNLP/modules/encoder/__init__.py
  36. +5
    -0
      fastNLP/modules/encoder/conv_maxpool.py
  37. +1
    -1
      fastNLP/modules/encoder/embedding.py
  38. BIN
      fastnlp-architecture.jpg
  39. +0
    -5331
      reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
  40. +0
    -5331
      reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
  41. BIN
      reproduction/HAN-document_classification/data/test_samples.pkl
  42. BIN
      reproduction/HAN-document_classification/data/train_samples.pkl
  43. BIN
      reproduction/HAN-document_classification/data/yelp.word2vec
  44. +0
    -114
      reproduction/chinese_word_seg/cws_train.py
  45. +12
    -0
      reproduction/chinese_word_segment/cws.cfg
  46. +25
    -20
      reproduction/chinese_word_segment/run.py
  47. +17
    -11
      reproduction/pos_tag_model/pos_tag.cfg
  48. +146
    -0
      reproduction/pos_tag_model/train_pos_tag.py
  49. +1
    -0
      requirements.txt
  50. +24
    -0
      setup.py
  51. +1
    -2
      test/core/test_action.py
  52. +2
    -0
      test/loader/test_loader.py
  53. +0
    -138
      test/ner.py
  54. +0
    -129
      test/ner_decode.py
  55. +23
    -27
      test/readme_example.py
  56. +8
    -8
      test/seq_labeling.py
  57. +8
    -8
      test/test_cws.py
  58. +69
    -7
      test/test_fastNLP.py
  59. +7
    -7
      test/test_tester.py
  60. +2
    -2
      test/text_classify.py

+ 25
- 133
README.md View File

@@ -2,6 +2,9 @@

[![Build Status](https://travis-ci.org/fastnlp/fastNLP.svg?branch=master)](https://travis-ci.org/fastnlp/fastNLP)
[![codecov](https://codecov.io/gh/fastnlp/fastNLP/branch/master/graph/badge.svg)](https://codecov.io/gh/fastnlp/fastNLP)
[![PyPI version](https://badge.fury.io/py/fastNLP.svg)](https://badge.fury.io/py/fastNLP)
![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
[![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)

fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below:

@@ -13,93 +16,19 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa
- numpy>=1.14.2
- torch==0.4.0
- torchvision>=0.1.8
- tensorboardX


## Resources

- [Documentation](https://github.com/fastnlp/fastNLP)
- [Documentation](https://fastnlp.readthedocs.io/en/latest/)
- [Source Code](https://github.com/fastnlp/fastNLP)


## Example

### Basic Usage

A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model.
```python
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import encoder
from fastNLP.modules import aggregation
from fastNLP.modules import decoder

from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.inference import ClassificationInfer


class ClassificationModel(BaseModel):
"""
Simple text classification model based on CNN.
"""

def __init__(self, num_classes, vocab_size):
super(ClassificationModel, self).__init__()

self.emb = encoder.Embedding(nums=vocab_size, dims=300)
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)

def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C]
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out]
x = self.agg(x) # [N,L,C] -> [N,C]
x = self.dec(x) # [N,C] -> [N, N_class]
return x


data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file

# load dataset
ds_loader = ClassDatasetLoader("train", train_path)
data = ds_loader.load()

# pre-process dataset
pre = ClassPreprocess(data_dir)
vocab_size, n_classes = pre.process(data, "data_train.pkl")

# construct model
model_args = {
'num_classes': n_classes,
'vocab_size': vocab_size
}
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)

# train model
train_args = {
"epochs": 20,
"batch_size": 50,
"pickle_path": data_dir,
"validate": False,
"save_best_dev": False,
"model_saved_path": None,
"use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
trainer.train(model)

# predict using model
seqs = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, seqs)
```


## Installation
Run the following commands to install fastNLP package.
```shell
pip install fastNLP
```

### Cloning From GitHub

@@ -119,20 +48,26 @@ conda install pytorch torchvision -c pytorch
pip3 install torch torchvision
```

### TensorboardX Installation

```shell
pip3 install tensorboardX
```

## Project Structure

```
FastNLP
├── docs
│   └── quick_tutorial.md
├── fastNLP
│   ├── action
│   ├── core
│   │   ├── action.py
│   │   ├── inference.py
│   │   ├── __init__.py
│   │   ├── loss.py
│   │   ├── metrics.py
│   │   ├── optimizer.py
│   │   ├── predictor.py
│   │   ├── preprocess.py
│   │   ├── README.md
│   │   ├── tester.py
│   │   └── trainer.py
@@ -144,71 +79,28 @@ FastNLP
│   │   ├── dataset_loader.py
│   │   ├── embed_loader.py
│   │   ├── __init__.py
│   │   ├── model_loader.py
│   │   └── preprocess.py
│   │   └── model_loader.py
│   ├── models
│   │   ├── base_model.py
│   │   ├── char_language_model.py
│   │   ├── cnn_text_classification.py
│   │   ├── __init__.py
│   │   └── sequence_modeling.py
│   ├── modules
│   │   ├── aggregation
│   │   │   ├── attention.py
│   │   │   ├── avg_pool.py
│   │   │   ├── __init__.py
│   │   │   ├── kmax_pool.py
│   │   │   ├── max_pool.py
│   │   │   └── self_attention.py
│   │   ├── decoder
│   │   │   ├── CRF.py
│   │   │   └── __init__.py
│   │   ├── encoder
│   │   │   ├── char_embedding.py
│   │   │   ├── conv_maxpool.py
│   │   │   ├── conv.py
│   │   │   ├── embedding.py
│   │   │   ├── __init__.py
│   │   │   ├── linear.py
│   │   │   ├── lstm.py
│   │   │   ├── masked_rnn.py
│   │   │   └── variational_rnn.py
│   │   ├── __init__.py
│   │   ├── interaction
│   │   │   └── __init__.py
│   │   ├── other_modules.py
│   │   └── utils.py
│   └── saver
│   ├── base_saver.py
│   ├── __init__.py
│   ├── logger.py
│   └── model_saver.py
├── LICENSE
├── README.md
├── reproduction
│   ├── Char-aware_NLM
│   │  
│   ├── CNN-sentence_classification
│   │  
│   ├── HAN-document_classification
│   │  
│   └── LSTM+self_attention_sentiment_analysis
|
├── requirements.txt
├── setup.py
└── test
├── core
├── data_for_tests
│   ├── charlm.txt
│   ├── config
│   ├── cws_test
│   ├── cws_train
│   ├── people_infer.txt
│   └── people.txt
├── test_charlm.py
├── test_cws.py
├── test_fastNLP.py
├── test_loader.py
├── test_seq_labeling.py
├── test_tester.py
└── test_trainer.py
├── __init__.py
├── loader
├── modules
└── readme_example.py

```

+ 4
- 3
docs/requirements.txt View File

@@ -1,3 +1,4 @@
sphinx
-e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme
sphinxcontrib.katex
numpy>=1.14.2
http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl
torchvision>=0.1.8
sphinx-rtd-theme==0.4.1

+ 2
- 0
docs/source/conf.py View File

@@ -42,6 +42,8 @@ release = '1.0'
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx.ext.autosummary',

]

# Add any paths that contain templates here, relative to this directory.


+ 26
- 34
docs/source/fastNLP.core.rst View File

@@ -1,62 +1,54 @@
fastNLP.core package
====================
fastNLP.core
=============

Submodules
----------

fastNLP.core.action module
--------------------------
fastNLP.core.action
--------------------

.. automodule:: fastNLP.core.action
:members:
:undoc-members:
:show-inheritance:

fastNLP.core.metrics module
---------------------------
fastNLP.core.loss
------------------

.. automodule:: fastNLP.core.loss
:members:

fastNLP.core.metrics
---------------------

.. automodule:: fastNLP.core.metrics
:members:
:undoc-members:
:show-inheritance:

fastNLP.core.optimizer module
-----------------------------
fastNLP.core.optimizer
-----------------------

.. automodule:: fastNLP.core.optimizer
:members:
:undoc-members:
:show-inheritance:

fastNLP.core.predictor module
-----------------------------
fastNLP.core.predictor
-----------------------

.. automodule:: fastNLP.core.predictor
:members:
:undoc-members:
:show-inheritance:

fastNLP.core.tester module
--------------------------
fastNLP.core.preprocess
------------------------

.. automodule:: fastNLP.core.preprocess
:members:

fastNLP.core.tester
--------------------

.. automodule:: fastNLP.core.tester
:members:
:undoc-members:
:show-inheritance:

fastNLP.core.trainer module
---------------------------
fastNLP.core.trainer
---------------------

.. automodule:: fastNLP.core.trainer
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.core
:members:
:undoc-members:
:show-inheritance:

+ 12
- 38
docs/source/fastNLP.loader.rst View File

@@ -1,62 +1,36 @@
fastNLP.loader package
======================
fastNLP.loader
===============

Submodules
----------

fastNLP.loader.base\_loader module
----------------------------------
fastNLP.loader.base\_loader
----------------------------

.. automodule:: fastNLP.loader.base_loader
:members:
:undoc-members:
:show-inheritance:

fastNLP.loader.config\_loader module
------------------------------------
fastNLP.loader.config\_loader
------------------------------

.. automodule:: fastNLP.loader.config_loader
:members:
:undoc-members:
:show-inheritance:

fastNLP.loader.dataset\_loader module
-------------------------------------
fastNLP.loader.dataset\_loader
-------------------------------

.. automodule:: fastNLP.loader.dataset_loader
:members:
:undoc-members:
:show-inheritance:

fastNLP.loader.embed\_loader module
-----------------------------------
fastNLP.loader.embed\_loader
-----------------------------

.. automodule:: fastNLP.loader.embed_loader
:members:
:undoc-members:
:show-inheritance:

fastNLP.loader.model\_loader module
-----------------------------------
fastNLP.loader.model\_loader
-----------------------------

.. automodule:: fastNLP.loader.model_loader
:members:
:undoc-members:
:show-inheritance:

fastNLP.loader.preprocess module
--------------------------------

.. automodule:: fastNLP.loader.preprocess
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.loader
:members:
:undoc-members:
:show-inheritance:

+ 10
- 26
docs/source/fastNLP.models.rst View File

@@ -1,46 +1,30 @@
fastNLP.models package
======================
fastNLP.models
===============

Submodules
----------

fastNLP.models.base\_model module
---------------------------------
fastNLP.models.base\_model
---------------------------

.. automodule:: fastNLP.models.base_model
:members:
:undoc-members:
:show-inheritance:

fastNLP.models.char\_language\_model module
-------------------------------------------
fastNLP.models.char\_language\_model
-------------------------------------

.. automodule:: fastNLP.models.char_language_model
:members:
:undoc-members:
:show-inheritance:

fastNLP.models.cnn\_text\_classification module
-----------------------------------------------
fastNLP.models.cnn\_text\_classification
-----------------------------------------

.. automodule:: fastNLP.models.cnn_text_classification
:members:
:undoc-members:
:show-inheritance:

fastNLP.models.sequence\_modeling module
----------------------------------------
fastNLP.models.sequence\_modeling
----------------------------------

.. automodule:: fastNLP.models.sequence_modeling
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.models
:members:
:undoc-members:
:show-inheritance:

+ 12
- 30
docs/source/fastNLP.modules.aggregation.rst View File

@@ -1,54 +1,36 @@
fastNLP.modules.aggregation package
===================================
fastNLP.modules.aggregation
============================

Submodules
----------

fastNLP.modules.aggregation.attention module
--------------------------------------------
fastNLP.modules.aggregation.attention
--------------------------------------

.. automodule:: fastNLP.modules.aggregation.attention
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.aggregation.avg\_pool module
--------------------------------------------
fastNLP.modules.aggregation.avg\_pool
--------------------------------------

.. automodule:: fastNLP.modules.aggregation.avg_pool
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.aggregation.kmax\_pool module
---------------------------------------------
fastNLP.modules.aggregation.kmax\_pool
---------------------------------------

.. automodule:: fastNLP.modules.aggregation.kmax_pool
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.aggregation.max\_pool module
--------------------------------------------
fastNLP.modules.aggregation.max\_pool
--------------------------------------

.. automodule:: fastNLP.modules.aggregation.max_pool
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.aggregation.self\_attention module
--------------------------------------------------
fastNLP.modules.aggregation.self\_attention
--------------------------------------------

.. automodule:: fastNLP.modules.aggregation.self_attention
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.modules.aggregation
:members:
:undoc-members:
:show-inheritance:

+ 9
- 13
docs/source/fastNLP.modules.decoder.rst View File

@@ -1,22 +1,18 @@
fastNLP.modules.decoder package
===============================
fastNLP.modules.decoder
========================

Submodules
----------

fastNLP.modules.decoder.CRF module
----------------------------------
fastNLP.modules.decoder.CRF
----------------------------

.. automodule:: fastNLP.modules.decoder.CRF
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.decoder.MLP
----------------------------

.. automodule:: fastNLP.modules.decoder.MLP
:members:

Module contents
---------------

.. automodule:: fastNLP.modules.decoder
:members:
:undoc-members:
:show-inheritance:

+ 18
- 42
docs/source/fastNLP.modules.encoder.rst View File

@@ -1,78 +1,54 @@
fastNLP.modules.encoder package
===============================
fastNLP.modules.encoder
========================

Submodules
----------

fastNLP.modules.encoder.char\_embedding module
----------------------------------------------
fastNLP.modules.encoder.char\_embedding
----------------------------------------

.. automodule:: fastNLP.modules.encoder.char_embedding
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.conv module
-----------------------------------
fastNLP.modules.encoder.conv
-----------------------------

.. automodule:: fastNLP.modules.encoder.conv
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.conv\_maxpool module
--------------------------------------------
fastNLP.modules.encoder.conv\_maxpool
--------------------------------------

.. automodule:: fastNLP.modules.encoder.conv_maxpool
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.embedding module
----------------------------------------
fastNLP.modules.encoder.embedding
----------------------------------

.. automodule:: fastNLP.modules.encoder.embedding
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.linear module
-------------------------------------
fastNLP.modules.encoder.linear
-------------------------------

.. automodule:: fastNLP.modules.encoder.linear
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.lstm module
-----------------------------------
fastNLP.modules.encoder.lstm
-----------------------------

.. automodule:: fastNLP.modules.encoder.lstm
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.masked\_rnn module
------------------------------------------
fastNLP.modules.encoder.masked\_rnn
------------------------------------

.. automodule:: fastNLP.modules.encoder.masked_rnn
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.encoder.variational\_rnn module
-----------------------------------------------
fastNLP.modules.encoder.variational\_rnn
-----------------------------------------

.. automodule:: fastNLP.modules.encoder.variational_rnn
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.modules.encoder
:members:
:undoc-members:
:show-inheritance:

+ 2
- 7
docs/source/fastNLP.modules.interaction.rst View File

@@ -1,10 +1,5 @@
fastNLP.modules.interaction package
===================================

Module contents
---------------
fastNLP.modules.interaction
============================

.. automodule:: fastNLP.modules.interaction
:members:
:undoc-members:
:show-inheritance:

+ 6
- 21
docs/source/fastNLP.modules.rst View File

@@ -1,8 +1,5 @@
fastNLP.modules package
=======================

Subpackages
-----------
fastNLP.modules
================

.. toctree::

@@ -11,30 +8,18 @@ Subpackages
fastNLP.modules.encoder
fastNLP.modules.interaction

Submodules
----------

fastNLP.modules.other\_modules module
-------------------------------------
fastNLP.modules.other\_modules
-------------------------------

.. automodule:: fastNLP.modules.other_modules
:members:
:undoc-members:
:show-inheritance:

fastNLP.modules.utils module
----------------------------
fastNLP.modules.utils
----------------------

.. automodule:: fastNLP.modules.utils
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.modules
:members:
:undoc-members:
:show-inheritance:

+ 4
- 17
docs/source/fastNLP.rst View File

@@ -1,8 +1,5 @@
fastNLP package
===============

Subpackages
-----------
fastNLP
========

.. toctree::

@@ -12,22 +9,12 @@ Subpackages
fastNLP.modules
fastNLP.saver

Submodules
----------

fastNLP.fastnlp module
----------------------
fastNLP.fastnlp
----------------

.. automodule:: fastNLP.fastnlp
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP
:members:
:undoc-members:
:show-inheritance:

+ 6
- 18
docs/source/fastNLP.saver.rst View File

@@ -1,30 +1,18 @@
fastNLP.saver package
=====================
fastNLP.saver
==============

Submodules
----------

fastNLP.saver.logger module
---------------------------
fastNLP.saver.logger
---------------------

.. automodule:: fastNLP.saver.logger
:members:
:undoc-members:
:show-inheritance:

fastNLP.saver.model\_saver module
---------------------------------
fastNLP.saver.model\_saver
---------------------------

.. automodule:: fastNLP.saver.model_saver
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------

.. automodule:: fastNLP.saver
:members:
:undoc-members:
:show-inheritance:

BIN
docs/source/figures/procedures_and_sequence_labeling.png View File

Before After
Width: 1079  |  Height: 558  |  Size: 51 kB

BIN
docs/source/figures/text_classification.png View File

Before After
Width: 1217  |  Height: 543  |  Size: 54 kB

+ 47
- 9
docs/source/index.rst View File

@@ -1,16 +1,54 @@
.. fastNLP documentation master file, created by
sphinx-quickstart on Mon Aug 20 17:06:44 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
fastNLP documentation
=====================
fastNLP,目前仍在孵化中。

Welcome to fastNLP's documentation!
===================================

Introduction
------------

fastNLP是一个基于PyTorch的模块化自然语言处理系统,用于快速开发NLP工具。
它将基于深度学习的NLP模型划分为不同的模块。
这些模块分为4类:encoder(编码),interaction(交互), aggregration(聚合) and decoder(解码),
而每个类别包含不同的实现模块。

大多数当前的NLP模型可以构建在这些模块上,这极大地简化了开发NLP模型的过程。
fastNLP的架构如下左图所示:

.. image:: figures/procedures_and_sequence_labeling.png

在constructing model部分,以序列标注(上右图)和文本分类(下图)为例进行说明:

.. image:: figures/text_classification.png

* encoder module:将输入编码为一些抽象表示,输入的是单词序列,输出向量序列。
* interaction module:使表示中的信息相互交互,输入的是向量序列,输出的也是向量序列。
* aggregation module:聚合和减少信息,输入向量序列,输出一个向量。
* decoder module:将表示解码为输出,输出一个label(文本分类)或者输出label序列(序列标注)

其中interaction module和aggregation module在模型中不一定存在,例如上面的序列标注模型。




User's Guide
------------
.. toctree::
:maxdepth: 2

user/installation
user/quickstart


API Reference
-------------

If you are looking for information on a specific function, class or
method, this part of the documentation is for you.

.. toctree::
:maxdepth: 4
:caption: Contents:
:maxdepth: 2
fastNLP
fastNLP API <fastNLP>





+ 0
- 7
docs/source/modules.rst View File

@@ -1,7 +0,0 @@
fastNLP
=======

.. toctree::
:maxdepth: 4

fastNLP

+ 31
- 0
docs/source/user/installation.rst View File

@@ -0,0 +1,31 @@
============
Installation
============

.. contents::
:local:


Cloning From GitHub
~~~~~~~~~~~~~~~~~~~

If you just want to use fastNLP, use:

.. code:: shell

git clone https://github.com/fastnlp/fastNLP
cd fastNLP

PyTorch Installation
~~~~~~~~~~~~~~~~~~~~

Visit the [PyTorch official website] for installation instructions based
on your system. In general, you could use:

.. code:: shell

# using conda
conda install pytorch torchvision -c pytorch
# or using pip
pip3 install torch torchvision

+ 84
- 0
docs/source/user/quickstart.rst View File

@@ -0,0 +1,84 @@
==========
Quickstart
==========

Example
-------

Basic Usage
~~~~~~~~~~~

A typical fastNLP routine is composed of four phases: loading dataset,
pre-processing data, constructing model and training model.

.. code:: python

from fastNLP.models.base_model import BaseModel
from fastNLP.modules import encoder
from fastNLP.modules import aggregation
from fastNLP.modules import decoder

from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.inference import ClassificationInfer


class ClassificationModel(BaseModel):
"""
Simple text classification model based on CNN.
"""

def __init__(self, num_classes, vocab_size):
super(ClassificationModel, self).__init__()

self.emb = encoder.Embedding(nums=vocab_size, dims=300)
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)

def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C]
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out]
x = self.agg(x) # [N,L,C] -> [N,C]
x = self.dec(x) # [N,C] -> [N, N_class]
return x


data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file

# load dataset
ds_loader = ClassDatasetLoader("train", train_path)
data = ds_loader.load()

# pre-process dataset
pre = ClassPreprocess(data_dir)
vocab_size, n_classes = pre.process(data, "data_train.pkl")

# construct model
model_args = {
'num_classes': n_classes,
'vocab_size': vocab_size
}
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)

# train model
train_args = {
"epochs": 20,
"batch_size": 50,
"pickle_path": data_dir,
"validate": False,
"save_best_dev": False,
"model_saved_path": None,
"use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
trainer.train(model)

# predict using model
seqs = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, seqs)

+ 57
- 31
fastNLP/core/action.py View File

@@ -1,7 +1,3 @@
"""
This file defines Action(s) and sample methods.

"""
from collections import Counter

import numpy as np
@@ -9,13 +5,12 @@ import torch


class Action(object):
"""
Operations shared by Trainer, Tester, or Inference.
"""Operations shared by Trainer, Tester, or Inference.
This is designed for reducing replicate codes.
- make_batch: produce a min-batch of data. @staticmethod
- pad: padding method used in sequence modeling. @staticmethod
- mode: change network mode for either train or test. (for PyTorch) @staticmethod
The base Action shall define operations shared by as much task-specific Actions as possible.
"""

def __init__(self):
@@ -24,18 +19,20 @@ class Action(object):
@staticmethod
def make_batch(iterator, use_cuda, output_length=True, max_len=None):
"""Batch and Pad data.

:param iterator: an iterator, (object that implements __next__ method) which returns the next sample.
:param use_cuda: bool, whether to use GPU
:param output_length: bool, whether to output the original length of the sequence before padding. (default: True)
:param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None)
:return
if output_length is True:
:return :

if output_length is True,
(batch_x, seq_len): tuple of two elements
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
seq_len: list. The length of the pre-padded sequence, if output_length is True.
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]

if output_length is False:
if output_length is False,
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]
"""
@@ -77,21 +74,21 @@ class Action(object):
return batch

@staticmethod
def mode(model, test=False):
"""
Train mode or Test mode. This is for PyTorch currently.
:param model:
:param test:
def mode(model, is_test=False):
"""Train mode or Test mode. This is for PyTorch currently.
:param model: a PyTorch model
:param is_test: bool, whether in test mode or not.
"""
if test:
if is_test:
model.eval()
else:
model.train()


def convert_to_torch_tensor(data_list, use_cuda):
"""
convert lists into (cuda) Tensors.
"""Convert lists into (cuda) Tensors.
:param data_list: 2-level lists
:param use_cuda: bool, whether to use GPU or not
:return data_list: PyTorch Tensor of shape [batch_size, max_seq_len]
@@ -103,8 +100,8 @@ def convert_to_torch_tensor(data_list, use_cuda):


def k_means_1d(x, k, max_iter=100):
"""
Perform k-means on 1-D data.
"""Perform k-means on 1-D data.
:param x: list of int, representing points in 1-D.
:param k: the number of clusters required.
:param max_iter: maximum iteration
@@ -132,21 +129,28 @@ def k_means_1d(x, k, max_iter=100):


def k_means_bucketing(all_inst, buckets):
"""
"""Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths.

:param all_inst: 3-level list
E.g. ::

[
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
...
]

:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
threshold for each bucket (This is usually None.).
:return data: 2-level list
::

[
[index_11, index_12, ...], # bucket 1
[index_21, index_22, ...], # bucket 2
...
]

"""
bucket_data = [[] for _ in buckets]
num_buckets = len(buckets)
@@ -160,11 +164,16 @@ def k_means_bucketing(all_inst, buckets):


class BaseSampler(object):
"""
Base class for all samplers.
"""The base class of all samplers.
"""

def __init__(self, data_set):
"""

:param data_set: multi-level list, of shape [num_example, *]

"""
self.data_set_length = len(data_set)
self.data = data_set

@@ -176,11 +185,16 @@ class BaseSampler(object):


class SequentialSampler(BaseSampler):
"""
Sample data in the original order.
"""Sample data in the original order.
"""

def __init__(self, data_set):
"""

:param data_set: multi-level list

"""
super(SequentialSampler, self).__init__(data_set)

def __iter__(self):
@@ -188,11 +202,16 @@ class SequentialSampler(BaseSampler):


class RandomSampler(BaseSampler):
"""
Sample data in random permutation order.
"""Sample data in random permutation order.
"""

def __init__(self, data_set):
"""

:param data_set: multi-level list

"""
super(RandomSampler, self).__init__(data_set)
self.order = np.random.permutation(self.data_set_length)

@@ -201,11 +220,18 @@ class RandomSampler(BaseSampler):


class Batchifier(object):
"""
Wrap random or sequential sampler to generate a mini-batch.
"""Wrap random or sequential sampler to generate a mini-batch.
"""

def __init__(self, sampler, batch_size, drop_last=True):
"""

:param sampler: a Sampler object
:param batch_size: int, the size of the mini-batch
:param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch.

"""
super(Batchifier, self).__init__()
self.sampler = sampler
self.batch_size = batch_size
@@ -223,8 +249,7 @@ class Batchifier(object):


class BucketBatchifier(Batchifier):
"""
Partition all samples into multiple buckets, each of which contains sentences of approximately the same length.
"""Partition all samples into multiple buckets, each of which contains sentences of approximately the same length.
In sampling, first random choose a bucket. Then sample data from it.
The number of buckets is decided dynamically by the variance of sentence lengths.
"""
@@ -237,6 +262,7 @@ class BucketBatchifier(Batchifier):
:param num_buckets: int, number of buckets for grouping these sequences.
:param drop_last: bool, useless currently.
:param sampler: Sampler, useless currently.

"""
super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last)
buckets = ([None] * num_buckets)


+ 15
- 1
fastNLP/core/loss.py View File

@@ -8,8 +8,13 @@ class Loss(object):
"""

def __init__(self, args):
"""

:param args: None or str, the name of a loss function.

"""
if args is None:
# this is useful when
# this is useful when Trainer.__init__ performs type check
self._loss = None
elif isinstance(args, str):
self._loss = self._borrow_from_pytorch(args)
@@ -17,10 +22,19 @@ class Loss(object):
raise NotImplementedError

def get(self):
"""

:return self._loss: the loss function
"""
return self._loss

@staticmethod
def _borrow_from_pytorch(loss_name):
"""Given a name of a loss function, return it from PyTorch.

:param loss_name: str, the name of a loss function
:return loss: a PyTorch loss
"""
if loss_name == "cross_entropy":
return torch.nn.CrossEntropyLoss()
else:


+ 26
- 31
fastNLP/core/metrics.py View File

@@ -1,11 +1,12 @@
import warnings

import numpy as np
import torch


def _conver_numpy(x):
"""
convert input data to numpy array
"""convert input data to numpy array
"""
if isinstance(x, np.ndarray):
return x
@@ -17,21 +18,20 @@ def _conver_numpy(x):


def _check_same_len(*arrays, axis=0):
"""
check if input array list has same length for one dimension
"""check if input array list has same length for one dimension
"""
lens = set([x.shape[axis] for x in arrays if x is not None])
return len(lens) == 1


def _label_types(y):
"""
determine the type
"binary"
"multiclass"
"multiclass-multioutput"
"multilabel"
"unknown"
"""Determine the type
- "binary"
- "multiclass"
- "multiclass-multioutput"
- "multilabel"
- "unknown"
"""
# never squeeze the first dimension
y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1)
@@ -46,8 +46,8 @@ def _label_types(y):


def _check_data(y_true, y_pred):
"""
check if y_true and y_pred is same type of data e.g both binary or multiclass
"""Check if y_true and y_pred is same type of data e.g both binary or multiclass
"""
y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred)
if not _check_same_len(y_true, y_pred):
@@ -174,16 +174,13 @@ def classification_report(y_true, y_pred, labels=None, target_names=None, digits


def accuracy_topk(y_true, y_prob, k=1):
"""
Compute accuracy of y_true matching top-k probable
"""Compute accuracy of y_true matching top-k probable
labels in y_prob.

Paras:
y_ture - ndarray, true label, [n_samples]
y_prob - ndarray, label probabilities, [n_samples, n_classes]
k - int, k in top-k
Returns:
accuracy of top-k
:param y_true: ndarray, true label, [n_samples]
:param y_prob: ndarray, label probabilities, [n_samples, n_classes]
:param k: int, k in top-k
:return :accuracy of top-k
"""

y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1]
@@ -195,16 +192,14 @@ def accuracy_topk(y_true, y_prob, k=1):


def pred_topk(y_prob, k=1):
"""
Return top-k predicted labels and corresponding probabilities.

Args:
y_prob - ndarray, size [n_samples, n_classes], probabilities on labels
k - int, k of top-k
Returns:
y_pred_topk - ndarray, size [n_samples, k], predicted top-k labels
y_prob_topk - ndarray, size [n_samples, k], probabilities for
top-k labels
"""Return top-k predicted labels and corresponding probabilities.


:param y_prob: ndarray, size [n_samples, n_classes], probabilities on labels
:param k: int, k of top-k
:returns
y_pred_topk: ndarray, size [n_samples, k], predicted top-k labels
y_prob_topk: ndarray, size [n_samples, k], probabilities for top-k labels
"""

y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1]


+ 10
- 2
fastNLP/core/optimizer.py View File

@@ -4,7 +4,6 @@ import torch
class Optimizer(object):
"""Wrapper of optimizer from framework

names: arguments (type)
1. Adam: lr (float), weight_decay (float)
2. AdaGrad
3. RMSProp
@@ -16,20 +15,29 @@ class Optimizer(object):
"""
:param optimizer_name: str, the name of the optimizer
:param kwargs: the arguments

"""
self.optim_name = optimizer_name
self.kwargs = kwargs

@property
def name(self):
"""The name of the optimizer.

:return: str
"""
return self.optim_name

@property
def params(self):
"""The arguments used to create the optimizer.

:return: dict of (str, *)
"""
return self.kwargs

def construct_from_pytorch(self, model_params):
"""construct a optimizer from framework over given model parameters"""
"""Construct a optimizer from framework over given model parameters."""

if self.optim_name in ["SGD", "sgd"]:
if "lr" in self.kwargs:


+ 1
- 1
fastNLP/core/predictor.py View File

@@ -70,7 +70,7 @@ class Predictor(object):
def predict(self, network, data):
"""Perform inference using the trained model.

:param network: a PyTorch model
:param network: a PyTorch model (cpu)
:param data: list of list of strings
:return: list of list of strings, [num_examples, tag_seq_length]
"""


+ 58
- 13
fastNLP/core/preprocess.py View File

@@ -17,20 +17,33 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
# the first vocab in dict with the index = 5

def save_pickle(obj, pickle_path, file_name):
"""Save an object into a pickle file.

:param obj: an object
:param pickle_path: str, the directory where the pickle file is to be saved
:param file_name: str, the name of the pickle file. In general, it should be ended by "pkl".
"""
with open(os.path.join(pickle_path, file_name), "wb") as f:
_pickle.dump(obj, f)
print("{} saved. ".format(file_name))
print("{} saved in {}".format(file_name, pickle_path))


def load_pickle(pickle_path, file_name):
"""Load an object from a given pickle file.

:param pickle_path: str, the directory where the pickle file is.
:param file_name: str, the name of the pickle file.
:return obj: an object stored in the pickle
"""
with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f)
print("{} loaded. ".format(file_name))
print("{} loaded from {}".format(file_name, pickle_path))
return obj


def pickle_exist(pickle_path, pickle_name):
"""
"""Check if a given pickle file exists in the directory.

:param pickle_path: the directory of target pickle file
:param pickle_name: the filename of target pickle file
:return: True if file exists else False
@@ -45,6 +58,19 @@ def pickle_exist(pickle_path, pickle_name):


class BasePreprocess(object):
"""Base class of all preprocessors.
Preprocessors are responsible for converting data of strings into data of indices.
During the pre-processing, the following pickle files will be built:

- "word2id.pkl", a mapping from words(tokens) to indices
- "id2word.pkl", a reversed dictionary
- "label2id.pkl", a dictionary on labels
- "id2label.pkl", a reversed dictionary on labels

These four pickle files are expected to be saved in the given pickle directory once they are constructed.
Preprocessors will check if those files are already in the directory and will reuse them in future calls.
"""

def __init__(self):
self.word2index = None
self.label2index = None
@@ -68,6 +94,7 @@ class BasePreprocess(object):
:param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
:return results: a tuple of datasets after preprocessing.
"""

if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
self.word2index = load_pickle(pickle_path, "word2id.pkl")
self.label2index = load_pickle(pickle_path, "class2id.pkl")
@@ -98,6 +125,8 @@ class BasePreprocess(object):
save_pickle(data_train, pickle_path, "data_train.pkl")
else:
data_train = load_pickle(pickle_path, "data_train.pkl")
if pickle_exist(pickle_path, "data_dev.pkl"):
data_dev = load_pickle(pickle_path, "data_dev.pkl")
else:
# cross_val is True
if not pickle_exist(pickle_path, "data_train_0.pkl"):
@@ -181,25 +210,31 @@ class SeqLabelPreprocess(BasePreprocess):
"""Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
data of three-level list which have multiple labels in each sample.
::

[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]

"""

def __init__(self):
super(SeqLabelPreprocess, self).__init__()

def build_dict(self, data):
"""
Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
"""Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
:param data: three-level list
::

[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]

:return word2index: dict of {str, int}
label2index: dict of {str, int}
"""
@@ -215,14 +250,17 @@ class SeqLabelPreprocess(BasePreprocess):
return word2index, label2index

def to_index(self, data):
"""
Convert word strings and label strings into indices.
"""Convert word strings and label strings into indices.
:param data: three-level list
::

[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]

:return data_index: the same shape as data, but each string is replaced by its corresponding index
"""
data_index = []
@@ -241,11 +279,14 @@ class ClassPreprocess(BasePreprocess):
Preprocess pipeline, including building mapping from words to index, from index to words,
from labels/classes to index, from index to labels/classes.
design for data of three-level list which has a single label in each sample.
::

[
[ [word_11, word_12, ...], label_1 ],
[ [word_21, word_22, ...], label_2 ],
...
]

"""

def __init__(self):
@@ -268,18 +309,21 @@ class ClassPreprocess(BasePreprocess):

for word in sent:
if word not in word2index:
word2index[word[0]] = len(word2index)
word2index[word] = len(word2index)
return word2index, label2index

def to_index(self, data):
"""
Convert word strings and label strings into indices.
"""Convert word strings and label strings into indices.
:param data: three-level list
::

[
[ [word_11, word_12, ...], label_1 ],
[ [word_21, word_22, ...], label_2 ],
...
]

:return data_index: the same shape as data, but each string is replaced by its corresponding index
"""
data_index = []
@@ -294,14 +338,15 @@ class ClassPreprocess(BasePreprocess):


def infer_preprocess(pickle_path, data):
"""
Preprocess over inference data.
Transform three-level list of strings into that of index.
"""Preprocess over inference data. Transform three-level list of strings into that of index.
::
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]

"""
word2index = load_pickle(pickle_path, "word2id.pkl")
data_index = []


+ 42
- 22
fastNLP/core/tester.py View File

@@ -38,7 +38,7 @@ class BaseTester(object):
Obviously, "required_args" is the subset of "default_args".
The value in "default_args" to the keys in "required_args" is simply for type check.
"""
# TODO: required arguments
# add required arguments here
required_args = {}

for req_key in required_args:
@@ -56,7 +56,7 @@ class BaseTester(object):
logger.error(msg)
raise ValueError(msg)
else:
# BeseTester doesn't care about extra arguments
# BaseTester doesn't care about extra arguments
pass
print(default_args)

@@ -69,8 +69,8 @@ class BaseTester(object):
self.print_every_step = default_args["print_every_step"]

self._model = None
self.eval_history = []
self.batch_output = []
self.eval_history = [] # evaluation results of all batches
self.batch_output = [] # outputs of all batches

def test(self, network, dev_data):
if torch.cuda.is_available() and self.use_cuda:
@@ -83,10 +83,10 @@ class BaseTester(object):
self.eval_history.clear()
self.batch_output.clear()

iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False))
step = 0

for batch_x, batch_y in self.make_batch(iterator, dev_data):
for batch_x, batch_y in self.make_batch(iterator):
with torch.no_grad():
prediction = self.data_forward(network, batch_x)
eval_results = self.evaluate(prediction, batch_y)
@@ -98,8 +98,8 @@ class BaseTester(object):

print_output = "[test step {}] {}".format(step, eval_results)
logger.info(print_output)
if step % self.print_every_step == 0:
print(print_output)
if self.print_every_step > 0 and step % self.print_every_step == 0:
print(self.make_eval_output(prediction, eval_results))
step += 1

def mode(self, model, test):
@@ -115,28 +115,48 @@ class BaseTester(object):
raise NotImplementedError

def evaluate(self, predict, truth):
"""Compute evaluation metrics for the model. """
"""Compute evaluation metrics.

:param predict: Tensor
:param truth: Tensor
:return eval_results: can be anything. It will be stored in self.eval_history
"""
raise NotImplementedError

@property
def metrics(self):
"""Return a list of metrics. """
"""Compute and return metrics.
Use self.eval_history to compute metrics over the whole dev set.
Please refer to metrics.py for common metric functions.

:return : variable number of outputs
"""
raise NotImplementedError

def show_matrices(self):
"""This is called by Trainer to print evaluation results on dev set during training.
def show_metrics(self):
"""Customize evaluation outputs in Trainer.
Called by Trainer to print evaluation results on dev set during training.
Use self.metrics to fetch available metrics.

:return print_str: str
"""
raise NotImplementedError

def make_batch(self, iterator, data):
def make_batch(self, iterator):
raise NotImplementedError

def make_eval_output(self, predictions, eval_results):
"""Customize Tester outputs.

:param predictions: Tensor
:param eval_results: Tensor
:return: str, to be printed.
"""
raise NotImplementedError

class SeqLabelTester(BaseTester):
"""
Tester for sequence labeling.
"""Tester for sequence labeling.
"""

def __init__(self, **test_args):
@@ -187,22 +207,22 @@ class SeqLabelTester(BaseTester):
# make sure "results" is in the same device as "truth"
results = results.to(truth)
accuracy = torch.sum(results == truth.view((-1,))).to(torch.float) / results.shape[0]
return [loss.data, accuracy.data]
return [float(loss), float(accuracy)]

def metrics(self):
batch_loss = np.mean([x[0] for x in self.eval_history])
batch_accuracy = np.mean([x[1] for x in self.eval_history])
return batch_loss, batch_accuracy

def show_matrices(self):
"""
This is called by Trainer to print evaluation on dev set.
def show_metrics(self):
"""This is called by Trainer to print evaluation on dev set.
:return print_str: str
"""
loss, accuracy = self.metrics()
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy)

def make_batch(self, iterator, data):
def make_batch(self, iterator):
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True)


@@ -211,12 +231,12 @@ class ClassificationTester(BaseTester):

def __init__(self, **test_args):
"""
:param test_args: a dict-like object that has __getitem__ method, \
:param test_args: a dict-like object that has __getitem__ method.
can be accessed by "test_args["key_str"]"
"""
super(ClassificationTester, self).__init__(**test_args)

def make_batch(self, iterator, data, max_len=None):
def make_batch(self, iterator, max_len=None):
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len)

def data_forward(self, network, x):


+ 61
- 99
fastNLP/core/trainer.py View File

@@ -1,11 +1,11 @@
import _pickle
import copy
import os
import time
from datetime import timedelta

import numpy as np
import torch
import tensorboardX
from tensorboardX import SummaryWriter

from fastNLP.core.action import Action
from fastNLP.core.action import RandomSampler, Batchifier
@@ -16,16 +16,12 @@ from fastNLP.modules import utils
from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver

DEFAULT_QUEUE_SIZE = 300
logger = create_logger(__name__, "./train_test.log")


class BaseTrainer(object):
"""Operations to train a model, including data loading, SGD, and validation.
"""Operations of training a model, including data loading, gradient descent, and validation.

Subclasses must implement the following abstract methods:
- grad_backward
- get_loss
"""

def __init__(self, **kwargs):
@@ -33,10 +29,10 @@ class BaseTrainer(object):
:param kwargs: dict of (key, value), or dict-like object. key is str.

The base trainer requires the following keys:
- epochs: int, the number of epochs in training
- validate: bool, whether or not to validate on dev set
- batch_size: int
- pickle_path: str, the path to pickle files for pre-processing
- epochs: int, the number of epochs in training
- validate: bool, whether or not to validate on dev set
- batch_size: int
- pickle_path: str, the path to pickle files for pre-processing
"""
super(BaseTrainer, self).__init__()

@@ -47,8 +43,8 @@ class BaseTrainer(object):
Otherwise, error will raise.
"""
default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/",
"save_best_dev": True, "model_name": "default_model_name.pkl",
"loss": Loss(None),
"save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1,
"loss": Loss(None), # used to pass type check
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0)
}
"""
@@ -57,7 +53,7 @@ class BaseTrainer(object):
Obviously, "required_args" is the subset of "default_args".
The value in "default_args" to the keys in "required_args" is simply for type check.
"""
# TODO: required arguments
# add required arguments here
required_args = {}

for req_key in required_args:
@@ -86,55 +82,46 @@ class BaseTrainer(object):
self.save_best_dev = default_args["save_best_dev"]
self.use_cuda = default_args["use_cuda"]
self.model_name = default_args["model_name"]
self.print_every_step = default_args["print_every_step"]

self._model = None
self._loss_func = default_args["loss"].get() # return a pytorch loss function or None
self._optimizer = None
self._optimizer_proto = default_args["optimizer"]
self._summary_writer = SummaryWriter(self.pickle_path + 'tensorboard_logs')
self._graph_summaried = False

def train(self, network, train_data, dev_data=None):
"""General Training Steps
"""General Training Procedure

:param network: a model
:param train_data: three-level list, the training set.
:param dev_data: three-level list, the validation data (optional)

The method is framework independent.
Work by calling the following methods:
- prepare_input
- mode
- define_optimizer
- data_forward
- get_loss
- grad_backward
- update
Subclasses must implement these methods with a specific framework.
"""
# prepare model and data, transfer model to gpu if available
# transfer model to gpu if available
if torch.cuda.is_available() and self.use_cuda:
self._model = network.cuda()
# self._model is used to access model-specific loss
else:
self._model = network

# define tester over dev data
# define Tester over dev data
if self.validate:
default_valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
"save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path,
"use_cuda": self.use_cuda}
"use_cuda": self.use_cuda, "print_every_step": 0}
validator = self._create_validator(default_valid_args)
logger.info("validator defined as {}".format(str(validator)))

# optimizer and loss
self.define_optimizer()
logger.info("optimizer defined as {}".format(str(self._optimizer)))
self.define_loss()
logger.info("loss function defined as {}".format(str(self._loss_func)))

# main training epochs
n_samples = len(train_data)
n_batches = n_samples // self.batch_size
n_print = 1
# main training procedure
start = time.time()
logger.info("training epochs started")

for epoch in range(1, self.n_epochs + 1):
logger.info("training epoch {}".format(epoch))

@@ -144,23 +131,31 @@ class BaseTrainer(object):
data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False))
logger.info("prepared data iterator")

self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch)
# one forward and backward pass
self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch)

# validation
if self.validate:
logger.info("validation started")
validator.test(network, dev_data)

if self.save_best_dev and self.best_eval_result(validator):
self.save_model(network, self.model_name)
print("saved better model selected by dev")
logger.info("saved better model selected by dev")
print("Saved better model selected by validation.")
logger.info("Saved better model selected by validation.")

valid_results = validator.show_matrices()
valid_results = validator.show_metrics()
print("[epoch {}] {}".format(epoch, valid_results))
logger.info("[epoch {}] {}".format(epoch, valid_results))

def _train_step(self, data_iterator, network, **kwargs):
"""Training process in one epoch."""
"""Training process in one epoch.

kwargs should contain:
- n_print: int, print training information every n steps.
- start: time.time(), the starting time of this step.
- epoch: int,
"""
step = 0
for batch_x, batch_y in self.make_batch(data_iterator):

@@ -169,8 +164,13 @@ class BaseTrainer(object):
loss = self.get_loss(prediction, batch_y)
self.grad_backward(loss)
self.update()
self._summary_writer.add_scalar("loss", loss.item(), global_step=step)

if step % kwargs["n_print"] == 0:
if not self._graph_summaried:
self._summary_writer.add_graph(network, batch_x)
self._graph_summaried = True

if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
end = time.time()
diff = timedelta(seconds=round(end - kwargs["start"]))
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format(
@@ -204,21 +204,6 @@ class BaseTrainer(object):
network_copy = copy.deepcopy(network)
self.train(network_copy, train_data_cv[i], dev_data_cv[i])

def load_train_data(self, pickle_path):
"""
For task-specific processing.
:param pickle_path:
:return data_train
"""
file_path = os.path.join(pickle_path, "data_train.pkl")
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
data = _pickle.load(f)
else:
logger.error("cannot find training data {}. invalid input path for training data.".format(file_path))
raise RuntimeError("cannot find training data {}".format(file_path))
return data

def make_batch(self, iterator):
raise NotImplementedError

@@ -226,14 +211,13 @@ class BaseTrainer(object):
Action.mode(network, test)

def define_optimizer(self):
"""
Define framework-specific optimizer specified by the models.
"""Define framework-specific optimizer specified by the models.
"""
self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters())

def update(self):
"""
Perform weight update on a model.
"""Perform weight update on a model.

For PyTorch, just call optimizer to update.
"""
@@ -243,8 +227,8 @@ class BaseTrainer(object):
raise NotImplementedError

def grad_backward(self, loss):
"""
Compute gradient with link rules.
"""Compute gradient with link rules.
:param loss: a scalar where back-prop starts

For PyTorch, just do "loss.backward()"
@@ -253,8 +237,8 @@ class BaseTrainer(object):
loss.backward()

def get_loss(self, predict, truth):
"""
Compute loss given prediction and ground truth.
"""Compute loss given prediction and ground truth.
:param predict: prediction label vector
:param truth: ground truth label vector
:return: a scalar
@@ -262,8 +246,9 @@ class BaseTrainer(object):
return self._loss_func(predict, truth)

def define_loss(self):
"""
if the model defines a loss, use model's loss.
"""Define a loss for the trainer.

If the model defines a loss, use model's loss.
Otherwise, Trainer must has a loss argument, use it as loss.
These two losses cannot be defined at the same time.
Trainer does not handle loss definition or choose default losses.
@@ -280,53 +265,30 @@ class BaseTrainer(object):
logger.info("The model didn't define loss, use Trainer's loss.")

def best_eval_result(self, validator):
"""
"""Check if the current epoch yields better validation results.

:param validator: a Tester instance
:return: bool, True means current results on dev set is the best.
"""
raise NotImplementedError

def save_model(self, network, model_name):
"""
"""Save this model with such a name.
This method may be called multiple times by Trainer to overwritten a better model.

:param network: the PyTorch model
:param model_name: str
model_best_dev.pkl may be overwritten by a better model in future epochs.
"""
if model_name[-4:] != ".pkl":
model_name += ".pkl"
ModelSaver(self.pickle_path + model_name).save_pytorch(network)
ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network)

def _create_validator(self, valid_args):
raise NotImplementedError


class ToyTrainer(BaseTrainer):
"""
An example to show the definition of Trainer.
"""

def __init__(self, training_args):
super(ToyTrainer, self).__init__(training_args)

def load_train_data(self, data_path):
data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
return data_train, data_dev, 0, 1

def data_forward(self, network, x):
return network(x)

def grad_backward(self, loss):
self._model.zero_grad()
loss.backward()

def get_loss(self, pred, truth):
return np.mean(np.square(pred - truth))


class SeqLabelTrainer(BaseTrainer):
"""
Trainer for Sequence Modeling
"""Trainer for Sequence Labeling

"""

@@ -356,11 +318,11 @@ class SeqLabelTrainer(BaseTrainer):
return y

def get_loss(self, predict, truth):
"""
Compute loss given prediction and ground truth.
"""Compute loss given prediction and ground truth.
:param predict: prediction label vector, [batch_size, max_len, tag_size]
:param truth: ground truth label vector, [batch_size, max_len]
:return: a scalar
:return loss: a scalar
"""
batch_size, max_len = predict.size(0), predict.size(1)
assert truth.shape == (batch_size, max_len)
@@ -384,7 +346,7 @@ class SeqLabelTrainer(BaseTrainer):


class ClassificationTrainer(BaseTrainer):
"""Trainer for classification."""
"""Trainer for text classification."""

def __init__(self, **train_args):
super(ClassificationTrainer, self).__init__(**train_args)


+ 162
- 70
fastNLP/fastnlp.py View File

@@ -1,4 +1,7 @@
import os

from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer
from fastNLP.core.preprocess import load_pickle
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader

@@ -7,14 +10,13 @@ mapping from model name to [URL, file_name.class_name, model_pickle_name]
Notice that the class of the model should be in "models" directory.

Example:
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"]
"""
FastNLP_MODEL_COLLECTION = {
"seq_label_model": {
"url": "www.fudan.edu.cn",
"class": "sequence_modeling.SeqLabeling",
"class": "sequence_modeling.SeqLabeling", # file_name.class_name in models/
"pickle": "seq_label_model.pkl",
"type": "seq_label"
"type": "seq_label",
"config_file_name": "config", # the name of the config file which stores model initialization parameters
"config_section_name": "text_class_model" # the name of the section in the config file which stores model init params
},
"text_class_model": {
"url": "www.fudan.edu.cn",
@@ -22,11 +24,34 @@ FastNLP_MODEL_COLLECTION = {
"pickle": "text_class_model.pkl",
"type": "text_class"
}
"""
FastNLP_MODEL_COLLECTION = {
"cws_basic_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "cws_basic_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "config",
"config_section_name": "text_class_model"
},
"pos_tag_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "pos_tag_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "pos_tag.config",
"config_section_name": "pos_tag_model"
},
"text_classify_model": {
"url": "",
"class": "cnn_text_classification.CNNText",
"pickle": "text_class_model_v0.pkl",
"type": "text_class",
"config_file_name": "text_classify.cfg",
"config_section_name": "model"
}
}

CONFIG_FILE_NAME = "config"
SECTION_NAME = "text_class_model"


class FastNLP(object):
"""
@@ -51,10 +76,13 @@ class FastNLP(object):
self.model = None
self.infer_type = None # "seq_label"/"text_class"

def load(self, model_name):
def load(self, model_name, config_file="config", section_name="model"):
"""
Load a pre-trained FastNLP model together with additional data.
:param model_name: str, the name of a FastNLP model.
:param config_file: str, the name of the config file which stores the initialization information of the model.
(default: "config")
:param section_name: str, the name of the corresponding section in the config file. (default: model)
"""
assert type(model_name) is str
if model_name not in FastNLP_MODEL_COLLECTION:
@@ -64,37 +92,47 @@ class FastNLP(object):
self._download(model_name, FastNLP_MODEL_COLLECTION[model_name]["url"])

model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name]["class"])
print("Restore model class {}".format(str(model_class)))

model_args = ConfigSection()
ConfigLoader.load_config(self.model_dir + CONFIG_FILE_NAME, {SECTION_NAME: model_args})
ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args})
print("Restore model hyper-parameters {}".format(str(model_args.data)))

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(self.model_dir, "word2id.pkl")
model_args["vocab_size"] = len(word2index)
index2label = load_pickle(self.model_dir, "id2class.pkl")
model_args["num_classes"] = len(index2label)

# Construct the model
model = model_class(model_args)
print("Model constructed.")

# To do: framework independent
ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name]["pickle"])
ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"]))
print("Model weights loaded.")

self.model = model
self.infer_type = FastNLP_MODEL_COLLECTION[model_name]["type"]

print("Model loaded. ")
print("Inference ready.")

def run(self, raw_input):
"""
Perform inference over given input using the loaded model.
:param raw_input: str, raw text
:param raw_input: list of string. Each list is an input query.
:return results:
"""

infer = self._create_inference(self.model_dir)

# string ---> 2-D list of string
infer_input = self.string_to_list(raw_input)
# tokenize: list of string ---> 2-D list of string
infer_input = self.tokenize(raw_input, language="zh")

# 2-D list of string ---> list of strings
# 2-D list of string ---> 2-D list of tags
results = infer.predict(self.model, infer_input)

# list of strings ---> final answers
# 2-D list of tags ---> list of final answers
outputs = self._make_output(results, infer_input)
return outputs

@@ -142,81 +180,135 @@ class FastNLP(object):
"""
return True

def string_to_list(self, text, delimiter="\n"):
"""
This function is used to transform raw input to lists, which is done by DatasetLoader in training.
Split text string into three-level lists.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
:param text: string
:param delimiter: str, character used to split text into sentences.
:return data: two-level lists
def tokenize(self, text, language):
"""Extract tokens from strings.
For English, extract words separated by space.
For Chinese, extract characters.
TODO: more complex tokenization methods

:param text: list of string
:param language: str, one of ('zh', 'en'), Chinese or English.
:return data: list of list of string, each string is a token.
"""
assert language in ("zh", "en")
data = []
sents = text.strip().split(delimiter)
for sent in sents:
characters = []
for ch in sent:
characters.append(ch)
data.append(characters)
for sent in text:
if language == "en":
tokens = sent.strip().split()
elif language == "zh":
tokens = [char for char in sent]
else:
raise RuntimeError("Unknown language {}".format(language))
data.append(tokens)
return data

def _make_output(self, results, infer_input):
"""Transform the infer output into user-friendly output.

:param results: 1 or 2-D list of strings.
If self.infer_type == "seq_label", it is of shape [num_examples, tag_seq_length]
If self.infer_type == "text_class", it is of shape [num_examples]
:param infer_input: 2-D list of string, the input query before inference.
:return outputs: list. Each entry is a prediction.
"""
if self.infer_type == "seq_label":
outputs = make_seq_label_output(results, infer_input)
elif self.infer_type == "text_class":
outputs = make_class_output(results, infer_input)
else:
raise ValueError("fail to make outputs with infer type {}".format(self.infer_type))
raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))
return outputs


def make_seq_label_output(result, infer_input):
"""
Transform model output into user-friendly contents.
:param result: 1-D list of strings. (model output)
"""Transform model output into user-friendly contents.
:param result: 2-D list of strings. (model output)
:param infer_input: 2-D list of string (model input)
:return outputs:
:return ret: list of list of tuples
[
[(word_11, label_11), (word_12, label_12), ...],
[(word_21, label_21), (word_22, label_22), ...],
...
]
"""
return result

ret = []
for example_x, example_y in zip(infer_input, result):
ret.append([(x, y) for x, y in zip(example_x, example_y)])
return ret

def make_class_output(result, infer_input):
"""Transform model output into user-friendly contents.

:param result: 2-D list of strings. (model output)
:param infer_input: 1-D list of string (model input)
:return ret: the same as result, [label_1, label_2, ...]
"""
return result


def interpret_word_seg_results(infer_input, results):
"""
Transform model output into user-friendly contents.
def interpret_word_seg_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.
Example: In CWS, convert <BMES> labeling into segmented text.
:param results: list of strings. (model output)
:param infer_input: 2-D list of string (model input)
:return output: list of strings
:param char_seq: list of string,
:param label_seq: list of string, the same length as char_seq
Each entry is one of ('B', 'M', 'E', 'S').
:return output: list of words
"""
outputs = []
for sent_char, sent_label in zip(infer_input, results):
words = []
word = ""
for char, label in zip(sent_char, sent_label):
if label[0] == "B":
if word != "":
words.append(word)
word = char
elif label[0] == "M":
word += char
elif label[0] == "E":
word += char
words = []
word = ""
for char, label in zip(char_seq, label_seq):
if label[0] == "B":
if word != "":
words.append(word)
word = ""
elif label[0] == "S":
if word != "":
words.append(word)
word = ""
words.append(char)
else:
raise ValueError("invalid label")
outputs.append(" ".join(words))
word = char
elif label[0] == "M":
word += char
elif label[0] == "E":
word += char
words.append(word)
word = ""
elif label[0] == "S":
if word != "":
words.append(word)
word = ""
words.append(char)
else:
raise ValueError("invalid label {}".format(label[0]))
return words


def interpret_cws_pos_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.

:param char_seq: list of string
:param label_seq: list of string, the same length as char_seq.
:return outputs: list of tuple (words, pos_tag):
"""

def pos_tag_check(seq):
"""check whether all entries are the same """
return len(set(seq)) <= 1

word = []
word_pos = []
outputs = []
for char, label in zip(char_seq, label_seq):
tmp = label.split("-")
cws_label, pos_tag = tmp[0], tmp[1]

if cws_label == "B" or cws_label == "M":
word.append(char)
word_pos.append(pos_tag)
elif cws_label == "E":
word.append(char)
word_pos.append(pos_tag)
if not pos_tag_check(word_pos):
raise RuntimeError("character-wise pos tags inconsistent. ")
outputs.append(("".join(word), word_pos[0]))
word.clear()
word_pos.clear()
elif cws_label == "S":
outputs.append((char, pos_tag))
return outputs

+ 3
- 4
fastNLP/loader/base_loader.py View File

@@ -1,9 +1,8 @@
class BaseLoader(object):
"""docstring for BaseLoader"""

def __init__(self, data_name, data_path):
def __init__(self, data_path):
super(BaseLoader, self).__init__()
self.data_name = data_name
self.data_path = data_path

def load(self):
@@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader):
For charLM
"""

def __init__(self, name, path):
super(ToyLoader0, self).__init__(name, path)
def __init__(self, data_path):
super(ToyLoader0, self).__init__(data_path)

def load(self):
with open(self.data_path, 'r') as f:


+ 62
- 18
fastNLP/loader/dataset_loader.py View File

@@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader
class DatasetLoader(BaseLoader):
""""loader for data sets"""

def __init__(self, data_name, data_path):
super(DatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(DatasetLoader, self).__init__(data_path)


class POSDatasetLoader(DatasetLoader):
@@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader):
to label5.
"""

def __init__(self, data_name, data_path):
super(POSDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(POSDatasetLoader, self).__init__(data_path)

def load(self):
assert os.path.exists(self.data_path)
@@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader):
Data set loader for tokenization data sets
"""

def __init__(self, data_name, data_path):
super(TokenizeDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(TokenizeDatasetLoader, self).__init__(data_path)

def load_pku(self, max_seq_len=32):
"""
@@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader):
class ClassDatasetLoader(DatasetLoader):
"""Loader for classification data sets"""

def __init__(self, data_name, data_path):
super(ClassDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(ClassDatasetLoader, self).__init__(data_path)

def load(self):
assert os.path.exists(self.data_path)
@@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader):
:param str data_name: the name of the conll data set
:param str data_path: the path to the conll data set
"""
super(ConllLoader, self).__init__(data_name, data_path)
super(ConllLoader, self).__init__(data_path)
self.data_set = self.parse(self.load())

def load(self):
@@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader):


class LMDatasetLoader(DatasetLoader):
def __init__(self, data_name, data_path):
super(LMDatasetLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(LMDatasetLoader, self).__init__(data_path)

def load(self):
if not os.path.exists(self.data_path):
@@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader):
return text.strip().split()


if __name__ == "__main__":
class PeopleDailyCorpusLoader(DatasetLoader):
"""
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines()
for example in data:
for w, l in zip(example[0], example[1]):
print(w, l)
People Daily Corpus: Chinese word segmentation, POS tag, NER
"""

ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku()
print(ans)
def __init__(self, data_path):
super(PeopleDailyCorpusLoader, self).__init__(data_path)

def load(self):
with open(self.data_path, "r", encoding="utf-8") as f:
sents = f.readlines()

pos_tag_examples = []
ner_examples = []
for sent in sents:
inside_ne = False
sent_pos_tag = []
sent_words = []
sent_ner = []
words = sent.strip().split()[1:]
for word in words:
if "[" in word and "]" in word:
ner_tag = "U"
print(word)
elif "[" in word:
inside_ne = True
ner_tag = "B"
word = word[1:]
elif "]" in word:
ner_tag = "L"
word = word[:word.index("]")]
if inside_ne is True:
inside_ne = False
else:
raise RuntimeError("only ] appears!")
else:
if inside_ne is True:
ner_tag = "I"
else:
ner_tag = "O"
tmp = word.split("/")
token, pos = tmp[0], tmp[1]
sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
return pos_tag_examples, ner_examples

if __name__ == "__main__":
loader = PeopleDailyCorpusLoader("./")
pos, ner = loader.load()
print(pos[:10])
print(ner[:10])

+ 44
- 2
fastNLP/loader/embed_loader.py View File

@@ -1,8 +1,50 @@
import _pickle
import os

import numpy as np

from fastNLP.loader.base_loader import BaseLoader


class EmbedLoader(BaseLoader):
"""docstring for EmbedLoader"""

def __init__(self, data_name, data_path):
super(EmbedLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(EmbedLoader, self).__init__(data_path)

@staticmethod
def load_embedding(emb_dim, emb_file, word_dict, emb_pkl):
"""Load the pre-trained embedding and combine with the given dictionary.

:param emb_file: str, the pre-trained embedding.
The embedding file should have the following format:
Each line is a word embedding, where a word string is followed by multiple floats.
Floats are separated by space. The word and the first float are separated by space.
:param word_dict: dict, a mapping from word to index.
:param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
:param emb_pkl: str, the embedding pickle file.
:return embedding_np: numpy array of shape (len(word_dict), emb_dim)

TODO: fragile code
"""
# If the embedding pickle exists, load it and return.
if os.path.exists(emb_pkl):
with open(emb_pkl, "rb") as f:
embedding_np = _pickle.load(f)
return embedding_np
# Otherwise, load the pre-trained embedding.
with open(emb_file, "r", encoding="utf-8") as f:
# begin with a random embedding
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
for line in f:
line = line.strip().split()
if len(line) != emb_dim + 1:
# skip this line if two embedding dimension not match
continue
if line[0] in word_dict:
# find the word and replace its embedding with a pre-trained one
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
# save and return the result
with open(emb_pkl, "wb") as f:
_pickle.dump(embedding_np, f)
return embedding_np

+ 2
- 2
fastNLP/loader/model_loader.py View File

@@ -8,8 +8,8 @@ class ModelLoader(BaseLoader):
Loader for models.
"""

def __init__(self, data_name, data_path):
super(ModelLoader, self).__init__(data_name, data_path)
def __init__(self, data_path):
super(ModelLoader, self).__init__(data_path)

@staticmethod
def load_pytorch(empty_model, model_path):


+ 6
- 6
fastNLP/models/cnn_text_classification.py View File

@@ -5,7 +5,7 @@ import torch
import torch.nn as nn

# import torch.nn.functional as F
from fastNLP.modules.encoder.conv_maxpool import ConvMaxpool
import fastNLP.modules.encoder as encoder


class CNNText(torch.nn.Module):
@@ -18,22 +18,22 @@ class CNNText(torch.nn.Module):
def __init__(self, args):
super(CNNText, self).__init__()

class_num = args["num_classes"]
num_classes = args["num_classes"]
kernel_nums = [100, 100, 100]
kernel_sizes = [3, 4, 5]
embed_num = args["vocab_size"]
vocab_size = args["vocab_size"]
embed_dim = 300
pretrained_embed = None
drop_prob = 0.5

# no support for pre-trained embedding currently
self.embed = nn.Embedding(embed_num, embed_dim, padding_idx=0)
self.conv_pool = ConvMaxpool(
self.embed = encoder.embedding.Embedding(vocab_size, embed_dim)
self.conv_pool = encoder.conv_maxpool.ConvMaxpool(
in_channels=embed_dim,
out_channels=kernel_nums,
kernel_sizes=kernel_sizes)
self.dropout = nn.Dropout(drop_prob)
self.fc = nn.Linear(sum(kernel_nums), class_num)
self.fc = encoder.linear.Linear(sum(kernel_nums), num_classes)

def forward(self, x):
x = self.embed(x) # [N,L] -> [N,L,C]


+ 2
- 1
fastNLP/modules/decoder/__init__.py View File

@@ -1,3 +1,4 @@
from .CRF import ConditionalRandomField
from .MLP import MLP

__all__ = ["ConditionalRandomField"]
__all__ = ["ConditionalRandomField", "MLP"]

+ 3
- 1
fastNLP/modules/encoder/__init__.py View File

@@ -2,8 +2,10 @@ from .embedding import Embedding
from .linear import Linear
from .lstm import Lstm
from .conv import Conv
from .conv_maxpool import ConvMaxpool

__all__ = ["Lstm",
"Embedding",
"Linear",
"Conv"]
"Conv",
"ConvMaxpool"]

+ 5
- 0
fastNLP/modules/encoder/conv_maxpool.py View File

@@ -4,6 +4,7 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_


class ConvMaxpool(nn.Module):
@@ -21,6 +22,7 @@ class ConvMaxpool(nn.Module):
if isinstance(kernel_sizes, int):
out_channels = [out_channels]
kernel_sizes = [kernel_sizes]

self.convs = nn.ModuleList([nn.Conv1d(
in_channels=in_channels,
out_channels=oc,
@@ -31,6 +33,9 @@ class ConvMaxpool(nn.Module):
groups=groups,
bias=bias)
for oc, ks in zip(out_channels, kernel_sizes)])

for conv in self.convs:
xavier_uniform_(conv.weight) # weight initialization
else:
raise Exception(
'Incorrect kernel sizes: should be list, tuple or int')


+ 1
- 1
fastNLP/modules/encoder/embedding.py View File

@@ -15,7 +15,7 @@ class Embedding(nn.Module):
def __init__(self, nums, dims, padding_idx=0, sparse=False, init_emb=None, dropout=0.0):
super(Embedding, self).__init__()
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
if init_emb:
if init_emb is not None:
self.embed.weight = nn.Parameter(init_emb)
self.dropout = nn.Dropout(dropout)



BIN
fastnlp-architecture.jpg View File

Before After
Width: 960  |  Height: 540  |  Size: 36 kB

+ 0
- 5331
reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
File diff suppressed because it is too large
View File


+ 0
- 5331
reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
File diff suppressed because it is too large
View File


BIN
reproduction/HAN-document_classification/data/test_samples.pkl View File


BIN
reproduction/HAN-document_classification/data/train_samples.pkl View File


BIN
reproduction/HAN-document_classification/data/yelp.word2vec View File


+ 0
- 114
reproduction/chinese_word_seg/cws_train.py View File

@@ -1,114 +0,0 @@
import sys

sys.path.append("..")

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.predictor import Predictor

data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/data/pku_training.utf8"
pickle_path = "./save/"
data_infer_path = "/home/zyfeng/data/pku_test.utf8"


def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = SeqLabeling(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
infer_data = raw_data_loader.load_lines()

# Inference interface
infer = Predictor(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train_test():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
train_data = loader.load_pku()

# Preprocessor
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocess.vocab_size
train_args["num_classes"] = preprocess.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)

# Model
model = SeqLabeling(train_args)

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

# testing with validation set
test(data_dev)


def test(test_data):
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

# Define the same model
model = SeqLabeling(train_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
print("model loaded!")

# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

# Tester
tester = SeqLabelTester(test_args)

# Start testing
tester.test(model, test_data)

# print test results
print(tester.show_matrices())
print("model tested!")


if __name__ == "__main__":
train_test()

+ 12
- 0
reproduction/chinese_word_segment/cws.cfg View File

@@ -31,4 +31,16 @@ pickle_path = "./save/"
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

[model]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 640
pickle_path = "./save/"
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

+ 25
- 20
reproduction/chinese_word_segment/run.py View File

@@ -1,33 +1,33 @@
import sys, os
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.inference import SeqLabelInfer
from fastNLP.core.optimizer import SGD
from fastNLP.core.predictor import SeqLabelInfer

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = 'icwb2-data'
cfgfile = 'cws.cfg'
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'
data_name = "pku_training.utf8"

cws_data_path = os.path.join(datadir, "training/pku_training.utf8")
cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")

def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -47,7 +47,7 @@ def infer():
raise

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
print('data loaded')

@@ -63,19 +63,20 @@ def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
loader = TokenizeDatasetLoader(cws_data_path)
train_data = loader.load_pku()

# Preprocessor
p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)
trainer = SeqLabelTrainer(**train_args.data)

# Model
model = AdvSeqLabel(train_args)
@@ -83,10 +84,11 @@ def train():
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass
# Start training
trainer.train(model)
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
@@ -98,7 +100,7 @@ def train():
def test():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -106,6 +108,9 @@ def test():
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")

# Define the same model
model = AdvSeqLabel(test_args)

@@ -114,13 +119,13 @@ def test():
print("model loaded!")

# Tester
tester = SeqLabelTester(test_args)
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model)
tester.test(model, dev_data)

# print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!")




reproduction/chinese_word_seg/cws.cfg → reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,29 +1,35 @@
[train]
epochs = 10
batch_size = 32
epochs = 30
batch_size = 64
pickle_path = "./save/"
validate = true
save_best_dev = true
model_saved_path = "./save/"
rnn_hidden_units = 100
rnn_layers = 2
rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true
use_cuda = true
print_every_step = 10

[test]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 64
batch_size = 640
pickle_path = "./save/"
use_crf = true
use_cuda = true


[POS_test]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 640
pickle_path = "./save/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
word_emb_dim = 100
dropout = 0.5
use_crf = true
use_cuda = true
rnn_hidden_units = 100
word_emb_dim = 100

+ 146
- 0
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -0,0 +1,146 @@
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './pos_tag.cfg'
data_name = "CWS_POS_TAG_NER_people_daily.txt"

pos_tag_data_path = os.path.join(datadir, data_name)
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")


def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = AdvSeqLabel(test_args)

try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise

# Data Loader
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
print('data loaded')

# Inference interface
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = PeopleDailyCorpusLoader(pos_tag_data_path)
train_data, _ = loader.load()

# Preprocessor
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes

# Trainer
trainer = SeqLabelTrainer(**train_args.data)

# Model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def test():
# Config Loader
test_args = ConfigSection()
ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")

# Define the same model
model = AdvSeqLabel(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print("model loaded!")

# Tester
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, dev_data)

# print test results
print(tester.show_metrics())
print("model tested!")


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
test()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

+ 1
- 0
requirements.txt View File

@@ -1,3 +1,4 @@
numpy>=1.14.2
torch==0.4.0
torchvision>=0.1.8
tensorboardX

+ 24
- 0
setup.py View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding=utf-8
from setuptools import setup, find_packages

with open('README.md') as f:
readme = f.read()

with open('LICENSE') as f:
license = f.read()

with open('requirements.txt') as f:
reqs = f.read()

setup(
name='fastNLP',
version='0.0.1',
description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team',
long_description=readme,
license=license,
author='fudanNLP',
python_requires='>=3.5',
packages=find_packages(),
install_requires=reqs.strip().split('\n'),
)

+ 1
- 2
test/core/test_action.py View File

@@ -1,9 +1,8 @@
import os

import unittest

from fastNLP.core.action import Action, Batchifier, SequentialSampler


class TestAction(unittest.TestCase):
def test_case_1(self):
x = [1, 2, 3, 4, 5, 6, 7, 8]


+ 2
- 0
test/loader/test_loader.py View File

@@ -33,8 +33,10 @@ class TestConfigLoader(unittest.TestCase):

test_arg = ConfigSection()
ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg})

section = read_section_from_config(os.path.join("./test/loader", "config"), "test")


for sec in section:
if (sec not in test_arg) or (section[sec] != test_arg[sec]):
raise AttributeError("ERROR")


+ 0
- 138
test/ner.py View File

@@ -1,138 +0,0 @@
import _pickle
import os

import numpy as np
import torch

from fastNLP.core.preprocess import SeqLabelPreprocess
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.models.sequence_modeling import AdvSeqLabel


class MyNERTrainer(SeqLabelTrainer):
def __init__(self, train_args):
super(MyNERTrainer, self).__init__(train_args)
self.scheduler = None

def define_optimizer(self):
"""
override
:return:
"""
self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001)
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5)

def update(self):
"""
override
:return:
"""
self.optimizer.step()
self.scheduler.step()

def _create_validator(self, valid_args):
return MyNERTester(valid_args)

def best_eval_result(self, validator):
accuracy = validator.metrics()
if accuracy > self.best_accuracy:
self.best_accuracy = accuracy
return True
else:
return False


class MyNERTester(SeqLabelTester):
def __init__(self, test_args):
super(MyNERTester, self).__init__(test_args)

def _evaluate(self, prediction, batch_y, seq_len):
"""
:param prediction: [batch_size, seq_len, num_classes]
:param batch_y: [batch_size, seq_len]
:param seq_len: [batch_size]
:return:
"""
summ = 0
correct = 0
_, indices = torch.max(prediction, 2)
for p, y, l in zip(indices, batch_y, seq_len):
summ += l
correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy())
return float(correct / summ)

def evaluate(self, predict, truth):
return self._evaluate(predict, truth, self.seq_len)

def metrics(self):
return np.mean(self.eval_history)

def show_matrices(self):
return "dev accuracy={:.2f}".format(float(self.metrics()))


def embedding_process(emb_file, word_dict, emb_dim, emb_pkl):
if os.path.exists(emb_pkl):
with open(emb_pkl, "rb") as f:
embedding_np = _pickle.load(f)
return embedding_np
with open(emb_file, "r", encoding="utf-8") as f:
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
for line in f:
line = line.strip().split()
if len(line) != emb_dim + 1:
continue
if line[0] in word_dict:
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
with open(emb_pkl, "wb") as f:
_pickle.dump(embedding_np, f)
return embedding_np


def data_load(data_file):
with open(data_file, "r", encoding="utf-8") as f:
all_data = []
sent = []
label = []
for line in f:
line = line.strip().split()

if not len(line) <= 1:
sent.append(line[0])
label.append(line[1])
else:
all_data.append([sent, label])
sent = []
label = []
return all_data


data_path = "data_for_tests/people.txt"
pick_path = "data_for_tests/"
emb_path = "data_for_tests/emb50.txt"
save_path = "data_for_tests/"
if __name__ == "__main__":
data = data_load(data_path)
preprocess = SeqLabelPreprocess()
data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3)
# emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl"))
emb = None
args = {"epochs": 20,
"batch_size": 1,
"pickle_path": pick_path,
"validate": True,
"save_best_dev": True,
"model_saved_path": save_path,
"use_cuda": True,

"vocab_size": preprocess.vocab_size,
"num_classes": preprocess.num_classes,
"word_emb_dim": 50,
"rnn_hidden_units": 100
}
# emb = torch.Tensor(emb).float().cuda()
networks = AdvSeqLabel(args, emb)
trainer = MyNERTrainer(args)
trainer.train(networks, data_train, data_dev)
print("Training finished!")

+ 0
- 129
test/ner_decode.py View File

@@ -1,129 +0,0 @@
import _pickle
import os

import torch

from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel


class Decode(SeqLabelTrainer):
def __init__(self, args):
super(Decode, self).__init__(args)

def decoder(self, network, sents, model_path):
self.model = network
self.model.load_state_dict(torch.load(model_path))
out_put = []
self.mode(network, test=True)
for batch_x in sents:
prediction = self.data_forward(self.model, batch_x)

seq_tag = self.model.prediction(prediction, batch_x[1])

out_put.append(list(seq_tag)[0])
return out_put


def process_sent(sents, word2id):
sents_num = []
for s in sents:
sent_num = []
for c in s:
if c in word2id:
sent_num.append(word2id[c])
else:
sent_num.append(word2id["<unk>"])
sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1

return sents_num


def process_tag(sents, tags, id2class):
Tags = []
for ttt in tags:
Tags.append([id2class[t] for t in ttt])

Segs = []
PosNers = []
for sent, tag in zip(sents, tags):
word__ = []
lll__ = []
for c, t in zip(sent, tag):

t = id2class[t]
l = t.split("-")
split_ = l[0]
pn = l[1]

if split_ == "S":
word__.append(c)
lll__.append(pn)
word_1 = ""
elif split_ == "E":
word_1 += c
word__.append(word_1)
lll__.append(pn)
word_1 = ""
elif split_ == "B":
word_1 = ""
word_1 += c
else:
word_1 += c
Segs.append(word__)
PosNers.append(lll__)
return Segs, PosNers


pickle_path = "data_for_tests/"
model_path = "data_for_tests/model_best_dev.pkl"
if __name__ == "__main__":

with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f:
id2word = _pickle.load(f)
with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f:
word2id = _pickle.load(f)
with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f:
id2class = _pickle.load(f)

sent = ["中共中央总书记、国家主席江泽民",
"逆向处理输入序列并返回逆序后的序列"] # here is input

args = {"epochs": 1,
"batch_size": 1,
"pickle_path": "data_for_tests/",
"validate": True,
"save_best_dev": True,
"model_saved_path": "data_for_tests/",
"use_cuda": False,

"vocab_size": len(word2id),
"num_classes": len(id2class),
"word_emb_dim": 50,
"rnn_hidden_units": 100,
}
"""
network = AdvSeqLabel(args, None)
decoder_ = Decode(args)
tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path)
output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output
print(output_seg)
print(output_pn)
"""
# Define the same model
model = AdvSeqLabel(args, None)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl")
print("model loaded!")

# Inference interface
infer = SeqLabelInfer(pickle_path)
sent = [[ch for ch in s] for s in sent]
results = infer.predict(model, sent)

for res in results:
print(res)
print("Inference finished!")

+ 23
- 27
test/readme_example.py View File

@@ -1,19 +1,13 @@
# python: 3.5
# pytorch: 0.4

################
# Test cross validation.
################

from fastNLP.loader.preprocess import ClassPreprocess

from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregation
from fastNLP.modules import encoder
from fastNLP.modules import decoder
from fastNLP.modules import encoder


class ClassificationModel(BaseModel):
@@ -28,7 +22,7 @@ class ClassificationModel(BaseModel):
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)
self.dec = decoder.MLP(size_layer=[100, num_classes])

def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C]
@@ -38,18 +32,17 @@ class ClassificationModel(BaseModel):
return x


data_dir = 'data' # directory to save data and model
train_path = 'test/data_for_tests/text_classify.txt' # training set file
data_dir = 'save/' # directory to save data and model
train_path = './data_for_tests/text_classify.txt' # training set file

# load dataset
ds_loader = ClassDatasetLoader("train", train_path)
ds_loader = ClassDatasetLoader(train_path)
data = ds_loader.load()

# pre-process dataset
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
# pre = ClassPreprocess(data, data_dir)
n_classes = pre.num_classes
vocab_size = pre.vocab_size
pre = ClassPreprocess()
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
n_classes, vocab_size = pre.num_classes, pre.vocab_size

# construct model
model_args = {
@@ -58,22 +51,25 @@ model_args = {
}
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)

# train model
# construct trainer
train_args = {
"epochs": 10,
"batch_size": 50,
"epochs": 3,
"batch_size": 16,
"pickle_path": data_dir,
"validate": False,
"save_best_dev": False,
"model_saved_path": None,
"use_cuda": True,
"learn_rate": 1e-3,
"momentum": 0.9}
trainer = ClassificationTrainer(train_args)
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
trainer.cross_validate(model)
"loss": Loss("cross_entropy"),
"optimizer": Optimizer("Adam", lr=0.001)
}
trainer = ClassificationTrainer(**train_args)

# start training
trainer.train(model, train_data=train_set, dev_data=dev_set)

# predict using model
data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, data_infer)
labels_pred = infer.predict(model.cpu(), data_infer)
print(labels_pred)

+ 8
- 8
test/seq_labeling.py View File

@@ -33,7 +33,7 @@ data_infer_path = args.infer
def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args})
ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -49,7 +49,7 @@ def infer():
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader("xxx", data_infer_path)
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()

# Inference interface
@@ -65,11 +65,11 @@ def train_and_test():
# Config Loader
trainer_args = ConfigSection()
model_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {
ConfigLoader("config.cfg").load_config(config_dir, {
"test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})

# Data Loader
pos_loader = POSDatasetLoader("xxx", data_path)
pos_loader = POSDatasetLoader(data_path)
train_data = pos_loader.load_lines()

# Preprocessor
@@ -117,13 +117,13 @@ def train_and_test():

# Load test configuration
tester_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args})
ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args})

# Tester
tester = SeqLabelTester(save_output=False,
save_loss=False,
save_best_dev=False,
batch_size=8,
batch_size=4,
use_cuda=False,
pickle_path=pickle_path,
model_name="seq_label_in_test.pkl",
@@ -134,10 +134,10 @@ def train_and_test():
tester.test(model, data_dev)

# print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!")


if __name__ == "__main__":
train_and_test()
# train_and_test()
infer()

+ 8
- 8
test/test_cws.py View File

@@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt"
def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@@ -38,7 +38,7 @@ def infer():
print("model loaded!")

# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines()
"""
Transform strings into list of list of strings.
@@ -61,10 +61,10 @@ def infer():
def train_test():
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args})

# Data Loader
loader = TokenizeDatasetLoader(data_name, cws_data_path)
loader = TokenizeDatasetLoader(cws_data_path)
train_data = loader.load_pku()

# Preprocessor
@@ -74,7 +74,7 @@ def train_test():
train_args["num_classes"] = p.num_classes

# Trainer
trainer = SeqLabelTrainer(train_args)
trainer = SeqLabelTrainer(**train_args.data)

# Model
model = SeqLabeling(train_args)
@@ -99,16 +99,16 @@ def train_test():

# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})

# Tester
tester = SeqLabelTester(test_args)
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, data_train)

# print test results
print(tester.show_matrices())
print(tester.show_metrics())
print("model tested!")




+ 69
- 7
test/test_fastNLP.py View File

@@ -1,13 +1,27 @@
import sys

sys.path.append("..")
from fastNLP.fastnlp import FastNLP
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results

PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/"
PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/"

def word_seg():
nlp = FastNLP("./data_for_tests/")
nlp.load("seq_label_model")
text = "这是最好的基于深度学习的中文分词系统。"
result = nlp.run(text)
print(result)
print("FastNLP finished!")
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES)
nlp.load("cws_basic_model", config_file="cws.cfg", section_name="POS_test")
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
print(results)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_word_seg_results(words, labels))


def text_class():
@@ -19,5 +33,53 @@ def text_class():
print("FastNLP finished!")


def test_word_seg_interpret():
foo = [[('这', 'S'), ('是', 'S'), ('最', 'S'), ('好', 'S'), ('的', 'S'), ('基', 'B'), ('于', 'E'), ('深', 'B'), ('度', 'E'),
('学', 'B'), ('习', 'E'), ('的', 'S'), ('中', 'B'), ('文', 'E'), ('分', 'B'), ('词', 'E'), ('系', 'B'), ('统', 'E'),
('。', 'S')]]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_word_seg_results(chars, labels))


def test_interpret_cws_pos_results():
foo = [
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'),
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'),
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')]
]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_cws_pos_results(chars, labels))


def pos_tag():
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES)
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model")
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_cws_pos_results(words, labels))


def text_classify():
nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES)
nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model")
text = [
"世界物联网大会明日在京召开龙头股启动在即",
"乌鲁木齐市新增一处城市中心旅游目的地",
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"]
results = nlp.run(text)
print(results)
"""
['finance', 'travel', 'history']
"""

if __name__ == "__main__":
text_class()
text_classify()

+ 7
- 7
test/test_tester.py View File

@@ -5,19 +5,19 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
from fastNLP.models.sequence_modeling import SeqLabeling

data_name = "pku_training.utf8"
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8"
pickle_path = "data_for_tests"


def foo():
loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8")
loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8")
train_data = loader.load_pku()

train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args})

# Preprocessor
p = SeqLabelPreprocess(train_data, pickle_path)
p = SeqLabelPreprocess()
train_data = p.run(train_data)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes

@@ -26,11 +26,11 @@ def foo():
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
"save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/",
"use_cuda": True}
validator = SeqLabelTester(valid_args)
validator = SeqLabelTester(**valid_args)

print("start validation.")
validator.test(model)
print(validator.show_matrices())
validator.test(model, train_data)
print(validator.show_metrics())


if __name__ == "__main__":


+ 2
- 2
test/text_classify.py View File

@@ -34,7 +34,7 @@ config_dir = args.config
def infer():
# load dataset
print("Loading data...")
ds_loader = ClassDatasetLoader("train", train_data_dir)
ds_loader = ClassDatasetLoader(train_data_dir)
data = ds_loader.load()
unlabeled_data = [x[0] for x in data]

@@ -69,7 +69,7 @@ def train():

# load dataset
print("Loading data...")
ds_loader = ClassDatasetLoader("train", train_data_dir)
ds_loader = ClassDatasetLoader(train_data_dir)
data = ds_loader.load()
print(data[0])



Loading…
Cancel
Save