diff --git a/README.md b/README.md index 8ad0f018..84d658fd 100644 --- a/README.md +++ b/README.md @@ -16,93 +16,19 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa - numpy>=1.14.2 - torch==0.4.0 - torchvision>=0.1.8 +- tensorboardX ## Resources -- [Documentation](https://github.com/fastnlp/fastNLP) +- [Documentation](https://fastnlp.readthedocs.io/en/latest/) - [Source Code](https://github.com/fastnlp/fastNLP) - -## Example - -### Basic Usage - -A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. -```python -from fastNLP.models.base_model import BaseModel -from fastNLP.modules import encoder -from fastNLP.modules import aggregation -from fastNLP.modules import decoder - -from fastNLP.loader.dataset_loader import ClassDatasetLoader -from fastNLP.loader.preprocess import ClassPreprocess -from fastNLP.core.trainer import ClassificationTrainer -from fastNLP.core.inference import ClassificationInfer - - -class ClassificationModel(BaseModel): - """ - Simple text classification model based on CNN. - """ - - def __init__(self, num_classes, vocab_size): - super(ClassificationModel, self).__init__() - - self.emb = encoder.Embedding(nums=vocab_size, dims=300) - self.enc = encoder.Conv( - in_channels=300, out_channels=100, kernel_size=3) - self.agg = aggregation.MaxPool() - self.dec = decoder.MLP(100, num_classes=num_classes) - - def forward(self, x): - x = self.emb(x) # [N,L] -> [N,L,C] - x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] - x = self.agg(x) # [N,L,C] -> [N,C] - x = self.dec(x) # [N,C] -> [N, N_class] - return x - - -data_dir = 'data' # directory to save data and model -train_path = 'test/data_for_tests/text_classify.txt' # training set file - -# load dataset -ds_loader = ClassDatasetLoader("train", train_path) -data = ds_loader.load() - -# pre-process dataset -pre = ClassPreprocess(data_dir) -vocab_size, n_classes = pre.process(data, "data_train.pkl") - -# construct model -model_args = { - 'num_classes': n_classes, - 'vocab_size': vocab_size -} -model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) - -# train model -train_args = { - "epochs": 20, - "batch_size": 50, - "pickle_path": data_dir, - "validate": False, - "save_best_dev": False, - "model_saved_path": None, - "use_cuda": True, - "learn_rate": 1e-3, - "momentum": 0.9} -trainer = ClassificationTrainer(train_args) -trainer.train(model) - -# predict using model -seqs = [x[0] for x in data] -infer = ClassificationInfer(data_dir) -labels_pred = infer.predict(model, seqs) -``` - - ## Installation +Run the following commands to install fastNLP package. +```shell +pip install fastNLP +``` ### Cloning From GitHub @@ -122,20 +48,26 @@ conda install pytorch torchvision -c pytorch pip3 install torch torchvision ``` +### TensorboardX Installation + +```shell +pip3 install tensorboardX +``` ## Project Structure ``` FastNLP ├── docs -│   └── quick_tutorial.md ├── fastNLP -│   ├── action +│   ├── core │   │   ├── action.py -│   │   ├── inference.py │   │   ├── __init__.py +│   │   ├── loss.py │   │   ├── metrics.py │   │   ├── optimizer.py +│   │   ├── predictor.py +│   │   ├── preprocess.py │   │   ├── README.md │   │   ├── tester.py │   │   └── trainer.py @@ -147,71 +79,28 @@ FastNLP │   │   ├── dataset_loader.py │   │   ├── embed_loader.py │   │   ├── __init__.py -│   │   ├── model_loader.py -│   │   └── preprocess.py +│   │   └── model_loader.py │   ├── models -│   │   ├── base_model.py -│   │   ├── char_language_model.py -│   │   ├── cnn_text_classification.py -│   │   ├── __init__.py -│   │   └── sequence_modeling.py │   ├── modules │   │   ├── aggregation -│   │   │   ├── attention.py -│   │   │   ├── avg_pool.py -│   │   │   ├── __init__.py -│   │   │   ├── kmax_pool.py -│   │   │   ├── max_pool.py -│   │   │   └── self_attention.py │   │   ├── decoder -│   │   │   ├── CRF.py -│   │   │   └── __init__.py │   │   ├── encoder -│   │   │   ├── char_embedding.py -│   │   │   ├── conv_maxpool.py -│   │   │   ├── conv.py -│   │   │   ├── embedding.py -│   │   │   ├── __init__.py -│   │   │   ├── linear.py -│   │   │   ├── lstm.py -│   │   │   ├── masked_rnn.py -│   │   │   └── variational_rnn.py │   │   ├── __init__.py │   │   ├── interaction -│   │   │   └── __init__.py │   │   ├── other_modules.py │   │   └── utils.py │   └── saver -│   ├── base_saver.py -│   ├── __init__.py -│   ├── logger.py -│   └── model_saver.py ├── LICENSE ├── README.md ├── reproduction -│   ├── Char-aware_NLM -│   │   -│   ├── CNN-sentence_classification -│   │   -│   ├── HAN-document_classification -│   │   -│   └── LSTM+self_attention_sentiment_analysis -| ├── requirements.txt ├── setup.py └── test + ├── core ├── data_for_tests - │   ├── charlm.txt - │   ├── config - │   ├── cws_test - │   ├── cws_train - │   ├── people_infer.txt - │   └── people.txt - ├── test_charlm.py - ├── test_cws.py - ├── test_fastNLP.py - ├── test_loader.py - ├── test_seq_labeling.py - ├── test_tester.py - └── test_trainer.py + ├── __init__.py + ├── loader + ├── modules + └── readme_example.py + ``` diff --git a/docs/requirements.txt b/docs/requirements.txt index 3749c2cd..2809876b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ -sphinx --e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme -sphinxcontrib.katex \ No newline at end of file +numpy>=1.14.2 +http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl +torchvision>=0.1.8 +sphinx-rtd-theme==0.4.1 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index d4d73d2a..ff3639fa 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -42,6 +42,8 @@ release = '1.0' extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', + 'sphinx.ext.autosummary', + ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst index 880be59f..13943f72 100644 --- a/docs/source/fastNLP.core.rst +++ b/docs/source/fastNLP.core.rst @@ -1,62 +1,54 @@ -fastNLP.core package -==================== +fastNLP.core +============= -Submodules ----------- - -fastNLP.core.action module --------------------------- +fastNLP.core.action +-------------------- .. automodule:: fastNLP.core.action :members: - :undoc-members: - :show-inheritance: -fastNLP.core.metrics module ---------------------------- +fastNLP.core.loss +------------------ + +.. automodule:: fastNLP.core.loss + :members: + +fastNLP.core.metrics +--------------------- .. automodule:: fastNLP.core.metrics :members: - :undoc-members: - :show-inheritance: -fastNLP.core.optimizer module ------------------------------ +fastNLP.core.optimizer +----------------------- .. automodule:: fastNLP.core.optimizer :members: - :undoc-members: - :show-inheritance: -fastNLP.core.predictor module ------------------------------ +fastNLP.core.predictor +----------------------- .. automodule:: fastNLP.core.predictor :members: - :undoc-members: - :show-inheritance: -fastNLP.core.tester module --------------------------- +fastNLP.core.preprocess +------------------------ + +.. automodule:: fastNLP.core.preprocess + :members: + +fastNLP.core.tester +-------------------- .. automodule:: fastNLP.core.tester :members: - :undoc-members: - :show-inheritance: -fastNLP.core.trainer module ---------------------------- +fastNLP.core.trainer +--------------------- .. automodule:: fastNLP.core.trainer :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.core :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.loader.rst b/docs/source/fastNLP.loader.rst index 90123b5b..658e07ff 100644 --- a/docs/source/fastNLP.loader.rst +++ b/docs/source/fastNLP.loader.rst @@ -1,62 +1,36 @@ -fastNLP.loader package -====================== +fastNLP.loader +=============== -Submodules ----------- - -fastNLP.loader.base\_loader module ----------------------------------- +fastNLP.loader.base\_loader +---------------------------- .. automodule:: fastNLP.loader.base_loader :members: - :undoc-members: - :show-inheritance: -fastNLP.loader.config\_loader module ------------------------------------- +fastNLP.loader.config\_loader +------------------------------ .. automodule:: fastNLP.loader.config_loader :members: - :undoc-members: - :show-inheritance: -fastNLP.loader.dataset\_loader module -------------------------------------- +fastNLP.loader.dataset\_loader +------------------------------- .. automodule:: fastNLP.loader.dataset_loader :members: - :undoc-members: - :show-inheritance: -fastNLP.loader.embed\_loader module ------------------------------------ +fastNLP.loader.embed\_loader +----------------------------- .. automodule:: fastNLP.loader.embed_loader :members: - :undoc-members: - :show-inheritance: -fastNLP.loader.model\_loader module ------------------------------------ +fastNLP.loader.model\_loader +----------------------------- .. automodule:: fastNLP.loader.model_loader :members: - :undoc-members: - :show-inheritance: - -fastNLP.loader.preprocess module --------------------------------- - -.. automodule:: fastNLP.loader.preprocess - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.loader :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 49481ac1..f17b1d49 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -1,46 +1,30 @@ -fastNLP.models package -====================== +fastNLP.models +=============== -Submodules ----------- - -fastNLP.models.base\_model module ---------------------------------- +fastNLP.models.base\_model +--------------------------- .. automodule:: fastNLP.models.base_model :members: - :undoc-members: - :show-inheritance: -fastNLP.models.char\_language\_model module -------------------------------------------- +fastNLP.models.char\_language\_model +------------------------------------- .. automodule:: fastNLP.models.char_language_model :members: - :undoc-members: - :show-inheritance: -fastNLP.models.cnn\_text\_classification module ------------------------------------------------ +fastNLP.models.cnn\_text\_classification +----------------------------------------- .. automodule:: fastNLP.models.cnn_text_classification :members: - :undoc-members: - :show-inheritance: -fastNLP.models.sequence\_modeling module ----------------------------------------- +fastNLP.models.sequence\_modeling +---------------------------------- .. automodule:: fastNLP.models.sequence_modeling :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.models :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.modules.aggregation.rst b/docs/source/fastNLP.modules.aggregation.rst index 7106f7bc..bfaf8646 100644 --- a/docs/source/fastNLP.modules.aggregation.rst +++ b/docs/source/fastNLP.modules.aggregation.rst @@ -1,54 +1,36 @@ -fastNLP.modules.aggregation package -=================================== +fastNLP.modules.aggregation +============================ -Submodules ----------- - -fastNLP.modules.aggregation.attention module --------------------------------------------- +fastNLP.modules.aggregation.attention +-------------------------------------- .. automodule:: fastNLP.modules.aggregation.attention :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.aggregation.avg\_pool module --------------------------------------------- +fastNLP.modules.aggregation.avg\_pool +-------------------------------------- .. automodule:: fastNLP.modules.aggregation.avg_pool :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.aggregation.kmax\_pool module ---------------------------------------------- +fastNLP.modules.aggregation.kmax\_pool +--------------------------------------- .. automodule:: fastNLP.modules.aggregation.kmax_pool :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.aggregation.max\_pool module --------------------------------------------- +fastNLP.modules.aggregation.max\_pool +-------------------------------------- .. automodule:: fastNLP.modules.aggregation.max_pool :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.aggregation.self\_attention module --------------------------------------------------- +fastNLP.modules.aggregation.self\_attention +-------------------------------------------- .. automodule:: fastNLP.modules.aggregation.self_attention :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.modules.aggregation :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst index 914802da..6844543a 100644 --- a/docs/source/fastNLP.modules.decoder.rst +++ b/docs/source/fastNLP.modules.decoder.rst @@ -1,22 +1,18 @@ -fastNLP.modules.decoder package -=============================== +fastNLP.modules.decoder +======================== -Submodules ----------- - -fastNLP.modules.decoder.CRF module ----------------------------------- +fastNLP.modules.decoder.CRF +---------------------------- .. automodule:: fastNLP.modules.decoder.CRF :members: - :undoc-members: - :show-inheritance: +fastNLP.modules.decoder.MLP +---------------------------- + +.. automodule:: fastNLP.modules.decoder.MLP + :members: -Module contents ---------------- .. automodule:: fastNLP.modules.decoder :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst index 3af14b64..41b4ce13 100644 --- a/docs/source/fastNLP.modules.encoder.rst +++ b/docs/source/fastNLP.modules.encoder.rst @@ -1,78 +1,54 @@ -fastNLP.modules.encoder package -=============================== +fastNLP.modules.encoder +======================== -Submodules ----------- - -fastNLP.modules.encoder.char\_embedding module ----------------------------------------------- +fastNLP.modules.encoder.char\_embedding +---------------------------------------- .. automodule:: fastNLP.modules.encoder.char_embedding :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.conv module ------------------------------------ +fastNLP.modules.encoder.conv +----------------------------- .. automodule:: fastNLP.modules.encoder.conv :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.conv\_maxpool module --------------------------------------------- +fastNLP.modules.encoder.conv\_maxpool +-------------------------------------- .. automodule:: fastNLP.modules.encoder.conv_maxpool :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.embedding module ----------------------------------------- +fastNLP.modules.encoder.embedding +---------------------------------- .. automodule:: fastNLP.modules.encoder.embedding :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.linear module -------------------------------------- +fastNLP.modules.encoder.linear +------------------------------- .. automodule:: fastNLP.modules.encoder.linear :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.lstm module ------------------------------------ +fastNLP.modules.encoder.lstm +----------------------------- .. automodule:: fastNLP.modules.encoder.lstm :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.masked\_rnn module ------------------------------------------- +fastNLP.modules.encoder.masked\_rnn +------------------------------------ .. automodule:: fastNLP.modules.encoder.masked_rnn :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.encoder.variational\_rnn module ------------------------------------------------ +fastNLP.modules.encoder.variational\_rnn +----------------------------------------- .. automodule:: fastNLP.modules.encoder.variational_rnn :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.modules.encoder :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.modules.interaction.rst b/docs/source/fastNLP.modules.interaction.rst index 32552231..91a34268 100644 --- a/docs/source/fastNLP.modules.interaction.rst +++ b/docs/source/fastNLP.modules.interaction.rst @@ -1,10 +1,5 @@ -fastNLP.modules.interaction package -=================================== - -Module contents ---------------- +fastNLP.modules.interaction +============================ .. automodule:: fastNLP.modules.interaction :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst index 5a6cac28..6ccdc21a 100644 --- a/docs/source/fastNLP.modules.rst +++ b/docs/source/fastNLP.modules.rst @@ -1,8 +1,5 @@ -fastNLP.modules package -======================= - -Subpackages ------------ +fastNLP.modules +================ .. toctree:: @@ -11,30 +8,18 @@ Subpackages fastNLP.modules.encoder fastNLP.modules.interaction -Submodules ----------- - -fastNLP.modules.other\_modules module -------------------------------------- +fastNLP.modules.other\_modules +------------------------------- .. automodule:: fastNLP.modules.other_modules :members: - :undoc-members: - :show-inheritance: -fastNLP.modules.utils module ----------------------------- +fastNLP.modules.utils +---------------------- .. automodule:: fastNLP.modules.utils :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.modules :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index fbc3a922..bb5037ce 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -1,8 +1,5 @@ -fastNLP package -=============== - -Subpackages ------------ +fastNLP +======== .. toctree:: @@ -12,22 +9,12 @@ Subpackages fastNLP.modules fastNLP.saver -Submodules ----------- - -fastNLP.fastnlp module ----------------------- +fastNLP.fastnlp +---------------- .. automodule:: fastNLP.fastnlp :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.saver.rst b/docs/source/fastNLP.saver.rst index 7699c2e8..daa6fbe8 100644 --- a/docs/source/fastNLP.saver.rst +++ b/docs/source/fastNLP.saver.rst @@ -1,30 +1,18 @@ -fastNLP.saver package -===================== +fastNLP.saver +============== -Submodules ----------- - -fastNLP.saver.logger module ---------------------------- +fastNLP.saver.logger +--------------------- .. automodule:: fastNLP.saver.logger :members: - :undoc-members: - :show-inheritance: -fastNLP.saver.model\_saver module ---------------------------------- +fastNLP.saver.model\_saver +--------------------------- .. automodule:: fastNLP.saver.model_saver :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- .. automodule:: fastNLP.saver :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/figures/procedures_and_sequence_labeling.png b/docs/source/figures/procedures_and_sequence_labeling.png new file mode 100644 index 00000000..06adc051 Binary files /dev/null and b/docs/source/figures/procedures_and_sequence_labeling.png differ diff --git a/docs/source/figures/text_classification.png b/docs/source/figures/text_classification.png new file mode 100644 index 00000000..5884c64e Binary files /dev/null and b/docs/source/figures/text_classification.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst index 1caf2373..37798321 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,16 +1,54 @@ -.. fastNLP documentation master file, created by - sphinx-quickstart on Mon Aug 20 17:06:44 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +fastNLP documentation +===================== +fastNLP,目前仍在孵化中。 -Welcome to fastNLP's documentation! -=================================== + +Introduction +------------ + +fastNLP是一个基于PyTorch的模块化自然语言处理系统,用于快速开发NLP工具。 +它将基于深度学习的NLP模型划分为不同的模块。 +这些模块分为4类:encoder(编码),interaction(交互), aggregration(聚合) and decoder(解码), +而每个类别包含不同的实现模块。 + +大多数当前的NLP模型可以构建在这些模块上,这极大地简化了开发NLP模型的过程。 +fastNLP的架构如下左图所示: + +.. image:: figures/procedures_and_sequence_labeling.png + +在constructing model部分,以序列标注(上右图)和文本分类(下图)为例进行说明: + +.. image:: figures/text_classification.png + +* encoder module:将输入编码为一些抽象表示,输入的是单词序列,输出向量序列。 +* interaction module:使表示中的信息相互交互,输入的是向量序列,输出的也是向量序列。 +* aggregation module:聚合和减少信息,输入向量序列,输出一个向量。 +* decoder module:将表示解码为输出,输出一个label(文本分类)或者输出label序列(序列标注) + +其中interaction module和aggregation module在模型中不一定存在,例如上面的序列标注模型。 + + + + +User's Guide +------------ +.. toctree:: + :maxdepth: 2 + + user/installation + user/quickstart + + +API Reference +------------- + +If you are looking for information on a specific function, class or +method, this part of the documentation is for you. .. toctree:: - :maxdepth: 4 - :caption: Contents: + :maxdepth: 2 - fastNLP + fastNLP API diff --git a/docs/source/modules.rst b/docs/source/modules.rst deleted file mode 100644 index e9a92cb7..00000000 --- a/docs/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP -======= - -.. toctree:: - :maxdepth: 4 - - fastNLP diff --git a/docs/source/user/installation.rst b/docs/source/user/installation.rst new file mode 100644 index 00000000..0655041b --- /dev/null +++ b/docs/source/user/installation.rst @@ -0,0 +1,31 @@ +============ +Installation +============ + +.. contents:: + :local: + + +Cloning From GitHub +~~~~~~~~~~~~~~~~~~~ + +If you just want to use fastNLP, use: + +.. code:: shell + + git clone https://github.com/fastnlp/fastNLP + cd fastNLP + + +PyTorch Installation +~~~~~~~~~~~~~~~~~~~~ + +Visit the [PyTorch official website] for installation instructions based +on your system. In general, you could use: + +.. code:: shell + + # using conda + conda install pytorch torchvision -c pytorch + # or using pip + pip3 install torch torchvision diff --git a/docs/source/user/quickstart.rst b/docs/source/user/quickstart.rst new file mode 100644 index 00000000..c8340053 --- /dev/null +++ b/docs/source/user/quickstart.rst @@ -0,0 +1,84 @@ +========== +Quickstart +========== + +Example +------- + +Basic Usage +~~~~~~~~~~~ + +A typical fastNLP routine is composed of four phases: loading dataset, +pre-processing data, constructing model and training model. + +.. code:: python + + from fastNLP.models.base_model import BaseModel + from fastNLP.modules import encoder + from fastNLP.modules import aggregation + from fastNLP.modules import decoder + + from fastNLP.loader.dataset_loader import ClassDatasetLoader + from fastNLP.loader.preprocess import ClassPreprocess + from fastNLP.core.trainer import ClassificationTrainer + from fastNLP.core.inference import ClassificationInfer + + + class ClassificationModel(BaseModel): + """ + Simple text classification model based on CNN. + """ + + def __init__(self, num_classes, vocab_size): + super(ClassificationModel, self).__init__() + + self.emb = encoder.Embedding(nums=vocab_size, dims=300) + self.enc = encoder.Conv( + in_channels=300, out_channels=100, kernel_size=3) + self.agg = aggregation.MaxPool() + self.dec = decoder.MLP(100, num_classes=num_classes) + + def forward(self, x): + x = self.emb(x) # [N,L] -> [N,L,C] + x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] + x = self.agg(x) # [N,L,C] -> [N,C] + x = self.dec(x) # [N,C] -> [N, N_class] + return x + + + data_dir = 'data' # directory to save data and model + train_path = 'test/data_for_tests/text_classify.txt' # training set file + + # load dataset + ds_loader = ClassDatasetLoader("train", train_path) + data = ds_loader.load() + + # pre-process dataset + pre = ClassPreprocess(data_dir) + vocab_size, n_classes = pre.process(data, "data_train.pkl") + + # construct model + model_args = { + 'num_classes': n_classes, + 'vocab_size': vocab_size + } + model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) + + # train model + train_args = { + "epochs": 20, + "batch_size": 50, + "pickle_path": data_dir, + "validate": False, + "save_best_dev": False, + "model_saved_path": None, + "use_cuda": True, + "learn_rate": 1e-3, + "momentum": 0.9} + trainer = ClassificationTrainer(train_args) + trainer.train(model) + + # predict using model + seqs = [x[0] for x in data] + infer = ClassificationInfer(data_dir) + labels_pred = infer.predict(model, seqs) \ No newline at end of file diff --git a/fastNLP/core/action.py b/fastNLP/core/action.py index 358db499..ef595cbb 100644 --- a/fastNLP/core/action.py +++ b/fastNLP/core/action.py @@ -1,7 +1,3 @@ -""" - This file defines Action(s) and sample methods. - -""" from collections import Counter import numpy as np @@ -9,13 +5,12 @@ import torch class Action(object): - """ - Operations shared by Trainer, Tester, or Inference. + """Operations shared by Trainer, Tester, or Inference. + This is designed for reducing replicate codes. - make_batch: produce a min-batch of data. @staticmethod - pad: padding method used in sequence modeling. @staticmethod - mode: change network mode for either train or test. (for PyTorch) @staticmethod - The base Action shall define operations shared by as much task-specific Actions as possible. """ def __init__(self): @@ -24,18 +19,20 @@ class Action(object): @staticmethod def make_batch(iterator, use_cuda, output_length=True, max_len=None): """Batch and Pad data. + :param iterator: an iterator, (object that implements __next__ method) which returns the next sample. :param use_cuda: bool, whether to use GPU :param output_length: bool, whether to output the original length of the sequence before padding. (default: True) :param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) - :return - if output_length is True: + :return : + + if output_length is True, (batch_x, seq_len): tuple of two elements batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] seq_len: list. The length of the pre-padded sequence, if output_length is True. batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] - if output_length is False: + if output_length is False, batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] """ @@ -77,21 +74,21 @@ class Action(object): return batch @staticmethod - def mode(model, test=False): - """ - Train mode or Test mode. This is for PyTorch currently. - :param model: - :param test: + def mode(model, is_test=False): + """Train mode or Test mode. This is for PyTorch currently. + + :param model: a PyTorch model + :param is_test: bool, whether in test mode or not. """ - if test: + if is_test: model.eval() else: model.train() def convert_to_torch_tensor(data_list, use_cuda): - """ - convert lists into (cuda) Tensors. + """Convert lists into (cuda) Tensors. + :param data_list: 2-level lists :param use_cuda: bool, whether to use GPU or not :return data_list: PyTorch Tensor of shape [batch_size, max_seq_len] @@ -103,8 +100,8 @@ def convert_to_torch_tensor(data_list, use_cuda): def k_means_1d(x, k, max_iter=100): - """ - Perform k-means on 1-D data. + """Perform k-means on 1-D data. + :param x: list of int, representing points in 1-D. :param k: the number of clusters required. :param max_iter: maximum iteration @@ -132,21 +129,28 @@ def k_means_1d(x, k, max_iter=100): def k_means_bucketing(all_inst, buckets): - """ + """Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. + :param all_inst: 3-level list + E.g. :: + [ [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 ... ] + :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length threshold for each bucket (This is usually None.). :return data: 2-level list + :: + [ [index_11, index_12, ...], # bucket 1 [index_21, index_22, ...], # bucket 2 ... ] + """ bucket_data = [[] for _ in buckets] num_buckets = len(buckets) @@ -160,11 +164,16 @@ def k_means_bucketing(all_inst, buckets): class BaseSampler(object): - """ - Base class for all samplers. + """The base class of all samplers. + """ def __init__(self, data_set): + """ + + :param data_set: multi-level list, of shape [num_example, *] + + """ self.data_set_length = len(data_set) self.data = data_set @@ -176,11 +185,16 @@ class BaseSampler(object): class SequentialSampler(BaseSampler): - """ - Sample data in the original order. + """Sample data in the original order. + """ def __init__(self, data_set): + """ + + :param data_set: multi-level list + + """ super(SequentialSampler, self).__init__(data_set) def __iter__(self): @@ -188,11 +202,16 @@ class SequentialSampler(BaseSampler): class RandomSampler(BaseSampler): - """ - Sample data in random permutation order. + """Sample data in random permutation order. + """ def __init__(self, data_set): + """ + + :param data_set: multi-level list + + """ super(RandomSampler, self).__init__(data_set) self.order = np.random.permutation(self.data_set_length) @@ -201,11 +220,18 @@ class RandomSampler(BaseSampler): class Batchifier(object): - """ - Wrap random or sequential sampler to generate a mini-batch. + """Wrap random or sequential sampler to generate a mini-batch. + """ def __init__(self, sampler, batch_size, drop_last=True): + """ + + :param sampler: a Sampler object + :param batch_size: int, the size of the mini-batch + :param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch. + + """ super(Batchifier, self).__init__() self.sampler = sampler self.batch_size = batch_size @@ -223,8 +249,7 @@ class Batchifier(object): class BucketBatchifier(Batchifier): - """ - Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. + """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. In sampling, first random choose a bucket. Then sample data from it. The number of buckets is decided dynamically by the variance of sentence lengths. """ @@ -237,6 +262,7 @@ class BucketBatchifier(Batchifier): :param num_buckets: int, number of buckets for grouping these sequences. :param drop_last: bool, useless currently. :param sampler: Sampler, useless currently. + """ super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) buckets = ([None] * num_buckets) diff --git a/fastNLP/core/loss.py b/fastNLP/core/loss.py index f83b4959..8d866bbf 100644 --- a/fastNLP/core/loss.py +++ b/fastNLP/core/loss.py @@ -8,8 +8,13 @@ class Loss(object): """ def __init__(self, args): + """ + + :param args: None or str, the name of a loss function. + + """ if args is None: - # this is useful when + # this is useful when Trainer.__init__ performs type check self._loss = None elif isinstance(args, str): self._loss = self._borrow_from_pytorch(args) @@ -17,10 +22,19 @@ class Loss(object): raise NotImplementedError def get(self): + """ + + :return self._loss: the loss function + """ return self._loss @staticmethod def _borrow_from_pytorch(loss_name): + """Given a name of a loss function, return it from PyTorch. + + :param loss_name: str, the name of a loss function + :return loss: a PyTorch loss + """ if loss_name == "cross_entropy": return torch.nn.CrossEntropyLoss() else: diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index c8d7fe52..7bf4b034 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -1,11 +1,12 @@ import warnings + import numpy as np import torch def _conver_numpy(x): - """ - convert input data to numpy array + """convert input data to numpy array + """ if isinstance(x, np.ndarray): return x @@ -17,21 +18,20 @@ def _conver_numpy(x): def _check_same_len(*arrays, axis=0): - """ - check if input array list has same length for one dimension + """check if input array list has same length for one dimension + """ lens = set([x.shape[axis] for x in arrays if x is not None]) return len(lens) == 1 def _label_types(y): - """ - determine the type - "binary" - "multiclass" - "multiclass-multioutput" - "multilabel" - "unknown" + """Determine the type + - "binary" + - "multiclass" + - "multiclass-multioutput" + - "multilabel" + - "unknown" """ # never squeeze the first dimension y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1) @@ -46,8 +46,8 @@ def _label_types(y): def _check_data(y_true, y_pred): - """ - check if y_true and y_pred is same type of data e.g both binary or multiclass + """Check if y_true and y_pred is same type of data e.g both binary or multiclass + """ y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred) if not _check_same_len(y_true, y_pred): @@ -174,16 +174,13 @@ def classification_report(y_true, y_pred, labels=None, target_names=None, digits def accuracy_topk(y_true, y_prob, k=1): - """ - Compute accuracy of y_true matching top-k probable + """Compute accuracy of y_true matching top-k probable labels in y_prob. - Paras: - y_ture - ndarray, true label, [n_samples] - y_prob - ndarray, label probabilities, [n_samples, n_classes] - k - int, k in top-k - Returns: - accuracy of top-k + :param y_true: ndarray, true label, [n_samples] + :param y_prob: ndarray, label probabilities, [n_samples, n_classes] + :param k: int, k in top-k + :return :accuracy of top-k """ y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] @@ -195,16 +192,14 @@ def accuracy_topk(y_true, y_prob, k=1): def pred_topk(y_prob, k=1): - """ - Return top-k predicted labels and corresponding probabilities. - - Args: - y_prob - ndarray, size [n_samples, n_classes], probabilities on labels - k - int, k of top-k - Returns: - y_pred_topk - ndarray, size [n_samples, k], predicted top-k labels - y_prob_topk - ndarray, size [n_samples, k], probabilities for - top-k labels + """Return top-k predicted labels and corresponding probabilities. + + + :param y_prob: ndarray, size [n_samples, n_classes], probabilities on labels + :param k: int, k of top-k + :returns + y_pred_topk: ndarray, size [n_samples, k], predicted top-k labels + y_prob_topk: ndarray, size [n_samples, k], probabilities for top-k labels """ y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index e106fde0..ff2ee40e 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -4,7 +4,6 @@ import torch class Optimizer(object): """Wrapper of optimizer from framework - names: arguments (type) 1. Adam: lr (float), weight_decay (float) 2. AdaGrad 3. RMSProp @@ -16,20 +15,29 @@ class Optimizer(object): """ :param optimizer_name: str, the name of the optimizer :param kwargs: the arguments + """ self.optim_name = optimizer_name self.kwargs = kwargs @property def name(self): + """The name of the optimizer. + + :return: str + """ return self.optim_name @property def params(self): + """The arguments used to create the optimizer. + + :return: dict of (str, *) + """ return self.kwargs def construct_from_pytorch(self, model_params): - """construct a optimizer from framework over given model parameters""" + """Construct a optimizer from framework over given model parameters.""" if self.optim_name in ["SGD", "sgd"]: if "lr" in self.kwargs: diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 03a6e43c..d04a6ef0 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -70,7 +70,7 @@ class Predictor(object): def predict(self, network, data): """Perform inference using the trained model. - :param network: a PyTorch model + :param network: a PyTorch model (cpu) :param data: list of list of strings :return: list of list of strings, [num_examples, tag_seq_length] """ diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py index f950929e..f8142c36 100644 --- a/fastNLP/core/preprocess.py +++ b/fastNLP/core/preprocess.py @@ -17,12 +17,24 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, # the first vocab in dict with the index = 5 def save_pickle(obj, pickle_path, file_name): + """Save an object into a pickle file. + + :param obj: an object + :param pickle_path: str, the directory where the pickle file is to be saved + :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl". + """ with open(os.path.join(pickle_path, file_name), "wb") as f: _pickle.dump(obj, f) print("{} saved in {}".format(file_name, pickle_path)) def load_pickle(pickle_path, file_name): + """Load an object from a given pickle file. + + :param pickle_path: str, the directory where the pickle file is. + :param file_name: str, the name of the pickle file. + :return obj: an object stored in the pickle + """ with open(os.path.join(pickle_path, file_name), "rb") as f: obj = _pickle.load(f) print("{} loaded from {}".format(file_name, pickle_path)) @@ -30,7 +42,8 @@ def load_pickle(pickle_path, file_name): def pickle_exist(pickle_path, pickle_name): - """ + """Check if a given pickle file exists in the directory. + :param pickle_path: the directory of target pickle file :param pickle_name: the filename of target pickle file :return: True if file exists else False @@ -45,6 +58,19 @@ def pickle_exist(pickle_path, pickle_name): class BasePreprocess(object): + """Base class of all preprocessors. + Preprocessors are responsible for converting data of strings into data of indices. + During the pre-processing, the following pickle files will be built: + + - "word2id.pkl", a mapping from words(tokens) to indices + - "id2word.pkl", a reversed dictionary + - "label2id.pkl", a dictionary on labels + - "id2label.pkl", a reversed dictionary on labels + + These four pickle files are expected to be saved in the given pickle directory once they are constructed. + Preprocessors will check if those files are already in the directory and will reuse them in future calls. + """ + def __init__(self): self.word2index = None self.label2index = None @@ -59,6 +85,7 @@ class BasePreprocess(object): def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): """Main preprocessing pipeline. + :param train_dev_data: three-level list, with either single label or multiple labels in a sample. :param test_data: three-level list, with either single label or multiple labels in a sample. (optional) :param pickle_path: str, the path to save the pickle files. @@ -67,6 +94,7 @@ class BasePreprocess(object): :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. :return results: a tuple of datasets after preprocessing. """ + if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): self.word2index = load_pickle(pickle_path, "word2id.pkl") self.label2index = load_pickle(pickle_path, "class2id.pkl") @@ -182,25 +210,31 @@ class SeqLabelPreprocess(BasePreprocess): """Preprocess pipeline, including building mapping from words to index, from index to words, from labels/classes to index, from index to labels/classes. data of three-level list which have multiple labels in each sample. + :: + [ [ [word_11, word_12, ...], [label_1, label_1, ...] ], [ [word_21, word_22, ...], [label_2, label_1, ...] ], ... ] + """ def __init__(self): super(SeqLabelPreprocess, self).__init__() def build_dict(self, data): - """ - Add new words with indices into self.word_dict, new labels with indices into self.label_dict. + """Add new words with indices into self.word_dict, new labels with indices into self.label_dict. + :param data: three-level list + :: + [ [ [word_11, word_12, ...], [label_1, label_1, ...] ], [ [word_21, word_22, ...], [label_2, label_1, ...] ], ... ] + :return word2index: dict of {str, int} label2index: dict of {str, int} """ @@ -216,14 +250,17 @@ class SeqLabelPreprocess(BasePreprocess): return word2index, label2index def to_index(self, data): - """ - Convert word strings and label strings into indices. + """Convert word strings and label strings into indices. + :param data: three-level list + :: + [ [ [word_11, word_12, ...], [label_1, label_1, ...] ], [ [word_21, word_22, ...], [label_2, label_1, ...] ], ... ] + :return data_index: the same shape as data, but each string is replaced by its corresponding index """ data_index = [] @@ -242,11 +279,14 @@ class ClassPreprocess(BasePreprocess): Preprocess pipeline, including building mapping from words to index, from index to words, from labels/classes to index, from index to labels/classes. design for data of three-level list which has a single label in each sample. + :: + [ [ [word_11, word_12, ...], label_1 ], [ [word_21, word_22, ...], label_2 ], ... ] + """ def __init__(self): @@ -269,18 +309,21 @@ class ClassPreprocess(BasePreprocess): for word in sent: if word not in word2index: - word2index[word[0]] = len(word2index) + word2index[word] = len(word2index) return word2index, label2index def to_index(self, data): - """ - Convert word strings and label strings into indices. + """Convert word strings and label strings into indices. + :param data: three-level list + :: + [ [ [word_11, word_12, ...], label_1 ], [ [word_21, word_22, ...], label_2 ], ... ] + :return data_index: the same shape as data, but each string is replaced by its corresponding index """ data_index = [] @@ -295,14 +338,15 @@ class ClassPreprocess(BasePreprocess): def infer_preprocess(pickle_path, data): - """ - Preprocess over inference data. - Transform three-level list of strings into that of index. + """Preprocess over inference data. Transform three-level list of strings into that of index. + :: + [ [word_11, word_12, ...], [word_21, word_22, ...], ... ] + """ word2index = load_pickle(pickle_path, "word2id.pkl") data_index = [] diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index c168822e..bcb6ba8c 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -38,7 +38,7 @@ class BaseTester(object): Obviously, "required_args" is the subset of "default_args". The value in "default_args" to the keys in "required_args" is simply for type check. """ - # TODO: required arguments + # add required arguments here required_args = {} for req_key in required_args: @@ -56,7 +56,7 @@ class BaseTester(object): logger.error(msg) raise ValueError(msg) else: - # BeseTester doesn't care about extra arguments + # BaseTester doesn't care about extra arguments pass print(default_args) @@ -69,8 +69,8 @@ class BaseTester(object): self.print_every_step = default_args["print_every_step"] self._model = None - self.eval_history = [] - self.batch_output = [] + self.eval_history = [] # evaluation results of all batches + self.batch_output = [] # outputs of all batches def test(self, network, dev_data): if torch.cuda.is_available() and self.use_cuda: @@ -83,10 +83,10 @@ class BaseTester(object): self.eval_history.clear() self.batch_output.clear() - iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) + iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False)) step = 0 - for batch_x, batch_y in self.make_batch(iterator, dev_data): + for batch_x, batch_y in self.make_batch(iterator): with torch.no_grad(): prediction = self.data_forward(network, batch_x) eval_results = self.evaluate(prediction, batch_y) @@ -99,7 +99,7 @@ class BaseTester(object): print_output = "[test step {}] {}".format(step, eval_results) logger.info(print_output) if self.print_every_step > 0 and step % self.print_every_step == 0: - print(print_output) + print(self.make_eval_output(prediction, eval_results)) step += 1 def mode(self, model, test): @@ -115,28 +115,48 @@ class BaseTester(object): raise NotImplementedError def evaluate(self, predict, truth): - """Compute evaluation metrics for the model. """ + """Compute evaluation metrics. + + :param predict: Tensor + :param truth: Tensor + :return eval_results: can be anything. It will be stored in self.eval_history + """ raise NotImplementedError @property def metrics(self): - """Return a list of metrics. """ + """Compute and return metrics. + Use self.eval_history to compute metrics over the whole dev set. + Please refer to metrics.py for common metric functions. + + :return : variable number of outputs + """ raise NotImplementedError - def show_matrices(self): - """This is called by Trainer to print evaluation results on dev set during training. + def show_metrics(self): + """Customize evaluation outputs in Trainer. + Called by Trainer to print evaluation results on dev set during training. + Use self.metrics to fetch available metrics. :return print_str: str """ raise NotImplementedError - def make_batch(self, iterator, data): + def make_batch(self, iterator): raise NotImplementedError + def make_eval_output(self, predictions, eval_results): + """Customize Tester outputs. + + :param predictions: Tensor + :param eval_results: Tensor + :return: str, to be printed. + """ + raise NotImplementedError class SeqLabelTester(BaseTester): - """ - Tester for sequence labeling. + """Tester for sequence labeling. + """ def __init__(self, **test_args): @@ -194,15 +214,15 @@ class SeqLabelTester(BaseTester): batch_accuracy = np.mean([x[1] for x in self.eval_history]) return batch_loss, batch_accuracy - def show_matrices(self): - """ - This is called by Trainer to print evaluation on dev set. + def show_metrics(self): + """This is called by Trainer to print evaluation on dev set. + :return print_str: str """ loss, accuracy = self.metrics() return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) - def make_batch(self, iterator, data): + def make_batch(self, iterator): return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) @@ -211,12 +231,12 @@ class ClassificationTester(BaseTester): def __init__(self, **test_args): """ - :param test_args: a dict-like object that has __getitem__ method, \ + :param test_args: a dict-like object that has __getitem__ method. can be accessed by "test_args["key_str"]" """ super(ClassificationTester, self).__init__(**test_args) - def make_batch(self, iterator, data, max_len=None): + def make_batch(self, iterator, max_len=None): return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) def data_forward(self, network, x): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 7fc34da0..523a1763 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,10 +1,11 @@ -import _pickle import copy import os import time from datetime import timedelta import torch +import tensorboardX +from tensorboardX import SummaryWriter from fastNLP.core.action import Action from fastNLP.core.action import RandomSampler, Batchifier @@ -15,16 +16,12 @@ from fastNLP.modules import utils from fastNLP.saver.logger import create_logger from fastNLP.saver.model_saver import ModelSaver -DEFAULT_QUEUE_SIZE = 300 logger = create_logger(__name__, "./train_test.log") class BaseTrainer(object): - """Operations to train a model, including data loading, SGD, and validation. + """Operations of training a model, including data loading, gradient descent, and validation. - Subclasses must implement the following abstract methods: - - grad_backward - - get_loss """ def __init__(self, **kwargs): @@ -32,10 +29,10 @@ class BaseTrainer(object): :param kwargs: dict of (key, value), or dict-like object. key is str. The base trainer requires the following keys: - - epochs: int, the number of epochs in training - - validate: bool, whether or not to validate on dev set - - batch_size: int - - pickle_path: str, the path to pickle files for pre-processing + - epochs: int, the number of epochs in training + - validate: bool, whether or not to validate on dev set + - batch_size: int + - pickle_path: str, the path to pickle files for pre-processing """ super(BaseTrainer, self).__init__() @@ -47,7 +44,7 @@ class BaseTrainer(object): """ default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, - "loss": Loss(None), + "loss": Loss(None), # used to pass type check "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) } """ @@ -56,7 +53,7 @@ class BaseTrainer(object): Obviously, "required_args" is the subset of "default_args". The value in "default_args" to the keys in "required_args" is simply for type check. """ - # TODO: required arguments + # add required arguments here required_args = {} for req_key in required_args: @@ -91,9 +88,12 @@ class BaseTrainer(object): self._loss_func = default_args["loss"].get() # return a pytorch loss function or None self._optimizer = None self._optimizer_proto = default_args["optimizer"] + self._summary_writer = SummaryWriter(self.pickle_path + 'tensorboard_logs') + self._graph_summaried = False def train(self, network, train_data, dev_data=None): """General Training Procedure + :param network: a model :param train_data: three-level list, the training set. :param dev_data: three-level list, the validation data (optional) @@ -144,12 +144,13 @@ class BaseTrainer(object): print("Saved better model selected by validation.") logger.info("Saved better model selected by validation.") - valid_results = validator.show_matrices() + valid_results = validator.show_metrics() print("[epoch {}] {}".format(epoch, valid_results)) logger.info("[epoch {}] {}".format(epoch, valid_results)) def _train_step(self, data_iterator, network, **kwargs): """Training process in one epoch. + kwargs should contain: - n_print: int, print training information every n steps. - start: time.time(), the starting time of this step. @@ -163,6 +164,11 @@ class BaseTrainer(object): loss = self.get_loss(prediction, batch_y) self.grad_backward(loss) self.update() + self._summary_writer.add_scalar("loss", loss.item(), global_step=step) + + if not self._graph_summaried: + self._summary_writer.add_graph(network, batch_x) + self._graph_summaried = True if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: end = time.time() @@ -198,21 +204,6 @@ class BaseTrainer(object): network_copy = copy.deepcopy(network) self.train(network_copy, train_data_cv[i], dev_data_cv[i]) - def load_train_data(self, pickle_path): - """ - For task-specific processing. - :param pickle_path: - :return data_train - """ - file_path = os.path.join(pickle_path, "data_train.pkl") - if os.path.exists(file_path): - with open(file_path, 'rb') as f: - data = _pickle.load(f) - else: - logger.error("cannot find training data {}. invalid input path for training data.".format(file_path)) - raise RuntimeError("cannot find training data {}".format(file_path)) - return data - def make_batch(self, iterator): raise NotImplementedError @@ -220,14 +211,13 @@ class BaseTrainer(object): Action.mode(network, test) def define_optimizer(self): - """ - Define framework-specific optimizer specified by the models. + """Define framework-specific optimizer specified by the models. + """ self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters()) def update(self): - """ - Perform weight update on a model. + """Perform weight update on a model. For PyTorch, just call optimizer to update. """ @@ -237,8 +227,8 @@ class BaseTrainer(object): raise NotImplementedError def grad_backward(self, loss): - """ - Compute gradient with link rules. + """Compute gradient with link rules. + :param loss: a scalar where back-prop starts For PyTorch, just do "loss.backward()" @@ -247,8 +237,8 @@ class BaseTrainer(object): loss.backward() def get_loss(self, predict, truth): - """ - Compute loss given prediction and ground truth. + """Compute loss given prediction and ground truth. + :param predict: prediction label vector :param truth: ground truth label vector :return: a scalar @@ -256,8 +246,9 @@ class BaseTrainer(object): return self._loss_func(predict, truth) def define_loss(self): - """ - if the model defines a loss, use model's loss. + """Define a loss for the trainer. + + If the model defines a loss, use model's loss. Otherwise, Trainer must has a loss argument, use it as loss. These two losses cannot be defined at the same time. Trainer does not handle loss definition or choose default losses. @@ -274,7 +265,8 @@ class BaseTrainer(object): logger.info("The model didn't define loss, use Trainer's loss.") def best_eval_result(self, validator): - """ + """Check if the current epoch yields better validation results. + :param validator: a Tester instance :return: bool, True means current results on dev set is the best. """ @@ -289,15 +281,14 @@ class BaseTrainer(object): """ if model_name[-4:] != ".pkl": model_name += ".pkl" - ModelSaver(self.pickle_path + model_name).save_pytorch(network) + ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network) def _create_validator(self, valid_args): raise NotImplementedError class SeqLabelTrainer(BaseTrainer): - """ - Trainer for Sequence Labeling + """Trainer for Sequence Labeling """ @@ -327,11 +318,11 @@ class SeqLabelTrainer(BaseTrainer): return y def get_loss(self, predict, truth): - """ - Compute loss given prediction and ground truth. + """Compute loss given prediction and ground truth. + :param predict: prediction label vector, [batch_size, max_len, tag_size] :param truth: ground truth label vector, [batch_size, max_len] - :return: a scalar + :return loss: a scalar """ batch_size, max_len = predict.size(0), predict.size(1) assert truth.shape == (batch_size, max_len) diff --git a/fastNLP/fastnlp.py b/fastNLP/fastnlp.py index 67204161..c76e6681 100644 --- a/fastNLP/fastnlp.py +++ b/fastNLP/fastnlp.py @@ -1,3 +1,5 @@ +import os + from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer from fastNLP.core.preprocess import load_pickle from fastNLP.loader.config_loader import ConfigLoader, ConfigSection @@ -31,6 +33,22 @@ FastNLP_MODEL_COLLECTION = { "type": "seq_label", "config_file_name": "config", "config_section_name": "text_class_model" + }, + "pos_tag_model": { + "url": "", + "class": "sequence_modeling.AdvSeqLabel", + "pickle": "pos_tag_model_v_0.pkl", + "type": "seq_label", + "config_file_name": "pos_tag.config", + "config_section_name": "pos_tag_model" + }, + "text_classify_model": { + "url": "", + "class": "cnn_text_classification.CNNText", + "pickle": "text_class_model_v0.pkl", + "type": "text_class", + "config_file_name": "text_classify.cfg", + "config_section_name": "model" } } @@ -77,7 +95,7 @@ class FastNLP(object): print("Restore model class {}".format(str(model_class))) model_args = ConfigSection() - ConfigLoader.load_config(self.model_dir + config_file, {section_name: model_args}) + ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args}) print("Restore model hyper-parameters {}".format(str(model_args.data))) # fetch dictionary size and number of labels from pickle files @@ -91,7 +109,7 @@ class FastNLP(object): print("Model constructed.") # To do: framework independent - ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name]["pickle"]) + ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"])) print("Model weights loaded.") self.model = model @@ -259,3 +277,38 @@ def interpret_word_seg_results(char_seq, label_seq): else: raise ValueError("invalid label {}".format(label[0])) return words + + +def interpret_cws_pos_results(char_seq, label_seq): + """Transform model output into user-friendly contents. + + :param char_seq: list of string + :param label_seq: list of string, the same length as char_seq. + :return outputs: list of tuple (words, pos_tag): + """ + + def pos_tag_check(seq): + """check whether all entries are the same """ + return len(set(seq)) <= 1 + + word = [] + word_pos = [] + outputs = [] + for char, label in zip(char_seq, label_seq): + tmp = label.split("-") + cws_label, pos_tag = tmp[0], tmp[1] + + if cws_label == "B" or cws_label == "M": + word.append(char) + word_pos.append(pos_tag) + elif cws_label == "E": + word.append(char) + word_pos.append(pos_tag) + if not pos_tag_check(word_pos): + raise RuntimeError("character-wise pos tags inconsistent. ") + outputs.append(("".join(word), word_pos[0])) + word.clear() + word_pos.clear() + elif cws_label == "S": + outputs.append((char, pos_tag)) + return outputs diff --git a/fastNLP/loader/base_loader.py b/fastNLP/loader/base_loader.py index 45a379c1..808567fb 100644 --- a/fastNLP/loader/base_loader.py +++ b/fastNLP/loader/base_loader.py @@ -1,9 +1,8 @@ class BaseLoader(object): """docstring for BaseLoader""" - def __init__(self, data_name, data_path): + def __init__(self, data_path): super(BaseLoader, self).__init__() - self.data_name = data_name self.data_path = data_path def load(self): @@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader): For charLM """ - def __init__(self, name, path): - super(ToyLoader0, self).__init__(name, path) + def __init__(self, data_path): + super(ToyLoader0, self).__init__(data_path) def load(self): with open(self.data_path, 'r') as f: diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py index 9e3ebc1c..20d791c4 100644 --- a/fastNLP/loader/config_loader.py +++ b/fastNLP/loader/config_loader.py @@ -9,7 +9,7 @@ class ConfigLoader(BaseLoader): """loader for configuration files""" def __int__(self, data_name, data_path): - super(ConfigLoader, self).__init__(data_name, data_path) + super(ConfigLoader, self).__init__(data_path) self.config = self.parse(super(ConfigLoader, self).load()) @staticmethod @@ -100,7 +100,7 @@ class ConfigSection(object): if __name__ == "__main__": - config = ConfigLoader('configLoader', 'there is no data') + config = ConfigLoader('there is no data') section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} """ diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 13a96030..2f03bd8a 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader class DatasetLoader(BaseLoader): """"loader for data sets""" - def __init__(self, data_name, data_path): - super(DatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(DatasetLoader, self).__init__(data_path) class POSDatasetLoader(DatasetLoader): @@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader): to label5. """ - def __init__(self, data_name, data_path): - super(POSDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(POSDatasetLoader, self).__init__(data_path) def load(self): assert os.path.exists(self.data_path) @@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader): Data set loader for tokenization data sets """ - def __init__(self, data_name, data_path): - super(TokenizeDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(TokenizeDatasetLoader, self).__init__(data_path) def load_pku(self, max_seq_len=32): """ @@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader): class ClassDatasetLoader(DatasetLoader): """Loader for classification data sets""" - def __init__(self, data_name, data_path): - super(ClassDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(ClassDatasetLoader, self).__init__(data_path) def load(self): assert os.path.exists(self.data_path) @@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader): :param str data_name: the name of the conll data set :param str data_path: the path to the conll data set """ - super(ConllLoader, self).__init__(data_name, data_path) + super(ConllLoader, self).__init__(data_path) self.data_set = self.parse(self.load()) def load(self): @@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader): class LMDatasetLoader(DatasetLoader): - def __init__(self, data_name, data_path): - super(LMDatasetLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(LMDatasetLoader, self).__init__(data_path) def load(self): if not os.path.exists(self.data_path): @@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): return text.strip().split() -if __name__ == "__main__": +class PeopleDailyCorpusLoader(DatasetLoader): """ - data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() - for example in data: - for w, l in zip(example[0], example[1]): - print(w, l) + People Daily Corpus: Chinese word segmentation, POS tag, NER """ - ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() - print(ans) + def __init__(self, data_path): + super(PeopleDailyCorpusLoader, self).__init__(data_path) + + def load(self): + with open(self.data_path, "r", encoding="utf-8") as f: + sents = f.readlines() + + pos_tag_examples = [] + ner_examples = [] + for sent in sents: + inside_ne = False + sent_pos_tag = [] + sent_words = [] + sent_ner = [] + words = sent.strip().split()[1:] + for word in words: + if "[" in word and "]" in word: + ner_tag = "U" + print(word) + elif "[" in word: + inside_ne = True + ner_tag = "B" + word = word[1:] + elif "]" in word: + ner_tag = "L" + word = word[:word.index("]")] + if inside_ne is True: + inside_ne = False + else: + raise RuntimeError("only ] appears!") + else: + if inside_ne is True: + ner_tag = "I" + else: + ner_tag = "O" + tmp = word.split("/") + token, pos = tmp[0], tmp[1] + sent_ner.append(ner_tag) + sent_pos_tag.append(pos) + sent_words.append(token) + pos_tag_examples.append([sent_words, sent_pos_tag]) + ner_examples.append([sent_words, sent_ner]) + return pos_tag_examples, ner_examples + +if __name__ == "__main__": + loader = PeopleDailyCorpusLoader("./") + pos, ner = loader.load() + print(pos[:10]) + print(ner[:10]) diff --git a/fastNLP/loader/embed_loader.py b/fastNLP/loader/embed_loader.py index 4b70dd0b..a84f6335 100644 --- a/fastNLP/loader/embed_loader.py +++ b/fastNLP/loader/embed_loader.py @@ -1,8 +1,50 @@ +import _pickle +import os + +import numpy as np + from fastNLP.loader.base_loader import BaseLoader class EmbedLoader(BaseLoader): """docstring for EmbedLoader""" - def __init__(self, data_name, data_path): - super(EmbedLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(EmbedLoader, self).__init__(data_path) + + @staticmethod + def load_embedding(emb_dim, emb_file, word_dict, emb_pkl): + """Load the pre-trained embedding and combine with the given dictionary. + + :param emb_file: str, the pre-trained embedding. + The embedding file should have the following format: + Each line is a word embedding, where a word string is followed by multiple floats. + Floats are separated by space. The word and the first float are separated by space. + :param word_dict: dict, a mapping from word to index. + :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. + :param emb_pkl: str, the embedding pickle file. + :return embedding_np: numpy array of shape (len(word_dict), emb_dim) + + TODO: fragile code + """ + # If the embedding pickle exists, load it and return. + if os.path.exists(emb_pkl): + with open(emb_pkl, "rb") as f: + embedding_np = _pickle.load(f) + return embedding_np + # Otherwise, load the pre-trained embedding. + with open(emb_file, "r", encoding="utf-8") as f: + # begin with a random embedding + embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) + for line in f: + line = line.strip().split() + if len(line) != emb_dim + 1: + # skip this line if two embedding dimension not match + continue + if line[0] in word_dict: + # find the word and replace its embedding with a pre-trained one + embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] + # save and return the result + with open(emb_pkl, "wb") as f: + _pickle.dump(embedding_np, f) + return embedding_np diff --git a/fastNLP/loader/model_loader.py b/fastNLP/loader/model_loader.py index 1e1d4f8f..c07576b8 100644 --- a/fastNLP/loader/model_loader.py +++ b/fastNLP/loader/model_loader.py @@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): Loader for models. """ - def __init__(self, data_name, data_path): - super(ModelLoader, self).__init__(data_name, data_path) + def __init__(self, data_path): + super(ModelLoader, self).__init__(data_path) @staticmethod def load_pytorch(empty_model, model_path): diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index b6dcafb3..fc7388a5 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn # import torch.nn.functional as F -from fastNLP.modules.encoder.conv_maxpool import ConvMaxpool +import fastNLP.modules.encoder as encoder class CNNText(torch.nn.Module): @@ -18,22 +18,22 @@ class CNNText(torch.nn.Module): def __init__(self, args): super(CNNText, self).__init__() - class_num = args["num_classes"] + num_classes = args["num_classes"] kernel_nums = [100, 100, 100] kernel_sizes = [3, 4, 5] - embed_num = args["vocab_size"] + vocab_size = args["vocab_size"] embed_dim = 300 pretrained_embed = None drop_prob = 0.5 # no support for pre-trained embedding currently - self.embed = nn.Embedding(embed_num, embed_dim, padding_idx=0) - self.conv_pool = ConvMaxpool( + self.embed = encoder.embedding.Embedding(vocab_size, embed_dim) + self.conv_pool = encoder.conv_maxpool.ConvMaxpool( in_channels=embed_dim, out_channels=kernel_nums, kernel_sizes=kernel_sizes) self.dropout = nn.Dropout(drop_prob) - self.fc = nn.Linear(sum(kernel_nums), class_num) + self.fc = encoder.linear.Linear(sum(kernel_nums), num_classes) def forward(self, x): x = self.embed(x) # [N,L] -> [N,L,C] diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py index 6c0e5141..7b8b2814 100644 --- a/fastNLP/modules/decoder/__init__.py +++ b/fastNLP/modules/decoder/__init__.py @@ -1,3 +1,4 @@ from .CRF import ConditionalRandomField +from .MLP import MLP -__all__ = ["ConditionalRandomField"] +__all__ = ["ConditionalRandomField", "MLP"] diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py index b4e689a7..71b786b9 100644 --- a/fastNLP/modules/encoder/__init__.py +++ b/fastNLP/modules/encoder/__init__.py @@ -2,8 +2,10 @@ from .embedding import Embedding from .linear import Linear from .lstm import Lstm from .conv import Conv +from .conv_maxpool import ConvMaxpool __all__ = ["Lstm", "Embedding", "Linear", - "Conv"] + "Conv", + "ConvMaxpool"] diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py index 0012dce7..f666e7f9 100644 --- a/fastNLP/modules/encoder/conv_maxpool.py +++ b/fastNLP/modules/encoder/conv_maxpool.py @@ -4,6 +4,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from torch.nn.init import xavier_uniform_ class ConvMaxpool(nn.Module): @@ -21,6 +22,7 @@ class ConvMaxpool(nn.Module): if isinstance(kernel_sizes, int): out_channels = [out_channels] kernel_sizes = [kernel_sizes] + self.convs = nn.ModuleList([nn.Conv1d( in_channels=in_channels, out_channels=oc, @@ -31,6 +33,9 @@ class ConvMaxpool(nn.Module): groups=groups, bias=bias) for oc, ks in zip(out_channels, kernel_sizes)]) + + for conv in self.convs: + xavier_uniform_(conv.weight) # weight initialization else: raise Exception( 'Incorrect kernel sizes: should be list, tuple or int') diff --git a/reproduction/chinese_word_seg/cws_train.py b/reproduction/chinese_word_seg/cws_train.py deleted file mode 100644 index b63a9401..00000000 --- a/reproduction/chinese_word_seg/cws_train.py +++ /dev/null @@ -1,114 +0,0 @@ -import sys - -sys.path.append("..") - -from fastNLP.loader.config_loader import ConfigLoader, ConfigSection -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader -from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle -from fastNLP.saver.model_saver import ModelSaver -from fastNLP.loader.model_loader import ModelLoader -from fastNLP.core.tester import SeqLabelTester -from fastNLP.models.sequence_modeling import SeqLabeling -from fastNLP.core.predictor import Predictor - -data_name = "pku_training.utf8" -cws_data_path = "/home/zyfeng/data/pku_training.utf8" -pickle_path = "./save/" -data_infer_path = "/home/zyfeng/data/pku_test.utf8" - - -def infer(): - # Load infer configuration, the same as test - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - - # fetch dictionary size and number of labels from pickle files - word2index = load_pickle(pickle_path, "word2id.pkl") - test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "id2class.pkl") - test_args["num_classes"] = len(index2label) - - # Define the same model - model = SeqLabeling(test_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") - - # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) - infer_data = raw_data_loader.load_lines() - - # Inference interface - infer = Predictor(pickle_path) - results = infer.predict(model, infer_data) - - print(results) - print("Inference finished!") - - -def train_test(): - # Config Loader - train_args = ConfigSection() - test_args = ConfigSection() - ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) - - # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) - train_data = loader.load_pku() - - # Preprocessor - preprocess = SeqLabelPreprocess() - data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) - train_args["vocab_size"] = preprocess.vocab_size - train_args["num_classes"] = preprocess.num_classes - - # Trainer - trainer = SeqLabelTrainer(train_args) - - # Model - model = SeqLabeling(train_args) - - # Start training - trainer.train(model, data_train, data_dev) - print("Training finished!") - - # Saver - saver = ModelSaver("./save/saved_model.pkl") - saver.save_pytorch(model) - print("Model saved!") - - # testing with validation set - test(data_dev) - - -def test(test_data): - # Config Loader - train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) - - # Define the same model - model = SeqLabeling(train_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") - print("model loaded!") - - # Load test configuration - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - - # Tester - tester = SeqLabelTester(test_args) - - # Start testing - tester.test(model, test_data) - - # print test results - print(tester.show_matrices()) - print("model tested!") - - -if __name__ == "__main__": - train_test() diff --git a/reproduction/chinese_word_segment/cws.cfg b/reproduction/chinese_word_segment/cws.cfg index ab799428..033d3967 100644 --- a/reproduction/chinese_word_segment/cws.cfg +++ b/reproduction/chinese_word_segment/cws.cfg @@ -31,4 +31,16 @@ pickle_path = "./save/" use_crf = true use_cuda = true rnn_hidden_units = 100 +word_emb_dim = 100 + +[model] +save_output = true +validate_in_training = true +save_dev_input = false +save_loss = true +batch_size = 640 +pickle_path = "./save/" +use_crf = true +use_cuda = true +rnn_hidden_units = 100 word_emb_dim = 100 \ No newline at end of file diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index 66d01038..d0a22e84 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -27,7 +27,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8") def infer(): # Config Loader test_args = ConfigSection() - ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -47,7 +47,7 @@ def infer(): raise # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() print('data loaded') @@ -63,10 +63,10 @@ def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() - ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) + ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) + loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor @@ -100,7 +100,7 @@ def train(): def test(): # Config Loader test_args = ConfigSection() - ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -125,7 +125,7 @@ def test(): tester.test(model, dev_data) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/reproduction/chinese_word_seg/cws.cfg b/reproduction/pos_tag_model/pos_tag.cfg similarity index 59% rename from reproduction/chinese_word_seg/cws.cfg rename to reproduction/pos_tag_model/pos_tag.cfg index cdcb4496..eb5e315d 100644 --- a/reproduction/chinese_word_seg/cws.cfg +++ b/reproduction/pos_tag_model/pos_tag.cfg @@ -1,29 +1,35 @@ [train] -epochs = 10 -batch_size = 32 +epochs = 30 +batch_size = 64 pickle_path = "./save/" validate = true save_best_dev = true model_saved_path = "./save/" rnn_hidden_units = 100 -rnn_layers = 2 -rnn_bi_direction = true word_emb_dim = 100 -dropout = 0.5 use_crf = true use_cuda = true +print_every_step = 10 [test] save_output = true validate_in_training = true save_dev_input = false save_loss = true -batch_size = 64 +batch_size = 640 +pickle_path = "./save/" +use_crf = true +use_cuda = true + + +[POS_test] +save_output = true +validate_in_training = true +save_dev_input = false +save_loss = true +batch_size = 640 pickle_path = "./save/" -rnn_hidden_units = 100 -rnn_layers = 1 -rnn_bi_direction = true -word_emb_dim = 100 -dropout = 0.5 use_crf = true use_cuda = true +rnn_hidden_units = 100 +word_emb_dim = 100 \ No newline at end of file diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py new file mode 100644 index 00000000..87a9f7e8 --- /dev/null +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -0,0 +1,146 @@ +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) + +from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader +from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle +from fastNLP.saver.model_saver import ModelSaver +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.core.tester import SeqLabelTester +from fastNLP.models.sequence_modeling import AdvSeqLabel +from fastNLP.core.predictor import SeqLabelInfer + +# not in the file's dir +if len(os.path.dirname(__file__)) != 0: + os.chdir(os.path.dirname(__file__)) +datadir = "/home/zyfeng/data/" +cfgfile = './pos_tag.cfg' +data_name = "CWS_POS_TAG_NER_people_daily.txt" + +pos_tag_data_path = os.path.join(datadir, data_name) +pickle_path = "save" +data_infer_path = os.path.join(datadir, "infer.utf8") + + +def infer(): + # Config Loader + test_args = ConfigSection() + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # Define the same model + model = AdvSeqLabel(test_args) + + try: + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print('model loaded!') + except Exception as e: + print('cannot load model!') + raise + + # Data Loader + raw_data_loader = BaseLoader(data_infer_path) + infer_data = raw_data_loader.load_lines() + print('data loaded') + + # Inference interface + infer = SeqLabelInfer(pickle_path) + results = infer.predict(model, infer_data) + + print(results) + print("Inference finished!") + + +def train(): + # Config Loader + train_args = ConfigSection() + test_args = ConfigSection() + ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) + + # Data Loader + loader = PeopleDailyCorpusLoader(pos_tag_data_path) + train_data, _ = loader.load() + + # Preprocessor + preprocessor = SeqLabelPreprocess() + data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) + train_args["vocab_size"] = preprocessor.vocab_size + train_args["num_classes"] = preprocessor.num_classes + + # Trainer + trainer = SeqLabelTrainer(**train_args.data) + + # Model + model = AdvSeqLabel(train_args) + try: + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print('model parameter loaded!') + except Exception as e: + print("No saved model. Continue.") + pass + + # Start training + trainer.train(model, data_train, data_dev) + print("Training finished!") + + # Saver + saver = ModelSaver("./save/saved_model.pkl") + saver.save_pytorch(model) + print("Model saved!") + + +def test(): + # Config Loader + test_args = ConfigSection() + ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) + + # fetch dictionary size and number of labels from pickle files + word2index = load_pickle(pickle_path, "word2id.pkl") + test_args["vocab_size"] = len(word2index) + index2label = load_pickle(pickle_path, "id2class.pkl") + test_args["num_classes"] = len(index2label) + + # load dev data + dev_data = load_pickle(pickle_path, "data_dev.pkl") + + # Define the same model + model = AdvSeqLabel(test_args) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, "./save/saved_model.pkl") + print("model loaded!") + + # Tester + tester = SeqLabelTester(**test_args.data) + + # Start testing + tester.test(model, dev_data) + + # print test results + print(tester.show_metrics()) + print("model tested!") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') + parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) + args = parser.parse_args() + if args.mode == 'train': + train() + elif args.mode == 'test': + test() + elif args.mode == 'infer': + infer() + else: + print('no mode specified for model!') + parser.print_help() diff --git a/requirements.txt b/requirements.txt index d961dd92..954dd741 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.14.2 torch==0.4.0 torchvision>=0.1.8 +tensorboardX diff --git a/setup.py b/setup.py index e69de29b..25a645c5 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# coding=utf-8 +from setuptools import setup, find_packages + +with open('README.md') as f: + readme = f.read() + +with open('LICENSE') as f: + license = f.read() + +with open('requirements.txt') as f: + reqs = f.read() + +setup( + name='fastNLP', + version='0.0.1', + description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', + long_description=readme, + license=license, + author='fudanNLP', + python_requires='>=3.5', + packages=find_packages(), + install_requires=reqs.strip().split('\n'), +) diff --git a/test/core/test_action.py b/test/core/test_action.py index 6ad1bd29..8d0f628b 100644 --- a/test/core/test_action.py +++ b/test/core/test_action.py @@ -1,9 +1,8 @@ -import os - import unittest from fastNLP.core.action import Action, Batchifier, SequentialSampler + class TestAction(unittest.TestCase): def test_case_1(self): x = [1, 2, 3, 4, 5, 6, 7, 8] diff --git a/test/loader/test_loader.py b/test/loader/test_loader.py index fe826a6f..d2f22166 100644 --- a/test/loader/test_loader.py +++ b/test/loader/test_loader.py @@ -1,13 +1,12 @@ -import os import configparser - import json +import os import unittest - from fastNLP.loader.config_loader import ConfigSection, ConfigLoader from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader + class TestConfigLoader(unittest.TestCase): def test_case_ConfigLoader(self): @@ -33,8 +32,8 @@ class TestConfigLoader(unittest.TestCase): return dict test_arg = ConfigSection() - ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) - #ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", + ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) + # ConfigLoader("config").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", # {"test": test_arg}) #dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test") @@ -58,18 +57,18 @@ class TestConfigLoader(unittest.TestCase): class TestDatasetLoader(unittest.TestCase): def test_case_TokenizeDatasetLoader(self): - loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") + loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") data = loader.load_pku(max_seq_len=32) print("pass TokenizeDatasetLoader test!") def test_case_POSDatasetLoader(self): - loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt") + loader = POSDatasetLoader("./test/data_for_tests/people.txt") data = loader.load() datas = loader.load_lines() print("pass POSDatasetLoader test!") def test_case_LMDatasetLoader(self): - loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") + loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") data = loader.load() datas = loader.load_lines() print("pass TokenizeDatasetLoader test!") diff --git a/test/ner.py b/test/ner.py deleted file mode 100644 index 150bd8c7..00000000 --- a/test/ner.py +++ /dev/null @@ -1,138 +0,0 @@ -import _pickle -import os - -import numpy as np -import torch - -from fastNLP.core.preprocess import SeqLabelPreprocess -from fastNLP.core.tester import SeqLabelTester -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.models.sequence_modeling import AdvSeqLabel - - -class MyNERTrainer(SeqLabelTrainer): - def __init__(self, train_args): - super(MyNERTrainer, self).__init__(train_args) - self.scheduler = None - - def define_optimizer(self): - """ - override - :return: - """ - self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001) - self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) - - def update(self): - """ - override - :return: - """ - self.optimizer.step() - self.scheduler.step() - - def _create_validator(self, valid_args): - return MyNERTester(valid_args) - - def best_eval_result(self, validator): - accuracy = validator.metrics() - if accuracy > self.best_accuracy: - self.best_accuracy = accuracy - return True - else: - return False - - -class MyNERTester(SeqLabelTester): - def __init__(self, test_args): - super(MyNERTester, self).__init__(test_args) - - def _evaluate(self, prediction, batch_y, seq_len): - """ - :param prediction: [batch_size, seq_len, num_classes] - :param batch_y: [batch_size, seq_len] - :param seq_len: [batch_size] - :return: - """ - summ = 0 - correct = 0 - _, indices = torch.max(prediction, 2) - for p, y, l in zip(indices, batch_y, seq_len): - summ += l - correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) - return float(correct / summ) - - def evaluate(self, predict, truth): - return self._evaluate(predict, truth, self.seq_len) - - def metrics(self): - return np.mean(self.eval_history) - - def show_matrices(self): - return "dev accuracy={:.2f}".format(float(self.metrics())) - - -def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): - if os.path.exists(emb_pkl): - with open(emb_pkl, "rb") as f: - embedding_np = _pickle.load(f) - return embedding_np - with open(emb_file, "r", encoding="utf-8") as f: - embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) - for line in f: - line = line.strip().split() - if len(line) != emb_dim + 1: - continue - if line[0] in word_dict: - embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] - with open(emb_pkl, "wb") as f: - _pickle.dump(embedding_np, f) - return embedding_np - - -def data_load(data_file): - with open(data_file, "r", encoding="utf-8") as f: - all_data = [] - sent = [] - label = [] - for line in f: - line = line.strip().split() - - if not len(line) <= 1: - sent.append(line[0]) - label.append(line[1]) - else: - all_data.append([sent, label]) - sent = [] - label = [] - return all_data - - -data_path = "data_for_tests/people.txt" -pick_path = "data_for_tests/" -emb_path = "data_for_tests/emb50.txt" -save_path = "data_for_tests/" -if __name__ == "__main__": - data = data_load(data_path) - preprocess = SeqLabelPreprocess() - data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3) - # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) - emb = None - args = {"epochs": 20, - "batch_size": 1, - "pickle_path": pick_path, - "validate": True, - "save_best_dev": True, - "model_saved_path": save_path, - "use_cuda": True, - - "vocab_size": preprocess.vocab_size, - "num_classes": preprocess.num_classes, - "word_emb_dim": 50, - "rnn_hidden_units": 100 - } - # emb = torch.Tensor(emb).float().cuda() - networks = AdvSeqLabel(args, emb) - trainer = MyNERTrainer(args) - trainer.train(networks, data_train, data_dev) - print("Training finished!") diff --git a/test/ner_decode.py b/test/ner_decode.py deleted file mode 100644 index 5c09cbd2..00000000 --- a/test/ner_decode.py +++ /dev/null @@ -1,129 +0,0 @@ -import _pickle -import os - -import torch - -from fastNLP.core.predictor import SeqLabelInfer -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.loader.model_loader import ModelLoader -from fastNLP.models.sequence_modeling import AdvSeqLabel - - -class Decode(SeqLabelTrainer): - def __init__(self, args): - super(Decode, self).__init__(args) - - def decoder(self, network, sents, model_path): - self.model = network - self.model.load_state_dict(torch.load(model_path)) - out_put = [] - self.mode(network, test=True) - for batch_x in sents: - prediction = self.data_forward(self.model, batch_x) - - seq_tag = self.model.prediction(prediction, batch_x[1]) - - out_put.append(list(seq_tag)[0]) - return out_put - - -def process_sent(sents, word2id): - sents_num = [] - for s in sents: - sent_num = [] - for c in s: - if c in word2id: - sent_num.append(word2id[c]) - else: - sent_num.append(word2id[""]) - sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 - - return sents_num - - -def process_tag(sents, tags, id2class): - Tags = [] - for ttt in tags: - Tags.append([id2class[t] for t in ttt]) - - Segs = [] - PosNers = [] - for sent, tag in zip(sents, tags): - word__ = [] - lll__ = [] - for c, t in zip(sent, tag): - - t = id2class[t] - l = t.split("-") - split_ = l[0] - pn = l[1] - - if split_ == "S": - word__.append(c) - lll__.append(pn) - word_1 = "" - elif split_ == "E": - word_1 += c - word__.append(word_1) - lll__.append(pn) - word_1 = "" - elif split_ == "B": - word_1 = "" - word_1 += c - else: - word_1 += c - Segs.append(word__) - PosNers.append(lll__) - return Segs, PosNers - - -pickle_path = "data_for_tests/" -model_path = "data_for_tests/model_best_dev.pkl" -if __name__ == "__main__": - - with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: - id2word = _pickle.load(f) - with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: - word2id = _pickle.load(f) - with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: - id2class = _pickle.load(f) - - sent = ["中共中央总书记、国家主席江泽民", - "逆向处理输入序列并返回逆序后的序列"] # here is input - - args = {"epochs": 1, - "batch_size": 1, - "pickle_path": "data_for_tests/", - "validate": True, - "save_best_dev": True, - "model_saved_path": "data_for_tests/", - "use_cuda": False, - - "vocab_size": len(word2id), - "num_classes": len(id2class), - "word_emb_dim": 50, - "rnn_hidden_units": 100, - } - """ - network = AdvSeqLabel(args, None) - decoder_ = Decode(args) - tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) - output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output - print(output_seg) - print(output_pn) - """ - # Define the same model - model = AdvSeqLabel(args, None) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") - print("model loaded!") - - # Inference interface - infer = SeqLabelInfer(pickle_path) - sent = [[ch for ch in s] for s in sent] - results = infer.predict(model, sent) - - for res in results: - print(res) - print("Inference finished!") diff --git a/test/readme_example.py b/test/readme_example.py index 17ac92c2..bc50c48b 100644 --- a/test/readme_example.py +++ b/test/readme_example.py @@ -1,19 +1,13 @@ -# python: 3.5 -# pytorch: 0.4 - -################ -# Test cross validation. -################ - -from fastNLP.loader.preprocess import ClassPreprocess - +from fastNLP.core.loss import Loss +from fastNLP.core.optimizer import Optimizer from fastNLP.core.predictor import ClassificationInfer +from fastNLP.core.preprocess import ClassPreprocess from fastNLP.core.trainer import ClassificationTrainer from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.models.base_model import BaseModel from fastNLP.modules import aggregation -from fastNLP.modules import encoder from fastNLP.modules import decoder +from fastNLP.modules import encoder class ClassificationModel(BaseModel): @@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): self.enc = encoder.Conv( in_channels=300, out_channels=100, kernel_size=3) self.agg = aggregation.MaxPool() - self.dec = decoder.MLP(100, num_classes=num_classes) + self.dec = decoder.MLP(size_layer=[100, num_classes]) def forward(self, x): x = self.emb(x) # [N,L] -> [N,L,C] @@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): return x -data_dir = 'data' # directory to save data and model -train_path = 'test/data_for_tests/text_classify.txt' # training set file +data_dir = 'save/' # directory to save data and model +train_path = './data_for_tests/text_classify.txt' # training set file # load dataset -ds_loader = ClassDatasetLoader("train", train_path) +ds_loader = ClassDatasetLoader(train_path) data = ds_loader.load() # pre-process dataset -pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) -# pre = ClassPreprocess(data, data_dir) -n_classes = pre.num_classes -vocab_size = pre.vocab_size +pre = ClassPreprocess() +train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) +n_classes, vocab_size = pre.num_classes, pre.vocab_size # construct model model_args = { @@ -58,22 +51,25 @@ model_args = { } model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) -# train model +# construct trainer train_args = { - "epochs": 10, - "batch_size": 50, + "epochs": 3, + "batch_size": 16, "pickle_path": data_dir, "validate": False, "save_best_dev": False, "model_saved_path": None, "use_cuda": True, - "learn_rate": 1e-3, - "momentum": 0.9} -trainer = ClassificationTrainer(train_args) -# trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) -trainer.cross_validate(model) + "loss": Loss("cross_entropy"), + "optimizer": Optimizer("Adam", lr=0.001) +} +trainer = ClassificationTrainer(**train_args) + +# start training +trainer.train(model, train_data=train_set, dev_data=dev_set) # predict using model data_infer = [x[0] for x in data] infer = ClassificationInfer(data_dir) -labels_pred = infer.predict(model, data_infer) \ No newline at end of file +labels_pred = infer.predict(model.cpu(), data_infer) +print(labels_pred) diff --git a/test/seq_labeling.py b/test/seq_labeling.py index a9488834..0f7a072b 100644 --- a/test/seq_labeling.py +++ b/test/seq_labeling.py @@ -33,7 +33,7 @@ data_infer_path = args.infer def infer(): # Load infer configuration, the same as test test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args}) + ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -49,7 +49,7 @@ def infer(): print("model loaded!") # Data Loader - raw_data_loader = BaseLoader("xxx", data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() # Inference interface @@ -65,11 +65,11 @@ def train_and_test(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config(config_dir, { + ConfigLoader("config.cfg").load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) # Data Loader - pos_loader = POSDatasetLoader("xxx", data_path) + pos_loader = POSDatasetLoader(data_path) train_data = pos_loader.load_lines() # Preprocessor @@ -117,7 +117,7 @@ def train_and_test(): # Load test configuration tester_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args}) + ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(save_output=False, @@ -134,10 +134,10 @@ def train_and_test(): tester.test(model, data_dev) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") if __name__ == "__main__": - train_and_test() - # infer() + # train_and_test() + infer() diff --git a/test/test_cws.py b/test/test_cws.py index bbbef67f..802d97ba 100644 --- a/test/test_cws.py +++ b/test/test_cws.py @@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt" def infer(): # Load infer configuration, the same as test test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") @@ -38,7 +38,7 @@ def infer(): print("model loaded!") # Data Loader - raw_data_loader = BaseLoader(data_name, data_infer_path) + raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() """ Transform strings into list of list of strings. @@ -61,10 +61,10 @@ def infer(): def train_test(): # Config Loader train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) # Data Loader - loader = TokenizeDatasetLoader(data_name, cws_data_path) + loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor @@ -74,7 +74,7 @@ def train_test(): train_args["num_classes"] = p.num_classes # Trainer - trainer = SeqLabelTrainer(train_args) + trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) @@ -99,16 +99,16 @@ def train_test(): # Load test configuration test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) # Tester - tester = SeqLabelTester(test_args) + tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, data_train) # print test results - print(tester.show_matrices()) + print(tester.show_metrics()) print("model tested!") diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py index 0776109a..92bc894f 100644 --- a/test/test_fastNLP.py +++ b/test/test_fastNLP.py @@ -1,9 +1,12 @@ import sys + sys.path.append("..") from fastNLP.fastnlp import FastNLP -from fastNLP.fastnlp import interpret_word_seg_results +from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" +PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" +PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" def word_seg(): nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) @@ -39,5 +42,44 @@ def test_word_seg_interpret(): print(interpret_word_seg_results(chars, labels)) +def test_interpret_cws_pos_results(): + foo = [ + [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), + ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), + ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] + ] + chars = [x[0] for x in foo[0]] + labels = [x[1] for x in foo[0]] + print(interpret_cws_pos_results(chars, labels)) + + +def pos_tag(): + nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) + nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") + text = ["这是最好的基于深度学习的中文分词系统。", + "大王叫我来巡山。", + "我党多年来致力于改善人民生活水平。"] + results = nlp.run(text) + for example in results: + words, labels = [], [] + for res in example: + words.append(res[0]) + labels.append(res[1]) + print(interpret_cws_pos_results(words, labels)) + + +def text_classify(): + nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES) + nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model") + text = [ + "世界物联网大会明日在京召开龙头股启动在即", + "乌鲁木齐市新增一处城市中心旅游目的地", + "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"] + results = nlp.run(text) + print(results) + """ + ['finance', 'travel', 'history'] + """ + if __name__ == "__main__": - word_seg() + text_classify() diff --git a/test/test_tester.py b/test/test_tester.py index 1c2658ef..e4ccf536 100644 --- a/test/test_tester.py +++ b/test/test_tester.py @@ -5,19 +5,19 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader from fastNLP.models.sequence_modeling import SeqLabeling data_name = "pku_training.utf8" -cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" pickle_path = "data_for_tests" def foo(): - loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8") + loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") train_data = loader.load_pku() train_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) # Preprocessor - p = SeqLabelPreprocess(train_data, pickle_path) + p = SeqLabelPreprocess() + train_data = p.run(train_data) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes @@ -26,11 +26,11 @@ def foo(): valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", "use_cuda": True} - validator = SeqLabelTester(valid_args) + validator = SeqLabelTester(**valid_args) print("start validation.") - validator.test(model) - print(validator.show_matrices()) + validator.test(model, train_data) + print(validator.show_metrics()) if __name__ == "__main__": diff --git a/test/text_classify.py b/test/text_classify.py index 64294d37..6ff3c059 100644 --- a/test/text_classify.py +++ b/test/text_classify.py @@ -34,7 +34,7 @@ config_dir = args.config def infer(): # load dataset print("Loading data...") - ds_loader = ClassDatasetLoader("train", train_data_dir) + ds_loader = ClassDatasetLoader(train_data_dir) data = ds_loader.load() unlabeled_data = [x[0] for x in data] @@ -69,7 +69,7 @@ def train(): # load dataset print("Loading data...") - ds_loader = ClassDatasetLoader("train", train_data_dir) + ds_loader = ClassDatasetLoader(train_data_dir) data = ds_loader.load() print(data[0])