| @@ -16,93 +16,19 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa | |||
| - numpy>=1.14.2 | |||
| - torch==0.4.0 | |||
| - torchvision>=0.1.8 | |||
| - tensorboardX | |||
| ## Resources | |||
| - [Documentation](https://github.com/fastnlp/fastNLP) | |||
| - [Documentation](https://fastnlp.readthedocs.io/en/latest/) | |||
| - [Source Code](https://github.com/fastnlp/fastNLP) | |||
| ## Example | |||
| ### Basic Usage | |||
| A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model. | |||
| ```python | |||
| from fastNLP.models.base_model import BaseModel | |||
| from fastNLP.modules import encoder | |||
| from fastNLP.modules import aggregation | |||
| from fastNLP.modules import decoder | |||
| from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
| from fastNLP.loader.preprocess import ClassPreprocess | |||
| from fastNLP.core.trainer import ClassificationTrainer | |||
| from fastNLP.core.inference import ClassificationInfer | |||
| class ClassificationModel(BaseModel): | |||
| """ | |||
| Simple text classification model based on CNN. | |||
| """ | |||
| def __init__(self, num_classes, vocab_size): | |||
| super(ClassificationModel, self).__init__() | |||
| self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
| self.enc = encoder.Conv( | |||
| in_channels=300, out_channels=100, kernel_size=3) | |||
| self.agg = aggregation.MaxPool() | |||
| self.dec = decoder.MLP(100, num_classes=num_classes) | |||
| def forward(self, x): | |||
| x = self.emb(x) # [N,L] -> [N,L,C] | |||
| x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] | |||
| x = self.agg(x) # [N,L,C] -> [N,C] | |||
| x = self.dec(x) # [N,C] -> [N, N_class] | |||
| return x | |||
| data_dir = 'data' # directory to save data and model | |||
| train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
| # load dataset | |||
| ds_loader = ClassDatasetLoader("train", train_path) | |||
| data = ds_loader.load() | |||
| # pre-process dataset | |||
| pre = ClassPreprocess(data_dir) | |||
| vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
| # construct model | |||
| model_args = { | |||
| 'num_classes': n_classes, | |||
| 'vocab_size': vocab_size | |||
| } | |||
| model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
| # train model | |||
| train_args = { | |||
| "epochs": 20, | |||
| "batch_size": 50, | |||
| "pickle_path": data_dir, | |||
| "validate": False, | |||
| "save_best_dev": False, | |||
| "model_saved_path": None, | |||
| "use_cuda": True, | |||
| "learn_rate": 1e-3, | |||
| "momentum": 0.9} | |||
| trainer = ClassificationTrainer(train_args) | |||
| trainer.train(model) | |||
| # predict using model | |||
| seqs = [x[0] for x in data] | |||
| infer = ClassificationInfer(data_dir) | |||
| labels_pred = infer.predict(model, seqs) | |||
| ``` | |||
| ## Installation | |||
| Run the following commands to install fastNLP package. | |||
| ```shell | |||
| pip install fastNLP | |||
| ``` | |||
| ### Cloning From GitHub | |||
| @@ -122,20 +48,26 @@ conda install pytorch torchvision -c pytorch | |||
| pip3 install torch torchvision | |||
| ``` | |||
| ### TensorboardX Installation | |||
| ```shell | |||
| pip3 install tensorboardX | |||
| ``` | |||
| ## Project Structure | |||
| ``` | |||
| FastNLP | |||
| ├── docs | |||
| │ └── quick_tutorial.md | |||
| ├── fastNLP | |||
| │ ├── action | |||
| │ ├── core | |||
| │ │ ├── action.py | |||
| │ │ ├── inference.py | |||
| │ │ ├── __init__.py | |||
| │ │ ├── loss.py | |||
| │ │ ├── metrics.py | |||
| │ │ ├── optimizer.py | |||
| │ │ ├── predictor.py | |||
| │ │ ├── preprocess.py | |||
| │ │ ├── README.md | |||
| │ │ ├── tester.py | |||
| │ │ └── trainer.py | |||
| @@ -147,71 +79,28 @@ FastNLP | |||
| │ │ ├── dataset_loader.py | |||
| │ │ ├── embed_loader.py | |||
| │ │ ├── __init__.py | |||
| │ │ ├── model_loader.py | |||
| │ │ └── preprocess.py | |||
| │ │ └── model_loader.py | |||
| │ ├── models | |||
| │ │ ├── base_model.py | |||
| │ │ ├── char_language_model.py | |||
| │ │ ├── cnn_text_classification.py | |||
| │ │ ├── __init__.py | |||
| │ │ └── sequence_modeling.py | |||
| │ ├── modules | |||
| │ │ ├── aggregation | |||
| │ │ │ ├── attention.py | |||
| │ │ │ ├── avg_pool.py | |||
| │ │ │ ├── __init__.py | |||
| │ │ │ ├── kmax_pool.py | |||
| │ │ │ ├── max_pool.py | |||
| │ │ │ └── self_attention.py | |||
| │ │ ├── decoder | |||
| │ │ │ ├── CRF.py | |||
| │ │ │ └── __init__.py | |||
| │ │ ├── encoder | |||
| │ │ │ ├── char_embedding.py | |||
| │ │ │ ├── conv_maxpool.py | |||
| │ │ │ ├── conv.py | |||
| │ │ │ ├── embedding.py | |||
| │ │ │ ├── __init__.py | |||
| │ │ │ ├── linear.py | |||
| │ │ │ ├── lstm.py | |||
| │ │ │ ├── masked_rnn.py | |||
| │ │ │ └── variational_rnn.py | |||
| │ │ ├── __init__.py | |||
| │ │ ├── interaction | |||
| │ │ │ └── __init__.py | |||
| │ │ ├── other_modules.py | |||
| │ │ └── utils.py | |||
| │ └── saver | |||
| │ ├── base_saver.py | |||
| │ ├── __init__.py | |||
| │ ├── logger.py | |||
| │ └── model_saver.py | |||
| ├── LICENSE | |||
| ├── README.md | |||
| ├── reproduction | |||
| │ ├── Char-aware_NLM | |||
| │ │ | |||
| │ ├── CNN-sentence_classification | |||
| │ │ | |||
| │ ├── HAN-document_classification | |||
| │ │ | |||
| │ └── LSTM+self_attention_sentiment_analysis | |||
| | | |||
| ├── requirements.txt | |||
| ├── setup.py | |||
| └── test | |||
| ├── core | |||
| ├── data_for_tests | |||
| │ ├── charlm.txt | |||
| │ ├── config | |||
| │ ├── cws_test | |||
| │ ├── cws_train | |||
| │ ├── people_infer.txt | |||
| │ └── people.txt | |||
| ├── test_charlm.py | |||
| ├── test_cws.py | |||
| ├── test_fastNLP.py | |||
| ├── test_loader.py | |||
| ├── test_seq_labeling.py | |||
| ├── test_tester.py | |||
| └── test_trainer.py | |||
| ├── __init__.py | |||
| ├── loader | |||
| ├── modules | |||
| └── readme_example.py | |||
| ``` | |||
| @@ -1,3 +1,4 @@ | |||
| sphinx | |||
| -e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme | |||
| sphinxcontrib.katex | |||
| numpy>=1.14.2 | |||
| http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl | |||
| torchvision>=0.1.8 | |||
| sphinx-rtd-theme==0.4.1 | |||
| @@ -42,6 +42,8 @@ release = '1.0' | |||
| extensions = [ | |||
| 'sphinx.ext.autodoc', | |||
| 'sphinx.ext.viewcode', | |||
| 'sphinx.ext.autosummary', | |||
| ] | |||
| # Add any paths that contain templates here, relative to this directory. | |||
| @@ -1,62 +1,54 @@ | |||
| fastNLP.core package | |||
| ==================== | |||
| fastNLP.core | |||
| ============= | |||
| Submodules | |||
| ---------- | |||
| fastNLP.core.action module | |||
| -------------------------- | |||
| fastNLP.core.action | |||
| -------------------- | |||
| .. automodule:: fastNLP.core.action | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.core.metrics module | |||
| --------------------------- | |||
| fastNLP.core.loss | |||
| ------------------ | |||
| .. automodule:: fastNLP.core.loss | |||
| :members: | |||
| fastNLP.core.metrics | |||
| --------------------- | |||
| .. automodule:: fastNLP.core.metrics | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.core.optimizer module | |||
| ----------------------------- | |||
| fastNLP.core.optimizer | |||
| ----------------------- | |||
| .. automodule:: fastNLP.core.optimizer | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.core.predictor module | |||
| ----------------------------- | |||
| fastNLP.core.predictor | |||
| ----------------------- | |||
| .. automodule:: fastNLP.core.predictor | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.core.tester module | |||
| -------------------------- | |||
| fastNLP.core.preprocess | |||
| ------------------------ | |||
| .. automodule:: fastNLP.core.preprocess | |||
| :members: | |||
| fastNLP.core.tester | |||
| -------------------- | |||
| .. automodule:: fastNLP.core.tester | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.core.trainer module | |||
| --------------------------- | |||
| fastNLP.core.trainer | |||
| --------------------- | |||
| .. automodule:: fastNLP.core.trainer | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.core | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,62 +1,36 @@ | |||
| fastNLP.loader package | |||
| ====================== | |||
| fastNLP.loader | |||
| =============== | |||
| Submodules | |||
| ---------- | |||
| fastNLP.loader.base\_loader module | |||
| ---------------------------------- | |||
| fastNLP.loader.base\_loader | |||
| ---------------------------- | |||
| .. automodule:: fastNLP.loader.base_loader | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.loader.config\_loader module | |||
| ------------------------------------ | |||
| fastNLP.loader.config\_loader | |||
| ------------------------------ | |||
| .. automodule:: fastNLP.loader.config_loader | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.loader.dataset\_loader module | |||
| ------------------------------------- | |||
| fastNLP.loader.dataset\_loader | |||
| ------------------------------- | |||
| .. automodule:: fastNLP.loader.dataset_loader | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.loader.embed\_loader module | |||
| ----------------------------------- | |||
| fastNLP.loader.embed\_loader | |||
| ----------------------------- | |||
| .. automodule:: fastNLP.loader.embed_loader | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.loader.model\_loader module | |||
| ----------------------------------- | |||
| fastNLP.loader.model\_loader | |||
| ----------------------------- | |||
| .. automodule:: fastNLP.loader.model_loader | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.loader.preprocess module | |||
| -------------------------------- | |||
| .. automodule:: fastNLP.loader.preprocess | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.loader | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,46 +1,30 @@ | |||
| fastNLP.models package | |||
| ====================== | |||
| fastNLP.models | |||
| =============== | |||
| Submodules | |||
| ---------- | |||
| fastNLP.models.base\_model module | |||
| --------------------------------- | |||
| fastNLP.models.base\_model | |||
| --------------------------- | |||
| .. automodule:: fastNLP.models.base_model | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.models.char\_language\_model module | |||
| ------------------------------------------- | |||
| fastNLP.models.char\_language\_model | |||
| ------------------------------------- | |||
| .. automodule:: fastNLP.models.char_language_model | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.models.cnn\_text\_classification module | |||
| ----------------------------------------------- | |||
| fastNLP.models.cnn\_text\_classification | |||
| ----------------------------------------- | |||
| .. automodule:: fastNLP.models.cnn_text_classification | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.models.sequence\_modeling module | |||
| ---------------------------------------- | |||
| fastNLP.models.sequence\_modeling | |||
| ---------------------------------- | |||
| .. automodule:: fastNLP.models.sequence_modeling | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.models | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,54 +1,36 @@ | |||
| fastNLP.modules.aggregation package | |||
| =================================== | |||
| fastNLP.modules.aggregation | |||
| ============================ | |||
| Submodules | |||
| ---------- | |||
| fastNLP.modules.aggregation.attention module | |||
| -------------------------------------------- | |||
| fastNLP.modules.aggregation.attention | |||
| -------------------------------------- | |||
| .. automodule:: fastNLP.modules.aggregation.attention | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.aggregation.avg\_pool module | |||
| -------------------------------------------- | |||
| fastNLP.modules.aggregation.avg\_pool | |||
| -------------------------------------- | |||
| .. automodule:: fastNLP.modules.aggregation.avg_pool | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.aggregation.kmax\_pool module | |||
| --------------------------------------------- | |||
| fastNLP.modules.aggregation.kmax\_pool | |||
| --------------------------------------- | |||
| .. automodule:: fastNLP.modules.aggregation.kmax_pool | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.aggregation.max\_pool module | |||
| -------------------------------------------- | |||
| fastNLP.modules.aggregation.max\_pool | |||
| -------------------------------------- | |||
| .. automodule:: fastNLP.modules.aggregation.max_pool | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.aggregation.self\_attention module | |||
| -------------------------------------------------- | |||
| fastNLP.modules.aggregation.self\_attention | |||
| -------------------------------------------- | |||
| .. automodule:: fastNLP.modules.aggregation.self_attention | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.modules.aggregation | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,22 +1,18 @@ | |||
| fastNLP.modules.decoder package | |||
| =============================== | |||
| fastNLP.modules.decoder | |||
| ======================== | |||
| Submodules | |||
| ---------- | |||
| fastNLP.modules.decoder.CRF module | |||
| ---------------------------------- | |||
| fastNLP.modules.decoder.CRF | |||
| ---------------------------- | |||
| .. automodule:: fastNLP.modules.decoder.CRF | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.decoder.MLP | |||
| ---------------------------- | |||
| .. automodule:: fastNLP.modules.decoder.MLP | |||
| :members: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.modules.decoder | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,78 +1,54 @@ | |||
| fastNLP.modules.encoder package | |||
| =============================== | |||
| fastNLP.modules.encoder | |||
| ======================== | |||
| Submodules | |||
| ---------- | |||
| fastNLP.modules.encoder.char\_embedding module | |||
| ---------------------------------------------- | |||
| fastNLP.modules.encoder.char\_embedding | |||
| ---------------------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.char_embedding | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.conv module | |||
| ----------------------------------- | |||
| fastNLP.modules.encoder.conv | |||
| ----------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.conv | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.conv\_maxpool module | |||
| -------------------------------------------- | |||
| fastNLP.modules.encoder.conv\_maxpool | |||
| -------------------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.conv_maxpool | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.embedding module | |||
| ---------------------------------------- | |||
| fastNLP.modules.encoder.embedding | |||
| ---------------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.embedding | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.linear module | |||
| ------------------------------------- | |||
| fastNLP.modules.encoder.linear | |||
| ------------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.linear | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.lstm module | |||
| ----------------------------------- | |||
| fastNLP.modules.encoder.lstm | |||
| ----------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.lstm | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.masked\_rnn module | |||
| ------------------------------------------ | |||
| fastNLP.modules.encoder.masked\_rnn | |||
| ------------------------------------ | |||
| .. automodule:: fastNLP.modules.encoder.masked_rnn | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.encoder.variational\_rnn module | |||
| ----------------------------------------------- | |||
| fastNLP.modules.encoder.variational\_rnn | |||
| ----------------------------------------- | |||
| .. automodule:: fastNLP.modules.encoder.variational_rnn | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.modules.encoder | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,10 +1,5 @@ | |||
| fastNLP.modules.interaction package | |||
| =================================== | |||
| Module contents | |||
| --------------- | |||
| fastNLP.modules.interaction | |||
| ============================ | |||
| .. automodule:: fastNLP.modules.interaction | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,8 +1,5 @@ | |||
| fastNLP.modules package | |||
| ======================= | |||
| Subpackages | |||
| ----------- | |||
| fastNLP.modules | |||
| ================ | |||
| .. toctree:: | |||
| @@ -11,30 +8,18 @@ Subpackages | |||
| fastNLP.modules.encoder | |||
| fastNLP.modules.interaction | |||
| Submodules | |||
| ---------- | |||
| fastNLP.modules.other\_modules module | |||
| ------------------------------------- | |||
| fastNLP.modules.other\_modules | |||
| ------------------------------- | |||
| .. automodule:: fastNLP.modules.other_modules | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.modules.utils module | |||
| ---------------------------- | |||
| fastNLP.modules.utils | |||
| ---------------------- | |||
| .. automodule:: fastNLP.modules.utils | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.modules | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,8 +1,5 @@ | |||
| fastNLP package | |||
| =============== | |||
| Subpackages | |||
| ----------- | |||
| fastNLP | |||
| ======== | |||
| .. toctree:: | |||
| @@ -12,22 +9,12 @@ Subpackages | |||
| fastNLP.modules | |||
| fastNLP.saver | |||
| Submodules | |||
| ---------- | |||
| fastNLP.fastnlp module | |||
| ---------------------- | |||
| fastNLP.fastnlp | |||
| ---------------- | |||
| .. automodule:: fastNLP.fastnlp | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,30 +1,18 @@ | |||
| fastNLP.saver package | |||
| ===================== | |||
| fastNLP.saver | |||
| ============== | |||
| Submodules | |||
| ---------- | |||
| fastNLP.saver.logger module | |||
| --------------------------- | |||
| fastNLP.saver.logger | |||
| --------------------- | |||
| .. automodule:: fastNLP.saver.logger | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| fastNLP.saver.model\_saver module | |||
| --------------------------------- | |||
| fastNLP.saver.model\_saver | |||
| --------------------------- | |||
| .. automodule:: fastNLP.saver.model_saver | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| Module contents | |||
| --------------- | |||
| .. automodule:: fastNLP.saver | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,16 +1,54 @@ | |||
| .. fastNLP documentation master file, created by | |||
| sphinx-quickstart on Mon Aug 20 17:06:44 2018. | |||
| You can adapt this file completely to your liking, but it should at least | |||
| contain the root `toctree` directive. | |||
| fastNLP documentation | |||
| ===================== | |||
| fastNLP,目前仍在孵化中。 | |||
| Welcome to fastNLP's documentation! | |||
| =================================== | |||
| Introduction | |||
| ------------ | |||
| fastNLP是一个基于PyTorch的模块化自然语言处理系统,用于快速开发NLP工具。 | |||
| 它将基于深度学习的NLP模型划分为不同的模块。 | |||
| 这些模块分为4类:encoder(编码),interaction(交互), aggregration(聚合) and decoder(解码), | |||
| 而每个类别包含不同的实现模块。 | |||
| 大多数当前的NLP模型可以构建在这些模块上,这极大地简化了开发NLP模型的过程。 | |||
| fastNLP的架构如下左图所示: | |||
| .. image:: figures/procedures_and_sequence_labeling.png | |||
| 在constructing model部分,以序列标注(上右图)和文本分类(下图)为例进行说明: | |||
| .. image:: figures/text_classification.png | |||
| * encoder module:将输入编码为一些抽象表示,输入的是单词序列,输出向量序列。 | |||
| * interaction module:使表示中的信息相互交互,输入的是向量序列,输出的也是向量序列。 | |||
| * aggregation module:聚合和减少信息,输入向量序列,输出一个向量。 | |||
| * decoder module:将表示解码为输出,输出一个label(文本分类)或者输出label序列(序列标注) | |||
| 其中interaction module和aggregation module在模型中不一定存在,例如上面的序列标注模型。 | |||
| User's Guide | |||
| ------------ | |||
| .. toctree:: | |||
| :maxdepth: 2 | |||
| user/installation | |||
| user/quickstart | |||
| API Reference | |||
| ------------- | |||
| If you are looking for information on a specific function, class or | |||
| method, this part of the documentation is for you. | |||
| .. toctree:: | |||
| :maxdepth: 4 | |||
| :caption: Contents: | |||
| :maxdepth: 2 | |||
| fastNLP | |||
| fastNLP API <fastNLP> | |||
| @@ -1,7 +0,0 @@ | |||
| fastNLP | |||
| ======= | |||
| .. toctree:: | |||
| :maxdepth: 4 | |||
| fastNLP | |||
| @@ -0,0 +1,31 @@ | |||
| ============ | |||
| Installation | |||
| ============ | |||
| .. contents:: | |||
| :local: | |||
| Cloning From GitHub | |||
| ~~~~~~~~~~~~~~~~~~~ | |||
| If you just want to use fastNLP, use: | |||
| .. code:: shell | |||
| git clone https://github.com/fastnlp/fastNLP | |||
| cd fastNLP | |||
| PyTorch Installation | |||
| ~~~~~~~~~~~~~~~~~~~~ | |||
| Visit the [PyTorch official website] for installation instructions based | |||
| on your system. In general, you could use: | |||
| .. code:: shell | |||
| # using conda | |||
| conda install pytorch torchvision -c pytorch | |||
| # or using pip | |||
| pip3 install torch torchvision | |||
| @@ -0,0 +1,84 @@ | |||
| ========== | |||
| Quickstart | |||
| ========== | |||
| Example | |||
| ------- | |||
| Basic Usage | |||
| ~~~~~~~~~~~ | |||
| A typical fastNLP routine is composed of four phases: loading dataset, | |||
| pre-processing data, constructing model and training model. | |||
| .. code:: python | |||
| from fastNLP.models.base_model import BaseModel | |||
| from fastNLP.modules import encoder | |||
| from fastNLP.modules import aggregation | |||
| from fastNLP.modules import decoder | |||
| from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
| from fastNLP.loader.preprocess import ClassPreprocess | |||
| from fastNLP.core.trainer import ClassificationTrainer | |||
| from fastNLP.core.inference import ClassificationInfer | |||
| class ClassificationModel(BaseModel): | |||
| """ | |||
| Simple text classification model based on CNN. | |||
| """ | |||
| def __init__(self, num_classes, vocab_size): | |||
| super(ClassificationModel, self).__init__() | |||
| self.emb = encoder.Embedding(nums=vocab_size, dims=300) | |||
| self.enc = encoder.Conv( | |||
| in_channels=300, out_channels=100, kernel_size=3) | |||
| self.agg = aggregation.MaxPool() | |||
| self.dec = decoder.MLP(100, num_classes=num_classes) | |||
| def forward(self, x): | |||
| x = self.emb(x) # [N,L] -> [N,L,C] | |||
| x = self.enc(x) # [N,L,C_in] -> [N,L,C_out] | |||
| x = self.agg(x) # [N,L,C] -> [N,C] | |||
| x = self.dec(x) # [N,C] -> [N, N_class] | |||
| return x | |||
| data_dir = 'data' # directory to save data and model | |||
| train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
| # load dataset | |||
| ds_loader = ClassDatasetLoader("train", train_path) | |||
| data = ds_loader.load() | |||
| # pre-process dataset | |||
| pre = ClassPreprocess(data_dir) | |||
| vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
| # construct model | |||
| model_args = { | |||
| 'num_classes': n_classes, | |||
| 'vocab_size': vocab_size | |||
| } | |||
| model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
| # train model | |||
| train_args = { | |||
| "epochs": 20, | |||
| "batch_size": 50, | |||
| "pickle_path": data_dir, | |||
| "validate": False, | |||
| "save_best_dev": False, | |||
| "model_saved_path": None, | |||
| "use_cuda": True, | |||
| "learn_rate": 1e-3, | |||
| "momentum": 0.9} | |||
| trainer = ClassificationTrainer(train_args) | |||
| trainer.train(model) | |||
| # predict using model | |||
| seqs = [x[0] for x in data] | |||
| infer = ClassificationInfer(data_dir) | |||
| labels_pred = infer.predict(model, seqs) | |||
| @@ -1,7 +1,3 @@ | |||
| """ | |||
| This file defines Action(s) and sample methods. | |||
| """ | |||
| from collections import Counter | |||
| import numpy as np | |||
| @@ -9,13 +5,12 @@ import torch | |||
| class Action(object): | |||
| """ | |||
| Operations shared by Trainer, Tester, or Inference. | |||
| """Operations shared by Trainer, Tester, or Inference. | |||
| This is designed for reducing replicate codes. | |||
| - make_batch: produce a min-batch of data. @staticmethod | |||
| - pad: padding method used in sequence modeling. @staticmethod | |||
| - mode: change network mode for either train or test. (for PyTorch) @staticmethod | |||
| The base Action shall define operations shared by as much task-specific Actions as possible. | |||
| """ | |||
| def __init__(self): | |||
| @@ -24,18 +19,20 @@ class Action(object): | |||
| @staticmethod | |||
| def make_batch(iterator, use_cuda, output_length=True, max_len=None): | |||
| """Batch and Pad data. | |||
| :param iterator: an iterator, (object that implements __next__ method) which returns the next sample. | |||
| :param use_cuda: bool, whether to use GPU | |||
| :param output_length: bool, whether to output the original length of the sequence before padding. (default: True) | |||
| :param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) | |||
| :return | |||
| if output_length is True: | |||
| :return : | |||
| if output_length is True, | |||
| (batch_x, seq_len): tuple of two elements | |||
| batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||
| seq_len: list. The length of the pre-padded sequence, if output_length is True. | |||
| batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||
| if output_length is False: | |||
| if output_length is False, | |||
| batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||
| batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||
| """ | |||
| @@ -77,21 +74,21 @@ class Action(object): | |||
| return batch | |||
| @staticmethod | |||
| def mode(model, test=False): | |||
| """ | |||
| Train mode or Test mode. This is for PyTorch currently. | |||
| :param model: | |||
| :param test: | |||
| def mode(model, is_test=False): | |||
| """Train mode or Test mode. This is for PyTorch currently. | |||
| :param model: a PyTorch model | |||
| :param is_test: bool, whether in test mode or not. | |||
| """ | |||
| if test: | |||
| if is_test: | |||
| model.eval() | |||
| else: | |||
| model.train() | |||
| def convert_to_torch_tensor(data_list, use_cuda): | |||
| """ | |||
| convert lists into (cuda) Tensors. | |||
| """Convert lists into (cuda) Tensors. | |||
| :param data_list: 2-level lists | |||
| :param use_cuda: bool, whether to use GPU or not | |||
| :return data_list: PyTorch Tensor of shape [batch_size, max_seq_len] | |||
| @@ -103,8 +100,8 @@ def convert_to_torch_tensor(data_list, use_cuda): | |||
| def k_means_1d(x, k, max_iter=100): | |||
| """ | |||
| Perform k-means on 1-D data. | |||
| """Perform k-means on 1-D data. | |||
| :param x: list of int, representing points in 1-D. | |||
| :param k: the number of clusters required. | |||
| :param max_iter: maximum iteration | |||
| @@ -132,21 +129,28 @@ def k_means_1d(x, k, max_iter=100): | |||
| def k_means_bucketing(all_inst, buckets): | |||
| """ | |||
| """Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. | |||
| :param all_inst: 3-level list | |||
| E.g. :: | |||
| [ | |||
| [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 | |||
| [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | |||
| ... | |||
| ] | |||
| :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length | |||
| threshold for each bucket (This is usually None.). | |||
| :return data: 2-level list | |||
| :: | |||
| [ | |||
| [index_11, index_12, ...], # bucket 1 | |||
| [index_21, index_22, ...], # bucket 2 | |||
| ... | |||
| ] | |||
| """ | |||
| bucket_data = [[] for _ in buckets] | |||
| num_buckets = len(buckets) | |||
| @@ -160,11 +164,16 @@ def k_means_bucketing(all_inst, buckets): | |||
| class BaseSampler(object): | |||
| """ | |||
| Base class for all samplers. | |||
| """The base class of all samplers. | |||
| """ | |||
| def __init__(self, data_set): | |||
| """ | |||
| :param data_set: multi-level list, of shape [num_example, *] | |||
| """ | |||
| self.data_set_length = len(data_set) | |||
| self.data = data_set | |||
| @@ -176,11 +185,16 @@ class BaseSampler(object): | |||
| class SequentialSampler(BaseSampler): | |||
| """ | |||
| Sample data in the original order. | |||
| """Sample data in the original order. | |||
| """ | |||
| def __init__(self, data_set): | |||
| """ | |||
| :param data_set: multi-level list | |||
| """ | |||
| super(SequentialSampler, self).__init__(data_set) | |||
| def __iter__(self): | |||
| @@ -188,11 +202,16 @@ class SequentialSampler(BaseSampler): | |||
| class RandomSampler(BaseSampler): | |||
| """ | |||
| Sample data in random permutation order. | |||
| """Sample data in random permutation order. | |||
| """ | |||
| def __init__(self, data_set): | |||
| """ | |||
| :param data_set: multi-level list | |||
| """ | |||
| super(RandomSampler, self).__init__(data_set) | |||
| self.order = np.random.permutation(self.data_set_length) | |||
| @@ -201,11 +220,18 @@ class RandomSampler(BaseSampler): | |||
| class Batchifier(object): | |||
| """ | |||
| Wrap random or sequential sampler to generate a mini-batch. | |||
| """Wrap random or sequential sampler to generate a mini-batch. | |||
| """ | |||
| def __init__(self, sampler, batch_size, drop_last=True): | |||
| """ | |||
| :param sampler: a Sampler object | |||
| :param batch_size: int, the size of the mini-batch | |||
| :param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch. | |||
| """ | |||
| super(Batchifier, self).__init__() | |||
| self.sampler = sampler | |||
| self.batch_size = batch_size | |||
| @@ -223,8 +249,7 @@ class Batchifier(object): | |||
| class BucketBatchifier(Batchifier): | |||
| """ | |||
| Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||
| """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||
| In sampling, first random choose a bucket. Then sample data from it. | |||
| The number of buckets is decided dynamically by the variance of sentence lengths. | |||
| """ | |||
| @@ -237,6 +262,7 @@ class BucketBatchifier(Batchifier): | |||
| :param num_buckets: int, number of buckets for grouping these sequences. | |||
| :param drop_last: bool, useless currently. | |||
| :param sampler: Sampler, useless currently. | |||
| """ | |||
| super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) | |||
| buckets = ([None] * num_buckets) | |||
| @@ -8,8 +8,13 @@ class Loss(object): | |||
| """ | |||
| def __init__(self, args): | |||
| """ | |||
| :param args: None or str, the name of a loss function. | |||
| """ | |||
| if args is None: | |||
| # this is useful when | |||
| # this is useful when Trainer.__init__ performs type check | |||
| self._loss = None | |||
| elif isinstance(args, str): | |||
| self._loss = self._borrow_from_pytorch(args) | |||
| @@ -17,10 +22,19 @@ class Loss(object): | |||
| raise NotImplementedError | |||
| def get(self): | |||
| """ | |||
| :return self._loss: the loss function | |||
| """ | |||
| return self._loss | |||
| @staticmethod | |||
| def _borrow_from_pytorch(loss_name): | |||
| """Given a name of a loss function, return it from PyTorch. | |||
| :param loss_name: str, the name of a loss function | |||
| :return loss: a PyTorch loss | |||
| """ | |||
| if loss_name == "cross_entropy": | |||
| return torch.nn.CrossEntropyLoss() | |||
| else: | |||
| @@ -1,11 +1,12 @@ | |||
| import warnings | |||
| import numpy as np | |||
| import torch | |||
| def _conver_numpy(x): | |||
| """ | |||
| convert input data to numpy array | |||
| """convert input data to numpy array | |||
| """ | |||
| if isinstance(x, np.ndarray): | |||
| return x | |||
| @@ -17,21 +18,20 @@ def _conver_numpy(x): | |||
| def _check_same_len(*arrays, axis=0): | |||
| """ | |||
| check if input array list has same length for one dimension | |||
| """check if input array list has same length for one dimension | |||
| """ | |||
| lens = set([x.shape[axis] for x in arrays if x is not None]) | |||
| return len(lens) == 1 | |||
| def _label_types(y): | |||
| """ | |||
| determine the type | |||
| "binary" | |||
| "multiclass" | |||
| "multiclass-multioutput" | |||
| "multilabel" | |||
| "unknown" | |||
| """Determine the type | |||
| - "binary" | |||
| - "multiclass" | |||
| - "multiclass-multioutput" | |||
| - "multilabel" | |||
| - "unknown" | |||
| """ | |||
| # never squeeze the first dimension | |||
| y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1) | |||
| @@ -46,8 +46,8 @@ def _label_types(y): | |||
| def _check_data(y_true, y_pred): | |||
| """ | |||
| check if y_true and y_pred is same type of data e.g both binary or multiclass | |||
| """Check if y_true and y_pred is same type of data e.g both binary or multiclass | |||
| """ | |||
| y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred) | |||
| if not _check_same_len(y_true, y_pred): | |||
| @@ -174,16 +174,13 @@ def classification_report(y_true, y_pred, labels=None, target_names=None, digits | |||
| def accuracy_topk(y_true, y_prob, k=1): | |||
| """ | |||
| Compute accuracy of y_true matching top-k probable | |||
| """Compute accuracy of y_true matching top-k probable | |||
| labels in y_prob. | |||
| Paras: | |||
| y_ture - ndarray, true label, [n_samples] | |||
| y_prob - ndarray, label probabilities, [n_samples, n_classes] | |||
| k - int, k in top-k | |||
| Returns: | |||
| accuracy of top-k | |||
| :param y_true: ndarray, true label, [n_samples] | |||
| :param y_prob: ndarray, label probabilities, [n_samples, n_classes] | |||
| :param k: int, k in top-k | |||
| :return :accuracy of top-k | |||
| """ | |||
| y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | |||
| @@ -195,16 +192,14 @@ def accuracy_topk(y_true, y_prob, k=1): | |||
| def pred_topk(y_prob, k=1): | |||
| """ | |||
| Return top-k predicted labels and corresponding probabilities. | |||
| Args: | |||
| y_prob - ndarray, size [n_samples, n_classes], probabilities on labels | |||
| k - int, k of top-k | |||
| Returns: | |||
| y_pred_topk - ndarray, size [n_samples, k], predicted top-k labels | |||
| y_prob_topk - ndarray, size [n_samples, k], probabilities for | |||
| top-k labels | |||
| """Return top-k predicted labels and corresponding probabilities. | |||
| :param y_prob: ndarray, size [n_samples, n_classes], probabilities on labels | |||
| :param k: int, k of top-k | |||
| :returns | |||
| y_pred_topk: ndarray, size [n_samples, k], predicted top-k labels | |||
| y_prob_topk: ndarray, size [n_samples, k], probabilities for top-k labels | |||
| """ | |||
| y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | |||
| @@ -4,7 +4,6 @@ import torch | |||
| class Optimizer(object): | |||
| """Wrapper of optimizer from framework | |||
| names: arguments (type) | |||
| 1. Adam: lr (float), weight_decay (float) | |||
| 2. AdaGrad | |||
| 3. RMSProp | |||
| @@ -16,20 +15,29 @@ class Optimizer(object): | |||
| """ | |||
| :param optimizer_name: str, the name of the optimizer | |||
| :param kwargs: the arguments | |||
| """ | |||
| self.optim_name = optimizer_name | |||
| self.kwargs = kwargs | |||
| @property | |||
| def name(self): | |||
| """The name of the optimizer. | |||
| :return: str | |||
| """ | |||
| return self.optim_name | |||
| @property | |||
| def params(self): | |||
| """The arguments used to create the optimizer. | |||
| :return: dict of (str, *) | |||
| """ | |||
| return self.kwargs | |||
| def construct_from_pytorch(self, model_params): | |||
| """construct a optimizer from framework over given model parameters""" | |||
| """Construct a optimizer from framework over given model parameters.""" | |||
| if self.optim_name in ["SGD", "sgd"]: | |||
| if "lr" in self.kwargs: | |||
| @@ -70,7 +70,7 @@ class Predictor(object): | |||
| def predict(self, network, data): | |||
| """Perform inference using the trained model. | |||
| :param network: a PyTorch model | |||
| :param network: a PyTorch model (cpu) | |||
| :param data: list of list of strings | |||
| :return: list of list of strings, [num_examples, tag_seq_length] | |||
| """ | |||
| @@ -17,12 +17,24 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
| # the first vocab in dict with the index = 5 | |||
| def save_pickle(obj, pickle_path, file_name): | |||
| """Save an object into a pickle file. | |||
| :param obj: an object | |||
| :param pickle_path: str, the directory where the pickle file is to be saved | |||
| :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl". | |||
| """ | |||
| with open(os.path.join(pickle_path, file_name), "wb") as f: | |||
| _pickle.dump(obj, f) | |||
| print("{} saved in {}".format(file_name, pickle_path)) | |||
| def load_pickle(pickle_path, file_name): | |||
| """Load an object from a given pickle file. | |||
| :param pickle_path: str, the directory where the pickle file is. | |||
| :param file_name: str, the name of the pickle file. | |||
| :return obj: an object stored in the pickle | |||
| """ | |||
| with open(os.path.join(pickle_path, file_name), "rb") as f: | |||
| obj = _pickle.load(f) | |||
| print("{} loaded from {}".format(file_name, pickle_path)) | |||
| @@ -30,7 +42,8 @@ def load_pickle(pickle_path, file_name): | |||
| def pickle_exist(pickle_path, pickle_name): | |||
| """ | |||
| """Check if a given pickle file exists in the directory. | |||
| :param pickle_path: the directory of target pickle file | |||
| :param pickle_name: the filename of target pickle file | |||
| :return: True if file exists else False | |||
| @@ -45,6 +58,19 @@ def pickle_exist(pickle_path, pickle_name): | |||
| class BasePreprocess(object): | |||
| """Base class of all preprocessors. | |||
| Preprocessors are responsible for converting data of strings into data of indices. | |||
| During the pre-processing, the following pickle files will be built: | |||
| - "word2id.pkl", a mapping from words(tokens) to indices | |||
| - "id2word.pkl", a reversed dictionary | |||
| - "label2id.pkl", a dictionary on labels | |||
| - "id2label.pkl", a reversed dictionary on labels | |||
| These four pickle files are expected to be saved in the given pickle directory once they are constructed. | |||
| Preprocessors will check if those files are already in the directory and will reuse them in future calls. | |||
| """ | |||
| def __init__(self): | |||
| self.word2index = None | |||
| self.label2index = None | |||
| @@ -59,6 +85,7 @@ class BasePreprocess(object): | |||
| def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): | |||
| """Main preprocessing pipeline. | |||
| :param train_dev_data: three-level list, with either single label or multiple labels in a sample. | |||
| :param test_data: three-level list, with either single label or multiple labels in a sample. (optional) | |||
| :param pickle_path: str, the path to save the pickle files. | |||
| @@ -67,6 +94,7 @@ class BasePreprocess(object): | |||
| :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. | |||
| :return results: a tuple of datasets after preprocessing. | |||
| """ | |||
| if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | |||
| self.word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| self.label2index = load_pickle(pickle_path, "class2id.pkl") | |||
| @@ -182,25 +210,31 @@ class SeqLabelPreprocess(BasePreprocess): | |||
| """Preprocess pipeline, including building mapping from words to index, from index to words, | |||
| from labels/classes to index, from index to labels/classes. | |||
| data of three-level list which have multiple labels in each sample. | |||
| :: | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| """ | |||
| def __init__(self): | |||
| super(SeqLabelPreprocess, self).__init__() | |||
| def build_dict(self, data): | |||
| """ | |||
| Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||
| """Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||
| :param data: three-level list | |||
| :: | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :return word2index: dict of {str, int} | |||
| label2index: dict of {str, int} | |||
| """ | |||
| @@ -216,14 +250,17 @@ class SeqLabelPreprocess(BasePreprocess): | |||
| return word2index, label2index | |||
| def to_index(self, data): | |||
| """ | |||
| Convert word strings and label strings into indices. | |||
| """Convert word strings and label strings into indices. | |||
| :param data: three-level list | |||
| :: | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :return data_index: the same shape as data, but each string is replaced by its corresponding index | |||
| """ | |||
| data_index = [] | |||
| @@ -242,11 +279,14 @@ class ClassPreprocess(BasePreprocess): | |||
| Preprocess pipeline, including building mapping from words to index, from index to words, | |||
| from labels/classes to index, from index to labels/classes. | |||
| design for data of three-level list which has a single label in each sample. | |||
| :: | |||
| [ | |||
| [ [word_11, word_12, ...], label_1 ], | |||
| [ [word_21, word_22, ...], label_2 ], | |||
| ... | |||
| ] | |||
| """ | |||
| def __init__(self): | |||
| @@ -269,18 +309,21 @@ class ClassPreprocess(BasePreprocess): | |||
| for word in sent: | |||
| if word not in word2index: | |||
| word2index[word[0]] = len(word2index) | |||
| word2index[word] = len(word2index) | |||
| return word2index, label2index | |||
| def to_index(self, data): | |||
| """ | |||
| Convert word strings and label strings into indices. | |||
| """Convert word strings and label strings into indices. | |||
| :param data: three-level list | |||
| :: | |||
| [ | |||
| [ [word_11, word_12, ...], label_1 ], | |||
| [ [word_21, word_22, ...], label_2 ], | |||
| ... | |||
| ] | |||
| :return data_index: the same shape as data, but each string is replaced by its corresponding index | |||
| """ | |||
| data_index = [] | |||
| @@ -295,14 +338,15 @@ class ClassPreprocess(BasePreprocess): | |||
| def infer_preprocess(pickle_path, data): | |||
| """ | |||
| Preprocess over inference data. | |||
| Transform three-level list of strings into that of index. | |||
| """Preprocess over inference data. Transform three-level list of strings into that of index. | |||
| :: | |||
| [ | |||
| [word_11, word_12, ...], | |||
| [word_21, word_22, ...], | |||
| ... | |||
| ] | |||
| """ | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| data_index = [] | |||
| @@ -38,7 +38,7 @@ class BaseTester(object): | |||
| Obviously, "required_args" is the subset of "default_args". | |||
| The value in "default_args" to the keys in "required_args" is simply for type check. | |||
| """ | |||
| # TODO: required arguments | |||
| # add required arguments here | |||
| required_args = {} | |||
| for req_key in required_args: | |||
| @@ -56,7 +56,7 @@ class BaseTester(object): | |||
| logger.error(msg) | |||
| raise ValueError(msg) | |||
| else: | |||
| # BeseTester doesn't care about extra arguments | |||
| # BaseTester doesn't care about extra arguments | |||
| pass | |||
| print(default_args) | |||
| @@ -69,8 +69,8 @@ class BaseTester(object): | |||
| self.print_every_step = default_args["print_every_step"] | |||
| self._model = None | |||
| self.eval_history = [] | |||
| self.batch_output = [] | |||
| self.eval_history = [] # evaluation results of all batches | |||
| self.batch_output = [] # outputs of all batches | |||
| def test(self, network, dev_data): | |||
| if torch.cuda.is_available() and self.use_cuda: | |||
| @@ -83,10 +83,10 @@ class BaseTester(object): | |||
| self.eval_history.clear() | |||
| self.batch_output.clear() | |||
| iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | |||
| iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=False)) | |||
| step = 0 | |||
| for batch_x, batch_y in self.make_batch(iterator, dev_data): | |||
| for batch_x, batch_y in self.make_batch(iterator): | |||
| with torch.no_grad(): | |||
| prediction = self.data_forward(network, batch_x) | |||
| eval_results = self.evaluate(prediction, batch_y) | |||
| @@ -99,7 +99,7 @@ class BaseTester(object): | |||
| print_output = "[test step {}] {}".format(step, eval_results) | |||
| logger.info(print_output) | |||
| if self.print_every_step > 0 and step % self.print_every_step == 0: | |||
| print(print_output) | |||
| print(self.make_eval_output(prediction, eval_results)) | |||
| step += 1 | |||
| def mode(self, model, test): | |||
| @@ -115,28 +115,48 @@ class BaseTester(object): | |||
| raise NotImplementedError | |||
| def evaluate(self, predict, truth): | |||
| """Compute evaluation metrics for the model. """ | |||
| """Compute evaluation metrics. | |||
| :param predict: Tensor | |||
| :param truth: Tensor | |||
| :return eval_results: can be anything. It will be stored in self.eval_history | |||
| """ | |||
| raise NotImplementedError | |||
| @property | |||
| def metrics(self): | |||
| """Return a list of metrics. """ | |||
| """Compute and return metrics. | |||
| Use self.eval_history to compute metrics over the whole dev set. | |||
| Please refer to metrics.py for common metric functions. | |||
| :return : variable number of outputs | |||
| """ | |||
| raise NotImplementedError | |||
| def show_matrices(self): | |||
| """This is called by Trainer to print evaluation results on dev set during training. | |||
| def show_metrics(self): | |||
| """Customize evaluation outputs in Trainer. | |||
| Called by Trainer to print evaluation results on dev set during training. | |||
| Use self.metrics to fetch available metrics. | |||
| :return print_str: str | |||
| """ | |||
| raise NotImplementedError | |||
| def make_batch(self, iterator, data): | |||
| def make_batch(self, iterator): | |||
| raise NotImplementedError | |||
| def make_eval_output(self, predictions, eval_results): | |||
| """Customize Tester outputs. | |||
| :param predictions: Tensor | |||
| :param eval_results: Tensor | |||
| :return: str, to be printed. | |||
| """ | |||
| raise NotImplementedError | |||
| class SeqLabelTester(BaseTester): | |||
| """ | |||
| Tester for sequence labeling. | |||
| """Tester for sequence labeling. | |||
| """ | |||
| def __init__(self, **test_args): | |||
| @@ -194,15 +214,15 @@ class SeqLabelTester(BaseTester): | |||
| batch_accuracy = np.mean([x[1] for x in self.eval_history]) | |||
| return batch_loss, batch_accuracy | |||
| def show_matrices(self): | |||
| """ | |||
| This is called by Trainer to print evaluation on dev set. | |||
| def show_metrics(self): | |||
| """This is called by Trainer to print evaluation on dev set. | |||
| :return print_str: str | |||
| """ | |||
| loss, accuracy = self.metrics() | |||
| return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | |||
| def make_batch(self, iterator, data): | |||
| def make_batch(self, iterator): | |||
| return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True) | |||
| @@ -211,12 +231,12 @@ class ClassificationTester(BaseTester): | |||
| def __init__(self, **test_args): | |||
| """ | |||
| :param test_args: a dict-like object that has __getitem__ method, \ | |||
| :param test_args: a dict-like object that has __getitem__ method. | |||
| can be accessed by "test_args["key_str"]" | |||
| """ | |||
| super(ClassificationTester, self).__init__(**test_args) | |||
| def make_batch(self, iterator, data, max_len=None): | |||
| def make_batch(self, iterator, max_len=None): | |||
| return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len) | |||
| def data_forward(self, network, x): | |||
| @@ -1,10 +1,11 @@ | |||
| import _pickle | |||
| import copy | |||
| import os | |||
| import time | |||
| from datetime import timedelta | |||
| import torch | |||
| import tensorboardX | |||
| from tensorboardX import SummaryWriter | |||
| from fastNLP.core.action import Action | |||
| from fastNLP.core.action import RandomSampler, Batchifier | |||
| @@ -15,16 +16,12 @@ from fastNLP.modules import utils | |||
| from fastNLP.saver.logger import create_logger | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| DEFAULT_QUEUE_SIZE = 300 | |||
| logger = create_logger(__name__, "./train_test.log") | |||
| class BaseTrainer(object): | |||
| """Operations to train a model, including data loading, SGD, and validation. | |||
| """Operations of training a model, including data loading, gradient descent, and validation. | |||
| Subclasses must implement the following abstract methods: | |||
| - grad_backward | |||
| - get_loss | |||
| """ | |||
| def __init__(self, **kwargs): | |||
| @@ -32,10 +29,10 @@ class BaseTrainer(object): | |||
| :param kwargs: dict of (key, value), or dict-like object. key is str. | |||
| The base trainer requires the following keys: | |||
| - epochs: int, the number of epochs in training | |||
| - validate: bool, whether or not to validate on dev set | |||
| - batch_size: int | |||
| - pickle_path: str, the path to pickle files for pre-processing | |||
| - epochs: int, the number of epochs in training | |||
| - validate: bool, whether or not to validate on dev set | |||
| - batch_size: int | |||
| - pickle_path: str, the path to pickle files for pre-processing | |||
| """ | |||
| super(BaseTrainer, self).__init__() | |||
| @@ -47,7 +44,7 @@ class BaseTrainer(object): | |||
| """ | |||
| default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", | |||
| "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, | |||
| "loss": Loss(None), | |||
| "loss": Loss(None), # used to pass type check | |||
| "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) | |||
| } | |||
| """ | |||
| @@ -56,7 +53,7 @@ class BaseTrainer(object): | |||
| Obviously, "required_args" is the subset of "default_args". | |||
| The value in "default_args" to the keys in "required_args" is simply for type check. | |||
| """ | |||
| # TODO: required arguments | |||
| # add required arguments here | |||
| required_args = {} | |||
| for req_key in required_args: | |||
| @@ -91,9 +88,12 @@ class BaseTrainer(object): | |||
| self._loss_func = default_args["loss"].get() # return a pytorch loss function or None | |||
| self._optimizer = None | |||
| self._optimizer_proto = default_args["optimizer"] | |||
| self._summary_writer = SummaryWriter(self.pickle_path + 'tensorboard_logs') | |||
| self._graph_summaried = False | |||
| def train(self, network, train_data, dev_data=None): | |||
| """General Training Procedure | |||
| :param network: a model | |||
| :param train_data: three-level list, the training set. | |||
| :param dev_data: three-level list, the validation data (optional) | |||
| @@ -144,12 +144,13 @@ class BaseTrainer(object): | |||
| print("Saved better model selected by validation.") | |||
| logger.info("Saved better model selected by validation.") | |||
| valid_results = validator.show_matrices() | |||
| valid_results = validator.show_metrics() | |||
| print("[epoch {}] {}".format(epoch, valid_results)) | |||
| logger.info("[epoch {}] {}".format(epoch, valid_results)) | |||
| def _train_step(self, data_iterator, network, **kwargs): | |||
| """Training process in one epoch. | |||
| kwargs should contain: | |||
| - n_print: int, print training information every n steps. | |||
| - start: time.time(), the starting time of this step. | |||
| @@ -163,6 +164,11 @@ class BaseTrainer(object): | |||
| loss = self.get_loss(prediction, batch_y) | |||
| self.grad_backward(loss) | |||
| self.update() | |||
| self._summary_writer.add_scalar("loss", loss.item(), global_step=step) | |||
| if not self._graph_summaried: | |||
| self._summary_writer.add_graph(network, batch_x) | |||
| self._graph_summaried = True | |||
| if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: | |||
| end = time.time() | |||
| @@ -198,21 +204,6 @@ class BaseTrainer(object): | |||
| network_copy = copy.deepcopy(network) | |||
| self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | |||
| def load_train_data(self, pickle_path): | |||
| """ | |||
| For task-specific processing. | |||
| :param pickle_path: | |||
| :return data_train | |||
| """ | |||
| file_path = os.path.join(pickle_path, "data_train.pkl") | |||
| if os.path.exists(file_path): | |||
| with open(file_path, 'rb') as f: | |||
| data = _pickle.load(f) | |||
| else: | |||
| logger.error("cannot find training data {}. invalid input path for training data.".format(file_path)) | |||
| raise RuntimeError("cannot find training data {}".format(file_path)) | |||
| return data | |||
| def make_batch(self, iterator): | |||
| raise NotImplementedError | |||
| @@ -220,14 +211,13 @@ class BaseTrainer(object): | |||
| Action.mode(network, test) | |||
| def define_optimizer(self): | |||
| """ | |||
| Define framework-specific optimizer specified by the models. | |||
| """Define framework-specific optimizer specified by the models. | |||
| """ | |||
| self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters()) | |||
| def update(self): | |||
| """ | |||
| Perform weight update on a model. | |||
| """Perform weight update on a model. | |||
| For PyTorch, just call optimizer to update. | |||
| """ | |||
| @@ -237,8 +227,8 @@ class BaseTrainer(object): | |||
| raise NotImplementedError | |||
| def grad_backward(self, loss): | |||
| """ | |||
| Compute gradient with link rules. | |||
| """Compute gradient with link rules. | |||
| :param loss: a scalar where back-prop starts | |||
| For PyTorch, just do "loss.backward()" | |||
| @@ -247,8 +237,8 @@ class BaseTrainer(object): | |||
| loss.backward() | |||
| def get_loss(self, predict, truth): | |||
| """ | |||
| Compute loss given prediction and ground truth. | |||
| """Compute loss given prediction and ground truth. | |||
| :param predict: prediction label vector | |||
| :param truth: ground truth label vector | |||
| :return: a scalar | |||
| @@ -256,8 +246,9 @@ class BaseTrainer(object): | |||
| return self._loss_func(predict, truth) | |||
| def define_loss(self): | |||
| """ | |||
| if the model defines a loss, use model's loss. | |||
| """Define a loss for the trainer. | |||
| If the model defines a loss, use model's loss. | |||
| Otherwise, Trainer must has a loss argument, use it as loss. | |||
| These two losses cannot be defined at the same time. | |||
| Trainer does not handle loss definition or choose default losses. | |||
| @@ -274,7 +265,8 @@ class BaseTrainer(object): | |||
| logger.info("The model didn't define loss, use Trainer's loss.") | |||
| def best_eval_result(self, validator): | |||
| """ | |||
| """Check if the current epoch yields better validation results. | |||
| :param validator: a Tester instance | |||
| :return: bool, True means current results on dev set is the best. | |||
| """ | |||
| @@ -289,15 +281,14 @@ class BaseTrainer(object): | |||
| """ | |||
| if model_name[-4:] != ".pkl": | |||
| model_name += ".pkl" | |||
| ModelSaver(self.pickle_path + model_name).save_pytorch(network) | |||
| ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network) | |||
| def _create_validator(self, valid_args): | |||
| raise NotImplementedError | |||
| class SeqLabelTrainer(BaseTrainer): | |||
| """ | |||
| Trainer for Sequence Labeling | |||
| """Trainer for Sequence Labeling | |||
| """ | |||
| @@ -327,11 +318,11 @@ class SeqLabelTrainer(BaseTrainer): | |||
| return y | |||
| def get_loss(self, predict, truth): | |||
| """ | |||
| Compute loss given prediction and ground truth. | |||
| """Compute loss given prediction and ground truth. | |||
| :param predict: prediction label vector, [batch_size, max_len, tag_size] | |||
| :param truth: ground truth label vector, [batch_size, max_len] | |||
| :return: a scalar | |||
| :return loss: a scalar | |||
| """ | |||
| batch_size, max_len = predict.size(0), predict.size(1) | |||
| assert truth.shape == (batch_size, max_len) | |||
| @@ -1,3 +1,5 @@ | |||
| import os | |||
| from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer | |||
| from fastNLP.core.preprocess import load_pickle | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| @@ -31,6 +33,22 @@ FastNLP_MODEL_COLLECTION = { | |||
| "type": "seq_label", | |||
| "config_file_name": "config", | |||
| "config_section_name": "text_class_model" | |||
| }, | |||
| "pos_tag_model": { | |||
| "url": "", | |||
| "class": "sequence_modeling.AdvSeqLabel", | |||
| "pickle": "pos_tag_model_v_0.pkl", | |||
| "type": "seq_label", | |||
| "config_file_name": "pos_tag.config", | |||
| "config_section_name": "pos_tag_model" | |||
| }, | |||
| "text_classify_model": { | |||
| "url": "", | |||
| "class": "cnn_text_classification.CNNText", | |||
| "pickle": "text_class_model_v0.pkl", | |||
| "type": "text_class", | |||
| "config_file_name": "text_classify.cfg", | |||
| "config_section_name": "model" | |||
| } | |||
| } | |||
| @@ -77,7 +95,7 @@ class FastNLP(object): | |||
| print("Restore model class {}".format(str(model_class))) | |||
| model_args = ConfigSection() | |||
| ConfigLoader.load_config(self.model_dir + config_file, {section_name: model_args}) | |||
| ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args}) | |||
| print("Restore model hyper-parameters {}".format(str(model_args.data))) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| @@ -91,7 +109,7 @@ class FastNLP(object): | |||
| print("Model constructed.") | |||
| # To do: framework independent | |||
| ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name]["pickle"]) | |||
| ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"])) | |||
| print("Model weights loaded.") | |||
| self.model = model | |||
| @@ -259,3 +277,38 @@ def interpret_word_seg_results(char_seq, label_seq): | |||
| else: | |||
| raise ValueError("invalid label {}".format(label[0])) | |||
| return words | |||
| def interpret_cws_pos_results(char_seq, label_seq): | |||
| """Transform model output into user-friendly contents. | |||
| :param char_seq: list of string | |||
| :param label_seq: list of string, the same length as char_seq. | |||
| :return outputs: list of tuple (words, pos_tag): | |||
| """ | |||
| def pos_tag_check(seq): | |||
| """check whether all entries are the same """ | |||
| return len(set(seq)) <= 1 | |||
| word = [] | |||
| word_pos = [] | |||
| outputs = [] | |||
| for char, label in zip(char_seq, label_seq): | |||
| tmp = label.split("-") | |||
| cws_label, pos_tag = tmp[0], tmp[1] | |||
| if cws_label == "B" or cws_label == "M": | |||
| word.append(char) | |||
| word_pos.append(pos_tag) | |||
| elif cws_label == "E": | |||
| word.append(char) | |||
| word_pos.append(pos_tag) | |||
| if not pos_tag_check(word_pos): | |||
| raise RuntimeError("character-wise pos tags inconsistent. ") | |||
| outputs.append(("".join(word), word_pos[0])) | |||
| word.clear() | |||
| word_pos.clear() | |||
| elif cws_label == "S": | |||
| outputs.append((char, pos_tag)) | |||
| return outputs | |||
| @@ -1,9 +1,8 @@ | |||
| class BaseLoader(object): | |||
| """docstring for BaseLoader""" | |||
| def __init__(self, data_name, data_path): | |||
| def __init__(self, data_path): | |||
| super(BaseLoader, self).__init__() | |||
| self.data_name = data_name | |||
| self.data_path = data_path | |||
| def load(self): | |||
| @@ -25,8 +24,8 @@ class ToyLoader0(BaseLoader): | |||
| For charLM | |||
| """ | |||
| def __init__(self, name, path): | |||
| super(ToyLoader0, self).__init__(name, path) | |||
| def __init__(self, data_path): | |||
| super(ToyLoader0, self).__init__(data_path) | |||
| def load(self): | |||
| with open(self.data_path, 'r') as f: | |||
| @@ -9,7 +9,7 @@ class ConfigLoader(BaseLoader): | |||
| """loader for configuration files""" | |||
| def __int__(self, data_name, data_path): | |||
| super(ConfigLoader, self).__init__(data_name, data_path) | |||
| super(ConfigLoader, self).__init__(data_path) | |||
| self.config = self.parse(super(ConfigLoader, self).load()) | |||
| @staticmethod | |||
| @@ -100,7 +100,7 @@ class ConfigSection(object): | |||
| if __name__ == "__main__": | |||
| config = ConfigLoader('configLoader', 'there is no data') | |||
| config = ConfigLoader('there is no data') | |||
| section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} | |||
| """ | |||
| @@ -6,8 +6,8 @@ from fastNLP.loader.base_loader import BaseLoader | |||
| class DatasetLoader(BaseLoader): | |||
| """"loader for data sets""" | |||
| def __init__(self, data_name, data_path): | |||
| super(DatasetLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(DatasetLoader, self).__init__(data_path) | |||
| class POSDatasetLoader(DatasetLoader): | |||
| @@ -31,8 +31,8 @@ class POSDatasetLoader(DatasetLoader): | |||
| to label5. | |||
| """ | |||
| def __init__(self, data_name, data_path): | |||
| super(POSDatasetLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(POSDatasetLoader, self).__init__(data_path) | |||
| def load(self): | |||
| assert os.path.exists(self.data_path) | |||
| @@ -84,8 +84,8 @@ class TokenizeDatasetLoader(DatasetLoader): | |||
| Data set loader for tokenization data sets | |||
| """ | |||
| def __init__(self, data_name, data_path): | |||
| super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(TokenizeDatasetLoader, self).__init__(data_path) | |||
| def load_pku(self, max_seq_len=32): | |||
| """ | |||
| @@ -138,8 +138,8 @@ class TokenizeDatasetLoader(DatasetLoader): | |||
| class ClassDatasetLoader(DatasetLoader): | |||
| """Loader for classification data sets""" | |||
| def __init__(self, data_name, data_path): | |||
| super(ClassDatasetLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(ClassDatasetLoader, self).__init__(data_path) | |||
| def load(self): | |||
| assert os.path.exists(self.data_path) | |||
| @@ -177,7 +177,7 @@ class ConllLoader(DatasetLoader): | |||
| :param str data_name: the name of the conll data set | |||
| :param str data_path: the path to the conll data set | |||
| """ | |||
| super(ConllLoader, self).__init__(data_name, data_path) | |||
| super(ConllLoader, self).__init__(data_path) | |||
| self.data_set = self.parse(self.load()) | |||
| def load(self): | |||
| @@ -209,8 +209,8 @@ class ConllLoader(DatasetLoader): | |||
| class LMDatasetLoader(DatasetLoader): | |||
| def __init__(self, data_name, data_path): | |||
| super(LMDatasetLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(LMDatasetLoader, self).__init__(data_path) | |||
| def load(self): | |||
| if not os.path.exists(self.data_path): | |||
| @@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader): | |||
| return text.strip().split() | |||
| if __name__ == "__main__": | |||
| class PeopleDailyCorpusLoader(DatasetLoader): | |||
| """ | |||
| data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines() | |||
| for example in data: | |||
| for w, l in zip(example[0], example[1]): | |||
| print(w, l) | |||
| People Daily Corpus: Chinese word segmentation, POS tag, NER | |||
| """ | |||
| ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku() | |||
| print(ans) | |||
| def __init__(self, data_path): | |||
| super(PeopleDailyCorpusLoader, self).__init__(data_path) | |||
| def load(self): | |||
| with open(self.data_path, "r", encoding="utf-8") as f: | |||
| sents = f.readlines() | |||
| pos_tag_examples = [] | |||
| ner_examples = [] | |||
| for sent in sents: | |||
| inside_ne = False | |||
| sent_pos_tag = [] | |||
| sent_words = [] | |||
| sent_ner = [] | |||
| words = sent.strip().split()[1:] | |||
| for word in words: | |||
| if "[" in word and "]" in word: | |||
| ner_tag = "U" | |||
| print(word) | |||
| elif "[" in word: | |||
| inside_ne = True | |||
| ner_tag = "B" | |||
| word = word[1:] | |||
| elif "]" in word: | |||
| ner_tag = "L" | |||
| word = word[:word.index("]")] | |||
| if inside_ne is True: | |||
| inside_ne = False | |||
| else: | |||
| raise RuntimeError("only ] appears!") | |||
| else: | |||
| if inside_ne is True: | |||
| ner_tag = "I" | |||
| else: | |||
| ner_tag = "O" | |||
| tmp = word.split("/") | |||
| token, pos = tmp[0], tmp[1] | |||
| sent_ner.append(ner_tag) | |||
| sent_pos_tag.append(pos) | |||
| sent_words.append(token) | |||
| pos_tag_examples.append([sent_words, sent_pos_tag]) | |||
| ner_examples.append([sent_words, sent_ner]) | |||
| return pos_tag_examples, ner_examples | |||
| if __name__ == "__main__": | |||
| loader = PeopleDailyCorpusLoader("./") | |||
| pos, ner = loader.load() | |||
| print(pos[:10]) | |||
| print(ner[:10]) | |||
| @@ -1,8 +1,50 @@ | |||
| import _pickle | |||
| import os | |||
| import numpy as np | |||
| from fastNLP.loader.base_loader import BaseLoader | |||
| class EmbedLoader(BaseLoader): | |||
| """docstring for EmbedLoader""" | |||
| def __init__(self, data_name, data_path): | |||
| super(EmbedLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(EmbedLoader, self).__init__(data_path) | |||
| @staticmethod | |||
| def load_embedding(emb_dim, emb_file, word_dict, emb_pkl): | |||
| """Load the pre-trained embedding and combine with the given dictionary. | |||
| :param emb_file: str, the pre-trained embedding. | |||
| The embedding file should have the following format: | |||
| Each line is a word embedding, where a word string is followed by multiple floats. | |||
| Floats are separated by space. The word and the first float are separated by space. | |||
| :param word_dict: dict, a mapping from word to index. | |||
| :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. | |||
| :param emb_pkl: str, the embedding pickle file. | |||
| :return embedding_np: numpy array of shape (len(word_dict), emb_dim) | |||
| TODO: fragile code | |||
| """ | |||
| # If the embedding pickle exists, load it and return. | |||
| if os.path.exists(emb_pkl): | |||
| with open(emb_pkl, "rb") as f: | |||
| embedding_np = _pickle.load(f) | |||
| return embedding_np | |||
| # Otherwise, load the pre-trained embedding. | |||
| with open(emb_file, "r", encoding="utf-8") as f: | |||
| # begin with a random embedding | |||
| embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||
| for line in f: | |||
| line = line.strip().split() | |||
| if len(line) != emb_dim + 1: | |||
| # skip this line if two embedding dimension not match | |||
| continue | |||
| if line[0] in word_dict: | |||
| # find the word and replace its embedding with a pre-trained one | |||
| embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||
| # save and return the result | |||
| with open(emb_pkl, "wb") as f: | |||
| _pickle.dump(embedding_np, f) | |||
| return embedding_np | |||
| @@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): | |||
| Loader for models. | |||
| """ | |||
| def __init__(self, data_name, data_path): | |||
| super(ModelLoader, self).__init__(data_name, data_path) | |||
| def __init__(self, data_path): | |||
| super(ModelLoader, self).__init__(data_path) | |||
| @staticmethod | |||
| def load_pytorch(empty_model, model_path): | |||
| @@ -5,7 +5,7 @@ import torch | |||
| import torch.nn as nn | |||
| # import torch.nn.functional as F | |||
| from fastNLP.modules.encoder.conv_maxpool import ConvMaxpool | |||
| import fastNLP.modules.encoder as encoder | |||
| class CNNText(torch.nn.Module): | |||
| @@ -18,22 +18,22 @@ class CNNText(torch.nn.Module): | |||
| def __init__(self, args): | |||
| super(CNNText, self).__init__() | |||
| class_num = args["num_classes"] | |||
| num_classes = args["num_classes"] | |||
| kernel_nums = [100, 100, 100] | |||
| kernel_sizes = [3, 4, 5] | |||
| embed_num = args["vocab_size"] | |||
| vocab_size = args["vocab_size"] | |||
| embed_dim = 300 | |||
| pretrained_embed = None | |||
| drop_prob = 0.5 | |||
| # no support for pre-trained embedding currently | |||
| self.embed = nn.Embedding(embed_num, embed_dim, padding_idx=0) | |||
| self.conv_pool = ConvMaxpool( | |||
| self.embed = encoder.embedding.Embedding(vocab_size, embed_dim) | |||
| self.conv_pool = encoder.conv_maxpool.ConvMaxpool( | |||
| in_channels=embed_dim, | |||
| out_channels=kernel_nums, | |||
| kernel_sizes=kernel_sizes) | |||
| self.dropout = nn.Dropout(drop_prob) | |||
| self.fc = nn.Linear(sum(kernel_nums), class_num) | |||
| self.fc = encoder.linear.Linear(sum(kernel_nums), num_classes) | |||
| def forward(self, x): | |||
| x = self.embed(x) # [N,L] -> [N,L,C] | |||
| @@ -1,3 +1,4 @@ | |||
| from .CRF import ConditionalRandomField | |||
| from .MLP import MLP | |||
| __all__ = ["ConditionalRandomField"] | |||
| __all__ = ["ConditionalRandomField", "MLP"] | |||
| @@ -2,8 +2,10 @@ from .embedding import Embedding | |||
| from .linear import Linear | |||
| from .lstm import Lstm | |||
| from .conv import Conv | |||
| from .conv_maxpool import ConvMaxpool | |||
| __all__ = ["Lstm", | |||
| "Embedding", | |||
| "Linear", | |||
| "Conv"] | |||
| "Conv", | |||
| "ConvMaxpool"] | |||
| @@ -4,6 +4,7 @@ | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torch.nn.init import xavier_uniform_ | |||
| class ConvMaxpool(nn.Module): | |||
| @@ -21,6 +22,7 @@ class ConvMaxpool(nn.Module): | |||
| if isinstance(kernel_sizes, int): | |||
| out_channels = [out_channels] | |||
| kernel_sizes = [kernel_sizes] | |||
| self.convs = nn.ModuleList([nn.Conv1d( | |||
| in_channels=in_channels, | |||
| out_channels=oc, | |||
| @@ -31,6 +33,9 @@ class ConvMaxpool(nn.Module): | |||
| groups=groups, | |||
| bias=bias) | |||
| for oc, ks in zip(out_channels, kernel_sizes)]) | |||
| for conv in self.convs: | |||
| xavier_uniform_(conv.weight) # weight initialization | |||
| else: | |||
| raise Exception( | |||
| 'Incorrect kernel sizes: should be list, tuple or int') | |||
| @@ -1,114 +0,0 @@ | |||
| import sys | |||
| sys.path.append("..") | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| from fastNLP.models.sequence_modeling import SeqLabeling | |||
| from fastNLP.core.predictor import Predictor | |||
| data_name = "pku_training.utf8" | |||
| cws_data_path = "/home/zyfeng/data/pku_training.utf8" | |||
| pickle_path = "./save/" | |||
| data_infer_path = "/home/zyfeng/data/pku_test.utf8" | |||
| def infer(): | |||
| # Load infer configuration, the same as test | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| test_args["vocab_size"] = len(word2index) | |||
| index2label = load_pickle(pickle_path, "id2class.pkl") | |||
| test_args["num_classes"] = len(index2label) | |||
| # Define the same model | |||
| model = SeqLabeling(test_args) | |||
| # Dump trained parameters into the model | |||
| ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
| print("model loaded!") | |||
| # Data Loader | |||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
| infer_data = raw_data_loader.load_lines() | |||
| # Inference interface | |||
| infer = Predictor(pickle_path) | |||
| results = infer.predict(model, infer_data) | |||
| print(results) | |||
| print("Inference finished!") | |||
| def train_test(): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| test_args = ConfigSection() | |||
| ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args}) | |||
| # Data Loader | |||
| loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
| train_data = loader.load_pku() | |||
| # Preprocessor | |||
| preprocess = SeqLabelPreprocess() | |||
| data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
| train_args["vocab_size"] = preprocess.vocab_size | |||
| train_args["num_classes"] = preprocess.num_classes | |||
| # Trainer | |||
| trainer = SeqLabelTrainer(train_args) | |||
| # Model | |||
| model = SeqLabeling(train_args) | |||
| # Start training | |||
| trainer.train(model, data_train, data_dev) | |||
| print("Training finished!") | |||
| # Saver | |||
| saver = ModelSaver("./save/saved_model.pkl") | |||
| saver.save_pytorch(model) | |||
| print("Model saved!") | |||
| # testing with validation set | |||
| test(data_dev) | |||
| def test(test_data): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| # Define the same model | |||
| model = SeqLabeling(train_args) | |||
| # Dump trained parameters into the model | |||
| ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
| print("model loaded!") | |||
| # Load test configuration | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
| # Tester | |||
| tester = SeqLabelTester(test_args) | |||
| # Start testing | |||
| tester.test(model, test_data) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| print("model tested!") | |||
| if __name__ == "__main__": | |||
| train_test() | |||
| @@ -31,4 +31,16 @@ pickle_path = "./save/" | |||
| use_crf = true | |||
| use_cuda = true | |||
| rnn_hidden_units = 100 | |||
| word_emb_dim = 100 | |||
| [model] | |||
| save_output = true | |||
| validate_in_training = true | |||
| save_dev_input = false | |||
| save_loss = true | |||
| batch_size = 640 | |||
| pickle_path = "./save/" | |||
| use_crf = true | |||
| use_cuda = true | |||
| rnn_hidden_units = 100 | |||
| word_emb_dim = 100 | |||
| @@ -27,7 +27,7 @@ data_infer_path = os.path.join(datadir, "infer.utf8") | |||
| def infer(): | |||
| # Config Loader | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| @@ -47,7 +47,7 @@ def infer(): | |||
| raise | |||
| # Data Loader | |||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
| raw_data_loader = BaseLoader(data_infer_path) | |||
| infer_data = raw_data_loader.load_lines() | |||
| print('data loaded') | |||
| @@ -63,10 +63,10 @@ def train(): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| test_args = ConfigSection() | |||
| ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
| ConfigLoader("good_path").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
| # Data Loader | |||
| loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
| loader = TokenizeDatasetLoader(cws_data_path) | |||
| train_data = loader.load_pku() | |||
| # Preprocessor | |||
| @@ -100,7 +100,7 @@ def train(): | |||
| def test(): | |||
| # Config Loader | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args}) | |||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| @@ -125,7 +125,7 @@ def test(): | |||
| tester.test(model, dev_data) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| print(tester.show_metrics()) | |||
| print("model tested!") | |||
| @@ -1,29 +1,35 @@ | |||
| [train] | |||
| epochs = 10 | |||
| batch_size = 32 | |||
| epochs = 30 | |||
| batch_size = 64 | |||
| pickle_path = "./save/" | |||
| validate = true | |||
| save_best_dev = true | |||
| model_saved_path = "./save/" | |||
| rnn_hidden_units = 100 | |||
| rnn_layers = 2 | |||
| rnn_bi_direction = true | |||
| word_emb_dim = 100 | |||
| dropout = 0.5 | |||
| use_crf = true | |||
| use_cuda = true | |||
| print_every_step = 10 | |||
| [test] | |||
| save_output = true | |||
| validate_in_training = true | |||
| save_dev_input = false | |||
| save_loss = true | |||
| batch_size = 64 | |||
| batch_size = 640 | |||
| pickle_path = "./save/" | |||
| use_crf = true | |||
| use_cuda = true | |||
| [POS_test] | |||
| save_output = true | |||
| validate_in_training = true | |||
| save_dev_input = false | |||
| save_loss = true | |||
| batch_size = 640 | |||
| pickle_path = "./save/" | |||
| rnn_hidden_units = 100 | |||
| rnn_layers = 1 | |||
| rnn_bi_direction = true | |||
| word_emb_dim = 100 | |||
| dropout = 0.5 | |||
| use_crf = true | |||
| use_cuda = true | |||
| rnn_hidden_units = 100 | |||
| word_emb_dim = 100 | |||
| @@ -0,0 +1,146 @@ | |||
| import os | |||
| import sys | |||
| sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
| from fastNLP.core.predictor import SeqLabelInfer | |||
| # not in the file's dir | |||
| if len(os.path.dirname(__file__)) != 0: | |||
| os.chdir(os.path.dirname(__file__)) | |||
| datadir = "/home/zyfeng/data/" | |||
| cfgfile = './pos_tag.cfg' | |||
| data_name = "CWS_POS_TAG_NER_people_daily.txt" | |||
| pos_tag_data_path = os.path.join(datadir, data_name) | |||
| pickle_path = "save" | |||
| data_infer_path = os.path.join(datadir, "infer.utf8") | |||
| def infer(): | |||
| # Config Loader | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| test_args["vocab_size"] = len(word2index) | |||
| index2label = load_pickle(pickle_path, "id2class.pkl") | |||
| test_args["num_classes"] = len(index2label) | |||
| # Define the same model | |||
| model = AdvSeqLabel(test_args) | |||
| try: | |||
| ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
| print('model loaded!') | |||
| except Exception as e: | |||
| print('cannot load model!') | |||
| raise | |||
| # Data Loader | |||
| raw_data_loader = BaseLoader(data_infer_path) | |||
| infer_data = raw_data_loader.load_lines() | |||
| print('data loaded') | |||
| # Inference interface | |||
| infer = SeqLabelInfer(pickle_path) | |||
| results = infer.predict(model, infer_data) | |||
| print(results) | |||
| print("Inference finished!") | |||
| def train(): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| test_args = ConfigSection() | |||
| ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) | |||
| # Data Loader | |||
| loader = PeopleDailyCorpusLoader(pos_tag_data_path) | |||
| train_data, _ = loader.load() | |||
| # Preprocessor | |||
| preprocessor = SeqLabelPreprocess() | |||
| data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
| train_args["vocab_size"] = preprocessor.vocab_size | |||
| train_args["num_classes"] = preprocessor.num_classes | |||
| # Trainer | |||
| trainer = SeqLabelTrainer(**train_args.data) | |||
| # Model | |||
| model = AdvSeqLabel(train_args) | |||
| try: | |||
| ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
| print('model parameter loaded!') | |||
| except Exception as e: | |||
| print("No saved model. Continue.") | |||
| pass | |||
| # Start training | |||
| trainer.train(model, data_train, data_dev) | |||
| print("Training finished!") | |||
| # Saver | |||
| saver = ModelSaver("./save/saved_model.pkl") | |||
| saver.save_pytorch(model) | |||
| print("Model saved!") | |||
| def test(): | |||
| # Config Loader | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config").load_config(cfgfile, {"POS_test": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| test_args["vocab_size"] = len(word2index) | |||
| index2label = load_pickle(pickle_path, "id2class.pkl") | |||
| test_args["num_classes"] = len(index2label) | |||
| # load dev data | |||
| dev_data = load_pickle(pickle_path, "data_dev.pkl") | |||
| # Define the same model | |||
| model = AdvSeqLabel(test_args) | |||
| # Dump trained parameters into the model | |||
| ModelLoader.load_pytorch(model, "./save/saved_model.pkl") | |||
| print("model loaded!") | |||
| # Tester | |||
| tester = SeqLabelTester(**test_args.data) | |||
| # Start testing | |||
| tester.test(model, dev_data) | |||
| # print test results | |||
| print(tester.show_metrics()) | |||
| print("model tested!") | |||
| if __name__ == "__main__": | |||
| import argparse | |||
| parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') | |||
| parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) | |||
| args = parser.parse_args() | |||
| if args.mode == 'train': | |||
| train() | |||
| elif args.mode == 'test': | |||
| test() | |||
| elif args.mode == 'infer': | |||
| infer() | |||
| else: | |||
| print('no mode specified for model!') | |||
| parser.print_help() | |||
| @@ -1,3 +1,4 @@ | |||
| numpy>=1.14.2 | |||
| torch==0.4.0 | |||
| torchvision>=0.1.8 | |||
| tensorboardX | |||
| @@ -0,0 +1,24 @@ | |||
| #!/usr/bin/env python | |||
| # coding=utf-8 | |||
| from setuptools import setup, find_packages | |||
| with open('README.md') as f: | |||
| readme = f.read() | |||
| with open('LICENSE') as f: | |||
| license = f.read() | |||
| with open('requirements.txt') as f: | |||
| reqs = f.read() | |||
| setup( | |||
| name='fastNLP', | |||
| version='0.0.1', | |||
| description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', | |||
| long_description=readme, | |||
| license=license, | |||
| author='fudanNLP', | |||
| python_requires='>=3.5', | |||
| packages=find_packages(), | |||
| install_requires=reqs.strip().split('\n'), | |||
| ) | |||
| @@ -1,9 +1,8 @@ | |||
| import os | |||
| import unittest | |||
| from fastNLP.core.action import Action, Batchifier, SequentialSampler | |||
| class TestAction(unittest.TestCase): | |||
| def test_case_1(self): | |||
| x = [1, 2, 3, 4, 5, 6, 7, 8] | |||
| @@ -1,13 +1,12 @@ | |||
| import os | |||
| import configparser | |||
| import json | |||
| import os | |||
| import unittest | |||
| from fastNLP.loader.config_loader import ConfigSection, ConfigLoader | |||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader | |||
| class TestConfigLoader(unittest.TestCase): | |||
| def test_case_ConfigLoader(self): | |||
| @@ -33,8 +32,8 @@ class TestConfigLoader(unittest.TestCase): | |||
| return dict | |||
| test_arg = ConfigSection() | |||
| ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) | |||
| #ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", | |||
| ConfigLoader("config").load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) | |||
| # ConfigLoader("config").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", | |||
| # {"test": test_arg}) | |||
| #dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test") | |||
| @@ -58,18 +57,18 @@ class TestConfigLoader(unittest.TestCase): | |||
| class TestDatasetLoader(unittest.TestCase): | |||
| def test_case_TokenizeDatasetLoader(self): | |||
| loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") | |||
| loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||
| data = loader.load_pku(max_seq_len=32) | |||
| print("pass TokenizeDatasetLoader test!") | |||
| def test_case_POSDatasetLoader(self): | |||
| loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt") | |||
| loader = POSDatasetLoader("./test/data_for_tests/people.txt") | |||
| data = loader.load() | |||
| datas = loader.load_lines() | |||
| print("pass POSDatasetLoader test!") | |||
| def test_case_LMDatasetLoader(self): | |||
| loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8") | |||
| loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8") | |||
| data = loader.load() | |||
| datas = loader.load_lines() | |||
| print("pass TokenizeDatasetLoader test!") | |||
| @@ -1,138 +0,0 @@ | |||
| import _pickle | |||
| import os | |||
| import numpy as np | |||
| import torch | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
| class MyNERTrainer(SeqLabelTrainer): | |||
| def __init__(self, train_args): | |||
| super(MyNERTrainer, self).__init__(train_args) | |||
| self.scheduler = None | |||
| def define_optimizer(self): | |||
| """ | |||
| override | |||
| :return: | |||
| """ | |||
| self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001) | |||
| self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) | |||
| def update(self): | |||
| """ | |||
| override | |||
| :return: | |||
| """ | |||
| self.optimizer.step() | |||
| self.scheduler.step() | |||
| def _create_validator(self, valid_args): | |||
| return MyNERTester(valid_args) | |||
| def best_eval_result(self, validator): | |||
| accuracy = validator.metrics() | |||
| if accuracy > self.best_accuracy: | |||
| self.best_accuracy = accuracy | |||
| return True | |||
| else: | |||
| return False | |||
| class MyNERTester(SeqLabelTester): | |||
| def __init__(self, test_args): | |||
| super(MyNERTester, self).__init__(test_args) | |||
| def _evaluate(self, prediction, batch_y, seq_len): | |||
| """ | |||
| :param prediction: [batch_size, seq_len, num_classes] | |||
| :param batch_y: [batch_size, seq_len] | |||
| :param seq_len: [batch_size] | |||
| :return: | |||
| """ | |||
| summ = 0 | |||
| correct = 0 | |||
| _, indices = torch.max(prediction, 2) | |||
| for p, y, l in zip(indices, batch_y, seq_len): | |||
| summ += l | |||
| correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) | |||
| return float(correct / summ) | |||
| def evaluate(self, predict, truth): | |||
| return self._evaluate(predict, truth, self.seq_len) | |||
| def metrics(self): | |||
| return np.mean(self.eval_history) | |||
| def show_matrices(self): | |||
| return "dev accuracy={:.2f}".format(float(self.metrics())) | |||
| def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): | |||
| if os.path.exists(emb_pkl): | |||
| with open(emb_pkl, "rb") as f: | |||
| embedding_np = _pickle.load(f) | |||
| return embedding_np | |||
| with open(emb_file, "r", encoding="utf-8") as f: | |||
| embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||
| for line in f: | |||
| line = line.strip().split() | |||
| if len(line) != emb_dim + 1: | |||
| continue | |||
| if line[0] in word_dict: | |||
| embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||
| with open(emb_pkl, "wb") as f: | |||
| _pickle.dump(embedding_np, f) | |||
| return embedding_np | |||
| def data_load(data_file): | |||
| with open(data_file, "r", encoding="utf-8") as f: | |||
| all_data = [] | |||
| sent = [] | |||
| label = [] | |||
| for line in f: | |||
| line = line.strip().split() | |||
| if not len(line) <= 1: | |||
| sent.append(line[0]) | |||
| label.append(line[1]) | |||
| else: | |||
| all_data.append([sent, label]) | |||
| sent = [] | |||
| label = [] | |||
| return all_data | |||
| data_path = "data_for_tests/people.txt" | |||
| pick_path = "data_for_tests/" | |||
| emb_path = "data_for_tests/emb50.txt" | |||
| save_path = "data_for_tests/" | |||
| if __name__ == "__main__": | |||
| data = data_load(data_path) | |||
| preprocess = SeqLabelPreprocess() | |||
| data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3) | |||
| # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) | |||
| emb = None | |||
| args = {"epochs": 20, | |||
| "batch_size": 1, | |||
| "pickle_path": pick_path, | |||
| "validate": True, | |||
| "save_best_dev": True, | |||
| "model_saved_path": save_path, | |||
| "use_cuda": True, | |||
| "vocab_size": preprocess.vocab_size, | |||
| "num_classes": preprocess.num_classes, | |||
| "word_emb_dim": 50, | |||
| "rnn_hidden_units": 100 | |||
| } | |||
| # emb = torch.Tensor(emb).float().cuda() | |||
| networks = AdvSeqLabel(args, emb) | |||
| trainer = MyNERTrainer(args) | |||
| trainer.train(networks, data_train, data_dev) | |||
| print("Training finished!") | |||
| @@ -1,129 +0,0 @@ | |||
| import _pickle | |||
| import os | |||
| import torch | |||
| from fastNLP.core.predictor import SeqLabelInfer | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
| class Decode(SeqLabelTrainer): | |||
| def __init__(self, args): | |||
| super(Decode, self).__init__(args) | |||
| def decoder(self, network, sents, model_path): | |||
| self.model = network | |||
| self.model.load_state_dict(torch.load(model_path)) | |||
| out_put = [] | |||
| self.mode(network, test=True) | |||
| for batch_x in sents: | |||
| prediction = self.data_forward(self.model, batch_x) | |||
| seq_tag = self.model.prediction(prediction, batch_x[1]) | |||
| out_put.append(list(seq_tag)[0]) | |||
| return out_put | |||
| def process_sent(sents, word2id): | |||
| sents_num = [] | |||
| for s in sents: | |||
| sent_num = [] | |||
| for c in s: | |||
| if c in word2id: | |||
| sent_num.append(word2id[c]) | |||
| else: | |||
| sent_num.append(word2id["<unk>"]) | |||
| sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 | |||
| return sents_num | |||
| def process_tag(sents, tags, id2class): | |||
| Tags = [] | |||
| for ttt in tags: | |||
| Tags.append([id2class[t] for t in ttt]) | |||
| Segs = [] | |||
| PosNers = [] | |||
| for sent, tag in zip(sents, tags): | |||
| word__ = [] | |||
| lll__ = [] | |||
| for c, t in zip(sent, tag): | |||
| t = id2class[t] | |||
| l = t.split("-") | |||
| split_ = l[0] | |||
| pn = l[1] | |||
| if split_ == "S": | |||
| word__.append(c) | |||
| lll__.append(pn) | |||
| word_1 = "" | |||
| elif split_ == "E": | |||
| word_1 += c | |||
| word__.append(word_1) | |||
| lll__.append(pn) | |||
| word_1 = "" | |||
| elif split_ == "B": | |||
| word_1 = "" | |||
| word_1 += c | |||
| else: | |||
| word_1 += c | |||
| Segs.append(word__) | |||
| PosNers.append(lll__) | |||
| return Segs, PosNers | |||
| pickle_path = "data_for_tests/" | |||
| model_path = "data_for_tests/model_best_dev.pkl" | |||
| if __name__ == "__main__": | |||
| with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: | |||
| id2word = _pickle.load(f) | |||
| with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: | |||
| word2id = _pickle.load(f) | |||
| with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: | |||
| id2class = _pickle.load(f) | |||
| sent = ["中共中央总书记、国家主席江泽民", | |||
| "逆向处理输入序列并返回逆序后的序列"] # here is input | |||
| args = {"epochs": 1, | |||
| "batch_size": 1, | |||
| "pickle_path": "data_for_tests/", | |||
| "validate": True, | |||
| "save_best_dev": True, | |||
| "model_saved_path": "data_for_tests/", | |||
| "use_cuda": False, | |||
| "vocab_size": len(word2id), | |||
| "num_classes": len(id2class), | |||
| "word_emb_dim": 50, | |||
| "rnn_hidden_units": 100, | |||
| } | |||
| """ | |||
| network = AdvSeqLabel(args, None) | |||
| decoder_ = Decode(args) | |||
| tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) | |||
| output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output | |||
| print(output_seg) | |||
| print(output_pn) | |||
| """ | |||
| # Define the same model | |||
| model = AdvSeqLabel(args, None) | |||
| # Dump trained parameters into the model | |||
| ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") | |||
| print("model loaded!") | |||
| # Inference interface | |||
| infer = SeqLabelInfer(pickle_path) | |||
| sent = [[ch for ch in s] for s in sent] | |||
| results = infer.predict(model, sent) | |||
| for res in results: | |||
| print(res) | |||
| print("Inference finished!") | |||
| @@ -1,19 +1,13 @@ | |||
| # python: 3.5 | |||
| # pytorch: 0.4 | |||
| ################ | |||
| # Test cross validation. | |||
| ################ | |||
| from fastNLP.loader.preprocess import ClassPreprocess | |||
| from fastNLP.core.loss import Loss | |||
| from fastNLP.core.optimizer import Optimizer | |||
| from fastNLP.core.predictor import ClassificationInfer | |||
| from fastNLP.core.preprocess import ClassPreprocess | |||
| from fastNLP.core.trainer import ClassificationTrainer | |||
| from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
| from fastNLP.models.base_model import BaseModel | |||
| from fastNLP.modules import aggregation | |||
| from fastNLP.modules import encoder | |||
| from fastNLP.modules import decoder | |||
| from fastNLP.modules import encoder | |||
| class ClassificationModel(BaseModel): | |||
| @@ -28,7 +22,7 @@ class ClassificationModel(BaseModel): | |||
| self.enc = encoder.Conv( | |||
| in_channels=300, out_channels=100, kernel_size=3) | |||
| self.agg = aggregation.MaxPool() | |||
| self.dec = decoder.MLP(100, num_classes=num_classes) | |||
| self.dec = decoder.MLP(size_layer=[100, num_classes]) | |||
| def forward(self, x): | |||
| x = self.emb(x) # [N,L] -> [N,L,C] | |||
| @@ -38,18 +32,17 @@ class ClassificationModel(BaseModel): | |||
| return x | |||
| data_dir = 'data' # directory to save data and model | |||
| train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
| data_dir = 'save/' # directory to save data and model | |||
| train_path = './data_for_tests/text_classify.txt' # training set file | |||
| # load dataset | |||
| ds_loader = ClassDatasetLoader("train", train_path) | |||
| ds_loader = ClassDatasetLoader(train_path) | |||
| data = ds_loader.load() | |||
| # pre-process dataset | |||
| pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) | |||
| # pre = ClassPreprocess(data, data_dir) | |||
| n_classes = pre.num_classes | |||
| vocab_size = pre.vocab_size | |||
| pre = ClassPreprocess() | |||
| train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir) | |||
| n_classes, vocab_size = pre.num_classes, pre.vocab_size | |||
| # construct model | |||
| model_args = { | |||
| @@ -58,22 +51,25 @@ model_args = { | |||
| } | |||
| model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size) | |||
| # train model | |||
| # construct trainer | |||
| train_args = { | |||
| "epochs": 10, | |||
| "batch_size": 50, | |||
| "epochs": 3, | |||
| "batch_size": 16, | |||
| "pickle_path": data_dir, | |||
| "validate": False, | |||
| "save_best_dev": False, | |||
| "model_saved_path": None, | |||
| "use_cuda": True, | |||
| "learn_rate": 1e-3, | |||
| "momentum": 0.9} | |||
| trainer = ClassificationTrainer(train_args) | |||
| # trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) | |||
| trainer.cross_validate(model) | |||
| "loss": Loss("cross_entropy"), | |||
| "optimizer": Optimizer("Adam", lr=0.001) | |||
| } | |||
| trainer = ClassificationTrainer(**train_args) | |||
| # start training | |||
| trainer.train(model, train_data=train_set, dev_data=dev_set) | |||
| # predict using model | |||
| data_infer = [x[0] for x in data] | |||
| infer = ClassificationInfer(data_dir) | |||
| labels_pred = infer.predict(model, data_infer) | |||
| labels_pred = infer.predict(model.cpu(), data_infer) | |||
| print(labels_pred) | |||
| @@ -33,7 +33,7 @@ data_infer_path = args.infer | |||
| def infer(): | |||
| # Load infer configuration, the same as test | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args}) | |||
| ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| @@ -49,7 +49,7 @@ def infer(): | |||
| print("model loaded!") | |||
| # Data Loader | |||
| raw_data_loader = BaseLoader("xxx", data_infer_path) | |||
| raw_data_loader = BaseLoader(data_infer_path) | |||
| infer_data = raw_data_loader.load_lines() | |||
| # Inference interface | |||
| @@ -65,11 +65,11 @@ def train_and_test(): | |||
| # Config Loader | |||
| trainer_args = ConfigSection() | |||
| model_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config(config_dir, { | |||
| ConfigLoader("config.cfg").load_config(config_dir, { | |||
| "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) | |||
| # Data Loader | |||
| pos_loader = POSDatasetLoader("xxx", data_path) | |||
| pos_loader = POSDatasetLoader(data_path) | |||
| train_data = pos_loader.load_lines() | |||
| # Preprocessor | |||
| @@ -117,7 +117,7 @@ def train_and_test(): | |||
| # Load test configuration | |||
| tester_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||
| ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||
| # Tester | |||
| tester = SeqLabelTester(save_output=False, | |||
| @@ -134,10 +134,10 @@ def train_and_test(): | |||
| tester.test(model, data_dev) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| print(tester.show_metrics()) | |||
| print("model tested!") | |||
| if __name__ == "__main__": | |||
| train_and_test() | |||
| # infer() | |||
| # train_and_test() | |||
| infer() | |||
| @@ -22,7 +22,7 @@ data_infer_path = "data_for_tests/people_infer.txt" | |||
| def infer(): | |||
| # Load infer configuration, the same as test | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
| # fetch dictionary size and number of labels from pickle files | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| @@ -38,7 +38,7 @@ def infer(): | |||
| print("model loaded!") | |||
| # Data Loader | |||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
| raw_data_loader = BaseLoader(data_infer_path) | |||
| infer_data = raw_data_loader.load_lines() | |||
| """ | |||
| Transform strings into list of list of strings. | |||
| @@ -61,10 +61,10 @@ def infer(): | |||
| def train_test(): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| # Data Loader | |||
| loader = TokenizeDatasetLoader(data_name, cws_data_path) | |||
| loader = TokenizeDatasetLoader(cws_data_path) | |||
| train_data = loader.load_pku() | |||
| # Preprocessor | |||
| @@ -74,7 +74,7 @@ def train_test(): | |||
| train_args["num_classes"] = p.num_classes | |||
| # Trainer | |||
| trainer = SeqLabelTrainer(train_args) | |||
| trainer = SeqLabelTrainer(**train_args.data) | |||
| # Model | |||
| model = SeqLabeling(train_args) | |||
| @@ -99,16 +99,16 @@ def train_test(): | |||
| # Load test configuration | |||
| test_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||
| # Tester | |||
| tester = SeqLabelTester(test_args) | |||
| tester = SeqLabelTester(**test_args.data) | |||
| # Start testing | |||
| tester.test(model, data_train) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| print(tester.show_metrics()) | |||
| print("model tested!") | |||
| @@ -1,9 +1,12 @@ | |||
| import sys | |||
| sys.path.append("..") | |||
| from fastNLP.fastnlp import FastNLP | |||
| from fastNLP.fastnlp import interpret_word_seg_results | |||
| from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results | |||
| PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | |||
| PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | |||
| PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" | |||
| def word_seg(): | |||
| nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | |||
| @@ -39,5 +42,44 @@ def test_word_seg_interpret(): | |||
| print(interpret_word_seg_results(chars, labels)) | |||
| def test_interpret_cws_pos_results(): | |||
| foo = [ | |||
| [('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'), | |||
| ('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'), | |||
| ('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')] | |||
| ] | |||
| chars = [x[0] for x in foo[0]] | |||
| labels = [x[1] for x in foo[0]] | |||
| print(interpret_cws_pos_results(chars, labels)) | |||
| def pos_tag(): | |||
| nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES) | |||
| nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model") | |||
| text = ["这是最好的基于深度学习的中文分词系统。", | |||
| "大王叫我来巡山。", | |||
| "我党多年来致力于改善人民生活水平。"] | |||
| results = nlp.run(text) | |||
| for example in results: | |||
| words, labels = [], [] | |||
| for res in example: | |||
| words.append(res[0]) | |||
| labels.append(res[1]) | |||
| print(interpret_cws_pos_results(words, labels)) | |||
| def text_classify(): | |||
| nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES) | |||
| nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model") | |||
| text = [ | |||
| "世界物联网大会明日在京召开龙头股启动在即", | |||
| "乌鲁木齐市新增一处城市中心旅游目的地", | |||
| "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"] | |||
| results = nlp.run(text) | |||
| print(results) | |||
| """ | |||
| ['finance', 'travel', 'history'] | |||
| """ | |||
| if __name__ == "__main__": | |||
| word_seg() | |||
| text_classify() | |||
| @@ -5,19 +5,19 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||
| from fastNLP.models.sequence_modeling import SeqLabeling | |||
| data_name = "pku_training.utf8" | |||
| cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" | |||
| pickle_path = "data_for_tests" | |||
| def foo(): | |||
| loader = TokenizeDatasetLoader(data_name, "./data_for_tests/cws_pku_utf_8") | |||
| loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") | |||
| train_data = loader.load_pku() | |||
| train_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| # Preprocessor | |||
| p = SeqLabelPreprocess(train_data, pickle_path) | |||
| p = SeqLabelPreprocess() | |||
| train_data = p.run(train_data) | |||
| train_args["vocab_size"] = p.vocab_size | |||
| train_args["num_classes"] = p.num_classes | |||
| @@ -26,11 +26,11 @@ def foo(): | |||
| valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | |||
| "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", | |||
| "use_cuda": True} | |||
| validator = SeqLabelTester(valid_args) | |||
| validator = SeqLabelTester(**valid_args) | |||
| print("start validation.") | |||
| validator.test(model) | |||
| print(validator.show_matrices()) | |||
| validator.test(model, train_data) | |||
| print(validator.show_metrics()) | |||
| if __name__ == "__main__": | |||
| @@ -34,7 +34,7 @@ config_dir = args.config | |||
| def infer(): | |||
| # load dataset | |||
| print("Loading data...") | |||
| ds_loader = ClassDatasetLoader("train", train_data_dir) | |||
| ds_loader = ClassDatasetLoader(train_data_dir) | |||
| data = ds_loader.load() | |||
| unlabeled_data = [x[0] for x in data] | |||
| @@ -69,7 +69,7 @@ def train(): | |||
| # load dataset | |||
| print("Loading data...") | |||
| ds_loader = ClassDatasetLoader("train", train_data_dir) | |||
| ds_loader = ClassDatasetLoader(train_data_dir) | |||
| data = ds_loader.load() | |||
| print(data[0]) | |||