From 897ed8a8d0d4f54033ecd36a2aadfe6f6cd1c5a7 Mon Sep 17 00:00:00 2001 From: Danqing Wang Date: Thu, 11 Jul 2019 21:11:49 +0800 Subject: [PATCH 1/2] modify readme and update trainer.py --- reproduction/Summarization/Baseline/data/dataloader.py | 6 +++--- reproduction/Summarization/Baseline/tools/data.py | 7 ++++--- reproduction/Summarization/Baseline/train.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/reproduction/Summarization/Baseline/data/dataloader.py b/reproduction/Summarization/Baseline/data/dataloader.py index 57702904..47cd0856 100644 --- a/reproduction/Summarization/Baseline/data/dataloader.py +++ b/reproduction/Summarization/Baseline/data/dataloader.py @@ -56,7 +56,7 @@ class SummarizationLoader(JsonLoader): return ds - def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab=True): + def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab_file=True): """ :param paths: dict path for each dataset :param vocab_size: int max_size for vocab @@ -65,7 +65,7 @@ class SummarizationLoader(JsonLoader): :param doc_max_timesteps: int max sentence number of the document :param domain: bool build vocab for publication, use 'X' for unknown :param tag: bool build vocab for tag, use 'X' for unknown - :param load_vocab: bool build vocab (False) or load vocab (True) + :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) @@ -146,7 +146,7 @@ class SummarizationLoader(JsonLoader): train_ds = datasets[key] vocab_dict = {} - if load_vocab == False: + if load_vocab_file == False: logger.info("[INFO] Build new vocab from training dataset!") if train_ds == None: raise ValueError("Lack train file to build vocabulary!") diff --git a/reproduction/Summarization/Baseline/tools/data.py b/reproduction/Summarization/Baseline/tools/data.py index f7bbaddd..0cbfbb06 100644 --- a/reproduction/Summarization/Baseline/tools/data.py +++ b/reproduction/Summarization/Baseline/tools/data.py @@ -36,8 +36,8 @@ import pickle from nltk.tokenize import sent_tokenize -import utils -from logger import * +import tools.utils +from tools.logger import * # and are used in the data files to segment the abstracts into sentences. They don't receive vocab ids. SENTENCE_START = '' @@ -313,7 +313,8 @@ class Example(object): for sent in article_sents: article_words = sent.split() self.enc_sent_len.append(len(article_words)) # store the length after truncation but before padding - self.enc_sent_input.append([vocab.word2id(w) for w in article_words]) # list of word ids; OOVs are represented by the id for UNK token + # self.enc_sent_input.append([vocab.word2id(w) for w in article_words]) # list of word ids; OOVs are represented by the id for UNK token + self.enc_sent_input.append([vocab.word2id(w.lower()) for w in article_words]) # list of word ids; OOVs are represented by the id for UNK token self._pad_encoder_input(vocab.word2id('[PAD]')) # Store the original strings diff --git a/reproduction/Summarization/Baseline/train.py b/reproduction/Summarization/Baseline/train.py index c3a92f67..b3170307 100644 --- a/reproduction/Summarization/Baseline/train.py +++ b/reproduction/Summarization/Baseline/train.py @@ -29,7 +29,7 @@ import torch.nn os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' -sys.path.append('/remote-home/dqwang/FastNLP/fastNLP/') +sys.path.append('/remote-home/dqwang/FastNLP/fastNLP_brxx/') from fastNLP.core.const import Const From ec08564c1ce3de05d750e4f3ec52a99ab1810413 Mon Sep 17 00:00:00 2001 From: Danqing Wang Date: Thu, 11 Jul 2019 21:14:53 +0800 Subject: [PATCH 2/2] add rouge in readme --- reproduction/Summarization/README.md | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/reproduction/Summarization/README.md b/reproduction/Summarization/README.md index 2cde6304..b584269f 100644 --- a/reproduction/Summarization/README.md +++ b/reproduction/Summarization/README.md @@ -39,6 +39,54 @@ FastNLP中实现的模型包括: +### Evaluation + +#### FastRougeMetric + +FastRougeMetric使用python实现的ROUGE非官方库来实现在训练过程中快速计算rouge近似值。 + 源代码可见 [https://github.com/pltrdy/rouge](https://github.com/pltrdy/rouge) + +在fastNLP中,该方法已经被包装成Metric.py中的FastRougeMetric类以供trainer直接使用。 +需要事先使用pip安装该rouge库。 + + pip install rouge + + +**注意:由于实现细节的差异,该结果和官方ROUGE结果存在1-2个点的差异,仅可作为训练过程优化趋势的粗略估计。** + + + +#### PyRougeMetric + +PyRougeMetric 使用论文 [*ROUGE: A Package for Automatic Evaluation of Summaries*](https://www.aclweb.org/anthology/W04-1013) 提供的官方ROUGE 1.5.5评测库。 + +由于原本的ROUGE使用perl解释器,[pyrouge](https://github.com/bheinzerling/pyrouge)对其进行了python包装,而PyRougeMetric将其进一步包装为trainer可以直接使用的Metric类。 + +为了使用ROUGE 1.5.5,需要使用sudo权限安装一系列依赖库。 + +1. ROUGE 本身在Ubuntu下的安装可以参考[博客](https://blog.csdn.net/Hay54/article/details/78744912) +2. 配置wordnet可参考: +```shell +$ cd ~/rouge/RELEASE-1.5.5/data/WordNet-2.0-Exceptions/ +$ ./buildExeptionDB.pl . exc WordNet-2.0.exc.db +$ cd ../ +$ ln -s WordNet-2.0-Exceptions/WordNet-2.0.exc.db WordNet-2.0.exc.db +``` +3. 安装pyrouge +```shell +$ git clone https://github.com/bheinzerling/pyrouge +$ cd pyrouge +$ python setup.py install +``` +4. 测试ROUGE安装是否正确 +```shell +$ pyrouge_set_rouge_path /absolute/path/to/ROUGE-1.5.5/directory +$ python -m pyrouge.test +``` + + + + ### Dataset_loader - SummarizationLoader: 用于读取处理好的jsonl格式数据集,返回以下field @@ -56,6 +104,21 @@ FastNLP中实现的模型包括: +### Train Cmdline + +#### Baseline + +LSTM + Sequence Labeling + + python train.py --cuda --gpu --sentence_encoder deeplstm --sentence_decoder seqlab --save_root --log_root --lr_descent --grad_clip --max_grad_norm 10 + +Transformer + Sequence Labeling + + python train.py --cuda --gpu --sentence_encoder transformer --sentence_decoder seqlab --save_root --log_root --lr_descent --grad_clip --max_grad_norm 10 + + + +#### BertSum