diff --git a/docs/source/user/example.rst b/docs/source/user/example.rst index ec7a6b74..55588c79 100644 --- a/docs/source/user/example.rst +++ b/docs/source/user/example.rst @@ -92,7 +92,7 @@ http://docutils.sf.net/ 孤立的网址会自动生成链接 各种连接 =========== -:doc:`/user/with_fitlog.rst` +:doc:`/user/with_fitlog` :mod:`~fastNLP.core.batch` diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 9ccdaf67..868d67b1 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -438,7 +438,7 @@ def _bio_tag_to_spans(tags, ignore_labels=None): class SpanFPreRecMetric(MetricBase): - """ + r""" 别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` 在序列标注问题中,以span的方式计算F, pre, rec. @@ -476,7 +476,7 @@ class SpanFPreRecMetric(MetricBase): label的f1, pre, rec :param str f_type: 'micro'或'macro'. 'micro':通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; 'macro': 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) - :param float beta: f_beta分数, :math:`f_beta = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . + :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 """ @@ -699,16 +699,16 @@ def _pred_topk(y_prob, k=1): class SQuADMetric(MetricBase): - """ + r""" 别名::class:`fastNLP.SQuADMetric` :class:`fastNLP.core.metrics.SQuADMetric` SQuAD数据集metric - :param pred1: 参数映射表中`pred1`的映射关系,None表示映射关系为`pred1`->`pred1` - :param pred2: 参数映射表中`pred2`的映射关系,None表示映射关系为`pred2`->`pred2` - :param target1: 参数映射表中`target1`的映射关系,None表示映射关系为`target1`->`target1` - :param target2: 参数映射表中`target2`的映射关系,None表示映射关系为`target2`->`target2` - :param float beta: f_beta分数, :math:`f_beta = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . + :param pred1: 参数映射表中 `pred1` 的映射关系,None表示映射关系为 `pred1` -> `pred1` + :param pred2: 参数映射表中 `pred2` 的映射关系,None表示映射关系为 `pred2` -> `pred2` + :param target1: 参数映射表中 `target1` 的映射关系,None表示映射关系为 `target1` -> `target1` + :param target2: 参数映射表中 `target2` 的映射关系,None表示映射关系为 `target2` -> `target2` + :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 :param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间,为false表示指向一个左闭右闭区间。 :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出 diff --git a/reproduction/matching/model/bert.py b/reproduction/matching/model/bert.py index 6b13ce2a..9b3a78b2 100644 --- a/reproduction/matching/model/bert.py +++ b/reproduction/matching/model/bert.py @@ -1,13 +1,41 @@ +import torch +import torch.nn as nn + +from fastNLP.core.const import Const from fastNLP.models import BaseModel from fastNLP.modules.encoder.bert import BertModel -class BertForSNLI(BaseModel): +class BertForNLI(BaseModel): # TODO: still in progress - def __init(self): - super(BertForSNLI, self).__init__() + def __init__(self, class_num=3, bert_dir=None): + super(BertForNLI, self).__init__() + if bert_dir is not None: + self.bert = BertModel.from_pretrained(bert_dir) + else: + self.bert = BertModel() + hidden_size = self.bert.pooler.dense._parameters['bias'].size(-1) + self.classifier = nn.Linear(hidden_size, class_num) + + def forward(self, words, seq_len1, seq_len2, target=None): + """ + :param torch.Tensor words: [batch_size, seq_len] input_ids + :param torch.Tensor seq_len1: [batch_size, seq_len] token_type_ids + :param torch.Tensor seq_len2: [batch_size, seq_len] attention_mask + :param torch.Tensor target: [batch] + :return: + """ + _, pooled_output = self.bert(words, seq_len1, seq_len2) + logits = self.classifier(pooled_output) + + if target is not None: + loss_func = torch.nn.CrossEntropyLoss() + loss = loss_func(logits, target) + return {Const.OUTPUT: logits, Const.LOSS: loss} + return {Const.OUTPUT: logits} + + def predict(self, words, seq_len1, seq_len2, target=None): + return self.forward(words, seq_len1, seq_len2) - def forward(self, words, segment_id, seq_len): - pass diff --git a/reproduction/matching/snli.py b/reproduction/matching/snli.py new file mode 100644 index 00000000..b389aa11 --- /dev/null +++ b/reproduction/matching/snli.py @@ -0,0 +1,97 @@ +import os + +import torch + +from fastNLP.core import Vocabulary, DataSet, Trainer, Tester, Const, Adam, AccuracyMetric + +from reproduction.matching.data.SNLIDataLoader import SNLILoader +from legacy.component.bert_tokenizer import BertTokenizer +from reproduction.matching.model.bert import BertForNLI + + +def preprocess_data(data: DataSet, bert_dir): + """ + preprocess data set to bert-need data set. + :param data: + :param bert_dir: + :return: + """ + tokenizer = BertTokenizer.from_pretrained(os.path.join(bert_dir, 'vocab.txt')) + + vocab = Vocabulary(padding=None, unknown=None) + with open(os.path.join(bert_dir, 'vocab.txt')) as f: + lines = f.readlines() + vocab_list = [] + for line in lines: + vocab_list.append(line.strip()) + vocab.add_word_lst(vocab_list) + vocab.build_vocab() + vocab.padding = '[PAD]' + vocab.unknown = '[UNK]' + + for i in range(2): + data.apply(lambda x: tokenizer.tokenize(" ".join(x[Const.INPUTS(i)])), + new_field_name=Const.INPUTS(i)) + data.apply(lambda x: ['[CLS]'] + x[Const.INPUTS(0)] + ['[SEP]'] + x[Const.INPUTS(1)] + ['[SEP]'], + new_field_name=Const.INPUT) + data.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), + new_field_name=Const.INPUT_LENS(0)) + data.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1)) + + max_len = 512 + data.apply(lambda x: x[Const.INPUT][: max_len], new_field_name=Const.INPUT) + data.apply(lambda x: [vocab.to_index(w) for w in x[Const.INPUT]], new_field_name=Const.INPUT) + data.apply(lambda x: x[Const.INPUT_LENS(0)][: max_len], new_field_name=Const.INPUT_LENS(0)) + data.apply(lambda x: x[Const.INPUT_LENS(1)][: max_len], new_field_name=Const.INPUT_LENS(1)) + + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab.add_word_lst(['neutral', 'contradiction', 'entailment']) + target_vocab.build_vocab() + data.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET) + + data.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1), Const.TARGET) + data.set_target(Const.TARGET) + + return data + + +bert_dirs = 'path/to/bert/dir' + +# load raw data set +train_data = SNLILoader().load('./data/snli/snli_1.0_train.jsonl') +dev_data = SNLILoader().load('./data/snli/snli_1.0_dev.jsonl') +test_data = SNLILoader().load('./data/snli/snli_1.0_test.jsonl') + +print('successfully load data sets!') + +train_data = preprocess_data(train_data, bert_dirs) +dev_data = preprocess_data(dev_data, bert_dirs) +test_data = preprocess_data(test_data, bert_dirs) + +model = BertForNLI(bert_dir=bert_dirs) + +trainer = Trainer( + train_data=train_data, + model=model, + optimizer=Adam(lr=2e-5, model_params=model.parameters()), + batch_size=torch.cuda.device_count() * 12, + n_epochs=4, + print_every=-1, + dev_data=dev_data, + metrics=AccuracyMetric(), + metric_key='acc', + device=[i for i in range(torch.cuda.device_count())], + check_code_level=-1 +) +trainer.train(load_best_model=True) + +tester = Tester( + data=test_data, + model=model, + metrics=AccuracyMetric(), + batch_size=torch.cuda.device_count() * 12, + device=[i for i in range(torch.cuda.device_count())], +) +tester.test() + +