Browse Source

1. CRF增加constrain, 用于限制跃迁,比如BMES中B不能跃迁到S

2. metric增加SpanFMetric,可以用于计算sequence labelling的performance
3. 分词复现任务根据新版接口做了部分调整。
tags/v0.3.0^2
yh 5 years ago
parent
commit
897c43fc3b
12 changed files with 1244 additions and 103 deletions
  1. +4
    -4
      fastNLP/api/api.py
  2. +56
    -10
      fastNLP/api/processor.py
  3. +1
    -1
      fastNLP/core/dataset.py
  4. +367
    -9
      fastNLP/core/metrics.py
  5. +2
    -3
      fastNLP/core/trainer.py
  6. +160
    -20
      fastNLP/modules/decoder/CRF.py
  7. +26
    -5
      reproduction/chinese_word_segment/cws_io/cws_reader.py
  8. +23
    -6
      reproduction/chinese_word_segment/models/cws_model.py
  9. +204
    -41
      reproduction/chinese_word_segment/process/cws_processor.py
  10. +68
    -4
      reproduction/pos_tag_model/pos_io/pos_reader.py
  11. +229
    -0
      test/core/test_metrics.py
  12. +104
    -0
      test/modules/decoder/test_CRF.py

+ 4
- 4
fastNLP/api/api.py View File

@@ -9,8 +9,8 @@ from fastNLP.core.dataset import DataSet

from fastNLP.api.model_zoo import load_url
from fastNLP.api.processor import ModelProcessor
from reproduction.chinese_word_segment.cws_io.cws_reader import ConlluCWSReader
from reproduction.pos_tag_model.pos_io.pos_reader import ConlluPOSReader
from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
from reproduction.pos_tag_model.pos_io.pos_reader import ConllPOSReader
from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.core.instance import Instance
from fastNLP.core.sampler import SequentialSampler
@@ -95,7 +95,7 @@ class POS(API):
pipeline.append(tag_proc)
pp = Pipeline(pipeline)

reader = ConlluPOSReader()
reader = ConllPOSReader()
te_dataset = reader.load(filepath)

evaluator = SeqLabelEvaluator2('word_seq_origin_len')
@@ -168,7 +168,7 @@ class CWS(API):
pipeline.insert(1, tag_proc)
pp = Pipeline(pipeline)

reader = ConlluCWSReader()
reader = ConllCWSReader()

# te_filename = '/home/hyan/ctb3/test.conllx'
te_dataset = reader.load(filepath)


+ 56
- 10
fastNLP/api/processor.py View File

@@ -11,6 +11,11 @@ from fastNLP.core.vocabulary import Vocabulary

class Processor(object):
def __init__(self, field_name, new_added_field_name):
"""

:param field_name: 处理哪个field
:param new_added_field_name: 如果为None,则认为是field_name,即覆盖原有的field
"""
self.field_name = field_name
if new_added_field_name is None:
self.new_added_field_name = field_name
@@ -92,6 +97,11 @@ class FullSpaceToHalfSpaceProcessor(Processor):


class PreAppendProcessor(Processor):
"""
向某个field的起始增加data(应该为str类型)。该field需要为list类型。即新增的field为
[data] + instance[field_name]

"""
def __init__(self, data, field_name, new_added_field_name=None):
super(PreAppendProcessor, self).__init__(field_name, new_added_field_name)
self.data = data
@@ -102,6 +112,10 @@ class PreAppendProcessor(Processor):


class SliceProcessor(Processor):
"""
从某个field中只取部分内容。等价于instance[field_name][start:end:step]

"""
def __init__(self, start, end, step, field_name, new_added_field_name=None):
super(SliceProcessor, self).__init__(field_name, new_added_field_name)
for o in (start, end, step):
@@ -114,7 +128,17 @@ class SliceProcessor(Processor):


class Num2TagProcessor(Processor):
"""
将一句话中的数字转换为某个tag。

"""
def __init__(self, tag, field_name, new_added_field_name=None):
"""

:param tag: str, 将数字转换为该tag
:param field_name:
:param new_added_field_name:
"""
super(Num2TagProcessor, self).__init__(field_name, new_added_field_name)
self.tag = tag
self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)'
@@ -135,6 +159,10 @@ class Num2TagProcessor(Processor):


class IndexerProcessor(Processor):
"""
给定一个vocabulary , 将指定field转换为index形式。指定field应该是一维的list,比如
['我', '是', xxx]
"""
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True):

assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
@@ -163,19 +191,19 @@ class IndexerProcessor(Processor):


class VocabProcessor(Processor):
"""Build vocabulary with a field in the data set.
"""
传入若干个DataSet以建立vocabulary。

"""

def __init__(self, field_name):
def __init__(self, field_name, min_freq=1, max_size=None):
super(VocabProcessor, self).__init__(field_name, None)
self.vocab = Vocabulary()
self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size)

def process(self, *datasets):
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
self.vocab.update(ins[self.field_name])
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))

def get_vocab(self):
self.vocab.build_vocab()
@@ -183,6 +211,10 @@ class VocabProcessor(Processor):


class SeqLenProcessor(Processor):
"""
根据某个field新增一个sequence length的field。取该field的第一维

"""
def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True):
super(SeqLenProcessor, self).__init__(field_name, new_added_field_name)
self.is_input = is_input
@@ -195,10 +227,15 @@ class SeqLenProcessor(Processor):
return dataset


from fastNLP.core.utils import _build_args

class ModelProcessor(Processor):
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32):
"""
迭代模型并将结果的padding drop掉
传入一个model,在process()时传入一个dataset,该processor会通过Batch将DataSet的内容输出给model.predict或者model.forward.
model输出的内容会被增加到dataset中,field_name由model输出决定。如果生成的内容维度不是(Batch_size, )与
(Batch_size, 1),则使用seqence length这个field进行unpad
TODO 这个类需要删除对seq_lens的依赖。

:param seq_len_field_name:
:param batch_size:
@@ -211,13 +248,18 @@ class ModelProcessor(Processor):
def process(self, dataset):
self.model.eval()
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False)
data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler())

batch_output = defaultdict(list)
if hasattr(self.model, "predict"):
predict_func = self.model.predict
else:
predict_func = self.model.forward
with torch.no_grad():
for batch_x, _ in data_iterator:
prediction = self.model.predict(**batch_x)
seq_lens = batch_x[self.seq_len_field_name].cpu().numpy().tolist()
refined_batch_x = _build_args(predict_func, **batch_x)
prediction = predict_func(**refined_batch_x)
seq_lens = batch_x[self.seq_len_field_name].tolist()

for key, value in prediction.items():
tmp_batch = []
@@ -246,6 +288,10 @@ class ModelProcessor(Processor):


class Index2WordProcessor(Processor):
"""
将DataSet中某个为index的field根据vocab转换为str

"""
def __init__(self, vocab, field_name, new_added_field_name):
super(Index2WordProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab
@@ -266,5 +312,5 @@ class SetIsTargetProcessor(Processor):
def process(self, dataset):
set_dict = {name: self.default for name in dataset.get_all_fields().keys()}
set_dict.update(self.field_dict)
dataset.set_target(**set_dict)
dataset.set_target(*set_dict.keys())
return dataset

+ 1
- 1
fastNLP/core/dataset.py View File

@@ -254,7 +254,7 @@ class DataSet(object):
:return results: if new_field_name is not passed, returned values of the function over all instances.
"""
results = [func(ins) for ins in self._inner_iter()]
if len(list(filter(lambda x: x is not None, results))) == 0: # all None
if len(list(filter(lambda x: x is not None, results))) == 0 and not (new_field_name is None): # all None
raise ValueError("{} always return None.".format(get_func_signature(func=func)))

extra_param = {}


+ 367
- 9
fastNLP/core/metrics.py View File

@@ -10,6 +10,7 @@ from fastNLP.core.utils import _build_args
from fastNLP.core.utils import _check_arg_dict_list
from fastNLP.core.utils import get_func_signature
from fastNLP.core.utils import seq_lens_to_masks
from fastNLP.core.vocabulary import Vocabulary


class MetricBase(object):
@@ -62,11 +63,6 @@ class MetricBase(object):
f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the "
f"initialization parameters, or change its signature.")

# evaluate should not have varargs.
# if func_spect.varargs:
# raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.evaluate)}(Do not use "
# f"positional argument.).")

def get_metric(self, reset=True):
raise NotImplemented

@@ -91,10 +87,9 @@ class MetricBase(object):

This method will call self.evaluate method.
Before calling self.evaluate, it will first check the validity of output_dict, target_dict
(1) whether self.evaluate has varargs, which is not supported.
(2) whether params needed by self.evaluate is not included in output_dict,target_dict.
(3) whether params needed by self.evaluate duplicate in pred_dict, target_dict
(4) whether params in output_dict, target_dict are not used by evaluate.(Might cause warning)
(1) whether params needed by self.evaluate is not included in output_dict,target_dict.
(2) whether params needed by self.evaluate duplicate in pred_dict, target_dict
(3) whether params in output_dict, target_dict are not used by evaluate.(Might cause warning)
Besides, before passing params into self.evaluate, this function will filter out params from output_dict and
target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering
will be conducted.)
@@ -275,6 +270,369 @@ class AccuracyMetric(MetricBase):
self.total = 0
return evaluate_result

def bmes_tag_to_spans(tags, ignore_labels=None):
"""

:param tags: List[str],
:param ignore_labels: List[str], 在该list中的label将被忽略
:return: List[Tuple[str, List[int, int]]]. [(label,[start, end])]
"""
ignore_labels = set(ignore_labels) if ignore_labels else set()

spans = []
prev_bmes_tag = None
for idx, tag in enumerate(tags):
tag = tag.lower()
bmes_tag, label = tag[:1], tag[2:]
if bmes_tag in ('b', 's'):
spans.append((label, [idx, idx]))
elif bmes_tag in ('m', 'e') and prev_bmes_tag in ('b', 'm') and label==spans[-1][0]:
spans[-1][1][1] = idx
else:
spans.append((label, [idx, idx]))
prev_bmes_tag = bmes_tag
return [(span[0], (span[1][0], span[1][1]))
for span in spans
if span[0] not in ignore_labels
]

def bio_tag_to_spans(tags, ignore_labels=None):
"""

:param tags: List[str],
:param ignore_labels: List[str], 在该list中的label将被忽略
:return: List[Tuple[str, List[int, int]]]. [(label,[start, end])]
"""
ignore_labels = set(ignore_labels) if ignore_labels else set()

spans = []
prev_bio_tag = None
for idx, tag in enumerate(tags):
tag = tag.lower()
bio_tag, label = tag[:1], tag[2:]
if bio_tag == 'b':
spans.append((label, [idx, idx]))
elif bio_tag == 'i' and prev_bio_tag in ('b', 'i') and label==spans[-1][0]:
spans[-1][1][1] = idx
elif bio_tag == 'o': # o tag does not count
pass
else:
spans.append((label, [idx, idx]))
prev_bio_tag = bio_tag
return [(span[0], (span[1][0], span[1][1]))
for span in spans
if span[0] not in ignore_labels
]


class SpanFPreRecMetric(MetricBase):
"""
在序列标注问题中,以span的方式计算F, pre, rec.
最后得到的metric结果为
{
'f': xxx, # 这里使用f考虑以后可以计算f_beta值
'pre': xxx,
'rec':xxx
}
若only_gross=False, 即还会返回各个label的metric统计值
{
'f': xxx,
'pre': xxx,
'rec':xxx,
'f-label': xxx,
'pre-label': xxx,
'rec-label':xxx,
...
}

"""
def __init__(self, tag_vocab, pred=None, target=None, seq_lens=None, encoding_type='bio', ignore_labels=None,
only_gross=True, f_type='micro', beta=1):
"""

:param tag_vocab: Vocabulary, 标签的vocabulary。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN),
在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'.
:param pred: str, 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用'pred'取数据
:param target: str, 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用'target'取数据
:param seq_lens: str, 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用'seq_lens'取数据。
:param encoding_type: str, 目前支持bio, bmes
:param ignore_labels, List[str]. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这
个label
:param only_gross, bool. 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个
label的f1, pre, rec
:param f_type, str. 'micro'或'macro'. 'micro':通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; 'macro':
分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同)
:param beta, float. f_beta分数,f_beta = (1 + beta^2)*(pre*rec)/(beta^2*pre + rec). 常用为beta=0.5, 1, 2. 若为0.5
则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。
"""
encoding_type = encoding_type.lower()
if encoding_type not in ('bio', 'bmes'):
raise ValueError("Only support 'bio' or 'bmes' type.")
if not isinstance(tag_vocab, Vocabulary):
raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab)))
if f_type not in ('micro', 'macro'):
raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type))

self.encoding_type = encoding_type
if self.encoding_type == 'bmes':
self.tag_to_span_func = bmes_tag_to_spans
elif self.encoding_type == 'bio':
self.tag_to_span_func = bio_tag_to_spans
self.ignore_labels = ignore_labels
self.f_type = f_type
self.beta = beta
self.beta_square = self.beta**2
self.only_gross = only_gross

super().__init__()
self._init_param_map(pred=pred, target=target, seq_lens=seq_lens)

self.tag_vocab = tag_vocab

self._true_positives = defaultdict(int)
self._false_positives = defaultdict(int)
self._false_negatives = defaultdict(int)

def evaluate(self, pred, target, seq_lens):
"""
A lot of design idea comes from allennlp's measure
:param pred:
:param target:
:param seq_lens:
:return:
"""
if not isinstance(pred, torch.Tensor):
raise TypeError(f"`pred` in {get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(pred)}.")
if not isinstance(target, torch.Tensor):
raise TypeError(f"`target` in {get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(target)}.")

if not isinstance(seq_lens, torch.Tensor):
raise TypeError(f"`seq_lens` in {get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(seq_lens)}.")

num_classes = pred.size(-1)
if (target >= num_classes).any():
raise ValueError("A gold label passed to SpanBasedF1Metric contains an "
"id >= {}, the number of classes.".format(num_classes))

if pred.size() == target.size() and len(target.size()) == 2:
pass
elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2:
pred = pred.argmax(dim=-1)
else:
raise RuntimeError(f"In {get_func_signature(self.evaluate)}, when pred have "
f"size:{pred.size()}, target should have size: {pred.size()} or "
f"{pred.size()[:-1]}, got {target.size()}.")

batch_size = pred.size(0)
for i in range(batch_size):
pred_tags = pred[i, :seq_lens[i]].tolist()
gold_tags = target[i, :seq_lens[i]].tolist()

pred_str_tags = [self.tag_vocab.to_word(tag) for tag in pred_tags]
gold_str_tags = [self.tag_vocab.to_word(tag) for tag in gold_tags]

pred_spans = self.tag_to_span_func(pred_str_tags, ignore_labels=self.ignore_labels)
gold_spans = self.tag_to_span_func(gold_str_tags, ignore_labels=self.ignore_labels)

for span in pred_spans:
if span in gold_spans:
self._true_positives[span[0]] += 1
gold_spans.remove(span)
else:
self._false_positives[span[0]] += 1
for span in gold_spans:
self._false_negatives[span[0]] += 1

def get_metric(self, reset=True):
evaluate_result = {}
if not self.only_gross or self.f_type=='macro':
tags = set(self._false_negatives.keys())
tags.update(set(self._false_positives.keys()))
tags.update(set(self._true_positives.keys()))
f_sum = 0
pre_sum = 0
rec_sum = 0
for tag in tags:
tp = self._true_positives[tag]
fn = self._false_negatives[tag]
fp = self._false_positives[tag]
f, pre, rec = self._compute_f_pre_rec(tp, fn, fp)
f_sum += f
pre_sum += pre
rec_sum + rec
if not self.only_gross and tag!='': # tag!=''防止无tag的情况
f_key = 'f-{}'.format(tag)
pre_key = 'pre-{}'.format(tag)
rec_key = 'rec-{}'.format(tag)
evaluate_result[f_key] = f
evaluate_result[pre_key] = pre
evaluate_result[rec_key] = rec

if self.f_type == 'macro':
evaluate_result['f'] = f_sum/len(tags)
evaluate_result['pre'] = pre_sum/len(tags)
evaluate_result['rec'] = rec_sum/len(tags)

if self.f_type == 'micro':
f, pre, rec = self._compute_f_pre_rec(sum(self._true_positives.values()),
sum(self._false_negatives.values()),
sum(self._false_positives.values()))
evaluate_result['f'] = f
evaluate_result['pre'] = pre
evaluate_result['rec'] = rec

if reset:
self._true_positives = defaultdict(int)
self._false_positives = defaultdict(int)
self._false_negatives = defaultdict(int)

return evaluate_result

def _compute_f_pre_rec(self, tp, fn, fp):
"""

:param tp: int, true positive
:param fn: int, false negative
:param fp: int, false positive
:return: (f, pre, rec)
"""
pre = tp / (fp + tp + 1e-13)
rec = tp / (fn + tp + 1e-13)
f = (1 + self.beta_square) * pre * rec / (self.beta_square * pre + rec + 1e-13)

return f, pre, rec

class BMESF1PreRecMetric(MetricBase):
"""
按照BMES标注方式计算f1, precision, recall。由于可能存在非法tag,比如"BS",所以需要用以下的表格做转换,cur_B意思是当前tag是B,
next_B意思是后一个tag是B。则cur_B=S,即将当前被predict是B的tag标为S;next_M=B, 即将后一个被predict是M的tag标为B
| | next_B | next_M | next_E | next_S | end |
|:-----:|:-------:|:--------:|:--------:|:-------:|:-------:|
| start | 合法 | next_M=B | next_E=S | 合法 | - |
| cur_B | cur_B=S | 合法 | 合法 | cur_B=S | cur_B=S |
| cur_M | cur_M=E | 合法 | 合法 | cur_M=E | cur_M=E |
| cur_E | 合法 | next_M=B | next_E=S | 合法 | 合法 |
| cur_S | 合法 | next_M=B | next_E=S | 合法 | 合法 |
举例:
prediction为BSEMS,会被认为是SSSSS.

本Metric不检验target的合法性,请务必保证target的合法性。
pred的形状应该为(batch_size, max_len) 或 (batch_size, max_len, 4)。
target形状为 (batch_size, max_len)
seq_lens形状为 (batch_size, )

"""

def __init__(self, b_idx=0, m_idx=1, e_idx=2, s_idx=3, pred=None, target=None, seq_lens=None):
"""
需要申明BMES这四种tag中,各种tag对应的idx。所有不为b_idx, m_idx, e_idx, s_idx的数字都认为是s_idx。

:param b_idx: int, Begin标签所对应的tag idx.
:param m_idx: int, Middle标签所对应的tag idx.
:param e_idx: int, End标签所对应的tag idx.
:param s_idx: int, Single标签所对应的tag idx
:param pred: str, 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用'pred'取数据
:param target: str, 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用'target'取数据
:param seq_lens: str, 用该key在evaluate()时从传入dict中取出seqence length数据。为None,则使用'seq_lens'取数据。
"""
super().__init__()

self._init_param_map(pred=pred, target=target, seq_lens=seq_lens)

self.yt_wordnum = 0
self.yp_wordnum = 0
self.corr_num = 0

self.b_idx = b_idx
self.m_idx = m_idx
self.e_idx = e_idx
self.s_idx = s_idx
# 还原init处介绍的矩阵
self._valida_matrix = {
-1: [(-1, -1), (1, self.b_idx), (1, self.s_idx), (-1, -1)], # magic start idx
self.b_idx:[(0, self.s_idx), (-1, -1), (-1, -1), (0, self.s_idx), (0, self.s_idx)],
self.m_idx:[(0, self.e_idx), (-1, -1), (-1, -1), (0, self.e_idx), (0, self.e_idx)],
self.e_idx:[(-1, -1), (1, self.b_idx), (1, self.s_idx), (-1, -1), (-1, -1)],
self.s_idx:[(-1, -1), (1, self.b_idx), (1, self.s_idx), (-1, -1), (-1, -1)],
}

def _validate_tags(self, tags):
"""
给定一个tag的Tensor,返回合法tag

:param tags: Tensor, shape: (seq_len, )
:return: 返回修改为合法tag的list
"""
assert len(tags)!=0
assert isinstance(tags, torch.Tensor) and len(tags.size())==1
padded_tags = [-1, *tags.tolist(), -1]
for idx in range(len(padded_tags)-1):
cur_tag = padded_tags[idx]
if cur_tag not in self._valida_matrix:
cur_tag = self.s_idx
if padded_tags[idx+1] not in self._valida_matrix:
padded_tags[idx+1] = self.s_idx
next_tag = padded_tags[idx+1]
shift_tag = self._valida_matrix[cur_tag][next_tag]
if shift_tag[0]!=-1:
padded_tags[idx+shift_tag[0]] = shift_tag[1]

return padded_tags[1:-1]

def evaluate(self, pred, target, seq_lens):
if not isinstance(pred, torch.Tensor):
raise TypeError(f"`pred` in {get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(pred)}.")
if not isinstance(target, torch.Tensor):
raise TypeError(f"`target` in {get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(target)}.")

if not isinstance(seq_lens, torch.Tensor):
raise TypeError(f"`seq_lens` in {get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(seq_lens)}.")

if pred.size() == target.size() and len(target.size()) == 2:
pass
elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2:
pred = pred.argmax(dim=-1)
else:
raise RuntimeError(f"In {get_func_signature(self.evaluate)}, when pred have "
f"size:{pred.size()}, target should have size: {pred.size()} or "
f"{pred.size()[:-1]}, got {target.size()}.")

for idx in range(len(pred)):
seq_len = seq_lens[idx]
target_tags = target[idx][:seq_len].tolist()
pred_tags = pred[idx][:seq_len]
pred_tags = self._validate_tags(pred_tags)
start_idx = 0
for t_idx, (t_tag, p_tag) in enumerate(zip(target_tags, pred_tags)):
if t_tag in (self.s_idx, self.e_idx):
self.yt_wordnum += 1
corr_flag = True
for i in range(start_idx, t_idx+1):
if target_tags[i]!=pred_tags[i]:
corr_flag = False
if corr_flag:
self.corr_num += 1
start_idx = t_idx + 1
if p_tag in (self.s_idx, self.e_idx):
self.yp_wordnum += 1

def get_metric(self, reset=True):
P = self.corr_num / (self.yp_wordnum + 1e-12)
R = self.corr_num / (self.yt_wordnum + 1e-12)
F = 2 * P * R / (P + R + 1e-12)
evaluate_result = {'f': round(F, 6), 'pre':round(P, 6), 'rec': round(R, 6)}
if reset:
self.yp_wordnum = 0
self.yt_wordnum = 0
self.corr_num = 0
return evaluate_result


def _prepare_metrics(metrics):
"""


+ 2
- 3
fastNLP/core/trainer.py View File

@@ -31,9 +31,8 @@ class Trainer(object):

"""
def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50,
validate_every=-1, dev_data=None, use_cuda=False, save_path=None,
optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0,
metric_key=None, sampler=RandomSampler(), use_tqdm=True):
validate_every=-1, dev_data=None, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0),
check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True, use_cuda=False):
"""

:param DataSet train_data: the training data


+ 160
- 20
fastNLP/modules/decoder/CRF.py View File

@@ -19,26 +19,149 @@ def seq_len_to_byte_mask(seq_lens):
mask = broadcast_arange.lt(seq_lens.float().view(-1, 1))
return mask

def allowed_transitions(id2label, encoding_type='bio'):
"""

:param id2label: dict, key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
"B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.get_id2word()id2label。
:param encoding_type: str, 支持"bio", "bmes"。
:return:List[Tuple(int, int)]], 内部的Tuple是(from_tag_id, to_tag_id)。 返回的结果考虑了start和end,比如"BIO"中,B、O可以
位于序列的开端,而I不行。所以返回的结果中会包含(start_idx, B_idx), (start_idx, O_idx), 但是不包含(start_idx, I_idx).
start_idx=len(id2label), end_idx=len(id2label)+1。
"""
num_tags = len(id2label)
start_idx = num_tags
end_idx = num_tags + 1
encoding_type = encoding_type.lower()
allowed_trans = []
id_label_lst = list(id2label.items()) + [(start_idx, 'start'), (end_idx, 'end')]
def split_tag_label(from_label):
from_label = from_label.lower()
if from_label in ['start', 'end']:
from_tag = from_label
from_label = ''
else:
from_tag = from_label[:1]
from_label = from_label[2:]
return from_tag, from_label

for from_id, from_label in id_label_lst:
if from_label in ['<pad>', '<unk>']:
continue
from_tag, from_label = split_tag_label(from_label)
for to_id, to_label in id_label_lst:
if to_label in ['<pad>', '<unk>']:
continue
to_tag, to_label = split_tag_label(to_label)
if is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label):
allowed_trans.append((from_id, to_id))
return allowed_trans

def is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label):
"""

:param encoding_type: str, 支持"BIO", "BMES"。
:param from_tag: str, 比如"B", "M"之类的标注tag. 还包括start, end等两种特殊tag
:param from_label: str, 比如"PER", "LOC"等label
:param to_tag: str, 比如"B", "M"之类的标注tag. 还包括start, end等两种特殊tag
:param to_label: str, 比如"PER", "LOC"等label
:return: bool,能否跃迁
"""
if to_tag=='start' or from_tag=='end':
return False
encoding_type = encoding_type.lower()
if encoding_type == 'bio':
"""
第一行是to_tag, 第一列是from_tag. y任意条件下可转,-只有在label相同时可转,n不可转
+-------+---+---+---+-------+-----+
| | B | I | O | start | end |
+-------+---+---+---+-------+-----+
| B | y | - | y | n | y |
+-------+---+---+---+-------+-----+
| I | y | - | y | n | y |
+-------+---+---+---+-------+-----+
| O | y | n | y | n | y |
+-------+---+---+---+-------+-----+
| start | y | n | y | n | n |
+-------+---+---+---+-------+-----+
| end | n | n | n | n | n |
+-------+---+---+---+-------+-----+
"""
if from_tag == 'start':
return to_tag in ('b', 'o')
elif from_tag in ['b', 'i']:
return any([to_tag in ['end', 'b', 'o'], to_tag=='i' and from_label==to_label])
elif from_tag == 'o':
return to_tag in ['end', 'b', 'o']
else:
raise ValueError("Unexpect tag {}. Expect only 'B', 'I', 'O'.".format(from_tag))

elif encoding_type == 'bmes':
"""
第一行是to_tag, 第一列是from_tag,y任意条件下可转,-只有在label相同时可转,n不可转
+-------+---+---+---+---+-------+-----+
| | B | M | E | S | start | end |
+-------+---+---+---+---+-------+-----+
| B | n | - | - | n | n | n |
+-------+---+---+---+---+-------+-----+
| M | n | - | - | n | n | n |
+-------+---+---+---+---+-------+-----+
| E | y | n | n | y | n | y |
+-------+---+---+---+---+-------+-----+
| S | y | n | n | y | n | y |
+-------+---+---+---+---+-------+-----+
| start | y | n | n | y | n | n |
+-------+---+---+---+---+-------+-----+
| end | n | n | n | n | n | n |
+-------+---+---+---+---+-------+-----+
"""
if from_tag == 'start':
return to_tag in ['b', 's']
elif from_tag == 'b':
return to_tag in ['m', 'e'] and from_label==to_label
elif from_tag == 'm':
return to_tag in ['m', 'e'] and from_label==to_label
elif from_tag in ['e', 's']:
return to_tag in ['b', 's', 'end']
else:
raise ValueError("Unexpect tag type {}. Expect only 'B', 'M', 'E', 'S'.".format(from_tag))
else:
raise ValueError("Only support BIO, BMES encoding type, got {}.".format(encoding_type))


class ConditionalRandomField(nn.Module):
def __init__(self, tag_size, include_start_end_trans=False ,initial_method = None):
def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None, initial_method=None):
"""
:param tag_size: int, num of tags
:param include_start_end_trans: bool, whether to include start/end tag

:param num_tags: int, 标签的数量。
:param include_start_end_trans: bool, 是否包含起始tag
:param allowed_transitions: List[Tuple[from_tag_id(int), to_tag_id(int)]]. 允许的跃迁,可以通过allowed_transitions()得到。
如果为None,则所有跃迁均为合法
:param initial_method:
"""

super(ConditionalRandomField, self).__init__()

self.include_start_end_trans = include_start_end_trans
self.tag_size = tag_size
self.num_tags = num_tags

# the meaning of entry in this matrix is (from_tag_id, to_tag_id) score
self.trans_m = nn.Parameter(torch.randn(tag_size, tag_size))
self.trans_m = nn.Parameter(torch.randn(num_tags, num_tags))
if self.include_start_end_trans:
self.start_scores = nn.Parameter(torch.randn(tag_size))
self.end_scores = nn.Parameter(torch.randn(tag_size))
self.start_scores = nn.Parameter(torch.randn(num_tags))
self.end_scores = nn.Parameter(torch.randn(num_tags))

if allowed_transitions is None:
constrain = torch.zeros(num_tags + 2, num_tags + 2)
else:
constrain = torch.ones(num_tags + 2, num_tags + 2) * -1000
for from_tag_id, to_tag_id in allowed_transitions:
constrain[from_tag_id, to_tag_id] = 0
self._constrain = nn.Parameter(constrain, requires_grad=False)

# self.reset_parameter()
initial_parameter(self, initial_method)

def reset_parameter(self):
nn.init.xavier_normal_(self.trans_m)
if self.include_start_end_trans:
@@ -49,7 +172,7 @@ class ConditionalRandomField(nn.Module):
"""
Computes the (batch_size,) denominator term for the log-likelihood, which is the
sum of the likelihoods across all possible state sequences.
:param logits:FloatTensor, max_len x batch_size x tag_size
:param logits:FloatTensor, max_len x batch_size x num_tags
:param mask:ByteTensor, max_len x batch_size
:return:FloatTensor, batch_size
"""
@@ -72,7 +195,7 @@ class ConditionalRandomField(nn.Module):
def _glod_score(self, logits, tags, mask):
"""
Compute the score for the gold path.
:param logits: FloatTensor, max_len x batch_size x tag_size
:param logits: FloatTensor, max_len x batch_size x num_tags
:param tags: LongTensor, max_len x batch_size
:param mask: ByteTensor, max_len x batch_size
:return:FloatTensor, batch_size
@@ -99,7 +222,7 @@ class ConditionalRandomField(nn.Module):
def forward(self, feats, tags, mask):
"""
Calculate the neg log likelihood
:param feats:FloatTensor, batch_size x max_len x tag_size
:param feats:FloatTensor, batch_size x max_len x num_tags
:param tags:LongTensor, batch_size x max_len
:param mask:ByteTensor batch_size x max_len
:return:FloatTensor, batch_size
@@ -112,13 +235,20 @@ class ConditionalRandomField(nn.Module):

return all_path_score - gold_path_score

def viterbi_decode(self, data, mask, get_score=False):
def viterbi_decode(self, data, mask, get_score=False, unpad=False):
"""
Given a feats matrix, return best decode path and best score.
:param data:FloatTensor, batch_size x max_len x tag_size
:param data:FloatTensor, batch_size x max_len x num_tags
:param mask:ByteTensor batch_size x max_len
:param get_score: bool, whether to output the decode score.
:return: scores, paths
:param unpad: bool, 是否将结果unpad,
如果False, 返回的是batch_size x max_len的tensor,
如果True,返回的是List[List[int]], List[int]为每个sequence的label,已经unpadding了,即每个
List[int]的长度是这个sample的有效长度
:return: 如果get_score为False,返回结果根据unpadding变动
如果get_score为True, 返回 (paths, List[float], )。第一个仍然是解码后的路径(根据unpad变化),第二个List[Float]
为每个seqence的解码分数。

"""
batch_size, seq_len, n_tags = data.size()
data = data.transpose(0, 1).data # L, B, H
@@ -127,19 +257,23 @@ class ConditionalRandomField(nn.Module):
# dp
vpath = data.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
vscore = data[0]
transitions = self._constrain.data.clone()
transitions[:n_tags, :n_tags] += self.trans_m.data
if self.include_start_end_trans:
vscore += self.start_scores.view(1, -1)
transitions[n_tags, :n_tags] += self.start_scores.data
transitions[:n_tags, n_tags+1] += self.end_scores.data

vscore += transitions[n_tags, :n_tags]
trans_score = transitions[:n_tags, :n_tags].view(1, n_tags, n_tags).data
for i in range(1, seq_len):
prev_score = vscore.view(batch_size, n_tags, 1)
cur_score = data[i].view(batch_size, 1, n_tags)
trans_score = self.trans_m.view(1, n_tags, n_tags).data
score = prev_score + trans_score + cur_score
best_score, best_dst = score.max(1)
vpath[i] = best_dst
vscore = best_score * mask[i].view(batch_size, 1) + vscore * (1 - mask[i]).view(batch_size, 1)

if self.include_start_end_trans:
vscore += self.end_scores.view(1, -1)
vscore += transitions[:n_tags, n_tags+1].view(1, -1)

# backtrace
batch_idx = torch.arange(batch_size, dtype=torch.long, device=data.device)
@@ -154,7 +288,13 @@ class ConditionalRandomField(nn.Module):
for i in range(seq_len - 1):
last_tags = vpath[idxes[i], batch_idx, last_tags]
ans[idxes[i+1], batch_idx] = last_tags

ans = ans.transpose(0, 1)
if unpad:
paths = []
for idx, seq_len in enumerate(lens):
paths.append(ans[idx, :seq_len+1].tolist())
else:
paths = ans
if get_score:
return ans_score, ans.transpose(0, 1)
return ans.transpose(0, 1)
return paths, ans_score.tolist()
return paths

+ 26
- 5
reproduction/chinese_word_segment/cws_io/cws_reader.py View File

@@ -6,6 +6,13 @@ from fastNLP.io.dataset_loader import DataSetLoader


def cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。所以截取的句子可能长于或者短于max_sample_length

:param sent: str.
:param max_sample_length: int.
:return: list of str.
"""
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
@@ -127,12 +134,26 @@ class POSCWSReader(DataSetLoader):
return dataset


class ConlluCWSReader(object):
# 返回的Dataset包含words(list of list, 里层的list是character), tag两个field(list of str, str是标有BMES的tag)。
class ConllCWSReader(object):
def __init__(self):
pass

def load(self, path, cut_long_sent=False):
"""
返回的DataSet只包含raw_sentence这个field,内容为str。
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
@@ -150,10 +171,10 @@ class ConlluCWSReader(object):
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
res = self.get_char_lst(sample)
if res is None:
continue
line = ' '.join(res)
line = ' '.join(res)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
@@ -163,7 +184,7 @@ class ConlluCWSReader(object):

return ds

def get_one(self, sample):
def get_char_lst(self, sample):
if len(sample)==0:
return None
text = []


+ 23
- 6
reproduction/chinese_word_segment/models/cws_model.py View File

@@ -9,7 +9,7 @@ from reproduction.chinese_word_segment.utils import seq_lens_to_mask

class CWSBiLSTMEncoder(BaseModel):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1):
hidden_size=200, bidirectional=True, embed_drop_p=0.2, num_layers=1):
super().__init__()

self.input_size = 0
@@ -68,6 +68,7 @@ class CWSBiLSTMEncoder(BaseModel):
if not bigrams is None:
bigram_tensor = self.bigram_embedding(bigrams).view(batch_size, max_len, -1)
x_tensor = torch.cat([x_tensor, bigram_tensor], dim=2)
x_tensor = self.embedding_drop(x_tensor)
sorted_lens, sorted_indices = torch.sort(seq_lens, descending=True)
packed_x = nn.utils.rnn.pack_padded_sequence(x_tensor[sorted_indices], sorted_lens, batch_first=True)

@@ -120,10 +121,24 @@ class CWSBiLSTMSegApp(BaseModel):


from fastNLP.modules.decoder.CRF import ConditionalRandomField
from fastNLP.modules.decoder.CRF import allowed_transitions

class CWSBiLSTMCRF(BaseModel):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1, tag_size=4):
hidden_size=200, bidirectional=True, embed_drop_p=0.2, num_layers=1, tag_size=4):
"""
默认使用BMES的标注方式
:param vocab_num:
:param embed_dim:
:param bigram_vocab_num:
:param bigram_embed_dim:
:param num_bigram_per_char:
:param hidden_size:
:param bidirectional:
:param embed_drop_p:
:param num_layers:
:param tag_size:
"""
super(CWSBiLSTMCRF, self).__init__()

self.tag_size = tag_size
@@ -133,10 +148,12 @@ class CWSBiLSTMCRF(BaseModel):

size_layer = [hidden_size, 200, tag_size]
self.decoder_model = MLP(size_layer)
self.crf = ConditionalRandomField(tag_size=tag_size, include_start_end_trans=False)
allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes')
self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False,
allowed_transitions=allowed_trans)


def forward(self, chars, tags, seq_lens, bigrams=None):
def forward(self, chars, target, seq_lens, bigrams=None):
device = self.parameters().__next__().device
chars = chars.to(device).long()
if not bigrams is None:
@@ -147,7 +164,7 @@ class CWSBiLSTMCRF(BaseModel):
masks = seq_lens_to_mask(seq_lens)
feats = self.encoder_model(chars, bigrams, seq_lens)
feats = self.decoder_model(feats)
losses = self.crf(feats, tags, masks)
losses = self.crf(feats, target, masks)

pred_dict = {}
pred_dict['seq_lens'] = seq_lens
@@ -168,5 +185,5 @@ class CWSBiLSTMCRF(BaseModel):
feats = self.decoder_model(feats)
probs = self.crf.viterbi_decode(feats, masks, get_score=False)

return {'pred_tags': probs}
return {'pred': probs}


+ 204
- 41
reproduction/chinese_word_segment/process/cws_processor.py View File

@@ -2,7 +2,6 @@
import re


from fastNLP.core.field import SeqLabelField
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.api.processor import Processor
@@ -11,7 +10,10 @@ from reproduction.chinese_word_segment.process.span_converter import SpanConvert
_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'

class SpeicalSpanProcessor(Processor):
# 这个类会将句子中的special span转换为对应的内容。
"""
将DataSet中field_name使用span_converter替换掉。

"""
def __init__(self, field_name, new_added_field_name=None):
super(SpeicalSpanProcessor, self).__init__(field_name, new_added_field_name)

@@ -20,11 +22,12 @@ class SpeicalSpanProcessor(Processor):

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
def inner_proc(ins):
sentence = ins[self.field_name]
for span_converter in self.span_converters:
sentence = span_converter.find_certain_span_and_replace(sentence)
ins[self.new_added_field_name] = sentence
return sentence
dataset.apply(func=inner_proc, new_field_name=self.new_added_field_name)

return dataset

@@ -34,17 +37,22 @@ class SpeicalSpanProcessor(Processor):
self.span_converters.append(converter)



class CWSCharSegProcessor(Processor):
"""
将DataSet中field_name这个field分成一个个的汉字,即原来可能为"复旦大学 fudan", 分成['复', '旦', '大', '学',
' ', 'f', 'u', ...]

"""
def __init__(self, field_name, new_added_field_name):
super(CWSCharSegProcessor, self).__init__(field_name, new_added_field_name)

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
def inner_proc(ins):
sentence = ins[self.field_name]
chars = self._split_sent_into_chars(sentence)
ins[self.new_added_field_name] = chars
return chars
dataset.apply(func=inner_proc, new_field_name=self.new_added_field_name)

return dataset

@@ -73,6 +81,10 @@ class CWSCharSegProcessor(Processor):


class CWSTagProcessor(Processor):
"""
为分词生成tag。该class为Base class。

"""
def __init__(self, field_name, new_added_field_name=None):
super(CWSTagProcessor, self).__init__(field_name, new_added_field_name)

@@ -107,18 +119,22 @@ class CWSTagProcessor(Processor):

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
def inner_proc(ins):
sentence = ins[self.field_name]
tag_list = self._generate_tag(sentence)
ins[self.new_added_field_name] = tag_list
dataset.set_target(**{self.new_added_field_name:True})
dataset._set_need_tensor(**{self.new_added_field_name:True})
return tag_list
dataset.apply(func=inner_proc, new_field_name=self.new_added_field_name)
dataset.set_target(self.new_added_field_name)
return dataset

def _tags_from_word_len(self, word_len):
raise NotImplementedError

class CWSBMESTagProcessor(CWSTagProcessor):
"""
通过DataSet中的field_name这个field生成相应的BMES的tag。

"""
def __init__(self, field_name, new_added_field_name=None):
super(CWSBMESTagProcessor, self).__init__(field_name, new_added_field_name)

@@ -137,6 +153,10 @@ class CWSBMESTagProcessor(CWSTagProcessor):
return tag_list

class CWSSegAppTagProcessor(CWSTagProcessor):
"""
通过DataSet中的field_name这个field生成相应的SegApp的tag。

"""
def __init__(self, field_name, new_added_field_name=None):
super(CWSSegAppTagProcessor, self).__init__(field_name, new_added_field_name)

@@ -151,6 +171,10 @@ class CWSSegAppTagProcessor(CWSTagProcessor):


class BigramProcessor(Processor):
"""
这是生成bigram的基类。

"""
def __init__(self, field_name, new_added_fielf_name=None):

super(BigramProcessor, self).__init__(field_name, new_added_fielf_name)
@@ -158,22 +182,31 @@ class BigramProcessor(Processor):
def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))

for ins in dataset:
def inner_proc(ins):
characters = ins[self.field_name]
bigrams = self._generate_bigram(characters)
ins[self.new_added_field_name] = bigrams
return bigrams
dataset.apply(func=inner_proc, new_field_name=self.new_added_field_name)

return dataset


def _generate_bigram(self, characters):
pass


class Pre2Post2BigramProcessor(BigramProcessor):
def __init__(self, field_name, new_added_fielf_name=None):
"""
该bigram processor生成bigram的方式如下
原汉字list为l = ['a', 'b', 'c'],会被padding为L=['SOS', 'SOS', 'a', 'b', 'c', 'EOS', 'EOS'],生成bigram list为
[L[idx-2], L[idx-1], L[idx+1], L[idx+2], L[idx-2]L[idx], L[idx-1]L[idx], L[idx]L[idx+1], L[idx]L[idx+2], ....]
即每个汉字,会有八个bigram, 对于上例中'a'的bigram为
['SOS', 'SOS', 'b', 'c', 'SOSa', 'SOSa', 'ab', 'ac']
返回的bigram是一个list,但其实每8个元素是一个汉字的bigram信息。

"""
def __init__(self, field_name, new_added_field_name=None):

super(BigramProcessor, self).__init__(field_name, new_added_fielf_name)
super(BigramProcessor, self).__init__(field_name, new_added_field_name)

def _generate_bigram(self, characters):
bigrams = []
@@ -197,20 +230,102 @@ class Pre2Post2BigramProcessor(BigramProcessor):
# 这里需要建立vocabulary了,但是遇到了以下的问题
# (1) 如果使用Processor的方式的话,但是在这种情况返回的不是dataset。所以建立vocabulary的工作用另外的方式实现,不借用
# Processor了
# TODO 如何将建立vocab和index这两步统一了?

class VocabIndexerProcessor(Processor):
"""
根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供
new_added_field_name, 则覆盖原有的field_name.

"""
def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
verbose=1):
"""

:param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作
:param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name.
:param min_freq: 创建的Vocabulary允许的单词最少出现次数.
:param max_size: 创建的Vocabulary允许的最大的单词数量
:param verbose: 0, 不输出任何信息;1,输出信息
"""
super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
self.min_freq = min_freq
self.max_size = max_size

self.verbose =verbose

def construct_vocab(self, *datasets):
"""
使用传入的DataSet创建vocabulary

:param datasets: DataSet类型的数据,用于构建vocabulary
:return:
"""
self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
self.vocab.build_vocab()
if self.verbose:
print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))

def process(self, *datasets, only_index_dataset=None):
"""
若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary
后,则会index datasets与only_index_dataset。

:param datasets: DataSet类型的数据
:param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。
:return:
"""
if len(datasets)==0 and not hasattr(self,'vocab'):
raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
if not hasattr(self, 'vocab'):
self.construct_vocab(*datasets)
else:
if self.verbose:
print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
to_index_datasets = []
if len(datasets)!=0:
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)

if not (only_index_dataset is None):
if isinstance(only_index_dataset, list):
for dataset in only_index_dataset:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
elif isinstance(only_index_dataset, DataSet):
to_index_datasets.append(only_index_dataset)
else:
raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))

for dataset in to_index_datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name)

def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
self.vocab = vocab

def delete_vocab(self):
del self.vocab

def get_vocab_size(self):
return len(self.vocab)

class VocabProcessor(Processor):
def __init__(self, field_name, min_count=1, max_vocab_size=None):
def __init__(self, field_name, min_freq=1, max_size=None):

super(VocabProcessor, self).__init__(field_name, None)
self.vocab = Vocabulary(min_freq=min_count, max_size=max_vocab_size)
self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size)

def process(self, *datasets):
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
tokens = ins[self.field_name]
self.vocab.update(tokens)

dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))

def get_vocab(self):
self.vocab.build_vocab()
@@ -220,19 +335,6 @@ class VocabProcessor(Processor):
return len(self.vocab)


class SeqLenProcessor(Processor):
def __init__(self, field_name, new_added_field_name='seq_lens'):

super(SeqLenProcessor, self).__init__(field_name, new_added_field_name)

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
length = len(ins[self.field_name])
ins[self.new_added_field_name] = length
dataset._set_need_tensor(**{self.new_added_field_name:True})
return dataset

class SegApp2OutputProcessor(Processor):
def __init__(self, chars_field_name='chars_list', tag_field_name='pred_tags', new_added_field_name='output'):
super(SegApp2OutputProcessor, self).__init__(None, None)
@@ -258,7 +360,32 @@ class SegApp2OutputProcessor(Processor):


class BMES2OutputProcessor(Processor):
def __init__(self, chars_field_name='chars_list', tag_field_name='pred_tags', new_added_field_name='output'):
"""
按照BMES标注方式推测生成的tag。由于可能存在非法tag,比如"BS",所以需要用以下的表格做转换,cur_B意思是当前tag是B,
next_B意思是后一个tag是B。则cur_B=S,即将当前被predict是B的tag标为S;next_M=B, 即将后一个被predict是M的tag标为B
| | next_B | next_M | next_E | next_S | end |
|:-----:|:-------:|:--------:|:--------:|:-------:|:-------:|
| start | 合法 | next_M=B | next_E=S | 合法 | - |
| cur_B | cur_B=S | 合法 | 合法 | cur_B=S | cur_B=S |
| cur_M | cur_M=E | 合法 | 合法 | cur_M=E | cur_M=E |
| cur_E | 合法 | next_M=B | next_E=S | 合法 | 合法 |
| cur_S | 合法 | next_M=B | next_E=S | 合法 | 合法 |
举例:
prediction为BSEMS,会被认为是SSSSS.

"""
def __init__(self, chars_field_name='chars_list', tag_field_name='pred_tags', new_added_field_name='output',
b_idx = 0, m_idx = 1, e_idx = 2, s_idx = 3):
"""

:param chars_field_name: character所对应的field
:param tag_field_name: 预测对应的field
:param new_added_field_name: 转换后的内容所在field
:param b_idx: int, Begin标签所对应的tag idx.
:param m_idx: int, Middle标签所对应的tag idx.
:param e_idx: int, End标签所对应的tag idx.
:param s_idx: int, Single标签所对应的tag idx
"""
super(BMES2OutputProcessor, self).__init__(None, None)

self.chars_field_name = chars_field_name
@@ -266,19 +393,55 @@ class BMES2OutputProcessor(Processor):

self.new_added_field_name = new_added_field_name

self.b_idx = b_idx
self.m_idx = m_idx
self.e_idx = e_idx
self.s_idx = s_idx
# 还原init处介绍的矩阵
self._valida_matrix = {
-1: [(-1, -1), (1, self.b_idx), (1, self.s_idx), (-1, -1)], # magic start idx
self.b_idx:[(0, self.s_idx), (-1, -1), (-1, -1), (0, self.s_idx), (0, self.s_idx)],
self.m_idx:[(0, self.e_idx), (-1, -1), (-1, -1), (0, self.e_idx), (0, self.e_idx)],
self.e_idx:[(-1, -1), (1, self.b_idx), (1, self.s_idx), (-1, -1), (-1, -1)],
self.s_idx:[(-1, -1), (1, self.b_idx), (1, self.s_idx), (-1, -1), (-1, -1)],
}

def _validate_tags(self, tags):
"""
给定一个tag的List,返回合法tag

:param tags: Tensor, shape: (seq_len, )
:return: 返回修改为合法tag的list
"""
assert len(tags)!=0
padded_tags = [-1, *tags, -1]
for idx in range(len(padded_tags)-1):
cur_tag = padded_tags[idx]
if cur_tag not in self._valida_matrix:
cur_tag = self.s_idx
if padded_tags[idx+1] not in self._valida_matrix:
padded_tags[idx+1] = self.s_idx
next_tag = padded_tags[idx+1]
shift_tag = self._valida_matrix[cur_tag][next_tag]
if shift_tag[0]!=-1:
padded_tags[idx+shift_tag[0]] = shift_tag[1]

return padded_tags[1:-1]

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
def inner_proc(ins):
pred_tags = ins[self.tag_field_name]
pred_tags = self._validate_tags(pred_tags)
chars = ins[self.chars_field_name]
words = []
start_idx = 0
for idx, tag in enumerate(pred_tags):
if tag==3:
# 当前没有考虑将原文替换回去
if tag==self.s_idx:
words.extend(chars[start_idx:idx+1])
start_idx = idx + 1
elif tag==2:
elif tag==self.e_idx:
words.append(''.join(chars[start_idx:idx+1]))
start_idx = idx + 1
ins[self.new_added_field_name] = ' '.join(words)
return ' '.join(words)
dataset.apply(func=inner_proc, new_field_name=self.new_added_field_name)

+ 68
- 4
reproduction/pos_tag_model/pos_io/pos_reader.py View File

@@ -24,8 +24,8 @@ def cut_long_sentence(sent, max_sample_length=200):
return cutted_sentence


class ConlluPOSReader(object):
# 返回的Dataset包含words(list of list, 里层的list是character), tag两个field(list of str, str是标有BMES的tag)。
class ConllPOSReader(object):
# 返回的Dataset包含words(list of list, 里层的list是character), tag两个field(list of str, str是标有BIO的tag)。
def __init__(self):
pass

@@ -70,6 +70,70 @@ class ConlluPOSReader(object):

return ds



class ZhConllPOSReader(object):
# 中文colln格式reader
def __init__(self):
pass

def load(self, path):
"""
返回的DataSet, 包含以下的field
words:list of str,
tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..]
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
char_seq.extend(list(word))
if len(word)==1:
pos_seq.append('S-{}'.format(tag))
elif len(word)>1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word)-2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
else:
raise ValueError("Zero length of word detected.")

ds.append(Instance(words=char_seq,
tag=pos_seq))

return ds

def get_one(self, sample):
if len(sample)==0:
return None
@@ -84,6 +148,6 @@ class ConlluPOSReader(object):
return text, pos_tags

if __name__ == '__main__':
reader = ConlluPOSReader()
reader = ZhConllPOSReader()
d = reader.load('/home/hyan/train.conllx')
print('reader')
print(d)

+ 229
- 0
test/core/test_metrics.py View File

@@ -4,6 +4,7 @@ import numpy as np
import torch

from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.metrics import BMESF1PreRecMetric
from fastNLP.core.metrics import pred_topk, accuracy_topk


@@ -132,6 +133,234 @@ class TestAccuracyMetric(unittest.TestCase):
return
self.assertTrue(True, False), "No exception catches."

class SpanF1PreRecMetric(unittest.TestCase):
def test_case1(self):
from fastNLP.core.metrics import bmes_tag_to_spans
from fastNLP.core.metrics import bio_tag_to_spans

bmes_lst = ['M-8', 'S-2', 'S-0', 'B-9', 'B-6', 'E-5', 'B-7', 'S-2', 'E-7', 'S-8']
bio_lst = ['O-8', 'O-2', 'B-0', 'O-9', 'I-6', 'I-5', 'I-7', 'I-2', 'I-7', 'O-8']
expect_bmes_res = set()
expect_bmes_res.update([('8', (0, 0)), ('2', (1, 1)), ('0', (2, 2)), ('9', (3, 3)), ('6', (4, 4)),
('5', (5, 5)), ('7', (6, 6)), ('2', (7, 7)), ('7', (8, 8)), ('8', (9, 9))])
expect_bio_res = set()
expect_bio_res.update([('7', (8, 8)), ('0', (2, 2)), ('2', (7, 7)), ('5', (5, 5)),
('6', (4, 4)), ('7', (6, 6))])
self.assertSetEqual(expect_bmes_res,set(bmes_tag_to_spans(bmes_lst)))
self.assertSetEqual(expect_bio_res, set(bio_tag_to_spans(bio_lst)))
# 已与allennlp对应函数做过验证,但由于测试不能依赖allennlp,所以这里只是截取上面的例子做固定测试
# from allennlp.data.dataset_readers.dataset_utils import bio_tags_to_spans as allen_bio_tags_to_spans
# from allennlp.data.dataset_readers.dataset_utils import bmes_tags_to_spans as allen_bmes_tags_to_spans
# for i in range(1000):
# strs = list(map(str, np.random.randint(100, size=1000)))
# bmes = list('bmes'.upper())
# bmes_strs = [str_ + '-' + tag for tag, str_ in zip(strs, np.random.choice(bmes, size=len(strs)))]
# bio = list('bio'.upper())
# bio_strs = [str_ + '-' + tag for tag, str_ in zip(strs, np.random.choice(bio, size=len(strs)))]
# self.assertSetEqual(set(allen_bmes_tags_to_spans(bmes_strs)),set(bmes_tag_to_spans(bmes_strs)))
# self.assertSetEqual(set(allen_bio_tags_to_spans(bio_strs)), set(bio_tag_to_spans(bio_strs)))

def test_case2(self):
# 测试不带label的
from fastNLP.core.metrics import bmes_tag_to_spans
from fastNLP.core.metrics import bio_tag_to_spans

bmes_lst = ['B', 'E', 'B', 'S', 'B', 'M', 'E', 'M', 'B', 'E']
bio_lst = ['I', 'B', 'O', 'O', 'I', 'O', 'I', 'B', 'O', 'O']
expect_bmes_res = set()
expect_bmes_res.update([('', (0, 1)), ('', (2, 2)), ('', (3, 3)), ('', (4, 6)), ('', (7, 7)), ('', (8, 9))])
expect_bio_res = set()
expect_bio_res.update([('', (7, 7)), ('', (6, 6)), ('', (4, 4)), ('', (0, 0)), ('', (1, 1))])
self.assertSetEqual(expect_bmes_res,set(bmes_tag_to_spans(bmes_lst)))
self.assertSetEqual(expect_bio_res, set(bio_tag_to_spans(bio_lst)))
# 已与allennlp对应函数做过验证,但由于测试不能依赖allennlp,所以这里只是截取上面的例子做固定测试
# from allennlp.data.dataset_readers.dataset_utils import bio_tags_to_spans as allen_bio_tags_to_spans
# from allennlp.data.dataset_readers.dataset_utils import bmes_tags_to_spans as allen_bmes_tags_to_spans
# for i in range(1000):
# bmes = list('bmes'.upper())
# bmes_strs = np.random.choice(bmes, size=1000)
# bio = list('bio'.upper())
# bio_strs = np.random.choice(bio, size=100)
# self.assertSetEqual(set(allen_bmes_tags_to_spans(bmes_strs)),set(bmes_tag_to_spans(bmes_strs)))
# self.assertSetEqual(set(allen_bio_tags_to_spans(bio_strs)), set(bio_tag_to_spans(bio_strs)))

def tese_case3(self):
from fastNLP.core.vocabulary import Vocabulary
from collections import Counter
from fastNLP.core.metrics import SpanFPreRecMetric
# 与allennlp测试能否正确计算f metric
#
def generate_allen_tags(encoding_type, number_labels=4):
vocab = {}
for i in range(number_labels):
label = str(i)
for tag in encoding_type:
if tag == 'O':
if tag not in vocab:
vocab['O'] = len(vocab) + 1
continue
vocab['{}-{}'.format(tag, label)] = len(vocab) + 1 # 其实表达的是这个的count
return vocab

number_labels = 4
# bio tag
fastnlp_bio_vocab = Vocabulary(unknown=None, padding=None)
fastnlp_bio_vocab.word_count = Counter(generate_allen_tags('BIO', number_labels))
fastnlp_bio_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bio_vocab, only_gross=False)
bio_sequence = torch.FloatTensor(
[[[-0.9543, -1.4357, -0.2365, 0.2438, 1.0312, -1.4302, 0.3011,
0.0470, 0.0971],
[-0.6638, -0.7116, -1.9804, 0.2787, -0.2732, -0.9501, -1.4523,
0.7987, -0.3970],
[0.2939, 0.8132, -0.0903, -2.8296, 0.2080, -0.9823, -0.1898,
0.6880, 1.4348],
[-0.1886, 0.0067, -0.6862, -0.4635, 2.2776, 0.0710, -1.6793,
-1.6876, -0.8917],
[-0.7663, 0.6377, 0.8669, 0.1237, 1.7628, 0.0313, -1.0824,
1.4217, 0.2622]],

[[0.1529, 0.7474, -0.9037, 1.5287, 0.2771, 0.2223, 0.8136,
1.3592, -0.8973],
[0.4515, -0.5235, 0.3265, -1.1947, 0.8308, 1.8754, -0.4887,
-0.4025, -0.3417],
[-0.7855, 0.1615, -0.1272, -1.9289, -0.5181, 1.9742, -0.9698,
0.2861, -0.3966],
[-0.8291, -0.8823, -1.1496, 0.2164, 1.3390, -0.3964, -0.5275,
0.0213, 1.4777],
[-1.1299, 0.0627, -0.1358, -1.5951, 0.4484, -0.6081, -1.9566,
1.3024, 0.2001]]]
)
bio_target = torch.LongTensor([[5., 0., 3., 3., 3.],
[5., 6., 8., 6., 0.]])
fastnlp_bio_metric({'pred': bio_sequence, 'seq_lens': torch.LongTensor([5, 5])}, {'target': bio_target})
expect_bio_res = {'pre-1': 0.24999999999999373, 'rec-1': 0.499999999999975, 'f-1': 0.33333333333327775,
'pre-2': 0.0, 'rec-2': 0.0, 'f-2': 0.0, 'pre-3': 0.0, 'rec-3': 0.0, 'f-3': 0.0, 'pre-0': 0.0,
'rec-0': 0.0, 'f-0': 0.0, 'pre': 0.12499999999999845, 'rec': 0.12499999999999845,
'f': 0.12499999999994846}
self.assertDictEqual(expect_bio_res, fastnlp_bio_metric.get_metric())

#bmes tag
bmes_sequence = torch.FloatTensor(
[[[0.6536, -0.7179, 0.6579, 1.2503, 0.4176, 0.6696, 0.2352,
-0.4085, 0.4084, -0.4185, 1.4172, -0.9162, -0.2679, 0.3332,
-0.3505, -0.6002],
[0.3238, -1.2378, -1.3304, -0.4903, 1.4518, -0.1868, -0.7641,
1.6199, -0.8877, 0.1449, 0.8995, -0.5810, 0.1041, 0.1002,
0.4439, 0.2514],
[-0.8362, 2.9526, 0.8008, 0.1193, 1.0488, 0.6670, 1.1696,
-1.1006, -0.8540, -0.1600, -0.9519, -0.2749, -0.4948, -1.4753,
0.5802, -0.0516],
[-0.8383, -1.7292, -1.4079, -1.5023, 0.5383, 0.6653, 0.3121,
4.1249, -0.4173, -0.2043, 1.7755, 1.1110, -1.7069, -0.0390,
-0.9242, -0.0333],
[0.9088, -0.4955, -0.5076, 0.3732, 0.0283, -0.0263, -1.0393,
0.7734, 1.0968, 0.4132, -1.3647, -0.5762, 0.6678, 0.8809,
-0.3779, -0.3195]],

[[-0.4638, -0.5939, -0.1052, -0.5573, 0.4600, -1.3484, 0.1753,
0.0685, 0.3663, -0.6789, 0.0097, 1.0327, -0.0212, -0.9957,
-0.1103, 0.4417],
[-0.2903, 0.9205, -1.5758, -1.0421, 0.2921, -0.2142, -0.3049,
-0.0879, -0.4412, -1.3195, -0.0657, -0.2986, 0.7214, 0.0631,
-0.6386, 0.2797],
[0.6440, -0.3748, 1.2912, -0.0170, 0.7447, 1.4075, -0.4947,
0.4123, -0.8447, -0.5502, 0.3520, -0.2832, 0.5019, -0.1522,
1.1237, -1.5385],
[0.2839, -0.7649, 0.9067, -0.1163, -1.3789, 0.2571, -1.3977,
-0.3680, -0.8902, -0.6983, -1.1583, 1.2779, 0.2197, 0.1376,
-0.0591, -0.2461],
[-0.2977, -1.8564, -0.5347, 1.0011, -1.1260, 0.4252, -2.0097,
2.6973, -0.8308, -1.4939, 0.9865, -0.3935, 0.2743, 0.1142,
-0.7344, -1.2046]]]
)
bmes_target = torch.LongTensor([[ 9., 6., 1., 9., 15.],
[ 6., 15., 6., 15., 5.]])

fastnlp_bmes_vocab = Vocabulary(unknown=None, padding=None)
fastnlp_bmes_vocab.word_count = Counter(generate_allen_tags('BMES', number_labels))
fastnlp_bmes_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bmes_vocab, only_gross=False, encoding_type='bmes')
fastnlp_bmes_metric({'pred': bmes_sequence, 'seq_lens': torch.LongTensor([20, 20])}, {'target': bmes_target})

expect_bmes_res = {'f-3': 0.6666666666665778, 'pre-3': 0.499999999999975, 'rec-3': 0.9999999999999001,
'f-0': 0.0, 'pre-0': 0.0, 'rec-0': 0.0, 'f-1': 0.33333333333327775,
'pre-1': 0.24999999999999373, 'rec-1': 0.499999999999975, 'f-2': 0.7499999999999314,
'pre-2': 0.7499999999999812, 'rec-2': 0.7499999999999812, 'f': 0.49999999999994504,
'pre': 0.499999999999995, 'rec': 0.499999999999995}

self.assertDictEqual(fastnlp_bmes_metric.get_metric(), expect_bmes_res)

# 已经和allennlp做过验证,但由于不能依赖allennlp,所以注释了以下代码
# from allennlp.data.vocabulary import Vocabulary as allen_Vocabulary
# from allennlp.training.metrics import SpanBasedF1Measure
# allen_bio_vocab = allen_Vocabulary({"tags": generate_allen_tags('BIO', number_labels)},
# non_padded_namespaces=['tags'])
# allen_bio_metric = SpanBasedF1Measure(allen_bio_vocab, 'tags')
# bio_sequence = torch.randn(size=(2, 20, 2 * number_labels + 1))
# bio_target = torch.randint(2 * number_labels + 1, size=(2, 20))
# allen_bio_metric(bio_sequence, bio_target, torch.ones(2, 20))
# fastnlp_bio_vocab = Vocabulary(unknown=None, padding=None)
# fastnlp_bio_vocab.word_count = Counter(generate_allen_tags('BIO', number_labels))
# fastnlp_bio_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bio_vocab, only_gross=False)
#
# def convert_allen_res_to_fastnlp_res(metric_result):
# allen_result = {}
# key_map = {'f1-measure-overall': "f", "recall-overall": "rec", "precision-overall": "pre"}
# for key, value in metric_result.items():
# if key in key_map:
# key = key_map[key]
# else:
# label = key.split('-')[-1]
# if key.startswith('f1'):
# key = 'f-{}'.format(label)
# else:
# key = '{}-{}'.format(key[:3], label)
# allen_result[key] = value
# return allen_result
#
# # print(convert_allen_res_to_fastnlp_res(allen_bio_metric.get_metric()))
# # print(fastnlp_bio_metric.get_metric())
# self.assertDictEqual(convert_allen_res_to_fastnlp_res(allen_bio_metric.get_metric()),
# fastnlp_bio_metric.get_metric())
#
# allen_bmes_vocab = allen_Vocabulary({"tags": generate_allen_tags('BMES', number_labels)})
# allen_bmes_metric = SpanBasedF1Measure(allen_bmes_vocab, 'tags', label_encoding='BMES')
# fastnlp_bmes_vocab = Vocabulary(unknown=None, padding=None)
# fastnlp_bmes_vocab.word_count = Counter(generate_allen_tags('BMES', number_labels))
# fastnlp_bmes_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bmes_vocab, only_gross=False, encoding_type='bmes')
# bmes_sequence = torch.randn(size=(2, 20, 4 * number_labels))
# bmes_target = torch.randint(4 * number_labels, size=(2, 20))
# allen_bmes_metric(bmes_sequence, bmes_target, torch.ones(2, 20))
# fastnlp_bmes_metric({'pred': bmes_sequence, 'seq_lens': torch.LongTensor([20, 20])}, {'target': bmes_target})
#
# # print(convert_allen_res_to_fastnlp_res(allen_bmes_metric.get_metric()))
# # print(fastnlp_bmes_metric.get_metric())
# self.assertDictEqual(convert_allen_res_to_fastnlp_res(allen_bmes_metric.get_metric()),
# fastnlp_bmes_metric.get_metric())

class TestBMESF1PreRecMetric(unittest.TestCase):
def test_case1(self):
seq_lens = torch.LongTensor([4, 2])
pred = torch.randn(2, 4, 4)
target = torch.LongTensor([[0, 1, 2, 3],
[3, 3, 0, 0]])
pred_dict = {'pred': pred}
target_dict = {'target': target, 'seq_lens': seq_lens}

metric = BMESF1PreRecMetric()
metric(pred_dict, target_dict)
metric.get_metric()

def test_case2(self):
# 测试相同两个seqence,应该给出{f1: 1, precision:1, recall:1}
seq_lens = torch.LongTensor([4, 2])
target = torch.LongTensor([[0, 1, 2, 3],
[3, 3, 0, 0]])
pred_dict = {'pred': target}
target_dict = {'target': target, 'seq_lens': seq_lens}

metric = BMESF1PreRecMetric()
metric(pred_dict, target_dict)
self.assertDictEqual(metric.get_metric(), {'f1': 0.999999, 'precision': 1.0, 'recall': 1.0})

class TestUsefulFunctions(unittest.TestCase):
# 测试metrics.py中一些看上去挺有用的函数


+ 104
- 0
test/modules/decoder/test_CRF.py View File

@@ -0,0 +1,104 @@

import unittest


class TestCRF(unittest.TestCase):
def test_case1(self):
# 检查allowed_transitions()能否正确使用
from fastNLP.modules.decoder.CRF import allowed_transitions

id2label = {0: 'B', 1: 'I', 2:'O'}
expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2),
(2, 4), (3, 0), (3, 2)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label)))

id2label = {0: 'B', 1:'M', 2:'E', 3:'S'}
expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES')))

id2label = {0: 'B', 1: 'I', 2:'O', 3: '<pad>', 4:"<unk>"}
allowed_transitions(id2label)

labels = ['O']
for label in ['X', 'Y']:
for tag in 'BI':
labels.append('{}-{}'.format(tag, label))
id2label = {idx:label for idx, label in enumerate(labels)}
expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1),
(2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3),
(4, 4), (4, 6), (5, 0), (5, 1), (5, 3)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label)))

labels = []
for label in ['X', 'Y']:
for tag in 'BMES':
labels.append('{}-{}'.format(tag, label))
id2label = {idx: label for idx, label in enumerate(labels)}
expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4),
(3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0),
(7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES')))

def test_case2(self):
# 测试CRF能否避免解码出非法跃迁, 使用allennlp做了验证。
pass
# import torch
# from fastNLP.modules.decoder.CRF import seq_len_to_byte_mask
#
# labels = ['O']
# for label in ['X', 'Y']:
# for tag in 'BI':
# labels.append('{}-{}'.format(tag, label))
# id2label = {idx: label for idx, label in enumerate(labels)}
# num_tags = len(id2label)
#
# from allennlp.modules.conditional_random_field import ConditionalRandomField, allowed_transitions
# allen_CRF = ConditionalRandomField(num_tags=num_tags, constraints=allowed_transitions('BIO', id2label),
# include_start_end_transitions=False)
# batch_size = 3
# logits = torch.nn.functional.softmax(torch.rand(size=(batch_size, 20, num_tags))).log()
# trans_m = allen_CRF.transitions
# seq_lens = torch.randint(1, 20, size=(batch_size,))
# seq_lens[-1] = 20
# mask = seq_len_to_byte_mask(seq_lens)
# allen_res = allen_CRF.viterbi_tags(logits, mask)
#
# from fastNLP.modules.decoder.CRF import ConditionalRandomField, allowed_transitions
# fast_CRF = ConditionalRandomField(num_tags=num_tags, allowed_transitions=allowed_transitions(id2label))
# fast_CRF.trans_m = trans_m
# fast_res = fast_CRF.viterbi_decode(logits, mask, get_score=True)
# # score equal
# self.assertListEqual([score for _, score in allen_res], fast_res[1])
# # seq equal
# self.assertListEqual([_ for _, score in allen_res], fast_res[0])
#
#
# labels = []
# for label in ['X', 'Y']:
# for tag in 'BMES':
# labels.append('{}-{}'.format(tag, label))
# id2label = {idx: label for idx, label in enumerate(labels)}
# num_tags = len(id2label)
#
# from allennlp.modules.conditional_random_field import ConditionalRandomField, allowed_transitions
# allen_CRF = ConditionalRandomField(num_tags=num_tags, constraints=allowed_transitions('BMES', id2label),
# include_start_end_transitions=False)
# batch_size = 3
# logits = torch.nn.functional.softmax(torch.rand(size=(batch_size, 20, num_tags))).log()
# trans_m = allen_CRF.transitions
# seq_lens = torch.randint(1, 20, size=(batch_size,))
# seq_lens[-1] = 20
# mask = seq_len_to_byte_mask(seq_lens)
# allen_res = allen_CRF.viterbi_tags(logits, mask)
#
# from fastNLP.modules.decoder.CRF import ConditionalRandomField, allowed_transitions
# fast_CRF = ConditionalRandomField(num_tags=num_tags, allowed_transitions=allowed_transitions(id2label,
# encoding_type='BMES'))
# fast_CRF.trans_m = trans_m
# fast_res = fast_CRF.viterbi_decode(logits, mask, get_score=True)
# # score equal
# self.assertListEqual([score for _, score in allen_res], fast_res[1])
# # seq equal
# self.assertListEqual([_ for _, score in allen_res], fast_res[0])



Loading…
Cancel
Save