@@ -172,3 +172,53 @@ class SeqLenProcessor(Processor): | |||
ins[self.new_added_field_name] = length | |||
dataset.set_need_tensor(**{self.new_added_field_name: True}) | |||
return dataset | |||
from fastNLP.core.batch import Batch | |||
from fastNLP.core.sampler import SequentialSampler | |||
import torch | |||
from collections import defaultdict | |||
class ModelProcessor(Processor): | |||
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32): | |||
""" | |||
迭代模型并将结果的padding drop掉 | |||
:param seq_len_field_name: | |||
:param batch_size: | |||
""" | |||
super(ModelProcessor, self).__init__(None, None) | |||
self.batch_size = batch_size | |||
self.seq_len_field_name = seq_len_field_name | |||
self.model = model | |||
def process(self, dataset): | |||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | |||
data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) | |||
batch_output = defaultdict(list) | |||
with torch.no_grad(): | |||
for batch_x, _ in data_iterator: | |||
prediction = self.model.predict(**batch_x) | |||
seq_lens = batch_x[self.seq_len_field_name].cpu().numpy().tolist() | |||
for key, value in prediction.items(): | |||
tmp_batch = [] | |||
value = value.cpu().numpy() | |||
for idx, seq_len in enumerate(seq_lens): | |||
tmp_batch.append(value[idx, :seq_len]) | |||
batch_output[key].extend(tmp_batch) | |||
batch_output[self.seq_len_field_name].extend(seq_lens) | |||
# TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 | |||
for field_name, fields in batch_output.items(): | |||
dataset.add_field(field_name, fields, need_tensor=False, is_target=False) | |||
return dataset | |||
def set_model(self, model): | |||
self.model = model | |||
@@ -74,10 +74,12 @@ class DataSet(object): | |||
assert name in self.field_arrays | |||
self.field_arrays[name].append(field) | |||
def add_field(self, name, fields): | |||
def add_field(self, name, fields, need_tensor=False, is_target=False): | |||
if len(self.field_arrays) != 0: | |||
assert len(self) == len(fields) | |||
self.field_arrays[name] = FieldArray(name, fields) | |||
self.field_arrays[name] = FieldArray(name, fields, | |||
need_tensor=need_tensor, | |||
is_target=is_target) | |||
def delete_field(self, name): | |||
self.field_arrays.pop(name) | |||
@@ -94,14 +94,14 @@ class CWSBiLSTMSegApp(BaseModel): | |||
self.decoder_model = MLP(size_layer) | |||
def forward(self, batch_dict): | |||
def forward(self, chars, seq_lens, bigrams=None): | |||
device = self.parameters().__next__().device | |||
chars = batch_dict['indexed_chars_list'].to(device).long() | |||
if 'indexed_bigrams_list' in batch_dict: | |||
bigrams = batch_dict['indexed_bigrams_list'].to(device).long() | |||
chars = chars.to(device).long() | |||
if not bigrams is None: | |||
bigrams = bigrams.to(device).long() | |||
else: | |||
bigrams = None | |||
seq_lens = batch_dict['seq_lens'].to(device).long() | |||
seq_lens = seq_lens.to(device).long() | |||
feats = self.encoder_model(chars, bigrams, seq_lens) | |||
probs = self.decoder_model(feats) | |||
@@ -112,6 +112,8 @@ class CWSBiLSTMSegApp(BaseModel): | |||
return pred_dict | |||
def predict(self, batch_dict): | |||
pass | |||
def predict(self, chars, seq_lens, bigrams=None): | |||
pred_dict = self.forward(chars, seq_lens, bigrams) | |||
pred_probs = pred_dict['pred_probs'] | |||
_, pred_tags = pred_probs.max(dim=-1) | |||
return {'pred_tags': pred_tags} |
@@ -214,3 +214,27 @@ class SeqLenProcessor(Processor): | |||
ins[self.new_added_field_name] = length | |||
dataset.set_need_tensor(**{self.new_added_field_name:True}) | |||
return dataset | |||
class SegApp2OutputProcessor(Processor): | |||
def __init__(self, chars_field_name='chars', tag_field_name='pred_tags', new_added_field_name='output'): | |||
super(SegApp2OutputProcessor, self).__init__(None, None) | |||
self.chars_field_name = chars_field_name | |||
self.tag_field_name = tag_field_name | |||
self.new_added_field_name = new_added_field_name | |||
def process(self, dataset): | |||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | |||
for ins in dataset: | |||
pred_tags = ins[self.tag_field_name] | |||
chars = ins[self.chars_field_name] | |||
words = [] | |||
start_idx = 0 | |||
for idx, tag in enumerate(pred_tags): | |||
if tag==1: | |||
# 当前没有考虑将原文替换回去 | |||
words.append(''.join(chars[start_idx:idx+1])) | |||
start_idx = idx | |||
ins[self.new_added_field_name] = ' '.join(words) | |||
@@ -61,11 +61,11 @@ bigram_proc(tr_dataset) | |||
char_vocab_proc(tr_dataset) | |||
bigram_vocab_proc(tr_dataset) | |||
char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'indexed_chars_list', | |||
delete_old_field=True) | |||
bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','indexed_bigrams_list', | |||
char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'chars', | |||
delete_old_field=False) | |||
bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','bigrams', | |||
delete_old_field=True) | |||
seq_len_proc = SeqLenProcessor('indexed_chars_list') | |||
seq_len_proc = SeqLenProcessor('chars') | |||
char_index_proc(tr_dataset) | |||
bigram_index_proc(tr_dataset) | |||
@@ -184,6 +184,49 @@ pp.add_processor(char_index_proc) | |||
pp.add_processor(bigram_index_proc) | |||
pp.add_processor(seq_len_proc) | |||
te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name) | |||
te_dataset = reader.load(te_filename) | |||
pp(te_dataset) | |||
batch_size = 64 | |||
te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False) | |||
pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher) | |||
print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100, | |||
pre * 100, | |||
rec * 100)) | |||
# TODO 这里貌似需要区分test pipeline与infer pipeline | |||
test_context_dict = {'pipeline': pp, | |||
'model': cws_model} | |||
torch.save(test_context_dict, 'models/test_context.pkl') | |||
# 5. dev的pp | |||
# 4. 组装需要存下的内容 | |||
from fastNLP.api.processor import ModelProcessor | |||
model_proc = ModelProcessor(cws_model) | |||
index2word_proc = | |||
pp = Pipeline() | |||
pp.add_processor(fs2hs_proc) | |||
pp.add_processor(sp_proc) | |||
pp.add_processor(char_proc) | |||
pp.add_processor(bigram_proc) | |||
pp.add_processor(char_index_proc) | |||
pp.add_processor(bigram_index_proc) | |||
pp.add_processor(seq_len_proc) | |||
pp.add_processor() | |||
te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name) | |||
te_dataset = reader.load(te_filename) | |||
pp(te_dataset) | |||
@@ -195,7 +238,7 @@ print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100, | |||
pre * 100, | |||
rec * 100)) | |||
# TODO 这里貌似需要区分test pipeline与dev pipeline | |||
# TODO 这里貌似需要区分test pipeline与infer pipeline | |||
test_context_dict = {'pipeline': pp, | |||
'model': cws_model} | |||
@@ -57,16 +57,17 @@ def decode_iterator(model, batcher): | |||
with torch.no_grad(): | |||
model.eval() | |||
for batch_x, batch_y in batcher: | |||
pred_dict = model(batch_x) | |||
seq_len = pred_dict['seq_lens'].cpu().numpy() | |||
probs = pred_dict['pred_probs'] | |||
_, pred_y = probs.max(dim=-1) | |||
pred_dict = model.predict(**batch_x) | |||
seq_len = batch_x['seq_lens'].cpu().numpy() | |||
pred_y = pred_dict['pred_tags'] | |||
true_y = batch_y['tags'] | |||
pred_y = pred_y.cpu().numpy() | |||
true_y = true_y.cpu().numpy() | |||
true_ys.extend(list(true_y)) | |||
pred_ys.extend(list(pred_y)) | |||
true_ys.extend(true_y.tolist()) | |||
pred_ys.extend(pred_y.tolist()) | |||
seq_lens.extend(list(seq_len)) | |||
model.train() | |||