diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py index 109aa7b6..e79ca953 100644 --- a/fastNLP/api/processor.py +++ b/fastNLP/api/processor.py @@ -172,3 +172,53 @@ class SeqLenProcessor(Processor): ins[self.new_added_field_name] = length dataset.set_need_tensor(**{self.new_added_field_name: True}) return dataset + + +from fastNLP.core.batch import Batch +from fastNLP.core.sampler import SequentialSampler +import torch +from collections import defaultdict + +class ModelProcessor(Processor): + def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32): + """ + 迭代模型并将结果的padding drop掉 + + :param seq_len_field_name: + :param batch_size: + """ + super(ModelProcessor, self).__init__(None, None) + + self.batch_size = batch_size + self.seq_len_field_name = seq_len_field_name + self.model = model + + def process(self, dataset): + assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) + data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) + + batch_output = defaultdict(list) + with torch.no_grad(): + for batch_x, _ in data_iterator: + prediction = self.model.predict(**batch_x) + seq_lens = batch_x[self.seq_len_field_name].cpu().numpy().tolist() + + for key, value in prediction.items(): + tmp_batch = [] + value = value.cpu().numpy() + for idx, seq_len in enumerate(seq_lens): + tmp_batch.append(value[idx, :seq_len]) + batch_output[key].extend(tmp_batch) + + batch_output[self.seq_len_field_name].extend(seq_lens) + + # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 + for field_name, fields in batch_output.items(): + dataset.add_field(field_name, fields, need_tensor=False, is_target=False) + + return dataset + + def set_model(self, model): + self.model = model + + diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 0b4dfc18..c3186aa2 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -74,10 +74,12 @@ class DataSet(object): assert name in self.field_arrays self.field_arrays[name].append(field) - def add_field(self, name, fields): + def add_field(self, name, fields, need_tensor=False, is_target=False): if len(self.field_arrays) != 0: assert len(self) == len(fields) - self.field_arrays[name] = FieldArray(name, fields) + self.field_arrays[name] = FieldArray(name, fields, + need_tensor=need_tensor, + is_target=is_target) def delete_field(self, name): self.field_arrays.pop(name) diff --git a/reproduction/chinese_word_segment/models/cws_model.py b/reproduction/chinese_word_segment/models/cws_model.py index b46a1940..b8859f7a 100644 --- a/reproduction/chinese_word_segment/models/cws_model.py +++ b/reproduction/chinese_word_segment/models/cws_model.py @@ -94,14 +94,14 @@ class CWSBiLSTMSegApp(BaseModel): self.decoder_model = MLP(size_layer) - def forward(self, batch_dict): + def forward(self, chars, seq_lens, bigrams=None): device = self.parameters().__next__().device - chars = batch_dict['indexed_chars_list'].to(device).long() - if 'indexed_bigrams_list' in batch_dict: - bigrams = batch_dict['indexed_bigrams_list'].to(device).long() + chars = chars.to(device).long() + if not bigrams is None: + bigrams = bigrams.to(device).long() else: bigrams = None - seq_lens = batch_dict['seq_lens'].to(device).long() + seq_lens = seq_lens.to(device).long() feats = self.encoder_model(chars, bigrams, seq_lens) probs = self.decoder_model(feats) @@ -112,6 +112,8 @@ class CWSBiLSTMSegApp(BaseModel): return pred_dict - def predict(self, batch_dict): - pass - + def predict(self, chars, seq_lens, bigrams=None): + pred_dict = self.forward(chars, seq_lens, bigrams) + pred_probs = pred_dict['pred_probs'] + _, pred_tags = pred_probs.max(dim=-1) + return {'pred_tags': pred_tags} diff --git a/reproduction/chinese_word_segment/process/cws_processor.py b/reproduction/chinese_word_segment/process/cws_processor.py index 8363ca75..2aa05bef 100644 --- a/reproduction/chinese_word_segment/process/cws_processor.py +++ b/reproduction/chinese_word_segment/process/cws_processor.py @@ -214,3 +214,27 @@ class SeqLenProcessor(Processor): ins[self.new_added_field_name] = length dataset.set_need_tensor(**{self.new_added_field_name:True}) return dataset + +class SegApp2OutputProcessor(Processor): + def __init__(self, chars_field_name='chars', tag_field_name='pred_tags', new_added_field_name='output'): + super(SegApp2OutputProcessor, self).__init__(None, None) + + self.chars_field_name = chars_field_name + self.tag_field_name = tag_field_name + + self.new_added_field_name = new_added_field_name + + def process(self, dataset): + assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) + for ins in dataset: + pred_tags = ins[self.tag_field_name] + chars = ins[self.chars_field_name] + words = [] + start_idx = 0 + for idx, tag in enumerate(pred_tags): + if tag==1: + # 当前没有考虑将原文替换回去 + words.append(''.join(chars[start_idx:idx+1])) + start_idx = idx + ins[self.new_added_field_name] = ' '.join(words) + diff --git a/reproduction/chinese_word_segment/train_context.py b/reproduction/chinese_word_segment/train_context.py index 484a0ce5..ce055b0e 100644 --- a/reproduction/chinese_word_segment/train_context.py +++ b/reproduction/chinese_word_segment/train_context.py @@ -61,11 +61,11 @@ bigram_proc(tr_dataset) char_vocab_proc(tr_dataset) bigram_vocab_proc(tr_dataset) -char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'indexed_chars_list', - delete_old_field=True) -bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','indexed_bigrams_list', +char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'chars', + delete_old_field=False) +bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','bigrams', delete_old_field=True) -seq_len_proc = SeqLenProcessor('indexed_chars_list') +seq_len_proc = SeqLenProcessor('chars') char_index_proc(tr_dataset) bigram_index_proc(tr_dataset) @@ -184,6 +184,49 @@ pp.add_processor(char_index_proc) pp.add_processor(bigram_index_proc) pp.add_processor(seq_len_proc) + + + +te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name) +te_dataset = reader.load(te_filename) +pp(te_dataset) + +batch_size = 64 +te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False) +pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher) +print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100, + pre * 100, + rec * 100)) + +# TODO 这里貌似需要区分test pipeline与infer pipeline + +test_context_dict = {'pipeline': pp, + 'model': cws_model} +torch.save(test_context_dict, 'models/test_context.pkl') + + +# 5. dev的pp +# 4. 组装需要存下的内容 + +from fastNLP.api.processor import ModelProcessor + +model_proc = ModelProcessor(cws_model) +index2word_proc = + +pp = Pipeline() +pp.add_processor(fs2hs_proc) +pp.add_processor(sp_proc) +pp.add_processor(char_proc) +pp.add_processor(bigram_proc) +pp.add_processor(char_index_proc) +pp.add_processor(bigram_index_proc) +pp.add_processor(seq_len_proc) + + +pp.add_processor() + + + te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name) te_dataset = reader.load(te_filename) pp(te_dataset) @@ -195,7 +238,7 @@ print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100, pre * 100, rec * 100)) -# TODO 这里貌似需要区分test pipeline与dev pipeline +# TODO 这里貌似需要区分test pipeline与infer pipeline test_context_dict = {'pipeline': pp, 'model': cws_model} diff --git a/reproduction/chinese_word_segment/utils.py b/reproduction/chinese_word_segment/utils.py index 9411c9f2..0296820d 100644 --- a/reproduction/chinese_word_segment/utils.py +++ b/reproduction/chinese_word_segment/utils.py @@ -57,16 +57,17 @@ def decode_iterator(model, batcher): with torch.no_grad(): model.eval() for batch_x, batch_y in batcher: - pred_dict = model(batch_x) - seq_len = pred_dict['seq_lens'].cpu().numpy() - probs = pred_dict['pred_probs'] - _, pred_y = probs.max(dim=-1) + pred_dict = model.predict(**batch_x) + seq_len = batch_x['seq_lens'].cpu().numpy() + + pred_y = pred_dict['pred_tags'] true_y = batch_y['tags'] + pred_y = pred_y.cpu().numpy() true_y = true_y.cpu().numpy() - true_ys.extend(list(true_y)) - pred_ys.extend(list(pred_y)) + true_ys.extend(true_y.tolist()) + pred_ys.extend(pred_y.tolist()) seq_lens.extend(list(seq_len)) model.train()