Browse Source

bug fix

tags/v0.2.0
yh 5 years ago
parent
commit
dc7f8ef8d4
6 changed files with 143 additions and 21 deletions
  1. +50
    -0
      fastNLP/api/processor.py
  2. +4
    -2
      fastNLP/core/dataset.py
  3. +10
    -8
      reproduction/chinese_word_segment/models/cws_model.py
  4. +24
    -0
      reproduction/chinese_word_segment/process/cws_processor.py
  5. +48
    -5
      reproduction/chinese_word_segment/train_context.py
  6. +7
    -6
      reproduction/chinese_word_segment/utils.py

+ 50
- 0
fastNLP/api/processor.py View File

@@ -172,3 +172,53 @@ class SeqLenProcessor(Processor):
ins[self.new_added_field_name] = length
dataset.set_need_tensor(**{self.new_added_field_name: True})
return dataset


from fastNLP.core.batch import Batch
from fastNLP.core.sampler import SequentialSampler
import torch
from collections import defaultdict

class ModelProcessor(Processor):
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32):
"""
迭代模型并将结果的padding drop掉

:param seq_len_field_name:
:param batch_size:
"""
super(ModelProcessor, self).__init__(None, None)

self.batch_size = batch_size
self.seq_len_field_name = seq_len_field_name
self.model = model

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False)

batch_output = defaultdict(list)
with torch.no_grad():
for batch_x, _ in data_iterator:
prediction = self.model.predict(**batch_x)
seq_lens = batch_x[self.seq_len_field_name].cpu().numpy().tolist()

for key, value in prediction.items():
tmp_batch = []
value = value.cpu().numpy()
for idx, seq_len in enumerate(seq_lens):
tmp_batch.append(value[idx, :seq_len])
batch_output[key].extend(tmp_batch)

batch_output[self.seq_len_field_name].extend(seq_lens)

# TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么
for field_name, fields in batch_output.items():
dataset.add_field(field_name, fields, need_tensor=False, is_target=False)

return dataset

def set_model(self, model):
self.model = model



+ 4
- 2
fastNLP/core/dataset.py View File

@@ -74,10 +74,12 @@ class DataSet(object):
assert name in self.field_arrays
self.field_arrays[name].append(field)

def add_field(self, name, fields):
def add_field(self, name, fields, need_tensor=False, is_target=False):
if len(self.field_arrays) != 0:
assert len(self) == len(fields)
self.field_arrays[name] = FieldArray(name, fields)
self.field_arrays[name] = FieldArray(name, fields,
need_tensor=need_tensor,
is_target=is_target)

def delete_field(self, name):
self.field_arrays.pop(name)


+ 10
- 8
reproduction/chinese_word_segment/models/cws_model.py View File

@@ -94,14 +94,14 @@ class CWSBiLSTMSegApp(BaseModel):
self.decoder_model = MLP(size_layer)


def forward(self, batch_dict):
def forward(self, chars, seq_lens, bigrams=None):
device = self.parameters().__next__().device
chars = batch_dict['indexed_chars_list'].to(device).long()
if 'indexed_bigrams_list' in batch_dict:
bigrams = batch_dict['indexed_bigrams_list'].to(device).long()
chars = chars.to(device).long()
if not bigrams is None:
bigrams = bigrams.to(device).long()
else:
bigrams = None
seq_lens = batch_dict['seq_lens'].to(device).long()
seq_lens = seq_lens.to(device).long()

feats = self.encoder_model(chars, bigrams, seq_lens)
probs = self.decoder_model(feats)
@@ -112,6 +112,8 @@ class CWSBiLSTMSegApp(BaseModel):

return pred_dict

def predict(self, batch_dict):
pass

def predict(self, chars, seq_lens, bigrams=None):
pred_dict = self.forward(chars, seq_lens, bigrams)
pred_probs = pred_dict['pred_probs']
_, pred_tags = pred_probs.max(dim=-1)
return {'pred_tags': pred_tags}

+ 24
- 0
reproduction/chinese_word_segment/process/cws_processor.py View File

@@ -214,3 +214,27 @@ class SeqLenProcessor(Processor):
ins[self.new_added_field_name] = length
dataset.set_need_tensor(**{self.new_added_field_name:True})
return dataset

class SegApp2OutputProcessor(Processor):
def __init__(self, chars_field_name='chars', tag_field_name='pred_tags', new_added_field_name='output'):
super(SegApp2OutputProcessor, self).__init__(None, None)

self.chars_field_name = chars_field_name
self.tag_field_name = tag_field_name

self.new_added_field_name = new_added_field_name

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
pred_tags = ins[self.tag_field_name]
chars = ins[self.chars_field_name]
words = []
start_idx = 0
for idx, tag in enumerate(pred_tags):
if tag==1:
# 当前没有考虑将原文替换回去
words.append(''.join(chars[start_idx:idx+1]))
start_idx = idx
ins[self.new_added_field_name] = ' '.join(words)


+ 48
- 5
reproduction/chinese_word_segment/train_context.py View File

@@ -61,11 +61,11 @@ bigram_proc(tr_dataset)
char_vocab_proc(tr_dataset)
bigram_vocab_proc(tr_dataset)

char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'indexed_chars_list',
delete_old_field=True)
bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','indexed_bigrams_list',
char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'chars',
delete_old_field=False)
bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','bigrams',
delete_old_field=True)
seq_len_proc = SeqLenProcessor('indexed_chars_list')
seq_len_proc = SeqLenProcessor('chars')

char_index_proc(tr_dataset)
bigram_index_proc(tr_dataset)
@@ -184,6 +184,49 @@ pp.add_processor(char_index_proc)
pp.add_processor(bigram_index_proc)
pp.add_processor(seq_len_proc)




te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name)
te_dataset = reader.load(te_filename)
pp(te_dataset)

batch_size = 64
te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False)
pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher)
print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100,
pre * 100,
rec * 100))

# TODO 这里貌似需要区分test pipeline与infer pipeline

test_context_dict = {'pipeline': pp,
'model': cws_model}
torch.save(test_context_dict, 'models/test_context.pkl')


# 5. dev的pp
# 4. 组装需要存下的内容

from fastNLP.api.processor import ModelProcessor

model_proc = ModelProcessor(cws_model)
index2word_proc =

pp = Pipeline()
pp.add_processor(fs2hs_proc)
pp.add_processor(sp_proc)
pp.add_processor(char_proc)
pp.add_processor(bigram_proc)
pp.add_processor(char_index_proc)
pp.add_processor(bigram_index_proc)
pp.add_processor(seq_len_proc)


pp.add_processor()



te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name)
te_dataset = reader.load(te_filename)
pp(te_dataset)
@@ -195,7 +238,7 @@ print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100,
pre * 100,
rec * 100))

# TODO 这里貌似需要区分test pipeline与dev pipeline
# TODO 这里貌似需要区分test pipeline与infer pipeline

test_context_dict = {'pipeline': pp,
'model': cws_model}


+ 7
- 6
reproduction/chinese_word_segment/utils.py View File

@@ -57,16 +57,17 @@ def decode_iterator(model, batcher):
with torch.no_grad():
model.eval()
for batch_x, batch_y in batcher:
pred_dict = model(batch_x)
seq_len = pred_dict['seq_lens'].cpu().numpy()
probs = pred_dict['pred_probs']
_, pred_y = probs.max(dim=-1)
pred_dict = model.predict(**batch_x)
seq_len = batch_x['seq_lens'].cpu().numpy()
pred_y = pred_dict['pred_tags']
true_y = batch_y['tags']

pred_y = pred_y.cpu().numpy()
true_y = true_y.cpu().numpy()

true_ys.extend(list(true_y))
pred_ys.extend(list(pred_y))
true_ys.extend(true_y.tolist())
pred_ys.extend(pred_y.tolist())
seq_lens.extend(list(seq_len))
model.train()



Loading…
Cancel
Save