Browse Source

当前为segapp的方式,但是貌似准确率不行,尝试修改为crf 4tag模式试一试

tags/v0.2.0
yh_cc yunfan 5 years ago
parent
commit
10379e9c74
7 changed files with 85 additions and 51 deletions
  1. +24
    -22
      fastNLP/api/api.py
  2. +1
    -1
      fastNLP/api/model_zoo.py
  3. +1
    -1
      reproduction/chinese_word_segment/models/cws_model.py
  4. +1
    -0
      reproduction/chinese_word_segment/process/cws_processor.py
  5. +34
    -15
      reproduction/chinese_word_segment/testcontext.py
  6. +11
    -7
      reproduction/chinese_word_segment/train_context.py
  7. +13
    -5
      reproduction/chinese_word_segment/utils.py

+ 24
- 22
fastNLP/api/api.py
File diff suppressed because it is too large
View File


+ 1
- 1
fastNLP/api/model_zoo.py View File

@@ -134,5 +134,5 @@ if tqdm is None:


if __name__ == '__main__':
pipeline = load_url('http://10.141.208.102:5000/file/download/infer_context.pkl', model_dir='.')
pipeline = load_url('http://10.141.208.102:5000/file/download/infer_context-4e86fd93.pkl', model_dir='.')
print(type(pipeline))

+ 1
- 1
reproduction/chinese_word_segment/models/cws_model.py View File

@@ -90,7 +90,7 @@ class CWSBiLSTMSegApp(BaseModel):
self.encoder_model = CWSBiLSTMEncoder(vocab_num, embed_dim, bigram_vocab_num, bigram_embed_dim, num_bigram_per_char,
hidden_size, bidirectional, embed_drop_p, num_layers)

size_layer = [hidden_size, 100, tag_size]
size_layer = [hidden_size, 200, tag_size]
self.decoder_model = MLP(size_layer)




+ 1
- 0
reproduction/chinese_word_segment/process/cws_processor.py View File

@@ -194,6 +194,7 @@ class VocabProcessor(Processor):
tokens = ins[self.field_name]
self.vocab.update(tokens)


def get_vocab(self):
self.vocab.build_vocab()
return self.vocab


+ 34
- 15
reproduction/chinese_word_segment/testcontext.py View File

@@ -6,23 +6,42 @@ from fastNLP.core.sampler import SequentialSampler
from fastNLP.core.batch import Batch
from reproduction.chinese_word_segment.utils import calculate_pre_rec_f1

ds_name = 'ncc'
def f1():
ds_name = 'pku'

test_dict = torch.load('models/test_context.pkl')
test_dict = torch.load('models/test_context.pkl')


pp = test_dict['pipeline']
model = test_dict['model'].cuda()
pp = test_dict['pipeline']
model = test_dict['model'].cuda()

reader = NaiveCWSReader()
te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/{}_raw_data/{}_raw_test.txt'.format(ds_name, ds_name,
ds_name)
te_dataset = reader.load(te_filename)
pp(te_dataset)
reader = NaiveCWSReader()
te_filename = '/hdd/fudanNLP/CWS/Multi_Criterion/all_data/{}/{}_raw_data/{}_raw_test.txt'.format(ds_name, ds_name,
ds_name)
te_dataset = reader.load(te_filename)
pp(te_dataset)

batch_size = 64
te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False)
pre, rec, f1 = calculate_pre_rec_f1(model, te_batcher)
print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100,
pre * 100,
rec * 100))
batch_size = 64
te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False)
pre, rec, f1 = calculate_pre_rec_f1(model, te_batcher)
print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100,
pre * 100,
rec * 100))


def f2():
from fastNLP.api.api import CWS
cws = CWS('models/maml-cws.pkl')
datasets = ['msr', 'as', 'pku', 'ctb', 'ncc', 'cityu', 'ckip', 'sxu']
for dataset in datasets:
print(dataset)
with open('/hdd/fudanNLP/CWS/others/benchmark/raw_and_gold/{}_raw.txt'.format(dataset), 'r') as f:
lines = f.readlines()
results = cws.predict(lines)

with open('/hdd/fudanNLP/CWS/others/benchmark/fastNLP_output/{}_seg.txt'.format(dataset), 'w', encoding='utf-8') as f:
for line in results:
f.write(line)


f1()

+ 11
- 7
reproduction/chinese_word_segment/train_context.py View File

@@ -19,10 +19,15 @@ from reproduction.chinese_word_segment.models.cws_model import CWSBiLSTMSegApp

from reproduction.chinese_word_segment.utils import calculate_pre_rec_f1

ds_name = 'msr'
tr_filename = '/home/hyan/CWS/Mutil_Criterion/all_data/{}/middle_files/{}_train.txt'.format(ds_name,
ds_name = 'pku'
# tr_filename = '/home/hyan/CWS/Mutil_Criterion/all_data/{}/middle_files/{}_train.txt'.format(ds_name,
# ds_name)
# dev_filename = '/home/hyan/CWS/Mutil_Criterion/all_data/{}/middle_files/{}_dev.txt'.format(ds_name,
# ds_name)

tr_filename = '/hdd/fudanNLP/CWS/CWS_semiCRF/all_data/{}/middle_files/{}_train.txt'.format(ds_name,
ds_name)
dev_filename = '/home/hyan/CWS/Mutil_Criterion/all_data/{}/middle_files/{}_dev.txt'.format(ds_name,
dev_filename = '/hdd/fudanNLP/CWS/CWS_semiCRF/all_data/{}/middle_files/{}_dev.txt'.format(ds_name,
ds_name)

reader = NaiveCWSReader()
@@ -189,7 +194,7 @@ pp.add_processor(seq_len_proc)



te_filename = '/home/hyan/CWS/Mutil_Criterion/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name)
te_filename = '/hdd/fudanNLP/CWS/CWS_semiCRF/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name)
te_dataset = reader.load(te_filename)
pp(te_dataset)

@@ -231,9 +236,8 @@ pp.add_processor(output_proc)

# TODO 这里貌似需要区分test pipeline与infer pipeline

infer_context_dict = {'pipeline': pp,
'model': cws_model}
torch.save(infer_context_dict, 'models/infer_context.pkl')
infer_context_dict = {'pipeline': pp}
torch.save(infer_context_dict, 'models/infer_cws.pkl')


# TODO 还需要考虑如何替换回原文的问题?


+ 13
- 5
reproduction/chinese_word_segment/utils.py View File

@@ -34,19 +34,27 @@ def calculate_pre_rec_f1(model, batcher):
yp_wordnum = pred_ys.count(1)
yt_wordnum = true_ys.count(1)
start = 0
for i in range(len(true_ys)):
if true_ys[0]==1 and pred_ys[0]==1:
cor_num += 1
start = 1

for i in range(1, len(true_ys)):
if true_ys[i] == 1:
flag = True
for j in range(start, i + 1):
if true_ys[j] != pred_ys[j]:
flag = False
break
if true_ys[start-1] != pred_ys[start-1]:
flag = False
else:
for j in range(start, i + 1):
if true_ys[j] != pred_ys[j]:
flag = False
break
if flag:
cor_num += 1
start = i + 1
P = cor_num / (float(yp_wordnum) + 1e-6)
R = cor_num / (float(yt_wordnum) + 1e-6)
F = 2 * P * R / (P + R + 1e-6)
print(cor_num, yt_wordnum, yp_wordnum)
return P, R, F




Loading…
Cancel
Save