From db5c5ea45eff78eaa53941c802338e8d8236b3ff Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 11 Nov 2018 14:17:16 +0800 Subject: [PATCH] update People Daily DataSet Loader --- fastNLP/loader/dataset_loader.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 7537c638..e9a6dd75 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -364,6 +364,7 @@ class PeopleDailyCorpusLoader(DataSetLoader): inside_ne = False sent_pos_tag = [] sent_words = [] + sent_word = [] sent_ner = [] words = sent.strip().split()[1:] for word in words: @@ -388,10 +389,23 @@ class PeopleDailyCorpusLoader(DataSetLoader): ner_tag = "O" tmp = word.split("/") token, pos = tmp[0], tmp[1] + + pos_tag = [] + for single_token in token: + if len(token) == 1: + single_pos = "S-" + pos + else: + single_pos = "M-" + pos + pos_tag.append(single_pos) + sent_word.append(single_token) + if len(token) > 1: + pos_tag[0] = "B-" + pos + pos_tag[-1] = "E-" + pos + sent_pos_tag += pos_tag + sent_ner.append(ner_tag) - sent_pos_tag.append(pos) sent_words.append(token) - pos_tag_examples.append([sent_words, sent_pos_tag]) + pos_tag_examples.append([sent_word, sent_pos_tag]) ner_examples.append([sent_words, sent_ner]) # List[List[List[str], List[str]]] return pos_tag_examples, ner_examples