|
@@ -364,6 +364,7 @@ class PeopleDailyCorpusLoader(DataSetLoader): |
|
|
inside_ne = False |
|
|
inside_ne = False |
|
|
sent_pos_tag = [] |
|
|
sent_pos_tag = [] |
|
|
sent_words = [] |
|
|
sent_words = [] |
|
|
|
|
|
sent_word = [] |
|
|
sent_ner = [] |
|
|
sent_ner = [] |
|
|
words = sent.strip().split()[1:] |
|
|
words = sent.strip().split()[1:] |
|
|
for word in words: |
|
|
for word in words: |
|
@@ -388,10 +389,23 @@ class PeopleDailyCorpusLoader(DataSetLoader): |
|
|
ner_tag = "O" |
|
|
ner_tag = "O" |
|
|
tmp = word.split("/") |
|
|
tmp = word.split("/") |
|
|
token, pos = tmp[0], tmp[1] |
|
|
token, pos = tmp[0], tmp[1] |
|
|
|
|
|
|
|
|
|
|
|
pos_tag = [] |
|
|
|
|
|
for single_token in token: |
|
|
|
|
|
if len(token) == 1: |
|
|
|
|
|
single_pos = "S-" + pos |
|
|
|
|
|
else: |
|
|
|
|
|
single_pos = "M-" + pos |
|
|
|
|
|
pos_tag.append(single_pos) |
|
|
|
|
|
sent_word.append(single_token) |
|
|
|
|
|
if len(token) > 1: |
|
|
|
|
|
pos_tag[0] = "B-" + pos |
|
|
|
|
|
pos_tag[-1] = "E-" + pos |
|
|
|
|
|
sent_pos_tag += pos_tag |
|
|
|
|
|
|
|
|
sent_ner.append(ner_tag) |
|
|
sent_ner.append(ner_tag) |
|
|
sent_pos_tag.append(pos) |
|
|
|
|
|
sent_words.append(token) |
|
|
sent_words.append(token) |
|
|
pos_tag_examples.append([sent_words, sent_pos_tag]) |
|
|
|
|
|
|
|
|
pos_tag_examples.append([sent_word, sent_pos_tag]) |
|
|
ner_examples.append([sent_words, sent_ner]) |
|
|
ner_examples.append([sent_words, sent_ner]) |
|
|
# List[List[List[str], List[str]]] |
|
|
# List[List[List[str], List[str]]] |
|
|
return pos_tag_examples, ner_examples |
|
|
return pos_tag_examples, ner_examples |
|
|