@@ -0,0 +1,129 @@ | |||
from fastNLP.loader.dataset_loader import DataSetLoader | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.dataset import DataSet | |||
def cut_long_sentence(sent, max_sample_length=200): | |||
sent_no_space = sent.replace(' ', '') | |||
cutted_sentence = [] | |||
if len(sent_no_space) > max_sample_length: | |||
parts = sent.strip().split() | |||
new_line = '' | |||
length = 0 | |||
for part in parts: | |||
length += len(part) | |||
new_line += part + ' ' | |||
if length > max_sample_length: | |||
new_line = new_line[:-1] | |||
cutted_sentence.append(new_line) | |||
length = 0 | |||
new_line = '' | |||
if new_line != '': | |||
cutted_sentence.append(new_line[:-1]) | |||
else: | |||
cutted_sentence.append(sent) | |||
return cutted_sentence | |||
class NaiveCWSReader(DataSetLoader): | |||
""" | |||
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了 | |||
这是 fastNLP , 一个 非常 good 的 包 . | |||
或者,即每个part后面还有一个pos tag | |||
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY | |||
""" | |||
def __init__(self, in_word_splitter=None): | |||
super().__init__() | |||
self.in_word_splitter = in_word_splitter | |||
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): | |||
""" | |||
允许使用的情况有(默认以\t或空格作为seg) | |||
这是 fastNLP , 一个 非常 good 的 包 . | |||
和 | |||
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY | |||
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] | |||
:param filepath: | |||
:param in_word_splitter: | |||
:return: | |||
""" | |||
if in_word_splitter == None: | |||
in_word_splitter = self.in_word_splitter | |||
dataset = DataSet() | |||
with open(filepath, 'r') as f: | |||
for line in f: | |||
line = line.strip() | |||
if len(line.replace(' ', ''))==0: # 不能接受空行 | |||
continue | |||
if not in_word_splitter is None: | |||
words = [] | |||
for part in line.split(): | |||
word = part.split(in_word_splitter)[0] | |||
words.append(word) | |||
line = ' '.join(words) | |||
if cut_long_sent: | |||
sents = cut_long_sentence(line) | |||
else: | |||
sents = [line] | |||
for sent in sents: | |||
instance = Instance(raw_sentence=sent) | |||
dataset.append(instance) | |||
return dataset | |||
class POSCWSReader(DataSetLoader): | |||
""" | |||
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限. | |||
迈 N | |||
向 N | |||
充 N | |||
... | |||
泽 I-PER | |||
民 I-PER | |||
( N | |||
一 N | |||
九 N | |||
... | |||
:param filepath: | |||
:return: | |||
""" | |||
def __init__(self, in_word_splitter=None): | |||
super().__init__() | |||
self.in_word_splitter = in_word_splitter | |||
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): | |||
if in_word_splitter is None: | |||
in_word_splitter = self.in_word_splitter | |||
dataset = DataSet() | |||
with open(filepath, 'r') as f: | |||
words = [] | |||
for line in f: | |||
line = line.strip() | |||
if len(line) == 0: # new line | |||
if len(words)==0: # 不能接受空行 | |||
continue | |||
line = ' '.join(words) | |||
if cut_long_sent: | |||
sents = cut_long_sent(line) | |||
else: | |||
sents = [line] | |||
for sent in sents: | |||
instance = Instance(raw_sentence=sent) | |||
dataset.append(instance) | |||
words = [] | |||
else: | |||
line = line.split()[0] | |||
if in_word_splitter is None: | |||
words.append(line) | |||
else: | |||
words.append(line.split(in_word_splitter)[0]) | |||
return dataset | |||
@@ -0,0 +1,185 @@ | |||
import re | |||
class SpanConverterBase: | |||
def __init__(self, replace_tag, pattern): | |||
super(SpanConverterBase, self).__init__() | |||
self.replace_tag = replace_tag | |||
self.pattern = pattern | |||
def find_certain_span_and_replace(self, sentence): | |||
replaced_sentence = '' | |||
prev_end = 0 | |||
for match in re.finditer(self.pattern, sentence): | |||
start, end = match.span() | |||
span = sentence[start:end] | |||
replaced_sentence += sentence[prev_end:start] + \ | |||
self.span_to_special_tag(span) | |||
prev_end = end | |||
replaced_sentence += sentence[prev_end:] | |||
return replaced_sentence | |||
def span_to_special_tag(self, span): | |||
return self.replace_tag | |||
def find_certain_span(self, sentence): | |||
spans = [] | |||
for match in re.finditer(self.pattern, sentence): | |||
spans.append(match.span()) | |||
return spans | |||
class AlphaSpanConverter(SpanConverterBase): | |||
def __init__(self): | |||
replace_tag = '<ALPHA>' | |||
# 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). | |||
pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' | |||
super(AlphaSpanConverter, self).__init__(replace_tag, pattern) | |||
class DigitSpanConverter(SpanConverterBase): | |||
def __init__(self): | |||
replace_tag = '<NUM>' | |||
pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' | |||
super(DigitSpanConverter, self).__init__(replace_tag, pattern) | |||
def span_to_special_tag(self, span): | |||
# return self.special_tag | |||
if span[0] == '0' and len(span) > 2: | |||
return '<NUM>' | |||
decimal_point_count = 0 # one might have more than one decimal pointers | |||
for idx, char in enumerate(span): | |||
if char == '.' or char == '﹒' or char == '·': | |||
decimal_point_count += 1 | |||
if span[-1] == '.' or span[-1] == '﹒' or span[ | |||
-1] == '·': # last digit being decimal point means this is not a number | |||
if decimal_point_count == 1: | |||
return span | |||
else: | |||
return '<UNKDGT>' | |||
if decimal_point_count == 1: | |||
return '<DEC>' | |||
elif decimal_point_count > 1: | |||
return '<UNKDGT>' | |||
else: | |||
return '<NUM>' | |||
class TimeConverter(SpanConverterBase): | |||
def __init__(self): | |||
replace_tag = '<TOC>' | |||
pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' | |||
super().__init__(replace_tag, pattern) | |||
class MixNumAlphaConverter(SpanConverterBase): | |||
def __init__(self): | |||
replace_tag = '<MIX>' | |||
pattern = None | |||
super().__init__(replace_tag, pattern) | |||
def find_certain_span_and_replace(self, sentence): | |||
replaced_sentence = '' | |||
start = 0 | |||
matching_flag = False | |||
number_flag = False | |||
alpha_flag = False | |||
link_flag = False | |||
slash_flag = False | |||
bracket_flag = False | |||
for idx in range(len(sentence)): | |||
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): | |||
if not matching_flag: | |||
replaced_sentence += sentence[start:idx] | |||
start = idx | |||
if re.match('[0-9]', sentence[idx]): | |||
number_flag = True | |||
elif re.match('[\'′&\\-]', sentence[idx]): | |||
link_flag = True | |||
elif re.match('/', sentence[idx]): | |||
slash_flag = True | |||
elif re.match('[\\(\\)]', sentence[idx]): | |||
bracket_flag = True | |||
else: | |||
alpha_flag = True | |||
matching_flag = True | |||
elif re.match('[\\.]', sentence[idx]): | |||
pass | |||
else: | |||
if matching_flag: | |||
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ | |||
or (slash_flag and alpha_flag) or (link_flag and number_flag) \ | |||
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): | |||
span = sentence[start:idx] | |||
start = idx | |||
replaced_sentence += self.span_to_special_tag(span) | |||
matching_flag = False | |||
number_flag = False | |||
alpha_flag = False | |||
link_flag = False | |||
slash_flag = False | |||
bracket_flag = False | |||
replaced_sentence += sentence[start:] | |||
return replaced_sentence | |||
def find_certain_span(self, sentence): | |||
spans = [] | |||
start = 0 | |||
matching_flag = False | |||
number_flag = False | |||
alpha_flag = False | |||
link_flag = False | |||
slash_flag = False | |||
bracket_flag = False | |||
for idx in range(len(sentence)): | |||
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): | |||
if not matching_flag: | |||
start = idx | |||
if re.match('[0-9]', sentence[idx]): | |||
number_flag = True | |||
elif re.match('[\'′&\\-]', sentence[idx]): | |||
link_flag = True | |||
elif re.match('/', sentence[idx]): | |||
slash_flag = True | |||
elif re.match('[\\(\\)]', sentence[idx]): | |||
bracket_flag = True | |||
else: | |||
alpha_flag = True | |||
matching_flag = True | |||
elif re.match('[\\.]', sentence[idx]): | |||
pass | |||
else: | |||
if matching_flag: | |||
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ | |||
or (slash_flag and alpha_flag) or (link_flag and number_flag) \ | |||
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): | |||
spans.append((start, idx)) | |||
start = idx | |||
matching_flag = False | |||
number_flag = False | |||
alpha_flag = False | |||
link_flag = False | |||
slash_flag = False | |||
bracket_flag = False | |||
return spans | |||
class EmailConverter(SpanConverterBase): | |||
def __init__(self): | |||
replaced_tag = "<EML>" | |||
pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' | |||
super(EmailConverter, self).__init__(replaced_tag, pattern) |
@@ -1,3 +1,98 @@ | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.api.pipeline import Pipeline | |||
from reproduction.chinese_word_segment.process.cws_processor import * | |||
from reproduction.chinese_word_segment.utils import cut_long_training_sentences | |||
from reproduction.chinese_word_segment.process.span_converter import * | |||
from reproduction.chinese_word_segment.io import NaiveCWSReader | |||
tr_filename = '' | |||
dev_filename = '' | |||
reader = NaiveCWSReader() | |||
tr_dataset = reader.load(tr_filename, cut=True) | |||
de_dataset = reader.load(dev_filename) | |||
# TODO 如何组建成为一个Dataset | |||
def construct_dataset(sentences): | |||
dataset = DataSet() | |||
for sentence in sentences: | |||
instance = Instance() | |||
instance['raw_sentence'] = sentence | |||
dataset.append(instance) | |||
return dataset | |||
tr_dataset = construct_dataset(tr_sentences) | |||
dev_dataset = construct_dataset(dev_sentence) | |||
# 1. 准备processor | |||
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') | |||
sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence') | |||
sp_proc.add_span_converter(AlphaSpanConverter()) | |||
sp_proc.add_span_converter(DigitSpanConverter()) | |||
char_proc = CWSCharSegProcessor('sentence', 'char_list') | |||
tag_proc = CWSSegAppTagProcessor('sentence', 'tag') | |||
bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list') | |||
char_vocab_proc = VocabProcessor('char_list') | |||
bigram_vocab_proc = VocabProcessor('bigram_list') | |||
# 2. 使用processor | |||
fs2hs_proc(tr_dataset) | |||
sp_proc(tr_dataset) | |||
char_proc(tr_dataset) | |||
tag_proc(tr_dataset) | |||
bigram_proc(tr_dataset) | |||
char_vocab_proc(tr_dataset) | |||
bigram_vocab_proc(tr_dataset) | |||
char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') | |||
bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') | |||
char_index_proc(tr_dataset) | |||
bigram_index_proc(tr_dataset) | |||
# 2.1 处理dev_dataset | |||
fs2hs_proc(dev_dataset) | |||
sp_proc(dev_dataset) | |||
char_proc(dev_dataset) | |||
tag_proc(dev_dataset) | |||
bigram_proc(dev_dataset) | |||
char_index_proc(dev_dataset) | |||
bigram_index_proc(dev_dataset) | |||
# 3. 得到数据集可以用于训练了 | |||
# TODO pretrain的embedding是怎么解决的? | |||
# 4. 组装需要存下的内容 | |||
pp = Pipeline() | |||
pp.add_processor(fs2hs_proc) | |||
pp.add_processor(sp_proc) | |||
pp.add_processor(char_proc) | |||
pp.add_processor(bigram_proc) | |||
pp.add_processor(char_index_proc) | |||
pp.add_processor(bigram_index_proc) |