@@ -0,0 +1,129 @@ | |||||
from fastNLP.loader.dataset_loader import DataSetLoader | |||||
from fastNLP.core.instance import Instance | |||||
from fastNLP.core.dataset import DataSet | |||||
def cut_long_sentence(sent, max_sample_length=200): | |||||
sent_no_space = sent.replace(' ', '') | |||||
cutted_sentence = [] | |||||
if len(sent_no_space) > max_sample_length: | |||||
parts = sent.strip().split() | |||||
new_line = '' | |||||
length = 0 | |||||
for part in parts: | |||||
length += len(part) | |||||
new_line += part + ' ' | |||||
if length > max_sample_length: | |||||
new_line = new_line[:-1] | |||||
cutted_sentence.append(new_line) | |||||
length = 0 | |||||
new_line = '' | |||||
if new_line != '': | |||||
cutted_sentence.append(new_line[:-1]) | |||||
else: | |||||
cutted_sentence.append(sent) | |||||
return cutted_sentence | |||||
class NaiveCWSReader(DataSetLoader): | |||||
""" | |||||
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了 | |||||
这是 fastNLP , 一个 非常 good 的 包 . | |||||
或者,即每个part后面还有一个pos tag | |||||
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY | |||||
""" | |||||
def __init__(self, in_word_splitter=None): | |||||
super().__init__() | |||||
self.in_word_splitter = in_word_splitter | |||||
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): | |||||
""" | |||||
允许使用的情况有(默认以\t或空格作为seg) | |||||
这是 fastNLP , 一个 非常 good 的 包 . | |||||
和 | |||||
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY | |||||
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] | |||||
:param filepath: | |||||
:param in_word_splitter: | |||||
:return: | |||||
""" | |||||
if in_word_splitter == None: | |||||
in_word_splitter = self.in_word_splitter | |||||
dataset = DataSet() | |||||
with open(filepath, 'r') as f: | |||||
for line in f: | |||||
line = line.strip() | |||||
if len(line.replace(' ', ''))==0: # 不能接受空行 | |||||
continue | |||||
if not in_word_splitter is None: | |||||
words = [] | |||||
for part in line.split(): | |||||
word = part.split(in_word_splitter)[0] | |||||
words.append(word) | |||||
line = ' '.join(words) | |||||
if cut_long_sent: | |||||
sents = cut_long_sentence(line) | |||||
else: | |||||
sents = [line] | |||||
for sent in sents: | |||||
instance = Instance(raw_sentence=sent) | |||||
dataset.append(instance) | |||||
return dataset | |||||
class POSCWSReader(DataSetLoader): | |||||
""" | |||||
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限. | |||||
迈 N | |||||
向 N | |||||
充 N | |||||
... | |||||
泽 I-PER | |||||
民 I-PER | |||||
( N | |||||
一 N | |||||
九 N | |||||
... | |||||
:param filepath: | |||||
:return: | |||||
""" | |||||
def __init__(self, in_word_splitter=None): | |||||
super().__init__() | |||||
self.in_word_splitter = in_word_splitter | |||||
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): | |||||
if in_word_splitter is None: | |||||
in_word_splitter = self.in_word_splitter | |||||
dataset = DataSet() | |||||
with open(filepath, 'r') as f: | |||||
words = [] | |||||
for line in f: | |||||
line = line.strip() | |||||
if len(line) == 0: # new line | |||||
if len(words)==0: # 不能接受空行 | |||||
continue | |||||
line = ' '.join(words) | |||||
if cut_long_sent: | |||||
sents = cut_long_sent(line) | |||||
else: | |||||
sents = [line] | |||||
for sent in sents: | |||||
instance = Instance(raw_sentence=sent) | |||||
dataset.append(instance) | |||||
words = [] | |||||
else: | |||||
line = line.split()[0] | |||||
if in_word_splitter is None: | |||||
words.append(line) | |||||
else: | |||||
words.append(line.split(in_word_splitter)[0]) | |||||
return dataset | |||||
@@ -0,0 +1,185 @@ | |||||
import re | |||||
class SpanConverterBase: | |||||
def __init__(self, replace_tag, pattern): | |||||
super(SpanConverterBase, self).__init__() | |||||
self.replace_tag = replace_tag | |||||
self.pattern = pattern | |||||
def find_certain_span_and_replace(self, sentence): | |||||
replaced_sentence = '' | |||||
prev_end = 0 | |||||
for match in re.finditer(self.pattern, sentence): | |||||
start, end = match.span() | |||||
span = sentence[start:end] | |||||
replaced_sentence += sentence[prev_end:start] + \ | |||||
self.span_to_special_tag(span) | |||||
prev_end = end | |||||
replaced_sentence += sentence[prev_end:] | |||||
return replaced_sentence | |||||
def span_to_special_tag(self, span): | |||||
return self.replace_tag | |||||
def find_certain_span(self, sentence): | |||||
spans = [] | |||||
for match in re.finditer(self.pattern, sentence): | |||||
spans.append(match.span()) | |||||
return spans | |||||
class AlphaSpanConverter(SpanConverterBase): | |||||
def __init__(self): | |||||
replace_tag = '<ALPHA>' | |||||
# 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). | |||||
pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' | |||||
super(AlphaSpanConverter, self).__init__(replace_tag, pattern) | |||||
class DigitSpanConverter(SpanConverterBase): | |||||
def __init__(self): | |||||
replace_tag = '<NUM>' | |||||
pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' | |||||
super(DigitSpanConverter, self).__init__(replace_tag, pattern) | |||||
def span_to_special_tag(self, span): | |||||
# return self.special_tag | |||||
if span[0] == '0' and len(span) > 2: | |||||
return '<NUM>' | |||||
decimal_point_count = 0 # one might have more than one decimal pointers | |||||
for idx, char in enumerate(span): | |||||
if char == '.' or char == '﹒' or char == '·': | |||||
decimal_point_count += 1 | |||||
if span[-1] == '.' or span[-1] == '﹒' or span[ | |||||
-1] == '·': # last digit being decimal point means this is not a number | |||||
if decimal_point_count == 1: | |||||
return span | |||||
else: | |||||
return '<UNKDGT>' | |||||
if decimal_point_count == 1: | |||||
return '<DEC>' | |||||
elif decimal_point_count > 1: | |||||
return '<UNKDGT>' | |||||
else: | |||||
return '<NUM>' | |||||
class TimeConverter(SpanConverterBase): | |||||
def __init__(self): | |||||
replace_tag = '<TOC>' | |||||
pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' | |||||
super().__init__(replace_tag, pattern) | |||||
class MixNumAlphaConverter(SpanConverterBase): | |||||
def __init__(self): | |||||
replace_tag = '<MIX>' | |||||
pattern = None | |||||
super().__init__(replace_tag, pattern) | |||||
def find_certain_span_and_replace(self, sentence): | |||||
replaced_sentence = '' | |||||
start = 0 | |||||
matching_flag = False | |||||
number_flag = False | |||||
alpha_flag = False | |||||
link_flag = False | |||||
slash_flag = False | |||||
bracket_flag = False | |||||
for idx in range(len(sentence)): | |||||
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): | |||||
if not matching_flag: | |||||
replaced_sentence += sentence[start:idx] | |||||
start = idx | |||||
if re.match('[0-9]', sentence[idx]): | |||||
number_flag = True | |||||
elif re.match('[\'′&\\-]', sentence[idx]): | |||||
link_flag = True | |||||
elif re.match('/', sentence[idx]): | |||||
slash_flag = True | |||||
elif re.match('[\\(\\)]', sentence[idx]): | |||||
bracket_flag = True | |||||
else: | |||||
alpha_flag = True | |||||
matching_flag = True | |||||
elif re.match('[\\.]', sentence[idx]): | |||||
pass | |||||
else: | |||||
if matching_flag: | |||||
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ | |||||
or (slash_flag and alpha_flag) or (link_flag and number_flag) \ | |||||
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): | |||||
span = sentence[start:idx] | |||||
start = idx | |||||
replaced_sentence += self.span_to_special_tag(span) | |||||
matching_flag = False | |||||
number_flag = False | |||||
alpha_flag = False | |||||
link_flag = False | |||||
slash_flag = False | |||||
bracket_flag = False | |||||
replaced_sentence += sentence[start:] | |||||
return replaced_sentence | |||||
def find_certain_span(self, sentence): | |||||
spans = [] | |||||
start = 0 | |||||
matching_flag = False | |||||
number_flag = False | |||||
alpha_flag = False | |||||
link_flag = False | |||||
slash_flag = False | |||||
bracket_flag = False | |||||
for idx in range(len(sentence)): | |||||
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): | |||||
if not matching_flag: | |||||
start = idx | |||||
if re.match('[0-9]', sentence[idx]): | |||||
number_flag = True | |||||
elif re.match('[\'′&\\-]', sentence[idx]): | |||||
link_flag = True | |||||
elif re.match('/', sentence[idx]): | |||||
slash_flag = True | |||||
elif re.match('[\\(\\)]', sentence[idx]): | |||||
bracket_flag = True | |||||
else: | |||||
alpha_flag = True | |||||
matching_flag = True | |||||
elif re.match('[\\.]', sentence[idx]): | |||||
pass | |||||
else: | |||||
if matching_flag: | |||||
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ | |||||
or (slash_flag and alpha_flag) or (link_flag and number_flag) \ | |||||
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): | |||||
spans.append((start, idx)) | |||||
start = idx | |||||
matching_flag = False | |||||
number_flag = False | |||||
alpha_flag = False | |||||
link_flag = False | |||||
slash_flag = False | |||||
bracket_flag = False | |||||
return spans | |||||
class EmailConverter(SpanConverterBase): | |||||
def __init__(self): | |||||
replaced_tag = "<EML>" | |||||
pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' | |||||
super(EmailConverter, self).__init__(replaced_tag, pattern) |
@@ -1,3 +1,98 @@ | |||||
from fastNLP.core.instance import Instance | |||||
from fastNLP.core.dataset import DataSet | |||||
from fastNLP.api.pipeline import Pipeline | |||||
from reproduction.chinese_word_segment.process.cws_processor import * | |||||
from reproduction.chinese_word_segment.utils import cut_long_training_sentences | |||||
from reproduction.chinese_word_segment.process.span_converter import * | |||||
from reproduction.chinese_word_segment.io import NaiveCWSReader | |||||
tr_filename = '' | |||||
dev_filename = '' | |||||
reader = NaiveCWSReader() | |||||
tr_dataset = reader.load(tr_filename, cut=True) | |||||
de_dataset = reader.load(dev_filename) | |||||
# TODO 如何组建成为一个Dataset | |||||
def construct_dataset(sentences): | |||||
dataset = DataSet() | |||||
for sentence in sentences: | |||||
instance = Instance() | |||||
instance['raw_sentence'] = sentence | |||||
dataset.append(instance) | |||||
return dataset | |||||
tr_dataset = construct_dataset(tr_sentences) | |||||
dev_dataset = construct_dataset(dev_sentence) | |||||
# 1. 准备processor | |||||
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') | |||||
sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence') | |||||
sp_proc.add_span_converter(AlphaSpanConverter()) | |||||
sp_proc.add_span_converter(DigitSpanConverter()) | |||||
char_proc = CWSCharSegProcessor('sentence', 'char_list') | |||||
tag_proc = CWSSegAppTagProcessor('sentence', 'tag') | |||||
bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list') | |||||
char_vocab_proc = VocabProcessor('char_list') | |||||
bigram_vocab_proc = VocabProcessor('bigram_list') | |||||
# 2. 使用processor | |||||
fs2hs_proc(tr_dataset) | |||||
sp_proc(tr_dataset) | |||||
char_proc(tr_dataset) | |||||
tag_proc(tr_dataset) | |||||
bigram_proc(tr_dataset) | |||||
char_vocab_proc(tr_dataset) | |||||
bigram_vocab_proc(tr_dataset) | |||||
char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') | |||||
bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') | |||||
char_index_proc(tr_dataset) | |||||
bigram_index_proc(tr_dataset) | |||||
# 2.1 处理dev_dataset | |||||
fs2hs_proc(dev_dataset) | |||||
sp_proc(dev_dataset) | |||||
char_proc(dev_dataset) | |||||
tag_proc(dev_dataset) | |||||
bigram_proc(dev_dataset) | |||||
char_index_proc(dev_dataset) | |||||
bigram_index_proc(dev_dataset) | |||||
# 3. 得到数据集可以用于训练了 | |||||
# TODO pretrain的embedding是怎么解决的? | |||||
# 4. 组装需要存下的内容 | |||||
pp = Pipeline() | |||||
pp.add_processor(fs2hs_proc) | |||||
pp.add_processor(sp_proc) | |||||
pp.add_processor(char_proc) | |||||
pp.add_processor(bigram_proc) | |||||
pp.add_processor(char_index_proc) | |||||
pp.add_processor(bigram_index_proc) |