| @@ -0,0 +1,129 @@ | |||
| from fastNLP.loader.dataset_loader import DataSetLoader | |||
| from fastNLP.core.instance import Instance | |||
| from fastNLP.core.dataset import DataSet | |||
| def cut_long_sentence(sent, max_sample_length=200): | |||
| sent_no_space = sent.replace(' ', '') | |||
| cutted_sentence = [] | |||
| if len(sent_no_space) > max_sample_length: | |||
| parts = sent.strip().split() | |||
| new_line = '' | |||
| length = 0 | |||
| for part in parts: | |||
| length += len(part) | |||
| new_line += part + ' ' | |||
| if length > max_sample_length: | |||
| new_line = new_line[:-1] | |||
| cutted_sentence.append(new_line) | |||
| length = 0 | |||
| new_line = '' | |||
| if new_line != '': | |||
| cutted_sentence.append(new_line[:-1]) | |||
| else: | |||
| cutted_sentence.append(sent) | |||
| return cutted_sentence | |||
| class NaiveCWSReader(DataSetLoader): | |||
| """ | |||
| 这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了 | |||
| 这是 fastNLP , 一个 非常 good 的 包 . | |||
| 或者,即每个part后面还有一个pos tag | |||
| 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY | |||
| """ | |||
| def __init__(self, in_word_splitter=None): | |||
| super().__init__() | |||
| self.in_word_splitter = in_word_splitter | |||
| def load(self, filepath, in_word_splitter=None, cut_long_sent=False): | |||
| """ | |||
| 允许使用的情况有(默认以\t或空格作为seg) | |||
| 这是 fastNLP , 一个 非常 good 的 包 . | |||
| 和 | |||
| 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY | |||
| 如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] | |||
| :param filepath: | |||
| :param in_word_splitter: | |||
| :return: | |||
| """ | |||
| if in_word_splitter == None: | |||
| in_word_splitter = self.in_word_splitter | |||
| dataset = DataSet() | |||
| with open(filepath, 'r') as f: | |||
| for line in f: | |||
| line = line.strip() | |||
| if len(line.replace(' ', ''))==0: # 不能接受空行 | |||
| continue | |||
| if not in_word_splitter is None: | |||
| words = [] | |||
| for part in line.split(): | |||
| word = part.split(in_word_splitter)[0] | |||
| words.append(word) | |||
| line = ' '.join(words) | |||
| if cut_long_sent: | |||
| sents = cut_long_sentence(line) | |||
| else: | |||
| sents = [line] | |||
| for sent in sents: | |||
| instance = Instance(raw_sentence=sent) | |||
| dataset.append(instance) | |||
| return dataset | |||
| class POSCWSReader(DataSetLoader): | |||
| """ | |||
| 支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限. | |||
| 迈 N | |||
| 向 N | |||
| 充 N | |||
| ... | |||
| 泽 I-PER | |||
| 民 I-PER | |||
| ( N | |||
| 一 N | |||
| 九 N | |||
| ... | |||
| :param filepath: | |||
| :return: | |||
| """ | |||
| def __init__(self, in_word_splitter=None): | |||
| super().__init__() | |||
| self.in_word_splitter = in_word_splitter | |||
| def load(self, filepath, in_word_splitter=None, cut_long_sent=False): | |||
| if in_word_splitter is None: | |||
| in_word_splitter = self.in_word_splitter | |||
| dataset = DataSet() | |||
| with open(filepath, 'r') as f: | |||
| words = [] | |||
| for line in f: | |||
| line = line.strip() | |||
| if len(line) == 0: # new line | |||
| if len(words)==0: # 不能接受空行 | |||
| continue | |||
| line = ' '.join(words) | |||
| if cut_long_sent: | |||
| sents = cut_long_sent(line) | |||
| else: | |||
| sents = [line] | |||
| for sent in sents: | |||
| instance = Instance(raw_sentence=sent) | |||
| dataset.append(instance) | |||
| words = [] | |||
| else: | |||
| line = line.split()[0] | |||
| if in_word_splitter is None: | |||
| words.append(line) | |||
| else: | |||
| words.append(line.split(in_word_splitter)[0]) | |||
| return dataset | |||
| @@ -0,0 +1,185 @@ | |||
| import re | |||
| class SpanConverterBase: | |||
| def __init__(self, replace_tag, pattern): | |||
| super(SpanConverterBase, self).__init__() | |||
| self.replace_tag = replace_tag | |||
| self.pattern = pattern | |||
| def find_certain_span_and_replace(self, sentence): | |||
| replaced_sentence = '' | |||
| prev_end = 0 | |||
| for match in re.finditer(self.pattern, sentence): | |||
| start, end = match.span() | |||
| span = sentence[start:end] | |||
| replaced_sentence += sentence[prev_end:start] + \ | |||
| self.span_to_special_tag(span) | |||
| prev_end = end | |||
| replaced_sentence += sentence[prev_end:] | |||
| return replaced_sentence | |||
| def span_to_special_tag(self, span): | |||
| return self.replace_tag | |||
| def find_certain_span(self, sentence): | |||
| spans = [] | |||
| for match in re.finditer(self.pattern, sentence): | |||
| spans.append(match.span()) | |||
| return spans | |||
| class AlphaSpanConverter(SpanConverterBase): | |||
| def __init__(self): | |||
| replace_tag = '<ALPHA>' | |||
| # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). | |||
| pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' | |||
| super(AlphaSpanConverter, self).__init__(replace_tag, pattern) | |||
| class DigitSpanConverter(SpanConverterBase): | |||
| def __init__(self): | |||
| replace_tag = '<NUM>' | |||
| pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' | |||
| super(DigitSpanConverter, self).__init__(replace_tag, pattern) | |||
| def span_to_special_tag(self, span): | |||
| # return self.special_tag | |||
| if span[0] == '0' and len(span) > 2: | |||
| return '<NUM>' | |||
| decimal_point_count = 0 # one might have more than one decimal pointers | |||
| for idx, char in enumerate(span): | |||
| if char == '.' or char == '﹒' or char == '·': | |||
| decimal_point_count += 1 | |||
| if span[-1] == '.' or span[-1] == '﹒' or span[ | |||
| -1] == '·': # last digit being decimal point means this is not a number | |||
| if decimal_point_count == 1: | |||
| return span | |||
| else: | |||
| return '<UNKDGT>' | |||
| if decimal_point_count == 1: | |||
| return '<DEC>' | |||
| elif decimal_point_count > 1: | |||
| return '<UNKDGT>' | |||
| else: | |||
| return '<NUM>' | |||
| class TimeConverter(SpanConverterBase): | |||
| def __init__(self): | |||
| replace_tag = '<TOC>' | |||
| pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' | |||
| super().__init__(replace_tag, pattern) | |||
| class MixNumAlphaConverter(SpanConverterBase): | |||
| def __init__(self): | |||
| replace_tag = '<MIX>' | |||
| pattern = None | |||
| super().__init__(replace_tag, pattern) | |||
| def find_certain_span_and_replace(self, sentence): | |||
| replaced_sentence = '' | |||
| start = 0 | |||
| matching_flag = False | |||
| number_flag = False | |||
| alpha_flag = False | |||
| link_flag = False | |||
| slash_flag = False | |||
| bracket_flag = False | |||
| for idx in range(len(sentence)): | |||
| if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): | |||
| if not matching_flag: | |||
| replaced_sentence += sentence[start:idx] | |||
| start = idx | |||
| if re.match('[0-9]', sentence[idx]): | |||
| number_flag = True | |||
| elif re.match('[\'′&\\-]', sentence[idx]): | |||
| link_flag = True | |||
| elif re.match('/', sentence[idx]): | |||
| slash_flag = True | |||
| elif re.match('[\\(\\)]', sentence[idx]): | |||
| bracket_flag = True | |||
| else: | |||
| alpha_flag = True | |||
| matching_flag = True | |||
| elif re.match('[\\.]', sentence[idx]): | |||
| pass | |||
| else: | |||
| if matching_flag: | |||
| if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ | |||
| or (slash_flag and alpha_flag) or (link_flag and number_flag) \ | |||
| or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): | |||
| span = sentence[start:idx] | |||
| start = idx | |||
| replaced_sentence += self.span_to_special_tag(span) | |||
| matching_flag = False | |||
| number_flag = False | |||
| alpha_flag = False | |||
| link_flag = False | |||
| slash_flag = False | |||
| bracket_flag = False | |||
| replaced_sentence += sentence[start:] | |||
| return replaced_sentence | |||
| def find_certain_span(self, sentence): | |||
| spans = [] | |||
| start = 0 | |||
| matching_flag = False | |||
| number_flag = False | |||
| alpha_flag = False | |||
| link_flag = False | |||
| slash_flag = False | |||
| bracket_flag = False | |||
| for idx in range(len(sentence)): | |||
| if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): | |||
| if not matching_flag: | |||
| start = idx | |||
| if re.match('[0-9]', sentence[idx]): | |||
| number_flag = True | |||
| elif re.match('[\'′&\\-]', sentence[idx]): | |||
| link_flag = True | |||
| elif re.match('/', sentence[idx]): | |||
| slash_flag = True | |||
| elif re.match('[\\(\\)]', sentence[idx]): | |||
| bracket_flag = True | |||
| else: | |||
| alpha_flag = True | |||
| matching_flag = True | |||
| elif re.match('[\\.]', sentence[idx]): | |||
| pass | |||
| else: | |||
| if matching_flag: | |||
| if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ | |||
| or (slash_flag and alpha_flag) or (link_flag and number_flag) \ | |||
| or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): | |||
| spans.append((start, idx)) | |||
| start = idx | |||
| matching_flag = False | |||
| number_flag = False | |||
| alpha_flag = False | |||
| link_flag = False | |||
| slash_flag = False | |||
| bracket_flag = False | |||
| return spans | |||
| class EmailConverter(SpanConverterBase): | |||
| def __init__(self): | |||
| replaced_tag = "<EML>" | |||
| pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' | |||
| super(EmailConverter, self).__init__(replaced_tag, pattern) | |||
| @@ -1,3 +1,98 @@ | |||
| from fastNLP.core.instance import Instance | |||
| from fastNLP.core.dataset import DataSet | |||
| from fastNLP.api.pipeline import Pipeline | |||
| from reproduction.chinese_word_segment.process.cws_processor import * | |||
| from reproduction.chinese_word_segment.utils import cut_long_training_sentences | |||
| from reproduction.chinese_word_segment.process.span_converter import * | |||
| from reproduction.chinese_word_segment.io import NaiveCWSReader | |||
| tr_filename = '' | |||
| dev_filename = '' | |||
| reader = NaiveCWSReader() | |||
| tr_dataset = reader.load(tr_filename, cut=True) | |||
| de_dataset = reader.load(dev_filename) | |||
| # TODO 如何组建成为一个Dataset | |||
| def construct_dataset(sentences): | |||
| dataset = DataSet() | |||
| for sentence in sentences: | |||
| instance = Instance() | |||
| instance['raw_sentence'] = sentence | |||
| dataset.append(instance) | |||
| return dataset | |||
| tr_dataset = construct_dataset(tr_sentences) | |||
| dev_dataset = construct_dataset(dev_sentence) | |||
| # 1. 准备processor | |||
| fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') | |||
| sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence') | |||
| sp_proc.add_span_converter(AlphaSpanConverter()) | |||
| sp_proc.add_span_converter(DigitSpanConverter()) | |||
| char_proc = CWSCharSegProcessor('sentence', 'char_list') | |||
| tag_proc = CWSSegAppTagProcessor('sentence', 'tag') | |||
| bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list') | |||
| char_vocab_proc = VocabProcessor('char_list') | |||
| bigram_vocab_proc = VocabProcessor('bigram_list') | |||
| # 2. 使用processor | |||
| fs2hs_proc(tr_dataset) | |||
| sp_proc(tr_dataset) | |||
| char_proc(tr_dataset) | |||
| tag_proc(tr_dataset) | |||
| bigram_proc(tr_dataset) | |||
| char_vocab_proc(tr_dataset) | |||
| bigram_vocab_proc(tr_dataset) | |||
| char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') | |||
| bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') | |||
| char_index_proc(tr_dataset) | |||
| bigram_index_proc(tr_dataset) | |||
| # 2.1 处理dev_dataset | |||
| fs2hs_proc(dev_dataset) | |||
| sp_proc(dev_dataset) | |||
| char_proc(dev_dataset) | |||
| tag_proc(dev_dataset) | |||
| bigram_proc(dev_dataset) | |||
| char_index_proc(dev_dataset) | |||
| bigram_index_proc(dev_dataset) | |||
| # 3. 得到数据集可以用于训练了 | |||
| # TODO pretrain的embedding是怎么解决的? | |||
| # 4. 组装需要存下的内容 | |||
| pp = Pipeline() | |||
| pp.add_processor(fs2hs_proc) | |||
| pp.add_processor(sp_proc) | |||
| pp.add_processor(char_proc) | |||
| pp.add_processor(bigram_proc) | |||
| pp.add_processor(char_index_proc) | |||
| pp.add_processor(bigram_index_proc) | |||