@@ -73,16 +73,16 @@ class FullSpaceToHalfSpaceProcessor(Processor): | |||
if char in self.convert_map: | |||
char = self.convert_map[char] | |||
new_sentence[idx] = char | |||
ins[self.field_name].text = ''.join(new_sentence) | |||
ins[self.field_name] = ''.join(new_sentence) | |||
return dataset | |||
class IndexerProcessor(Processor): | |||
def __init__(self, vocab, field_name): | |||
def __init__(self, vocab, field_name, new_added_field_name): | |||
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) | |||
super(IndexerProcessor, self).__init__(field_name, None) | |||
super(IndexerProcessor, self).__init__(field_name, new_added_field_name) | |||
self.vocab = vocab | |||
def set_vocab(self, vocab): | |||
@@ -93,9 +93,9 @@ class IndexerProcessor(Processor): | |||
def process(self, dataset): | |||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | |||
for ins in dataset: | |||
tokens = ins[self.field_name].content | |||
tokens = ins[self.field_name] | |||
index = [self.vocab.to_index(token) for token in tokens] | |||
ins[self.field_name]._index = index | |||
ins[self.new_added_field_name] = index | |||
return dataset | |||
@@ -110,7 +110,7 @@ class VocabProcessor(Processor): | |||
for dataset in datasets: | |||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | |||
for ins in dataset: | |||
tokens = ins[self.field_name].content | |||
tokens = ins[self.field_name] | |||
self.vocab.update(tokens) | |||
def get_vocab(self): | |||
@@ -5,9 +5,8 @@ import re | |||
from fastNLP.core.field import SeqLabelField | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.api.processor import Processor | |||
from reproduction.chinese_word_segment.process.span_converter import * | |||
from reproduction.chinese_word_segment.process.span_converter import SpanConverter | |||
_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' | |||
@@ -25,11 +24,7 @@ class SpeicalSpanProcessor(Processor): | |||
sentence = ins[self.field_name].text | |||
for span_converter in self.span_converters: | |||
sentence = span_converter.find_certain_span_and_replace(sentence) | |||
if self.new_added_field_name!=self.field_name: | |||
new_text_field = TextField(sentence, is_target=False) | |||
ins[self.new_added_field_name] = new_text_field | |||
else: | |||
ins[self.field_name].text = sentence | |||
ins[self.new_added_field_name] = sentence | |||
return dataset | |||
@@ -1,13 +1,12 @@ | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.api.pipeline import Pipeline | |||
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor | |||
from reproduction.chinese_word_segment.process.cws_processor import * | |||
from reproduction.chinese_word_segment.utils import cut_long_training_sentences | |||
from reproduction.chinese_word_segment.process.span_converter import * | |||
from reproduction.chinese_word_segment.io import NaiveCWSReader | |||
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter | |||
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader | |||
tr_filename = '' | |||
@@ -15,9 +14,8 @@ dev_filename = '' | |||
reader = NaiveCWSReader() | |||
tr_dataset = reader.load(tr_filename, cut=True) | |||
de_dataset = reader.load(dev_filename) | |||
tr_sentences = reader.load(tr_filename, cut_long_sent=True) | |||
dev_sentences = reader.load(dev_filename) | |||
# TODO 如何组建成为一个Dataset | |||
@@ -32,7 +30,7 @@ def construct_dataset(sentences): | |||
tr_dataset = construct_dataset(tr_sentences) | |||
dev_dataset = construct_dataset(dev_sentence) | |||
dev_dataset = construct_dataset(dev_sentences) | |||
# 1. 准备processor | |||
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') | |||