@@ -73,16 +73,16 @@ class FullSpaceToHalfSpaceProcessor(Processor): | |||||
if char in self.convert_map: | if char in self.convert_map: | ||||
char = self.convert_map[char] | char = self.convert_map[char] | ||||
new_sentence[idx] = char | new_sentence[idx] = char | ||||
ins[self.field_name].text = ''.join(new_sentence) | |||||
ins[self.field_name] = ''.join(new_sentence) | |||||
return dataset | return dataset | ||||
class IndexerProcessor(Processor): | class IndexerProcessor(Processor): | ||||
def __init__(self, vocab, field_name): | |||||
def __init__(self, vocab, field_name, new_added_field_name): | |||||
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) | assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) | ||||
super(IndexerProcessor, self).__init__(field_name, None) | |||||
super(IndexerProcessor, self).__init__(field_name, new_added_field_name) | |||||
self.vocab = vocab | self.vocab = vocab | ||||
def set_vocab(self, vocab): | def set_vocab(self, vocab): | ||||
@@ -93,9 +93,9 @@ class IndexerProcessor(Processor): | |||||
def process(self, dataset): | def process(self, dataset): | ||||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | ||||
for ins in dataset: | for ins in dataset: | ||||
tokens = ins[self.field_name].content | |||||
tokens = ins[self.field_name] | |||||
index = [self.vocab.to_index(token) for token in tokens] | index = [self.vocab.to_index(token) for token in tokens] | ||||
ins[self.field_name]._index = index | |||||
ins[self.new_added_field_name] = index | |||||
return dataset | return dataset | ||||
@@ -110,7 +110,7 @@ class VocabProcessor(Processor): | |||||
for dataset in datasets: | for dataset in datasets: | ||||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) | ||||
for ins in dataset: | for ins in dataset: | ||||
tokens = ins[self.field_name].content | |||||
tokens = ins[self.field_name] | |||||
self.vocab.update(tokens) | self.vocab.update(tokens) | ||||
def get_vocab(self): | def get_vocab(self): | ||||
@@ -5,9 +5,8 @@ import re | |||||
from fastNLP.core.field import SeqLabelField | from fastNLP.core.field import SeqLabelField | ||||
from fastNLP.core.vocabulary import Vocabulary | from fastNLP.core.vocabulary import Vocabulary | ||||
from fastNLP.core.dataset import DataSet | from fastNLP.core.dataset import DataSet | ||||
from fastNLP.api.processor import Processor | from fastNLP.api.processor import Processor | ||||
from reproduction.chinese_word_segment.process.span_converter import * | |||||
from reproduction.chinese_word_segment.process.span_converter import SpanConverter | |||||
_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' | _SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' | ||||
@@ -25,11 +24,7 @@ class SpeicalSpanProcessor(Processor): | |||||
sentence = ins[self.field_name].text | sentence = ins[self.field_name].text | ||||
for span_converter in self.span_converters: | for span_converter in self.span_converters: | ||||
sentence = span_converter.find_certain_span_and_replace(sentence) | sentence = span_converter.find_certain_span_and_replace(sentence) | ||||
if self.new_added_field_name!=self.field_name: | |||||
new_text_field = TextField(sentence, is_target=False) | |||||
ins[self.new_added_field_name] = new_text_field | |||||
else: | |||||
ins[self.field_name].text = sentence | |||||
ins[self.new_added_field_name] = sentence | |||||
return dataset | return dataset | ||||
@@ -1,13 +1,12 @@ | |||||
from fastNLP.core.instance import Instance | from fastNLP.core.instance import Instance | ||||
from fastNLP.core.dataset import DataSet | from fastNLP.core.dataset import DataSet | ||||
from fastNLP.api.pipeline import Pipeline | from fastNLP.api.pipeline import Pipeline | ||||
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor | |||||
from reproduction.chinese_word_segment.process.cws_processor import * | from reproduction.chinese_word_segment.process.cws_processor import * | ||||
from reproduction.chinese_word_segment.utils import cut_long_training_sentences | |||||
from reproduction.chinese_word_segment.process.span_converter import * | |||||
from reproduction.chinese_word_segment.io import NaiveCWSReader | |||||
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter | |||||
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader | |||||
tr_filename = '' | tr_filename = '' | ||||
@@ -15,9 +14,8 @@ dev_filename = '' | |||||
reader = NaiveCWSReader() | reader = NaiveCWSReader() | ||||
tr_dataset = reader.load(tr_filename, cut=True) | |||||
de_dataset = reader.load(dev_filename) | |||||
tr_sentences = reader.load(tr_filename, cut_long_sent=True) | |||||
dev_sentences = reader.load(dev_filename) | |||||
# TODO 如何组建成为一个Dataset | # TODO 如何组建成为一个Dataset | ||||
@@ -32,7 +30,7 @@ def construct_dataset(sentences): | |||||
tr_dataset = construct_dataset(tr_sentences) | tr_dataset = construct_dataset(tr_sentences) | ||||
dev_dataset = construct_dataset(dev_sentence) | |||||
dev_dataset = construct_dataset(dev_sentences) | |||||
# 1. 准备processor | # 1. 准备processor | ||||
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') | fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') | ||||