diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py index a01810ac..300dd8ac 100644 --- a/fastNLP/api/processor.py +++ b/fastNLP/api/processor.py @@ -73,16 +73,16 @@ class FullSpaceToHalfSpaceProcessor(Processor): if char in self.convert_map: char = self.convert_map[char] new_sentence[idx] = char - ins[self.field_name].text = ''.join(new_sentence) + ins[self.field_name] = ''.join(new_sentence) return dataset class IndexerProcessor(Processor): - def __init__(self, vocab, field_name): + def __init__(self, vocab, field_name, new_added_field_name): assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) - super(IndexerProcessor, self).__init__(field_name, None) + super(IndexerProcessor, self).__init__(field_name, new_added_field_name) self.vocab = vocab def set_vocab(self, vocab): @@ -93,9 +93,9 @@ class IndexerProcessor(Processor): def process(self, dataset): assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) for ins in dataset: - tokens = ins[self.field_name].content + tokens = ins[self.field_name] index = [self.vocab.to_index(token) for token in tokens] - ins[self.field_name]._index = index + ins[self.new_added_field_name] = index return dataset @@ -110,7 +110,7 @@ class VocabProcessor(Processor): for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) for ins in dataset: - tokens = ins[self.field_name].content + tokens = ins[self.field_name] self.vocab.update(tokens) def get_vocab(self): diff --git a/reproduction/chinese_word_segment/process/cws_processor.py b/reproduction/chinese_word_segment/process/cws_processor.py index 3e6b9c3b..c025895f 100644 --- a/reproduction/chinese_word_segment/process/cws_processor.py +++ b/reproduction/chinese_word_segment/process/cws_processor.py @@ -5,9 +5,8 @@ import re from fastNLP.core.field import SeqLabelField from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.dataset import DataSet - from fastNLP.api.processor import Processor -from reproduction.chinese_word_segment.process.span_converter import * +from reproduction.chinese_word_segment.process.span_converter import SpanConverter _SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' @@ -25,11 +24,7 @@ class SpeicalSpanProcessor(Processor): sentence = ins[self.field_name].text for span_converter in self.span_converters: sentence = span_converter.find_certain_span_and_replace(sentence) - if self.new_added_field_name!=self.field_name: - new_text_field = TextField(sentence, is_target=False) - ins[self.new_added_field_name] = new_text_field - else: - ins[self.field_name].text = sentence + ins[self.new_added_field_name] = sentence return dataset diff --git a/reproduction/chinese_word_segment/train_context.py b/reproduction/chinese_word_segment/train_context.py index 691a97a6..de6513d3 100644 --- a/reproduction/chinese_word_segment/train_context.py +++ b/reproduction/chinese_word_segment/train_context.py @@ -1,13 +1,12 @@ from fastNLP.core.instance import Instance from fastNLP.core.dataset import DataSet - - from fastNLP.api.pipeline import Pipeline +from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor + from reproduction.chinese_word_segment.process.cws_processor import * -from reproduction.chinese_word_segment.utils import cut_long_training_sentences -from reproduction.chinese_word_segment.process.span_converter import * -from reproduction.chinese_word_segment.io import NaiveCWSReader +from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter +from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader tr_filename = '' @@ -15,9 +14,8 @@ dev_filename = '' reader = NaiveCWSReader() -tr_dataset = reader.load(tr_filename, cut=True) -de_dataset = reader.load(dev_filename) - +tr_sentences = reader.load(tr_filename, cut_long_sent=True) +dev_sentences = reader.load(dev_filename) # TODO 如何组建成为一个Dataset @@ -32,7 +30,7 @@ def construct_dataset(sentences): tr_dataset = construct_dataset(tr_sentences) -dev_dataset = construct_dataset(dev_sentence) +dev_dataset = construct_dataset(dev_sentences) # 1. 准备processor fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')