Browse Source

修改processor适配昨天的sao操作

tags/v0.2.0
yh_cc 6 years ago
parent
commit
25a53ac5c9
3 changed files with 15 additions and 22 deletions
  1. +6
    -6
      fastNLP/api/processor.py
  2. +2
    -7
      reproduction/chinese_word_segment/process/cws_processor.py
  3. +7
    -9
      reproduction/chinese_word_segment/train_context.py

+ 6
- 6
fastNLP/api/processor.py View File

@@ -73,16 +73,16 @@ class FullSpaceToHalfSpaceProcessor(Processor):
if char in self.convert_map: if char in self.convert_map:
char = self.convert_map[char] char = self.convert_map[char]
new_sentence[idx] = char new_sentence[idx] = char
ins[self.field_name].text = ''.join(new_sentence)
ins[self.field_name] = ''.join(new_sentence)
return dataset return dataset




class IndexerProcessor(Processor): class IndexerProcessor(Processor):
def __init__(self, vocab, field_name):
def __init__(self, vocab, field_name, new_added_field_name):


assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))


super(IndexerProcessor, self).__init__(field_name, None)
super(IndexerProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab self.vocab = vocab


def set_vocab(self, vocab): def set_vocab(self, vocab):
@@ -93,9 +93,9 @@ class IndexerProcessor(Processor):
def process(self, dataset): def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset: for ins in dataset:
tokens = ins[self.field_name].content
tokens = ins[self.field_name]
index = [self.vocab.to_index(token) for token in tokens] index = [self.vocab.to_index(token) for token in tokens]
ins[self.field_name]._index = index
ins[self.new_added_field_name] = index


return dataset return dataset


@@ -110,7 +110,7 @@ class VocabProcessor(Processor):
for dataset in datasets: for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset: for ins in dataset:
tokens = ins[self.field_name].content
tokens = ins[self.field_name]
self.vocab.update(tokens) self.vocab.update(tokens)


def get_vocab(self): def get_vocab(self):


+ 2
- 7
reproduction/chinese_word_segment/process/cws_processor.py View File

@@ -5,9 +5,8 @@ import re
from fastNLP.core.field import SeqLabelField from fastNLP.core.field import SeqLabelField
from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet

from fastNLP.api.processor import Processor from fastNLP.api.processor import Processor
from reproduction.chinese_word_segment.process.span_converter import *
from reproduction.chinese_word_segment.process.span_converter import SpanConverter


_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' _SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'


@@ -25,11 +24,7 @@ class SpeicalSpanProcessor(Processor):
sentence = ins[self.field_name].text sentence = ins[self.field_name].text
for span_converter in self.span_converters: for span_converter in self.span_converters:
sentence = span_converter.find_certain_span_and_replace(sentence) sentence = span_converter.find_certain_span_and_replace(sentence)
if self.new_added_field_name!=self.field_name:
new_text_field = TextField(sentence, is_target=False)
ins[self.new_added_field_name] = new_text_field
else:
ins[self.field_name].text = sentence
ins[self.new_added_field_name] = sentence


return dataset return dataset




+ 7
- 9
reproduction/chinese_word_segment/train_context.py View File

@@ -1,13 +1,12 @@


from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet


from fastNLP.api.pipeline import Pipeline from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor

from reproduction.chinese_word_segment.process.cws_processor import * from reproduction.chinese_word_segment.process.cws_processor import *
from reproduction.chinese_word_segment.utils import cut_long_training_sentences
from reproduction.chinese_word_segment.process.span_converter import *
from reproduction.chinese_word_segment.io import NaiveCWSReader
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader




tr_filename = '' tr_filename = ''
@@ -15,9 +14,8 @@ dev_filename = ''


reader = NaiveCWSReader() reader = NaiveCWSReader()


tr_dataset = reader.load(tr_filename, cut=True)
de_dataset = reader.load(dev_filename)

tr_sentences = reader.load(tr_filename, cut_long_sent=True)
dev_sentences = reader.load(dev_filename)




# TODO 如何组建成为一个Dataset # TODO 如何组建成为一个Dataset
@@ -32,7 +30,7 @@ def construct_dataset(sentences):




tr_dataset = construct_dataset(tr_sentences) tr_dataset = construct_dataset(tr_sentences)
dev_dataset = construct_dataset(dev_sentence)
dev_dataset = construct_dataset(dev_sentences)


# 1. 准备processor # 1. 准备processor
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')


Loading…
Cancel
Save