Browse Source

修改processor适配昨天的sao操作

tags/v0.2.0
yh_cc 6 years ago
parent
commit
25a53ac5c9
3 changed files with 15 additions and 22 deletions
  1. +6
    -6
      fastNLP/api/processor.py
  2. +2
    -7
      reproduction/chinese_word_segment/process/cws_processor.py
  3. +7
    -9
      reproduction/chinese_word_segment/train_context.py

+ 6
- 6
fastNLP/api/processor.py View File

@@ -73,16 +73,16 @@ class FullSpaceToHalfSpaceProcessor(Processor):
if char in self.convert_map:
char = self.convert_map[char]
new_sentence[idx] = char
ins[self.field_name].text = ''.join(new_sentence)
ins[self.field_name] = ''.join(new_sentence)
return dataset


class IndexerProcessor(Processor):
def __init__(self, vocab, field_name):
def __init__(self, vocab, field_name, new_added_field_name):

assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))

super(IndexerProcessor, self).__init__(field_name, None)
super(IndexerProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab

def set_vocab(self, vocab):
@@ -93,9 +93,9 @@ class IndexerProcessor(Processor):
def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
tokens = ins[self.field_name].content
tokens = ins[self.field_name]
index = [self.vocab.to_index(token) for token in tokens]
ins[self.field_name]._index = index
ins[self.new_added_field_name] = index

return dataset

@@ -110,7 +110,7 @@ class VocabProcessor(Processor):
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
tokens = ins[self.field_name].content
tokens = ins[self.field_name]
self.vocab.update(tokens)

def get_vocab(self):


+ 2
- 7
reproduction/chinese_word_segment/process/cws_processor.py View File

@@ -5,9 +5,8 @@ import re
from fastNLP.core.field import SeqLabelField
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet

from fastNLP.api.processor import Processor
from reproduction.chinese_word_segment.process.span_converter import *
from reproduction.chinese_word_segment.process.span_converter import SpanConverter

_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'

@@ -25,11 +24,7 @@ class SpeicalSpanProcessor(Processor):
sentence = ins[self.field_name].text
for span_converter in self.span_converters:
sentence = span_converter.find_certain_span_and_replace(sentence)
if self.new_added_field_name!=self.field_name:
new_text_field = TextField(sentence, is_target=False)
ins[self.new_added_field_name] = new_text_field
else:
ins[self.field_name].text = sentence
ins[self.new_added_field_name] = sentence

return dataset



+ 7
- 9
reproduction/chinese_word_segment/train_context.py View File

@@ -1,13 +1,12 @@

from fastNLP.core.instance import Instance
from fastNLP.core.dataset import DataSet


from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor

from reproduction.chinese_word_segment.process.cws_processor import *
from reproduction.chinese_word_segment.utils import cut_long_training_sentences
from reproduction.chinese_word_segment.process.span_converter import *
from reproduction.chinese_word_segment.io import NaiveCWSReader
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader


tr_filename = ''
@@ -15,9 +14,8 @@ dev_filename = ''

reader = NaiveCWSReader()

tr_dataset = reader.load(tr_filename, cut=True)
de_dataset = reader.load(dev_filename)

tr_sentences = reader.load(tr_filename, cut_long_sent=True)
dev_sentences = reader.load(dev_filename)


# TODO 如何组建成为一个Dataset
@@ -32,7 +30,7 @@ def construct_dataset(sentences):


tr_dataset = construct_dataset(tr_sentences)
dev_dataset = construct_dataset(dev_sentence)
dev_dataset = construct_dataset(dev_sentences)

# 1. 准备processor
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')


Loading…
Cancel
Save