|
@@ -2,6 +2,8 @@ |
|
|
from fastNLP.core.dataset import DataSet |
|
|
from fastNLP.core.dataset import DataSet |
|
|
from fastNLP.core.vocabulary import Vocabulary |
|
|
from fastNLP.core.vocabulary import Vocabulary |
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
class Processor: |
|
|
class Processor: |
|
|
def __init__(self, field_name, new_added_field_name): |
|
|
def __init__(self, field_name, new_added_field_name): |
|
|
self.field_name = field_name |
|
|
self.field_name = field_name |
|
@@ -64,6 +66,7 @@ class FullSpaceToHalfSpaceProcessor(Processor): |
|
|
if self.change_space: |
|
|
if self.change_space: |
|
|
FHs += FH_SPACE |
|
|
FHs += FH_SPACE |
|
|
self.convert_map = {k: v for k, v in FHs} |
|
|
self.convert_map = {k: v for k, v in FHs} |
|
|
|
|
|
|
|
|
def process(self, dataset): |
|
|
def process(self, dataset): |
|
|
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) |
|
|
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) |
|
|
for ins in dataset: |
|
|
for ins in dataset: |
|
@@ -77,6 +80,37 @@ class FullSpaceToHalfSpaceProcessor(Processor): |
|
|
return dataset |
|
|
return dataset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MapFieldProcessor(Processor): |
|
|
|
|
|
def __init__(self, func, field_name, new_added_field_name=None): |
|
|
|
|
|
super(MapFieldProcessor, self).__init__(field_name, new_added_field_name) |
|
|
|
|
|
self.func = func |
|
|
|
|
|
|
|
|
|
|
|
def process(self, dataset): |
|
|
|
|
|
for ins in dataset: |
|
|
|
|
|
s = ins[self.field_name] |
|
|
|
|
|
new_s = self.func(s) |
|
|
|
|
|
ins[self.new_added_field_name] = new_s |
|
|
|
|
|
return dataset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Num2TagProcessor(Processor): |
|
|
|
|
|
def __init__(self, tag, field_name, new_added_field_name=None): |
|
|
|
|
|
super(Num2TagProcessor, self).__init__(field_name, new_added_field_name) |
|
|
|
|
|
self.tag = tag |
|
|
|
|
|
self.pattern = r'[-+]?[0-9]+[\./e]+[-+]?[0-9]*' |
|
|
|
|
|
|
|
|
|
|
|
def process(self, dataset): |
|
|
|
|
|
for ins in dataset: |
|
|
|
|
|
s = ins[self.field_name] |
|
|
|
|
|
new_s = [None] * len(s) |
|
|
|
|
|
for i, w in enumerate(s): |
|
|
|
|
|
if re.search(self.pattern, w) is not None: |
|
|
|
|
|
w = self.tag |
|
|
|
|
|
new_s[i] = w |
|
|
|
|
|
ins[self.new_added_field_name] = new_s |
|
|
|
|
|
return dataset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class IndexerProcessor(Processor): |
|
|
class IndexerProcessor(Processor): |
|
|
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False): |
|
|
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False): |
|
|
|
|
|
|
|
|