Browse Source

* update most processors to use dataset.apply

* fix failed tests
tags/v0.3.0
FengZiYjun 6 years ago
parent
commit
337e3035b3
5 changed files with 79 additions and 47 deletions
  1. +22
    -40
      fastNLP/api/processor.py
  2. +3
    -2
      fastNLP/core/trainer.py
  3. +6
    -0
      test/api/test_pipeline.py
  4. +45
    -2
      test/api/test_processor.py
  5. +3
    -3
      test/core/test_trainer.py

+ 22
- 40
fastNLP/api/processor.py View File

@@ -77,14 +77,17 @@ class FullSpaceToHalfSpaceProcessor(Processor):


def process(self, dataset): def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:

def inner_proc(ins):
sentence = ins[self.field_name] sentence = ins[self.field_name]
new_sentence = [None] * len(sentence)
new_sentence = [""] * len(sentence)
for idx, char in enumerate(sentence): for idx, char in enumerate(sentence):
if char in self.convert_map: if char in self.convert_map:
char = self.convert_map[char] char = self.convert_map[char]
new_sentence[idx] = char new_sentence[idx] = char
ins[self.field_name] = ''.join(new_sentence)
return "".join(new_sentence)

dataset.apply(inner_proc, new_field_name=self.field_name)
return dataset return dataset




@@ -94,9 +97,7 @@ class PreAppendProcessor(Processor):
self.data = data self.data = data


def process(self, dataset): def process(self, dataset):
for ins in dataset:
sent = ins[self.field_name]
ins[self.new_added_field_name] = [self.data] + sent
dataset.apply(lambda ins: [self.data] + ins[self.field_name], new_field_name=self.new_added_field_name)
return dataset return dataset




@@ -108,9 +109,7 @@ class SliceProcessor(Processor):
self.slice = slice(start, end, step) self.slice = slice(start, end, step)


def process(self, dataset): def process(self, dataset):
for ins in dataset:
sent = ins[self.field_name]
ins[self.new_added_field_name] = sent[self.slice]
dataset.apply(lambda ins: ins[self.field_name][self.slice], new_field_name=self.new_added_field_name)
return dataset return dataset




@@ -121,14 +120,17 @@ class Num2TagProcessor(Processor):
self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)' self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)'


def process(self, dataset): def process(self, dataset):
for ins in dataset:

def inner_proc(ins):
s = ins[self.field_name] s = ins[self.field_name]
new_s = [None] * len(s) new_s = [None] * len(s)
for i, w in enumerate(s): for i, w in enumerate(s):
if re.search(self.pattern, w) is not None: if re.search(self.pattern, w) is not None:
w = self.tag w = self.tag
new_s[i] = w new_s[i] = w
ins[self.new_added_field_name] = new_s
return new_s

dataset.apply(inner_proc, new_field_name=self.new_added_field_name)
return dataset return dataset




@@ -149,11 +151,8 @@ class IndexerProcessor(Processor):


def process(self, dataset): def process(self, dataset):
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
for ins in dataset:
tokens = ins[self.field_name]
index = [self.vocab.to_index(token) for token in tokens]
ins[self.new_added_field_name] = index

dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name)
if self.is_input: if self.is_input:
dataset.set_input(self.new_added_field_name) dataset.set_input(self.new_added_field_name)


@@ -167,6 +166,7 @@ class VocabProcessor(Processor):
"""Build vocabulary with a field in the data set. """Build vocabulary with a field in the data set.


""" """

def __init__(self, field_name): def __init__(self, field_name):
super(VocabProcessor, self).__init__(field_name, None) super(VocabProcessor, self).__init__(field_name, None)
self.vocab = Vocabulary() self.vocab = Vocabulary()
@@ -175,8 +175,7 @@ class VocabProcessor(Processor):
for dataset in datasets: for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset: for ins in dataset:
tokens = ins[self.field_name]
self.vocab.update(tokens)
self.vocab.update(ins[self.field_name])


def get_vocab(self): def get_vocab(self):
self.vocab.build_vocab() self.vocab.build_vocab()
@@ -190,9 +189,7 @@ class SeqLenProcessor(Processor):


def process(self, dataset): def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
length = len(ins[self.field_name])
ins[self.new_added_field_name] = length
dataset.apply(lambda ins: len(ins[self.field_name]), new_field_name=self.new_added_field_name)
if self.is_input: if self.is_input:
dataset.set_input(self.new_added_field_name) dataset.set_input(self.new_added_field_name)
return dataset return dataset
@@ -225,7 +222,7 @@ class ModelProcessor(Processor):
for key, value in prediction.items(): for key, value in prediction.items():
tmp_batch = [] tmp_batch = []
value = value.cpu().numpy() value = value.cpu().numpy()
if len(value.shape) == 1 or (len(value.shape)==2 and value.shape[1]==1):
if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
batch_output[key].extend(value.tolist()) batch_output[key].extend(value.tolist())
else: else:
for idx, seq_len in enumerate(seq_lens): for idx, seq_len in enumerate(seq_lens):
@@ -236,7 +233,7 @@ class ModelProcessor(Processor):


# TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么
for field_name, fields in batch_output.items(): for field_name, fields in batch_output.items():
dataset.add_field(field_name, fields, need_tensor=False, is_target=False)
dataset.add_field(field_name, fields, is_input=True, is_target=False)


return dataset return dataset


@@ -254,23 +251,8 @@ class Index2WordProcessor(Processor):
self.vocab = vocab self.vocab = vocab


def process(self, dataset): def process(self, dataset):
for ins in dataset:
new_sent = [self.vocab.to_word(w) for w in ins[self.field_name]]
ins[self.new_added_field_name] = new_sent
return dataset


class SetTensorProcessor(Processor):
# TODO: remove it. It is strange.
def __init__(self, field_dict, default=False):
super(SetTensorProcessor, self).__init__(None, None)
self.field_dict = field_dict
self.default = default

def process(self, dataset):
set_dict = {name: self.default for name in dataset.get_all_fields().keys()}
set_dict.update(self.field_dict)
dataset._set_need_tensor(**set_dict)
dataset.apply(lambda ins: [self.vocab.to_word(w) for w in ins[self.field_name]],
new_field_name=self.new_added_field_name)
return dataset return dataset






+ 3
- 2
fastNLP/core/trainer.py View File

@@ -3,11 +3,11 @@ import time
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta


import numpy as np
import torch import torch
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from torch import nn from torch import nn
from tqdm.autonotebook import tqdm from tqdm.autonotebook import tqdm
import numpy as np


from fastNLP.core.batch import Batch from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
@@ -201,7 +201,7 @@ class Trainer(object):
results['best_step'] = self.best_dev_step results['best_step'] = self.best_dev_step
if load_best_model: if load_best_model:
model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])
self._load_model(self.model, model_name)
# self._load_model(self.model, model_name)
print("Reloaded the best model.") print("Reloaded the best model.")
finally: finally:
self._summary_writer.close() self._summary_writer.close()
@@ -361,6 +361,7 @@ class Trainer(object):
torch.save(model, model_name) torch.save(model, model_name)


def _load_model(self, model, model_name, only_param=False): def _load_model(self, model, model_name, only_param=False):
# TODO: 这个是不是有问题?
if self.save_path is not None: if self.save_path is not None:
model_name = os.path.join(self.save_path, model_name) model_name = os.path.join(self.save_path, model_name)
if only_param: if only_param:


+ 6
- 0
test/api/test_pipeline.py View File

@@ -0,0 +1,6 @@
import unittest


class TestPipeline(unittest.TestCase):
def test_case(self):
pass

+ 45
- 2
test/api/test_processor.py View File

@@ -1,6 +1,9 @@
import random
import unittest import unittest


from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor
from fastNLP import Vocabulary
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor, PreAppendProcessor, SliceProcessor, Num2TagProcessor, \
IndexerProcessor, VocabProcessor, SeqLenProcessor
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet




@@ -9,4 +12,44 @@ class TestProcessor(unittest.TestCase):
ds = DataSet({"word": ["00, u1, u), (u2, u2"]}) ds = DataSet({"word": ["00, u1, u), (u2, u2"]})
proc = FullSpaceToHalfSpaceProcessor("word") proc = FullSpaceToHalfSpaceProcessor("word")
ds = proc(ds) ds = proc(ds)
self.assertTrue(ds.field_arrays["word"].content, ["00, u1, u), (u2, u2"])
self.assertEqual(ds.field_arrays["word"].content, ["00, u1, u), (u2, u2"])

def test_PreAppendProcessor(self):
ds = DataSet({"word": [["1234", "3456"], ["8789", "3464"]]})
proc = PreAppendProcessor(data="abc", field_name="word")
ds = proc(ds)
self.assertEqual(ds.field_arrays["word"].content, [["abc", "1234", "3456"], ["abc", "8789", "3464"]])

def test_SliceProcessor(self):
ds = DataSet({"xx": [[random.randint(0, 10) for _ in range(30)]] * 40})
proc = SliceProcessor(10, 20, 2, "xx", new_added_field_name="yy")
ds = proc(ds)
self.assertEqual(len(ds.field_arrays["yy"].content[0]), 5)

def test_Num2TagProcessor(self):
ds = DataSet({"num": [["99.9982", "2134.0"], ["0.002", "234"]]})
proc = Num2TagProcessor("<num>", "num")
ds = proc(ds)
for data in ds.field_arrays["num"].content:
for d in data:
self.assertEqual(d, "<num>")

def test_VocabProcessor_and_IndexerProcessor(self):
ds = DataSet({"xx": [[str(random.randint(0, 10)) for _ in range(30)]] * 40})
vocab_proc = VocabProcessor("xx")
vocab_proc(ds)
vocab = vocab_proc.vocab
self.assertTrue(isinstance(vocab, Vocabulary))
self.assertTrue(len(vocab) > 5)

proc = IndexerProcessor(vocab, "xx", "yy")
ds = proc(ds)
for data in ds.field_arrays["yy"].content[0]:
self.assertTrue(isinstance(data, int))

def test_SeqLenProcessor(self):
ds = DataSet({"xx": [[str(random.randint(0, 10)) for _ in range(30)]] * 10})
proc = SeqLenProcessor("xx", "len")
ds = proc(ds)
for data in ds.field_arrays["len"].content:
self.assertEqual(data, 30)

+ 3
- 3
test/core/test_trainer.py View File

@@ -1,10 +1,10 @@
import time
import unittest import unittest


import numpy as np import numpy as np
import torch.nn.functional as F import torch.nn.functional as F
from torch import nn from torch import nn
import time
from fastNLP.core.utils import CheckError

from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.losses import BCELoss from fastNLP.core.losses import BCELoss
@@ -83,7 +83,7 @@ class TrainerTestGround(unittest.TestCase):


model = Model() model = Model()


with self.assertRaises(NameError):
with self.assertRaises(RuntimeError):
trainer = Trainer( trainer = Trainer(
train_data=dataset, train_data=dataset,
model=model model=model


Loading…
Cancel
Save