Browse Source

* 更新教程,放在在./tutorial

* remove unused codes in metrics.py
* add tests for DataSet
* add tests for FieldArray
* add tests for metrics.py
* fix predictor, add tests for predictor
* fix bucket sampler, add tests for bucket sampler
tags/v0.2.0^2
FengZiYjun 5 years ago
parent
commit
f26f11608b
12 changed files with 1316 additions and 128 deletions
  1. +0
    -1
      fastNLP/core/__init__.py
  2. +4
    -1
      fastNLP/core/dataset.py
  3. +2
    -114
      fastNLP/core/metrics.py
  4. +2
    -2
      fastNLP/core/predictor.py
  5. +1
    -1
      fastNLP/core/sampler.py
  6. +0
    -7
      fastNLP/core/vocabulary.py
  7. +23
    -0
      test/core/test_dataset.py
  8. +22
    -0
      test/core/test_fieldarray.py
  9. +13
    -0
      test/core/test_metrics.py
  10. +29
    -1
      test/core/test_predictor.py
  11. +11
    -1
      test/core/test_sampler.py
  12. +1209
    -0
      tutorials/fastnlp_tutorial_1204.ipynb

+ 0
- 1
fastNLP/core/__init__.py View File

@@ -3,7 +3,6 @@ from .dataset import DataSet
from .fieldarray import FieldArray from .fieldarray import FieldArray
from .instance import Instance from .instance import Instance
from .losses import Loss from .losses import Loss
from .metrics import Evaluator, ClassifyEvaluator, SNLIEvaluator, SeqLabelEvaluator
from .optimizer import Optimizer from .optimizer import Optimizer
from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler
from .tester import Tester from .tester import Tester


+ 4
- 1
fastNLP/core/dataset.py View File

@@ -1,4 +1,5 @@
import _pickle as pickle import _pickle as pickle

import numpy as np import numpy as np


from fastNLP.core.fieldarray import FieldArray from fastNLP.core.fieldarray import FieldArray
@@ -66,10 +67,12 @@ class DataSet(object):
def __init__(self, dataset, idx): def __init__(self, dataset, idx):
self.dataset = dataset self.dataset = dataset
self.idx = idx self.idx = idx

def __getitem__(self, item): def __getitem__(self, item):
assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx])
assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx)
return self.dataset.field_arrays[item][self.idx] return self.dataset.field_arrays[item][self.idx]

def __repr__(self): def __repr__(self):
return self.dataset[self.idx].__repr__() return self.dataset[self.idx].__repr__()


@@ -339,6 +342,6 @@ class DataSet(object):
pickle.dump(self, f) pickle.dump(self, f)


@staticmethod @staticmethod
def load(self, path):
def load(path):
with open(path, 'rb') as f: with open(path, 'rb') as f:
return pickle.load(f) return pickle.load(f)

+ 2
- 114
fastNLP/core/metrics.py View File

@@ -304,118 +304,6 @@ def _prepare_metrics(metrics):
return _metrics return _metrics




class Evaluator(object):
def __init__(self):
pass

def __call__(self, predict, truth):
"""

:param predict: list of tensors, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return:
"""
raise NotImplementedError


class ClassifyEvaluator(Evaluator):
def __init__(self):
super(ClassifyEvaluator, self).__init__()

def __call__(self, predict, truth):
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
y_prob = torch.cat(y_prob, dim=0)
y_pred = torch.argmax(y_prob, dim=-1)
y_true = torch.cat(truth, dim=0)
acc = float(torch.sum(y_pred == y_true)) / len(y_true)
return {"accuracy": acc}


class SeqLabelEvaluator(Evaluator):
def __init__(self):
super(SeqLabelEvaluator, self).__init__()

def __call__(self, predict, truth, **_):
"""

:param predict: list of List, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return accuracy:
"""
total_correct, total_count = 0., 0.
for x, y in zip(predict, truth):
x = torch.tensor(x)
y = y.to(x) # make sure they are in the same device
mask = (y > 0)
correct = torch.sum(((x == y) * mask).long())
total_correct += float(correct)
total_count += float(torch.sum(mask.long()))
accuracy = total_correct / total_count
return {"accuracy": float(accuracy)}


class SeqLabelEvaluator2(Evaluator):
# 上面的evaluator应该是错误的
def __init__(self, seq_lens_field_name='word_seq_origin_len'):
super(SeqLabelEvaluator2, self).__init__()
self.end_tagidx_set = set()
self.seq_lens_field_name = seq_lens_field_name

def __call__(self, predict, truth, **_):
"""

:param predict: list of batch, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return accuracy:
"""
seq_lens = _[self.seq_lens_field_name]
corr_count = 0
pred_count = 0
truth_count = 0
for x, y, seq_len in zip(predict, truth, seq_lens):
x = x.cpu().numpy()
y = y.cpu().numpy()
for idx, s_l in enumerate(seq_len):
x_ = x[idx]
y_ = y[idx]
x_ = x_[:s_l]
y_ = y_[:s_l]
flag = True
start = 0
for idx_i, (x_i, y_i) in enumerate(zip(x_, y_)):
if x_i in self.end_tagidx_set:
truth_count += 1
for j in range(start, idx_i + 1):
if y_[j] != x_[j]:
flag = False
break
if flag:
corr_count += 1
flag = True
start = idx_i + 1
if y_i in self.end_tagidx_set:
pred_count += 1
P = corr_count / (float(pred_count) + 1e-6)
R = corr_count / (float(truth_count) + 1e-6)
F = 2 * P * R / (P + R + 1e-6)

return {"P": P, 'R': R, 'F': F}


class SNLIEvaluator(Evaluator):
def __init__(self):
super(SNLIEvaluator, self).__init__()

def __call__(self, predict, truth):
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
y_prob = torch.cat(y_prob, dim=0)
y_pred = torch.argmax(y_prob, dim=-1)
truth = [t['truth'] for t in truth]
y_true = torch.cat(truth, dim=0).view(-1)
acc = float(torch.sum(y_pred == y_true)) / y_true.size(0)
return {"accuracy": acc}


def _conver_numpy(x): def _conver_numpy(x):
"""convert input data to numpy array """convert input data to numpy array


@@ -467,11 +355,11 @@ def _check_data(y_true, y_pred):
type_true, y_true = _label_types(y_true) type_true, y_true = _label_types(y_true)
type_pred, y_pred = _label_types(y_pred) type_pred, y_pred = _label_types(y_pred)


type_set = set(['binary', 'multiclass'])
type_set = {'binary', 'multiclass'}
if type_true in type_set and type_pred in type_set: if type_true in type_set and type_pred in type_set:
return type_true if type_true == type_pred else 'multiclass', y_true, y_pred return type_true if type_true == type_pred else 'multiclass', y_true, y_pred


type_set = set(['multiclass-multioutput', 'multilabel'])
type_set = {'multiclass-multioutput', 'multilabel'}
if type_true in type_set and type_pred in type_set: if type_true in type_set and type_pred in type_set:
return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred




+ 2
- 2
fastNLP/core/predictor.py View File

@@ -23,13 +23,13 @@ class Predictor(object):


:param network: a PyTorch model (cpu) :param network: a PyTorch model (cpu)
:param data: a DataSet object. :param data: a DataSet object.
:return: list of list of strings, [num_examples, tag_seq_length]
:return: list of batch outputs
""" """
# turn on the testing mode; clean up the history # turn on the testing mode; clean up the history
self.mode(network, test=True) self.mode(network, test=True)
batch_output = [] batch_output = []


data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False)
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)


for batch_x, _ in data_iterator: for batch_x, _ in data_iterator:
with torch.no_grad(): with torch.no_grad():


+ 1
- 1
fastNLP/core/sampler.py View File

@@ -55,7 +55,7 @@ class BucketSampler(BaseSampler):


def __call__(self, data_set): def __call__(self, data_set):


seq_lens = data_set[self.seq_lens_field_name].content
seq_lens = data_set.get_fields()[self.seq_lens_field_name].content
total_sample_num = len(seq_lens) total_sample_num = len(seq_lens)


bucket_indexes = [] bucket_indexes = []


+ 0
- 7
fastNLP/core/vocabulary.py View File

@@ -1,12 +1,5 @@
from collections import Counter from collections import Counter


def isiterable(p_object):
try:
_ = iter(p_object)
except TypeError:
return False
return True



def check_build_vocab(func): def check_build_vocab(func):
"""A decorator to make sure the indexing is built before used. """A decorator to make sure the indexing is built before used.


+ 23
- 0
test/core/test_dataset.py View File

@@ -1,3 +1,4 @@
import os
import unittest import unittest


from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
@@ -90,6 +91,18 @@ class TestDataSet(unittest.TestCase):
self.assertTrue("rx" in ds.field_arrays) self.assertTrue("rx" in ds.field_arrays)
self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])


ds.apply(lambda ins: len(ins["y"]), new_field_name="y")
self.assertEqual(ds.field_arrays["y"].content[0], 2)

res = ds.apply(lambda ins: len(ins["x"]))
self.assertTrue(isinstance(res, list) and len(res) > 0)
self.assertTrue(res[0], 4)

def test_drop(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20})
ds.drop(lambda ins: len(ins["y"]) < 3)
self.assertEqual(len(ds), 20)

def test_contains(self): def test_contains(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
self.assertTrue("x" in ds) self.assertTrue("x" in ds)
@@ -125,9 +138,19 @@ class TestDataSet(unittest.TestCase):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])


def test_save_load(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
ds.save("./my_ds.pkl")
self.assertTrue(os.path.exists("./my_ds.pkl"))

ds_1 = DataSet.load("./my_ds.pkl")
os.remove("my_ds.pkl")
# 能跑通就行



class TestDataSetIter(unittest.TestCase): class TestDataSetIter(unittest.TestCase):
def test__repr__(self): def test__repr__(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
for iter in ds: for iter in ds:
self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}") self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}")


+ 22
- 0
test/core/test_fieldarray.py View File

@@ -75,3 +75,25 @@ class TestFieldArray(unittest.TestCase):
indices = [0, 1, 3, 4, 6] indices = [0, 1, 3, 4, 6]
for a, b in zip(fa[indices], x[indices]): for a, b in zip(fa[indices], x[indices]):
self.assertListEqual(a.tolist(), b.tolist()) self.assertListEqual(a.tolist(), b.tolist())

def test_append(self):
with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append(0)

with self.assertRaises(Exception):
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True)
fa.append([1, 2, 3, 4, 5])

with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append([])

with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append(["str", 0, 0, 0, 1.89])

fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append([1.2, 2.3, 3.4, 4.5, 5.6])
self.assertEqual(len(fa), 3)
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6])

+ 13
- 0
test/core/test_metrics.py View File

@@ -4,6 +4,7 @@ import numpy as np
import torch import torch


from fastNLP.core.metrics import AccuracyMetric from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score




class TestAccuracyMetric(unittest.TestCase): class TestAccuracyMetric(unittest.TestCase):
@@ -132,3 +133,15 @@ class TestAccuracyMetric(unittest.TestCase):
print(e) print(e)
return return
self.assertTrue(True, False), "No exception catches." self.assertTrue(True, False), "No exception catches."


class TestUsefulFunctions(unittest.TestCase):
# 测试metrics.py中一些看上去挺有用的函数
def test_case_1(self):
# multi-class
_ = accuracy_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)))
_ = precision_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None)
_ = recall_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None)
_ = f1_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None)

# 跑通即可

+ 29
- 1
test/core/test_predictor.py View File

@@ -1,6 +1,34 @@
import unittest import unittest


import numpy as np
import torch

from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.predictor import Predictor
from fastNLP.modules.encoder.linear import Linear


def prepare_fake_dataset():
mean = np.array([-3, -3])
cov = np.array([[1, 0], [0, 1]])
class_A = np.random.multivariate_normal(mean, cov, size=(1000,))

mean = np.array([3, 3])
cov = np.array([[1, 0], [0, 1]])
class_B = np.random.multivariate_normal(mean, cov, size=(1000,))

data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
return data_set



class TestPredictor(unittest.TestCase): class TestPredictor(unittest.TestCase):
def test(self): def test(self):
pass
predictor = Predictor()
model = Linear(2, 1)
data = prepare_fake_dataset()
data.set_input("x")
ans = predictor.predict(model, data)
self.assertEqual(len(ans), 2000)
self.assertTrue(isinstance(ans[0], torch.Tensor))

+ 11
- 1
test/core/test_sampler.py View File

@@ -1,9 +1,11 @@
import random
import unittest import unittest


import torch import torch


from fastNLP.core.dataset import DataSet
from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \ from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \
k_means_1d, k_means_bucketing, simple_sort_bucketing
k_means_1d, k_means_bucketing, simple_sort_bucketing, BucketSampler




class TestSampler(unittest.TestCase): class TestSampler(unittest.TestCase):
@@ -40,3 +42,11 @@ class TestSampler(unittest.TestCase):
def test_simple_sort_bucketing(self): def test_simple_sort_bucketing(self):
_ = simple_sort_bucketing([21, 3, 25, 7, 9, 22, 4, 6, 28, 10]) _ = simple_sort_bucketing([21, 3, 25, 7, 9, 22, 4, 6, 28, 10])
assert len(_) == 10 assert len(_) == 10

def test_BucketSampler(self):
sampler = BucketSampler(num_buckets=3, batch_size=16, seq_lens_field_name="seq_len")
data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10})
data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len")
indices = sampler(data_set)
self.assertEqual(len(indices), 10)
# 跑通即可,不验证效果

+ 1209
- 0
tutorials/fastnlp_tutorial_1204.ipynb
File diff suppressed because it is too large
View File


Loading…
Cancel
Save