Browse Source

* 更新教程,放在在./tutorial

* remove unused codes in metrics.py
* add tests for DataSet
* add tests for FieldArray
* add tests for metrics.py
* fix predictor, add tests for predictor
* fix bucket sampler, add tests for bucket sampler
tags/v0.2.0^2
FengZiYjun 5 years ago
parent
commit
f26f11608b
12 changed files with 1316 additions and 128 deletions
  1. +0
    -1
      fastNLP/core/__init__.py
  2. +4
    -1
      fastNLP/core/dataset.py
  3. +2
    -114
      fastNLP/core/metrics.py
  4. +2
    -2
      fastNLP/core/predictor.py
  5. +1
    -1
      fastNLP/core/sampler.py
  6. +0
    -7
      fastNLP/core/vocabulary.py
  7. +23
    -0
      test/core/test_dataset.py
  8. +22
    -0
      test/core/test_fieldarray.py
  9. +13
    -0
      test/core/test_metrics.py
  10. +29
    -1
      test/core/test_predictor.py
  11. +11
    -1
      test/core/test_sampler.py
  12. +1209
    -0
      tutorials/fastnlp_tutorial_1204.ipynb

+ 0
- 1
fastNLP/core/__init__.py View File

@@ -3,7 +3,6 @@ from .dataset import DataSet
from .fieldarray import FieldArray
from .instance import Instance
from .losses import Loss
from .metrics import Evaluator, ClassifyEvaluator, SNLIEvaluator, SeqLabelEvaluator
from .optimizer import Optimizer
from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler
from .tester import Tester


+ 4
- 1
fastNLP/core/dataset.py View File

@@ -1,4 +1,5 @@
import _pickle as pickle

import numpy as np

from fastNLP.core.fieldarray import FieldArray
@@ -66,10 +67,12 @@ class DataSet(object):
def __init__(self, dataset, idx):
self.dataset = dataset
self.idx = idx

def __getitem__(self, item):
assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx])
assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx)
return self.dataset.field_arrays[item][self.idx]

def __repr__(self):
return self.dataset[self.idx].__repr__()

@@ -339,6 +342,6 @@ class DataSet(object):
pickle.dump(self, f)

@staticmethod
def load(self, path):
def load(path):
with open(path, 'rb') as f:
return pickle.load(f)

+ 2
- 114
fastNLP/core/metrics.py View File

@@ -304,118 +304,6 @@ def _prepare_metrics(metrics):
return _metrics


class Evaluator(object):
def __init__(self):
pass

def __call__(self, predict, truth):
"""

:param predict: list of tensors, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return:
"""
raise NotImplementedError


class ClassifyEvaluator(Evaluator):
def __init__(self):
super(ClassifyEvaluator, self).__init__()

def __call__(self, predict, truth):
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
y_prob = torch.cat(y_prob, dim=0)
y_pred = torch.argmax(y_prob, dim=-1)
y_true = torch.cat(truth, dim=0)
acc = float(torch.sum(y_pred == y_true)) / len(y_true)
return {"accuracy": acc}


class SeqLabelEvaluator(Evaluator):
def __init__(self):
super(SeqLabelEvaluator, self).__init__()

def __call__(self, predict, truth, **_):
"""

:param predict: list of List, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return accuracy:
"""
total_correct, total_count = 0., 0.
for x, y in zip(predict, truth):
x = torch.tensor(x)
y = y.to(x) # make sure they are in the same device
mask = (y > 0)
correct = torch.sum(((x == y) * mask).long())
total_correct += float(correct)
total_count += float(torch.sum(mask.long()))
accuracy = total_correct / total_count
return {"accuracy": float(accuracy)}


class SeqLabelEvaluator2(Evaluator):
# 上面的evaluator应该是错误的
def __init__(self, seq_lens_field_name='word_seq_origin_len'):
super(SeqLabelEvaluator2, self).__init__()
self.end_tagidx_set = set()
self.seq_lens_field_name = seq_lens_field_name

def __call__(self, predict, truth, **_):
"""

:param predict: list of batch, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return accuracy:
"""
seq_lens = _[self.seq_lens_field_name]
corr_count = 0
pred_count = 0
truth_count = 0
for x, y, seq_len in zip(predict, truth, seq_lens):
x = x.cpu().numpy()
y = y.cpu().numpy()
for idx, s_l in enumerate(seq_len):
x_ = x[idx]
y_ = y[idx]
x_ = x_[:s_l]
y_ = y_[:s_l]
flag = True
start = 0
for idx_i, (x_i, y_i) in enumerate(zip(x_, y_)):
if x_i in self.end_tagidx_set:
truth_count += 1
for j in range(start, idx_i + 1):
if y_[j] != x_[j]:
flag = False
break
if flag:
corr_count += 1
flag = True
start = idx_i + 1
if y_i in self.end_tagidx_set:
pred_count += 1
P = corr_count / (float(pred_count) + 1e-6)
R = corr_count / (float(truth_count) + 1e-6)
F = 2 * P * R / (P + R + 1e-6)

return {"P": P, 'R': R, 'F': F}


class SNLIEvaluator(Evaluator):
def __init__(self):
super(SNLIEvaluator, self).__init__()

def __call__(self, predict, truth):
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
y_prob = torch.cat(y_prob, dim=0)
y_pred = torch.argmax(y_prob, dim=-1)
truth = [t['truth'] for t in truth]
y_true = torch.cat(truth, dim=0).view(-1)
acc = float(torch.sum(y_pred == y_true)) / y_true.size(0)
return {"accuracy": acc}


def _conver_numpy(x):
"""convert input data to numpy array

@@ -467,11 +355,11 @@ def _check_data(y_true, y_pred):
type_true, y_true = _label_types(y_true)
type_pred, y_pred = _label_types(y_pred)

type_set = set(['binary', 'multiclass'])
type_set = {'binary', 'multiclass'}
if type_true in type_set and type_pred in type_set:
return type_true if type_true == type_pred else 'multiclass', y_true, y_pred

type_set = set(['multiclass-multioutput', 'multilabel'])
type_set = {'multiclass-multioutput', 'multilabel'}
if type_true in type_set and type_pred in type_set:
return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred



+ 2
- 2
fastNLP/core/predictor.py View File

@@ -23,13 +23,13 @@ class Predictor(object):

:param network: a PyTorch model (cpu)
:param data: a DataSet object.
:return: list of list of strings, [num_examples, tag_seq_length]
:return: list of batch outputs
"""
# turn on the testing mode; clean up the history
self.mode(network, test=True)
batch_output = []

data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False)
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)

for batch_x, _ in data_iterator:
with torch.no_grad():


+ 1
- 1
fastNLP/core/sampler.py View File

@@ -55,7 +55,7 @@ class BucketSampler(BaseSampler):

def __call__(self, data_set):

seq_lens = data_set[self.seq_lens_field_name].content
seq_lens = data_set.get_fields()[self.seq_lens_field_name].content
total_sample_num = len(seq_lens)

bucket_indexes = []


+ 0
- 7
fastNLP/core/vocabulary.py View File

@@ -1,12 +1,5 @@
from collections import Counter

def isiterable(p_object):
try:
_ = iter(p_object)
except TypeError:
return False
return True


def check_build_vocab(func):
"""A decorator to make sure the indexing is built before used.


+ 23
- 0
test/core/test_dataset.py View File

@@ -1,3 +1,4 @@
import os
import unittest

from fastNLP.core.dataset import DataSet
@@ -90,6 +91,18 @@ class TestDataSet(unittest.TestCase):
self.assertTrue("rx" in ds.field_arrays)
self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])

ds.apply(lambda ins: len(ins["y"]), new_field_name="y")
self.assertEqual(ds.field_arrays["y"].content[0], 2)

res = ds.apply(lambda ins: len(ins["x"]))
self.assertTrue(isinstance(res, list) and len(res) > 0)
self.assertTrue(res[0], 4)

def test_drop(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20})
ds.drop(lambda ins: len(ins["y"]) < 3)
self.assertEqual(len(ds), 20)

def test_contains(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
self.assertTrue("x" in ds)
@@ -125,9 +138,19 @@ class TestDataSet(unittest.TestCase):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])

def test_save_load(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
ds.save("./my_ds.pkl")
self.assertTrue(os.path.exists("./my_ds.pkl"))

ds_1 = DataSet.load("./my_ds.pkl")
os.remove("my_ds.pkl")
# 能跑通就行


class TestDataSetIter(unittest.TestCase):
def test__repr__(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
for iter in ds:
self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}")


+ 22
- 0
test/core/test_fieldarray.py View File

@@ -75,3 +75,25 @@ class TestFieldArray(unittest.TestCase):
indices = [0, 1, 3, 4, 6]
for a, b in zip(fa[indices], x[indices]):
self.assertListEqual(a.tolist(), b.tolist())

def test_append(self):
with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append(0)

with self.assertRaises(Exception):
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True)
fa.append([1, 2, 3, 4, 5])

with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append([])

with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append(["str", 0, 0, 0, 1.89])

fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa.append([1.2, 2.3, 3.4, 4.5, 5.6])
self.assertEqual(len(fa), 3)
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6])

+ 13
- 0
test/core/test_metrics.py View File

@@ -4,6 +4,7 @@ import numpy as np
import torch

from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score


class TestAccuracyMetric(unittest.TestCase):
@@ -132,3 +133,15 @@ class TestAccuracyMetric(unittest.TestCase):
print(e)
return
self.assertTrue(True, False), "No exception catches."


class TestUsefulFunctions(unittest.TestCase):
# 测试metrics.py中一些看上去挺有用的函数
def test_case_1(self):
# multi-class
_ = accuracy_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)))
_ = precision_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None)
_ = recall_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None)
_ = f1_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None)

# 跑通即可

+ 29
- 1
test/core/test_predictor.py View File

@@ -1,6 +1,34 @@
import unittest

import numpy as np
import torch

from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.predictor import Predictor
from fastNLP.modules.encoder.linear import Linear


def prepare_fake_dataset():
mean = np.array([-3, -3])
cov = np.array([[1, 0], [0, 1]])
class_A = np.random.multivariate_normal(mean, cov, size=(1000,))

mean = np.array([3, 3])
cov = np.array([[1, 0], [0, 1]])
class_B = np.random.multivariate_normal(mean, cov, size=(1000,))

data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
return data_set


class TestPredictor(unittest.TestCase):
def test(self):
pass
predictor = Predictor()
model = Linear(2, 1)
data = prepare_fake_dataset()
data.set_input("x")
ans = predictor.predict(model, data)
self.assertEqual(len(ans), 2000)
self.assertTrue(isinstance(ans[0], torch.Tensor))

+ 11
- 1
test/core/test_sampler.py View File

@@ -1,9 +1,11 @@
import random
import unittest

import torch

from fastNLP.core.dataset import DataSet
from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \
k_means_1d, k_means_bucketing, simple_sort_bucketing
k_means_1d, k_means_bucketing, simple_sort_bucketing, BucketSampler


class TestSampler(unittest.TestCase):
@@ -40,3 +42,11 @@ class TestSampler(unittest.TestCase):
def test_simple_sort_bucketing(self):
_ = simple_sort_bucketing([21, 3, 25, 7, 9, 22, 4, 6, 28, 10])
assert len(_) == 10

def test_BucketSampler(self):
sampler = BucketSampler(num_buckets=3, batch_size=16, seq_lens_field_name="seq_len")
data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10})
data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len")
indices = sampler(data_set)
self.assertEqual(len(indices), 10)
# 跑通即可,不验证效果

+ 1209
- 0
tutorials/fastnlp_tutorial_1204.ipynb
File diff suppressed because it is too large
View File


Loading…
Cancel
Save