* remove unused codes in metrics.py * add tests for DataSet * add tests for FieldArray * add tests for metrics.py * fix predictor, add tests for predictor * fix bucket sampler, add tests for bucket samplertags/v0.2.0^2
@@ -3,7 +3,6 @@ from .dataset import DataSet | |||||
from .fieldarray import FieldArray | from .fieldarray import FieldArray | ||||
from .instance import Instance | from .instance import Instance | ||||
from .losses import Loss | from .losses import Loss | ||||
from .metrics import Evaluator, ClassifyEvaluator, SNLIEvaluator, SeqLabelEvaluator | |||||
from .optimizer import Optimizer | from .optimizer import Optimizer | ||||
from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler | from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler | ||||
from .tester import Tester | from .tester import Tester | ||||
@@ -1,4 +1,5 @@ | |||||
import _pickle as pickle | import _pickle as pickle | ||||
import numpy as np | import numpy as np | ||||
from fastNLP.core.fieldarray import FieldArray | from fastNLP.core.fieldarray import FieldArray | ||||
@@ -66,10 +67,12 @@ class DataSet(object): | |||||
def __init__(self, dataset, idx): | def __init__(self, dataset, idx): | ||||
self.dataset = dataset | self.dataset = dataset | ||||
self.idx = idx | self.idx = idx | ||||
def __getitem__(self, item): | def __getitem__(self, item): | ||||
assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) | assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) | ||||
assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) | assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) | ||||
return self.dataset.field_arrays[item][self.idx] | return self.dataset.field_arrays[item][self.idx] | ||||
def __repr__(self): | def __repr__(self): | ||||
return self.dataset[self.idx].__repr__() | return self.dataset[self.idx].__repr__() | ||||
@@ -339,6 +342,6 @@ class DataSet(object): | |||||
pickle.dump(self, f) | pickle.dump(self, f) | ||||
@staticmethod | @staticmethod | ||||
def load(self, path): | |||||
def load(path): | |||||
with open(path, 'rb') as f: | with open(path, 'rb') as f: | ||||
return pickle.load(f) | return pickle.load(f) |
@@ -304,118 +304,6 @@ def _prepare_metrics(metrics): | |||||
return _metrics | return _metrics | ||||
class Evaluator(object): | |||||
def __init__(self): | |||||
pass | |||||
def __call__(self, predict, truth): | |||||
""" | |||||
:param predict: list of tensors, the network outputs from all batches. | |||||
:param truth: list of dict, the ground truths from all batch_y. | |||||
:return: | |||||
""" | |||||
raise NotImplementedError | |||||
class ClassifyEvaluator(Evaluator): | |||||
def __init__(self): | |||||
super(ClassifyEvaluator, self).__init__() | |||||
def __call__(self, predict, truth): | |||||
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict] | |||||
y_prob = torch.cat(y_prob, dim=0) | |||||
y_pred = torch.argmax(y_prob, dim=-1) | |||||
y_true = torch.cat(truth, dim=0) | |||||
acc = float(torch.sum(y_pred == y_true)) / len(y_true) | |||||
return {"accuracy": acc} | |||||
class SeqLabelEvaluator(Evaluator): | |||||
def __init__(self): | |||||
super(SeqLabelEvaluator, self).__init__() | |||||
def __call__(self, predict, truth, **_): | |||||
""" | |||||
:param predict: list of List, the network outputs from all batches. | |||||
:param truth: list of dict, the ground truths from all batch_y. | |||||
:return accuracy: | |||||
""" | |||||
total_correct, total_count = 0., 0. | |||||
for x, y in zip(predict, truth): | |||||
x = torch.tensor(x) | |||||
y = y.to(x) # make sure they are in the same device | |||||
mask = (y > 0) | |||||
correct = torch.sum(((x == y) * mask).long()) | |||||
total_correct += float(correct) | |||||
total_count += float(torch.sum(mask.long())) | |||||
accuracy = total_correct / total_count | |||||
return {"accuracy": float(accuracy)} | |||||
class SeqLabelEvaluator2(Evaluator): | |||||
# 上面的evaluator应该是错误的 | |||||
def __init__(self, seq_lens_field_name='word_seq_origin_len'): | |||||
super(SeqLabelEvaluator2, self).__init__() | |||||
self.end_tagidx_set = set() | |||||
self.seq_lens_field_name = seq_lens_field_name | |||||
def __call__(self, predict, truth, **_): | |||||
""" | |||||
:param predict: list of batch, the network outputs from all batches. | |||||
:param truth: list of dict, the ground truths from all batch_y. | |||||
:return accuracy: | |||||
""" | |||||
seq_lens = _[self.seq_lens_field_name] | |||||
corr_count = 0 | |||||
pred_count = 0 | |||||
truth_count = 0 | |||||
for x, y, seq_len in zip(predict, truth, seq_lens): | |||||
x = x.cpu().numpy() | |||||
y = y.cpu().numpy() | |||||
for idx, s_l in enumerate(seq_len): | |||||
x_ = x[idx] | |||||
y_ = y[idx] | |||||
x_ = x_[:s_l] | |||||
y_ = y_[:s_l] | |||||
flag = True | |||||
start = 0 | |||||
for idx_i, (x_i, y_i) in enumerate(zip(x_, y_)): | |||||
if x_i in self.end_tagidx_set: | |||||
truth_count += 1 | |||||
for j in range(start, idx_i + 1): | |||||
if y_[j] != x_[j]: | |||||
flag = False | |||||
break | |||||
if flag: | |||||
corr_count += 1 | |||||
flag = True | |||||
start = idx_i + 1 | |||||
if y_i in self.end_tagidx_set: | |||||
pred_count += 1 | |||||
P = corr_count / (float(pred_count) + 1e-6) | |||||
R = corr_count / (float(truth_count) + 1e-6) | |||||
F = 2 * P * R / (P + R + 1e-6) | |||||
return {"P": P, 'R': R, 'F': F} | |||||
class SNLIEvaluator(Evaluator): | |||||
def __init__(self): | |||||
super(SNLIEvaluator, self).__init__() | |||||
def __call__(self, predict, truth): | |||||
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict] | |||||
y_prob = torch.cat(y_prob, dim=0) | |||||
y_pred = torch.argmax(y_prob, dim=-1) | |||||
truth = [t['truth'] for t in truth] | |||||
y_true = torch.cat(truth, dim=0).view(-1) | |||||
acc = float(torch.sum(y_pred == y_true)) / y_true.size(0) | |||||
return {"accuracy": acc} | |||||
def _conver_numpy(x): | def _conver_numpy(x): | ||||
"""convert input data to numpy array | """convert input data to numpy array | ||||
@@ -467,11 +355,11 @@ def _check_data(y_true, y_pred): | |||||
type_true, y_true = _label_types(y_true) | type_true, y_true = _label_types(y_true) | ||||
type_pred, y_pred = _label_types(y_pred) | type_pred, y_pred = _label_types(y_pred) | ||||
type_set = set(['binary', 'multiclass']) | |||||
type_set = {'binary', 'multiclass'} | |||||
if type_true in type_set and type_pred in type_set: | if type_true in type_set and type_pred in type_set: | ||||
return type_true if type_true == type_pred else 'multiclass', y_true, y_pred | return type_true if type_true == type_pred else 'multiclass', y_true, y_pred | ||||
type_set = set(['multiclass-multioutput', 'multilabel']) | |||||
type_set = {'multiclass-multioutput', 'multilabel'} | |||||
if type_true in type_set and type_pred in type_set: | if type_true in type_set and type_pred in type_set: | ||||
return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred | return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred | ||||
@@ -23,13 +23,13 @@ class Predictor(object): | |||||
:param network: a PyTorch model (cpu) | :param network: a PyTorch model (cpu) | ||||
:param data: a DataSet object. | :param data: a DataSet object. | ||||
:return: list of list of strings, [num_examples, tag_seq_length] | |||||
:return: list of batch outputs | |||||
""" | """ | ||||
# turn on the testing mode; clean up the history | # turn on the testing mode; clean up the history | ||||
self.mode(network, test=True) | self.mode(network, test=True) | ||||
batch_output = [] | batch_output = [] | ||||
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) | |||||
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) | |||||
for batch_x, _ in data_iterator: | for batch_x, _ in data_iterator: | ||||
with torch.no_grad(): | with torch.no_grad(): | ||||
@@ -55,7 +55,7 @@ class BucketSampler(BaseSampler): | |||||
def __call__(self, data_set): | def __call__(self, data_set): | ||||
seq_lens = data_set[self.seq_lens_field_name].content | |||||
seq_lens = data_set.get_fields()[self.seq_lens_field_name].content | |||||
total_sample_num = len(seq_lens) | total_sample_num = len(seq_lens) | ||||
bucket_indexes = [] | bucket_indexes = [] | ||||
@@ -1,12 +1,5 @@ | |||||
from collections import Counter | from collections import Counter | ||||
def isiterable(p_object): | |||||
try: | |||||
_ = iter(p_object) | |||||
except TypeError: | |||||
return False | |||||
return True | |||||
def check_build_vocab(func): | def check_build_vocab(func): | ||||
"""A decorator to make sure the indexing is built before used. | """A decorator to make sure the indexing is built before used. | ||||
@@ -1,3 +1,4 @@ | |||||
import os | |||||
import unittest | import unittest | ||||
from fastNLP.core.dataset import DataSet | from fastNLP.core.dataset import DataSet | ||||
@@ -90,6 +91,18 @@ class TestDataSet(unittest.TestCase): | |||||
self.assertTrue("rx" in ds.field_arrays) | self.assertTrue("rx" in ds.field_arrays) | ||||
self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) | self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) | ||||
ds.apply(lambda ins: len(ins["y"]), new_field_name="y") | |||||
self.assertEqual(ds.field_arrays["y"].content[0], 2) | |||||
res = ds.apply(lambda ins: len(ins["x"])) | |||||
self.assertTrue(isinstance(res, list) and len(res) > 0) | |||||
self.assertTrue(res[0], 4) | |||||
def test_drop(self): | |||||
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20}) | |||||
ds.drop(lambda ins: len(ins["y"]) < 3) | |||||
self.assertEqual(len(ds), 20) | |||||
def test_contains(self): | def test_contains(self): | ||||
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) | ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) | ||||
self.assertTrue("x" in ds) | self.assertTrue("x" in ds) | ||||
@@ -125,9 +138,19 @@ class TestDataSet(unittest.TestCase): | |||||
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) | ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) | ||||
self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) | self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) | ||||
def test_save_load(self): | |||||
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) | |||||
ds.save("./my_ds.pkl") | |||||
self.assertTrue(os.path.exists("./my_ds.pkl")) | |||||
ds_1 = DataSet.load("./my_ds.pkl") | |||||
os.remove("my_ds.pkl") | |||||
# 能跑通就行 | |||||
class TestDataSetIter(unittest.TestCase): | class TestDataSetIter(unittest.TestCase): | ||||
def test__repr__(self): | def test__repr__(self): | ||||
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) | ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) | ||||
for iter in ds: | for iter in ds: | ||||
self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}") | self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}") | ||||
@@ -75,3 +75,25 @@ class TestFieldArray(unittest.TestCase): | |||||
indices = [0, 1, 3, 4, 6] | indices = [0, 1, 3, 4, 6] | ||||
for a, b in zip(fa[indices], x[indices]): | for a, b in zip(fa[indices], x[indices]): | ||||
self.assertListEqual(a.tolist(), b.tolist()) | self.assertListEqual(a.tolist(), b.tolist()) | ||||
def test_append(self): | |||||
with self.assertRaises(Exception): | |||||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||||
fa.append(0) | |||||
with self.assertRaises(Exception): | |||||
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) | |||||
fa.append([1, 2, 3, 4, 5]) | |||||
with self.assertRaises(Exception): | |||||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||||
fa.append([]) | |||||
with self.assertRaises(Exception): | |||||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||||
fa.append(["str", 0, 0, 0, 1.89]) | |||||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||||
fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) | |||||
self.assertEqual(len(fa), 3) | |||||
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) |
@@ -4,6 +4,7 @@ import numpy as np | |||||
import torch | import torch | ||||
from fastNLP.core.metrics import AccuracyMetric | from fastNLP.core.metrics import AccuracyMetric | ||||
from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score | |||||
class TestAccuracyMetric(unittest.TestCase): | class TestAccuracyMetric(unittest.TestCase): | ||||
@@ -132,3 +133,15 @@ class TestAccuracyMetric(unittest.TestCase): | |||||
print(e) | print(e) | ||||
return | return | ||||
self.assertTrue(True, False), "No exception catches." | self.assertTrue(True, False), "No exception catches." | ||||
class TestUsefulFunctions(unittest.TestCase): | |||||
# 测试metrics.py中一些看上去挺有用的函数 | |||||
def test_case_1(self): | |||||
# multi-class | |||||
_ = accuracy_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1))) | |||||
_ = precision_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) | |||||
_ = recall_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) | |||||
_ = f1_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) | |||||
# 跑通即可 |
@@ -1,6 +1,34 @@ | |||||
import unittest | import unittest | ||||
import numpy as np | |||||
import torch | |||||
from fastNLP.core.dataset import DataSet | |||||
from fastNLP.core.instance import Instance | |||||
from fastNLP.core.predictor import Predictor | |||||
from fastNLP.modules.encoder.linear import Linear | |||||
def prepare_fake_dataset(): | |||||
mean = np.array([-3, -3]) | |||||
cov = np.array([[1, 0], [0, 1]]) | |||||
class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) | |||||
mean = np.array([3, 3]) | |||||
cov = np.array([[1, 0], [0, 1]]) | |||||
class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) | |||||
data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + | |||||
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) | |||||
return data_set | |||||
class TestPredictor(unittest.TestCase): | class TestPredictor(unittest.TestCase): | ||||
def test(self): | def test(self): | ||||
pass | |||||
predictor = Predictor() | |||||
model = Linear(2, 1) | |||||
data = prepare_fake_dataset() | |||||
data.set_input("x") | |||||
ans = predictor.predict(model, data) | |||||
self.assertEqual(len(ans), 2000) | |||||
self.assertTrue(isinstance(ans[0], torch.Tensor)) |
@@ -1,9 +1,11 @@ | |||||
import random | |||||
import unittest | import unittest | ||||
import torch | import torch | ||||
from fastNLP.core.dataset import DataSet | |||||
from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \ | from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \ | ||||
k_means_1d, k_means_bucketing, simple_sort_bucketing | |||||
k_means_1d, k_means_bucketing, simple_sort_bucketing, BucketSampler | |||||
class TestSampler(unittest.TestCase): | class TestSampler(unittest.TestCase): | ||||
@@ -40,3 +42,11 @@ class TestSampler(unittest.TestCase): | |||||
def test_simple_sort_bucketing(self): | def test_simple_sort_bucketing(self): | ||||
_ = simple_sort_bucketing([21, 3, 25, 7, 9, 22, 4, 6, 28, 10]) | _ = simple_sort_bucketing([21, 3, 25, 7, 9, 22, 4, 6, 28, 10]) | ||||
assert len(_) == 10 | assert len(_) == 10 | ||||
def test_BucketSampler(self): | |||||
sampler = BucketSampler(num_buckets=3, batch_size=16, seq_lens_field_name="seq_len") | |||||
data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10}) | |||||
data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len") | |||||
indices = sampler(data_set) | |||||
self.assertEqual(len(indices), 10) | |||||
# 跑通即可,不验证效果 |