hummingbird
/
fastNLP

 
			
							import unittest
from collections import Counter

import numpy as np
import torch
from fastNLP import AccuracyMetric
from fastNLP.core.metrics import (ClassifyFPreRecMetric, CMRC2018Metric,
                                  ConfusionMatrixMetric, SpanFPreRecMetric,
                                  _accuracy_topk, _pred_topk)
from fastNLP.core.vocabulary import Vocabulary


def _generate_tags(encoding_type, number_labels=4):
    """

    :param encoding_type: 例如BIOES, BMES, BIO等
    :param number_labels: 多少个label，大于1
    :return:
    """
    vocab = {}
    for i in range(number_labels):
        label = str(i)
        for tag in encoding_type:
            if tag == 'O':
                if tag not in vocab:
                    vocab['O'] = len(vocab) + 1
                continue
            vocab['{}-{}'.format(tag, label)] = len(vocab) + 1  # 其实表达的是这个的count
    return vocab


def _convert_res_to_fastnlp_res(metric_result):
    allen_result = {}
    key_map = {'f1-measure-overall': "f", "recall-overall": "rec", "precision-overall": "pre"}
    for key, value in metric_result.items():
        if key in key_map:
            key = key_map[key]
        else:
            label = key.split('-')[-1]
            if key.startswith('f1'):
                key = 'f-{}'.format(label)
            else:
                key = '{}-{}'.format(key[:3], label)
        allen_result[key] = round(value, 6)
    return allen_result


class TestConfusionMatrixMetric(unittest.TestCase):
    def test_ConfusionMatrixMetric1(self):
        pred_dict = {"pred": torch.zeros(4,3)}
        target_dict = {'target': torch.zeros(4)}
        metric = ConfusionMatrixMetric()

        metric(pred_dict=pred_dict, target_dict=target_dict)
        print(metric.get_metric())

    def test_ConfusionMatrixMetric2(self):
        # (2) with corrupted size

        with self.assertRaises(Exception):
            pred_dict = {"pred": torch.zeros(4, 3, 2)}
            target_dict = {'target': torch.zeros(4)}
            metric = ConfusionMatrixMetric()

            metric(pred_dict=pred_dict, target_dict=target_dict, )
            print(metric.get_metric())

    def test_ConfusionMatrixMetric3(self):
    # (3) the second batch is corrupted size
        with self.assertRaises(Exception):
            metric = ConfusionMatrixMetric()
            pred_dict = {"pred": torch.zeros(4, 3, 2)}
            target_dict = {'target': torch.zeros(4, 3)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            
            pred_dict = {"pred": torch.zeros(4, 3, 2)}
            target_dict = {'target': torch.zeros(4)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            
            print(metric.get_metric())

    def test_ConfusionMatrixMetric4(self):
    # (4) check reset
        metric = ConfusionMatrixMetric()
        pred_dict = {"pred": torch.randn(4, 3, 2)}
        target_dict = {'target': torch.ones(4, 3)}
        metric(pred_dict=pred_dict, target_dict=target_dict)
        res = metric.get_metric()
        self.assertTrue(isinstance(res, dict))
        print(res)

    def test_ConfusionMatrixMetric5(self):
    # (5) check numpy array is not acceptable

        with self.assertRaises(Exception):
            metric = ConfusionMatrixMetric()
            pred_dict = {"pred": np.zeros((4, 3, 2))}
            target_dict = {'target': np.zeros((4, 3))}
            metric(pred_dict=pred_dict, target_dict=target_dict)

    def test_ConfusionMatrixMetric6(self):
    # (6) check map, match
        metric = ConfusionMatrixMetric(pred='predictions', target='targets')
        pred_dict = {"predictions": torch.randn(4, 3, 2)}
        target_dict = {'targets': torch.zeros(4, 3)}
        metric(pred_dict=pred_dict, target_dict=target_dict)
        res = metric.get_metric()
        print(res)

    def test_ConfusionMatrixMetric7(self):
        # (7) check map, include unused
        metric = ConfusionMatrixMetric(pred='prediction', target='targets')
        pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused': 1}
        target_dict = {'targets': torch.zeros(4, 3)}
        metric(pred_dict=pred_dict, target_dict=target_dict)

    def test_ConfusionMatrixMetric8(self):
        # (8) check _fast_metric
        with self.assertRaises(Exception):
            metric = ConfusionMatrixMetric()
            pred_dict = {"predictions": torch.zeros(4, 3, 2), "seq_len": torch.ones(3) * 3}
            target_dict = {'targets': torch.zeros(4, 3)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            print(metric.get_metric())


    def test_duplicate(self):
        # 0.4.1的潜在bug，不能出现形参重复的情况
        metric = ConfusionMatrixMetric(pred='predictions', target='targets')
        pred_dict = {"predictions": torch.zeros(4, 3, 2), "seq_len": torch.ones(4) * 3, 'pred':0}
        target_dict = {'targets':torch.zeros(4, 3), 'target': 0}
        metric(pred_dict=pred_dict, target_dict=target_dict)
        print(metric.get_metric())


    def test_seq_len(self):
        N = 256
        seq_len = torch.zeros(N).long()
        seq_len[0] = 2
        pred = {'pred': torch.ones(N, 2)}
        target = {'target': torch.ones(N, 2), 'seq_len': seq_len}
        metric = ConfusionMatrixMetric()
        metric(pred_dict=pred, target_dict=target)
        metric.get_metric(reset=False)
        seq_len[1:] = 1
        metric(pred_dict=pred, target_dict=target)
        metric.get_metric()

    def test_vocab(self):
        vocab = Vocabulary()
        word_list = "this is a word list".split()
        vocab.update(word_list)
        
        pred_dict = {"pred": torch.zeros(4,3)}
        target_dict = {'target': torch.zeros(4)}
        metric = ConfusionMatrixMetric(vocab=vocab)
        metric(pred_dict=pred_dict, target_dict=target_dict)
        print(metric.get_metric())


class TestAccuracyMetric(unittest.TestCase):
    def test_AccuracyMetric1(self):
        # (1) only input, targets passed
        pred_dict = {"pred": torch.zeros(4, 3)}
        target_dict = {'target': torch.zeros(4)}
        metric = AccuracyMetric()
        
        metric(pred_dict=pred_dict, target_dict=target_dict)
        print(metric.get_metric())
    
    def test_AccuracyMetric2(self):
        # (2) with corrupted size
        try:
            pred_dict = {"pred": torch.zeros(4, 3, 2)}
            target_dict = {'target': torch.zeros(4)}
            metric = AccuracyMetric()
            
            metric(pred_dict=pred_dict, target_dict=target_dict, )
            print(metric.get_metric())
        except Exception as e:
            print(e)
            return
        print("No exception catches.")
    
    def test_AccuracyMetric3(self):
        # (3) the second batch is corrupted size
        try:
            metric = AccuracyMetric()
            pred_dict = {"pred": torch.zeros(4, 3, 2)}
            target_dict = {'target': torch.zeros(4, 3)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            
            pred_dict = {"pred": torch.zeros(4, 3, 2)}
            target_dict = {'target': torch.zeros(4)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            
            print(metric.get_metric())
        except Exception as e:
            print(e)
            return
        self.assertTrue(True, False), "No exception catches."
    
    def test_AccuaryMetric4(self):
        # (5) check reset
        metric = AccuracyMetric()
        pred_dict = {"pred": torch.randn(4, 3, 2)}
        target_dict = {'target': torch.ones(4, 3)}
        metric(pred_dict=pred_dict, target_dict=target_dict)
        ans = torch.argmax(pred_dict["pred"], dim=2).to(target_dict["target"]) == target_dict["target"]
        res = metric.get_metric()
        self.assertTrue(isinstance(res, dict))
        self.assertTrue("acc" in res)
        self.assertAlmostEqual(res["acc"], float(ans.float().mean()), places=3)
    
    def test_AccuaryMetric5(self):
        # (5) check reset
        metric = AccuracyMetric()
        pred_dict = {"pred": torch.randn(4, 3, 2)}
        target_dict = {'target': torch.zeros(4, 3)}
        metric(pred_dict=pred_dict, target_dict=target_dict)
        res = metric.get_metric(reset=False)
        ans = (torch.argmax(pred_dict["pred"], dim=2).float() == target_dict["target"]).float().mean()
        self.assertAlmostEqual(res["acc"], float(ans), places=4)
    
    def test_AccuaryMetric6(self):
        # (6) check numpy array is not acceptable
        try:
            metric = AccuracyMetric()
            pred_dict = {"pred": np.zeros((4, 3, 2))}
            target_dict = {'target': np.zeros((4, 3))}
            metric(pred_dict=pred_dict, target_dict=target_dict)
        except Exception as e:
            print(e)
            return
        self.assertTrue(True, False), "No exception catches."
    
    def test_AccuaryMetric7(self):
        # (7) check map, match
        metric = AccuracyMetric(pred='predictions', target='targets')
        pred_dict = {"predictions": torch.randn(4, 3, 2)}
        target_dict = {'targets': torch.zeros(4, 3)}
        metric(pred_dict=pred_dict, target_dict=target_dict)
        res = metric.get_metric()
        ans = (torch.argmax(pred_dict["predictions"], dim=2).float() == target_dict["targets"]).float().mean()
        self.assertAlmostEqual(res["acc"], float(ans), places=4)
    
    def test_AccuaryMetric8(self):
        try:
            metric = AccuracyMetric(pred='predictions', target='targets')
            pred_dict = {"predictions": torch.zeros(4, 3, 2)}
            target_dict = {'targets': torch.zeros(4, 3)}
            metric(pred_dict=pred_dict, target_dict=target_dict, )
            self.assertDictEqual(metric.get_metric(), {'acc': 1})
        except Exception as e:
            print(e)
            return
        self.assertTrue(True, False), "No exception catches."
    
    def test_AccuaryMetric9(self):
        # (9) check map, include unused
        try:
            metric = AccuracyMetric(pred='prediction', target='targets')
            pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused': 1}
            target_dict = {'targets': torch.zeros(4, 3)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            self.assertDictEqual(metric.get_metric(), {'acc': 1})
        except Exception as e:
            print(e)
            return
        self.assertTrue(True, False), "No exception catches."
    
    def test_AccuaryMetric10(self):
        # (10) check _fast_metric
        try:
            metric = AccuracyMetric()
            pred_dict = {"predictions": torch.zeros(4, 3, 2), "seq_len": torch.ones(3) * 3}
            target_dict = {'targets': torch.zeros(4, 3)}
            metric(pred_dict=pred_dict, target_dict=target_dict)
            self.assertDictEqual(metric.get_metric(), {'acc': 1})
        except Exception as e:
            print(e)
            return
        self.assertTrue(True, False), "No exception catches."

    def test_duplicate(self):
        # 0.4.1的潜在bug，不能出现形参重复的情况
        metric = AccuracyMetric(pred='predictions', target='targets')
        pred_dict = {"predictions": torch.zeros(4, 3, 2), "seq_len": torch.ones(4) * 3, 'pred':0}
        target_dict = {'targets':torch.zeros(4, 3), 'target': 0}
        metric(pred_dict=pred_dict, target_dict=target_dict)


    def test_seq_len(self):
        N = 256
        seq_len = torch.zeros(N).long()
        seq_len[0] = 2
        pred = {'pred': torch.ones(N, 2)}
        target = {'target': torch.ones(N, 2), 'seq_len': seq_len}
        metric = AccuracyMetric()
        metric(pred_dict=pred, target_dict=target)
        self.assertDictEqual(metric.get_metric(), {'acc': 1.})
        seq_len[1:] = 1
        metric(pred_dict=pred, target_dict=target)
        self.assertDictEqual(metric.get_metric(), {'acc': 1.})


class SpanFPreRecMetricTest(unittest.TestCase):
    def test_case1(self):
        from fastNLP.core.metrics import _bmes_tag_to_spans
        from fastNLP.core.metrics import _bio_tag_to_spans
        
        bmes_lst = ['M-8', 'S-2', 'S-0', 'B-9', 'B-6', 'E-5', 'B-7', 'S-2', 'E-7', 'S-8']
        bio_lst = ['O-8', 'O-2', 'B-0', 'O-9', 'I-6', 'I-5', 'I-7', 'I-2', 'I-7', 'O-8']
        expect_bmes_res = set()
        expect_bmes_res.update([('8', (0, 1)), ('2', (1, 2)), ('0', (2, 3)), ('9', (3, 4)), ('6', (4, 5)),
                                ('5', (5, 6)), ('7', (6, 7)), ('2', (7, 8)), ('7', (8, 9)), ('8', (9, 10))])
        expect_bio_res = set()
        expect_bio_res.update([('7', (8, 9)), ('0', (2, 3)), ('2', (7, 8)), ('5', (5, 6)),
                               ('6', (4, 5)), ('7', (6, 7))])
        self.assertSetEqual(expect_bmes_res, set(_bmes_tag_to_spans(bmes_lst)))
        self.assertSetEqual(expect_bio_res, set(_bio_tag_to_spans(bio_lst)))

    def test_case2(self):
        # 测试不带label的
        from fastNLP.core.metrics import _bmes_tag_to_spans
        from fastNLP.core.metrics import _bio_tag_to_spans
        
        bmes_lst = ['B', 'E', 'B', 'S', 'B', 'M', 'E', 'M', 'B', 'E']
        bio_lst = ['I', 'B', 'O', 'O', 'I', 'O', 'I', 'B', 'O', 'O']
        expect_bmes_res = set()
        expect_bmes_res.update([('', (0, 2)), ('', (2, 3)), ('', (3, 4)), ('', (4, 7)), ('', (7, 8)), ('', (8, 10))])
        expect_bio_res = set()
        expect_bio_res.update([('', (7, 8)), ('', (6, 7)), ('', (4, 5)), ('', (0, 1)), ('', (1, 2))])
        self.assertSetEqual(expect_bmes_res, set(_bmes_tag_to_spans(bmes_lst)))
        self.assertSetEqual(expect_bio_res, set(_bio_tag_to_spans(bio_lst)))

    def test_case3(self):
        number_labels = 4
        # bio tag
        fastnlp_bio_vocab = Vocabulary(unknown=None, padding=None)
        fastnlp_bio_vocab.word_count = Counter(_generate_tags('BIO', number_labels))
        fastnlp_bio_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bio_vocab, only_gross=False)
        bio_sequence = torch.FloatTensor([[[-0.4424, -0.4579, -0.7376,  1.8129,  0.1316,  1.6566, -1.2169,
          -0.3782,  0.8240],
         [-1.2348, -0.1876, -0.1462, -0.4834, -0.6692, -0.9735,  1.1563,
          -0.3562, -1.4116],
         [ 1.6550, -0.9555,  0.3782, -1.3160, -1.5835, -0.3443, -1.7858,
           2.0023,  0.7075],
         [-0.3772, -0.5447, -1.5631,  1.1614,  1.4598, -1.2764,  0.5186,
           0.3832, -0.1540],
         [-0.1011,  0.0600,  1.1090, -0.3545,  0.1284,  1.1484, -1.0120,
          -1.3508, -0.9513],
         [ 1.8948,  0.8627, -2.1359,  1.3740, -0.7499,  1.5019,  0.6919,
          -0.0842, -0.4294]],

        [[-0.2802,  0.6941, -0.4788, -0.3845,  1.7752,  1.2950, -1.9490,
          -1.4138, -0.8853],
         [-1.3752, -0.5457, -0.5305,  0.4018,  0.2934,  0.7931,  2.3845,
          -1.0726,  0.0364],
         [ 0.3621,  0.2609,  0.1269, -0.5950,  0.7212,  0.5959,  1.6264,
          -0.8836, -0.9320],
         [ 0.2003, -1.0758, -1.1560, -0.6472, -1.7549,  0.1264,  0.6044,
          -1.6857,  1.1571],
         [ 1.4277, -0.4915,  0.4496,  2.2027,  0.0730, -3.1792, -0.5125,
          -0.5837,  1.0184],
         [ 1.9495,  1.7145, -0.2143, -0.1230, -0.2205,  0.8250,  0.4943,
          -0.9025,  0.0864]]])
        bio_target = torch.LongTensor([[3, 6, 0, 8, 2, 4],
                                        [4, 1, 7, 0, 4, 7]])
        fastnlp_bio_metric({'pred': bio_sequence, 'seq_len': torch.LongTensor([6, 6])}, {'target': bio_target})
        expect_bio_res = {'pre-1': 0.333333, 'rec-1': 0.333333, 'f-1': 0.333333, 'pre-2': 0.5, 'rec-2': 0.5,
                          'f-2': 0.5, 'pre-0': 0.0, 'rec-0': 0.0, 'f-0': 0.0, 'pre-3': 0.0, 'rec-3': 0.0,
                          'f-3': 0.0, 'pre': 0.222222, 'rec': 0.181818, 'f': 0.2}

        self.assertDictEqual(expect_bio_res, fastnlp_bio_metric.get_metric())

    def test_case4(self):
        # bmes tag
        def _generate_samples():
            target = []
            seq_len = []
            vocab = Vocabulary(unknown=None, padding=None)
            for i in range(3):
                target_i = []
                seq_len_i = 0
                for j in range(1, 10):
                    word_len = np.random.randint(1, 5)
                    seq_len_i += word_len
                    if word_len==1:
                        target_i.append('S')
                    else:
                        target_i.append('B')
                        target_i.extend(['M']*(word_len-2))
                        target_i.append('E')
                vocab.add_word_lst(target_i)
                target.append(target_i)
                seq_len.append(seq_len_i)
            target_ = np.zeros((3, max(seq_len)))
            for i in range(3):
                target_i = [vocab.to_index(t) for t in target[i]]
                target_[i, :seq_len[i]] = target_i
            return target_, target, seq_len, vocab
        def get_eval(raw_target, pred, vocab, seq_len):
            pred = pred.argmax(dim=-1).tolist()
            tp = 0
            gold = 0
            seg = 0
            pred_target = []
            for i in range(len(seq_len)):
                tags = [vocab.to_word(p) for p in pred[i][:seq_len[i]]]
                spans = []
                prev_bmes_tag = None
                for idx, tag in enumerate(tags):
                    if tag in ('B', 'S'):
                        spans.append([idx, idx])
                    elif tag in ('M', 'E') and prev_bmes_tag in ('B', 'M'):
                        spans[-1][1] = idx
                    else:
                        spans.append([idx, idx])
                    prev_bmes_tag = tag
                tmp = []
                for span in spans:
                    if span[1]-span[0]>0:
                        tmp.extend(['B'] + ['M']*(span[1]-span[0]-1) + ['E'])
                    else:
                        tmp.append('S')
                pred_target.append(tmp)
            for i in range(len(seq_len)):
                raw_pred = pred_target[i]
                start = 0
                for j in range(seq_len[i]):
                    if raw_target[i][j] in ('E', 'S'):
                        flag = True
                        for k in range(start, j+1):
                            if raw_target[i][k]!=raw_pred[k]:
                                flag = False
                                break
                        if flag:
                            tp += 1
                        start = j + 1
                        gold += 1
                    if raw_pred[j] in ('E', 'S'):
                        seg += 1

            pre = round(tp/seg, 6)
            rec = round(tp/gold, 6)
            return {'f': round(2*pre*rec/(pre+rec), 6), 'pre': pre, 'rec':rec}

        target, raw_target, seq_len, vocab = _generate_samples()
        pred = torch.randn(3, max(seq_len), 4)

        expected_metric = get_eval(raw_target, pred, vocab, seq_len)
        metric = SpanFPreRecMetric(vocab, encoding_type='bmes')
        metric({'pred': pred, 'seq_len':torch.LongTensor(seq_len)}, {'target': torch.from_numpy(target)})
        # print(metric.get_metric(reset=False))
        # print(expected_metric)
        metric_value = metric.get_metric()
        for key, value in expected_metric.items():
            self.assertAlmostEqual(value, metric_value[key], places=5)

    def test_auto_encoding_type_infer(self):
        #  检查是否可以自动check encode的类型
        vocabs = {}
        import random
        for encoding_type in ['bio', 'bioes', 'bmeso']:
            vocab = Vocabulary(unknown=None, padding=None)
            for i in range(random.randint(10, 100)):
                label = str(random.randint(1, 10))
                for tag in encoding_type:
                    if tag!='o':
                        vocab.add_word(f'{tag}-{label}')
                    else:
                        vocab.add_word('o')
            vocabs[encoding_type] = vocab
        for e in ['bio', 'bioes', 'bmeso']:
            with self.subTest(e=e):
                metric = SpanFPreRecMetric(tag_vocab=vocabs[e])
                assert metric.encoding_type == e

        bmes_vocab = _generate_tags('bmes')
        vocab = Vocabulary()
        for tag, index in bmes_vocab.items():
            vocab.add_word(tag)
        metric = SpanFPreRecMetric(vocab)
        assert metric.encoding_type == 'bmes'

        # 一些无法check的情况
        vocab = Vocabulary()
        for i in range(10):
            vocab.add_word(str(i))
        with self.assertRaises(Exception):
            metric = SpanFPreRecMetric(vocab)

    def test_encoding_type(self):
        # 检查传入的tag_vocab与encoding_type不符合时，是否会报错
        vocabs = {}
        import random
        from itertools import product
        for encoding_type in ['bio', 'bioes', 'bmeso']:
            vocab = Vocabulary(unknown=None, padding=None)
            for i in range(random.randint(10, 100)):
                label = str(random.randint(1, 10))
                for tag in encoding_type:
                    if tag!='o':
                        vocab.add_word(f'{tag}-{label}')
                    else:
                        vocab.add_word('o')
            vocabs[encoding_type] = vocab
        for e1, e2 in product(['bio', 'bioes', 'bmeso'], ['bio', 'bioes', 'bmeso']):
            with self.subTest(e1=e1, e2=e2):
                if e1==e2:
                    metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2)
                else:
                    s2 = set(e2)
                    s2.update(set(e1))
                    if s2==set(e2):
                        continue
                    with self.assertRaises(AssertionError):
                        metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2)
        for encoding_type in ['bio', 'bioes', 'bmeso']:
            with self.assertRaises(AssertionError):
                metric = SpanFPreRecMetric(vocabs[encoding_type], encoding_type='bmes')

        with self.assertWarns(Warning):
            vocab = Vocabulary(unknown=None, padding=None).add_word_lst(list('bmes'))
            metric = SpanFPreRecMetric(vocab, encoding_type='bmeso')
            vocab = Vocabulary().add_word_lst(list('bmes'))
            metric = SpanFPreRecMetric(vocab, encoding_type='bmeso')


class TestCMRC2018Metric(unittest.TestCase):
    def test_case1(self):
        # 测试能否正确计算
        import torch
        metric = CMRC2018Metric()

        raw_chars = [list("abcsdef"), list("123456s789")]
        context_len = torch.LongTensor([3, 6])
        answers = [["abc", "abc", "abc"], ["12", "12", "12"]]
        pred_start = torch.randn(2, max(map(len, raw_chars)))
        pred_end = torch.randn(2, max(map(len, raw_chars)))
        pred_start[0, 0] = 1000  # 正好是abc
        pred_end[0, 2] = 1000
        pred_start[1, 1] = 1000  # 取出234
        pred_end[1, 3] = 1000

        metric.evaluate(answers=answers, raw_chars=raw_chars, pred_start=pred_start,
                        pred_end=pred_end, context_len=context_len)

        eval_res = metric.get_metric()
        self.assertDictEqual(eval_res, {'f1': 70.0, 'em': 50.0})


class TestUsefulFunctions(unittest.TestCase):
    # 测试metrics.py中一些看上去挺有用的函数
    def test_case_1(self):
        # multi-class
        _ = _accuracy_topk(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), k=3)
        _ = _pred_topk(np.random.randint(0, 3, size=(10, 1)))
        
        # 跑通即可


class TestClassfiyFPreRecMetric(unittest.TestCase):
    def test_case_1(self):
        pred= torch.tensor([[-0.4375, -0.1779, -1.0985, -1.1592,  0.4910],
        [ 1.3410,  0.2889, -0.8667, -1.8580,  0.3029],
        [ 0.7459, -1.1957,  0.3231,  0.0308, -0.1847],
        [ 1.1439, -0.0057,  0.8203,  0.0312, -1.0051],
        [-0.4870,  0.3215, -0.8290,  0.9221,  0.4683],
        [ 0.9078,  1.0674, -0.5629,  0.3895,  0.8917],
        [-0.7743, -0.4041, -0.9026,  0.2112,  1.0892],
        [ 1.8232, -1.4188, -2.5615, -2.4187,  0.5907],
        [-1.0592,  0.4164, -0.1192,  1.4238, -0.9258],
        [-1.1137,  0.5773,  2.5778,  0.5398, -0.3323],
        [-0.3868, -0.5165,  0.2286, -1.3876,  0.5561],
        [-0.3304,  1.3619, -1.5744,  0.4902, -0.7661],
        [ 1.8387,  0.5234,  0.4269,  1.3748, -1.2793],
        [ 0.6692,  0.2571,  1.2425, -0.5894, -0.0184],
        [ 0.4165,  0.4084, -0.1280,  1.4489, -2.3058],
        [-0.5826, -0.5469,  1.5898, -0.2786, -0.9882],
        [-1.5548, -2.2891,  0.2983, -1.2145, -0.1947],
        [-0.7222,  2.3543, -0.5801, -0.0640, -1.5614],
        [-1.4978,  1.9297, -1.3652, -0.2358,  2.5566],
        [ 0.1561, -0.0316,  0.9331,  1.0363,  2.3949],
        [ 0.2650, -0.8459,  1.3221,  0.1321, -1.1900],
        [ 0.0664, -1.2353, -0.5242, -1.4491,  1.3300],
        [-0.2744,  0.0941,  0.7157,  0.1404,  1.2046],
        [ 0.9341, -0.6652,  1.4512,  0.9608, -0.3623],
        [-1.1641,  0.0873,  0.1163, -0.2068, -0.7002],
        [ 1.4775, -2.0025, -0.5634, -0.1589,  0.0247],
        [ 1.0151,  1.0304, -0.1042, -0.6955, -0.0629],
        [-0.3119, -0.4558,  0.7757,  0.0758, -1.6297],
        [ 1.0654,  0.0313, -0.7716,  0.1194,  0.6913],
        [-0.8088, -0.6648, -0.5018, -0.0230, -0.8207],
        [-0.7753, -0.3508,  1.6163,  0.7158,  1.5207],
        [ 0.8692,  0.7718, -0.6734,  0.6515,  0.0641]])
        arg_max_pred = torch.argmax(pred,dim=-1)
        target = torch.tensor([0, 2, 4, 1, 4, 0, 1, 3, 3, 3, 1, 3, 4, 4, 3, 4, 0, 2, 4, 4, 3, 4, 4, 3,
        0, 3, 0, 0, 0, 1, 3, 1])
        
        metric = ClassifyFPreRecMetric(f_type='macro')
        metric.evaluate(pred, target)
        result_dict = metric.get_metric() 
        f1_score = 0.1882051282051282
        recall = 0.1619047619047619
        pre = 0.23928571428571427

        ground_truth = {'f': f1_score, 'pre': pre, 'rec': recall}
        for keys in ['f', 'pre', 'rec']:
            self.assertAlmostEqual(result_dict[keys], ground_truth[keys], delta=0.000001)

        metric = ClassifyFPreRecMetric(f_type='micro')
        metric.evaluate(pred, target)
        result_dict = metric.get_metric() 
        f1_score = 0.21875
        recall = 0.21875
        pre = 0.21875

        ground_truth = {'f': f1_score, 'pre': pre, 'rec': recall}
        for keys in ['f', 'pre', 'rec']:
            self.assertAlmostEqual(result_dict[keys], ground_truth[keys], delta=0.000001)

        metric = ClassifyFPreRecMetric(only_gross=False, f_type='macro')
        metric.evaluate(pred, target)
        result_dict = metric.get_metric(reset=True)
        ground_truth = {'0': {'f1-score': 0.13333333333333333, 'precision': 0.125, 'recall': 0.14285714285714285, 'support': 7}, '1': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 5}, '2': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 2}, '3': {'f1-score': 0.30769230769230765, 'precision': 0.5, 'recall': 0.2222222222222222, 'support': 9}, '4': {'f1-score': 0.5, 'precision': 0.5714285714285714, 'recall': 0.4444444444444444, 'support': 9}, 'macro avg': {'f1-score': 0.1882051282051282, 'precision': 0.23928571428571427, 'recall': 0.1619047619047619, 'support': 32}, 'micro avg': {'f1-score': 0.21875, 'precision': 0.21875, 'recall': 0.21875, 'support': 32}, 'weighted avg': {'f1-score': 0.2563301282051282, 'precision': 0.3286830357142857, 'recall': 0.21875, 'support': 32}}
        for keys in result_dict.keys():
            if keys=="f" or "pre" or "rec":
                continue
            gl=str(keys[-1])
            tmp_d={"p":"precision","r":"recall","f":"f1-score"}
            gk=tmp_d[keys[0]]
            self.assertAlmostEqual(result_dict[keys], ground_truth[gl][gk], delta=0.000001)