# Copyright (c) OpenMMLab. All rights reserved. from unittest import TestCase import numpy as np import sklearn.metrics import torch from mmengine.evaluator import Evaluator from mmengine.registry import init_default_scope from mmpretrain.evaluation.metrics import AveragePrecision, MultiLabelMetric from mmpretrain.structures import DataSample init_default_scope('mmpretrain') class TestMultiLabel(TestCase): def test_calculate(self): """Test using the metric from static method.""" y_true = [[0], [1, 3], [0, 1, 2], [3]] y_pred = [[0, 3], [0, 2], [1, 2], [2, 3]] y_true_binary = np.array([ [1, 0, 0, 0], [0, 1, 0, 1], [1, 1, 1, 0], [0, 0, 0, 1], ]) y_pred_binary = np.array([ [1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 1, 1], ]) y_pred_score = np.array([ [0.8, 0, 0, 0.6], [0.2, 0, 0.6, 0], [0, 0.9, 0.6, 0], [0, 0, 0.2, 0.3], ]) # Test with sequence of category indexes res = MultiLabelMetric.calculate( y_pred, y_true, pred_indices=True, target_indices=True, num_classes=4) self.assertIsInstance(res, tuple) precision, recall, f1_score, support = res expect_precision = sklearn.metrics.precision_score( y_true_binary, y_pred_binary, average='macro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, y_pred_binary, average='macro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, y_pred_binary, average='macro') * 100 self.assertTensorEqual(precision, expect_precision) self.assertTensorEqual(recall, expect_recall) self.assertTensorEqual(f1_score, expect_f1) self.assertTensorEqual(support, 7) # Test with onehot input res = MultiLabelMetric.calculate(y_pred_binary, torch.from_numpy(y_true_binary)) self.assertIsInstance(res, tuple) precision, recall, f1_score, support = res # Expected values come from sklearn self.assertTensorEqual(precision, expect_precision) self.assertTensorEqual(recall, expect_recall) self.assertTensorEqual(f1_score, expect_f1) self.assertTensorEqual(support, 7) # Test with topk argument res = MultiLabelMetric.calculate( y_pred_score, y_true, target_indices=True, topk=1, num_classes=4) self.assertIsInstance(res, tuple) precision, recall, f1_score, support = res # Expected values come from sklearn top1_y_pred = np.array([ [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1], ]) expect_precision = sklearn.metrics.precision_score( y_true_binary, top1_y_pred, average='macro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, top1_y_pred, average='macro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, top1_y_pred, average='macro') * 100 self.assertTensorEqual(precision, expect_precision) self.assertTensorEqual(recall, expect_recall) self.assertTensorEqual(f1_score, expect_f1) self.assertTensorEqual(support, 7) # Test with thr argument res = MultiLabelMetric.calculate( y_pred_score, y_true, target_indices=True, thr=0.25, num_classes=4) self.assertIsInstance(res, tuple) precision, recall, f1_score, support = res # Expected values come from sklearn thr_y_pred = np.array([ [1, 0, 0, 1], [0, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1], ]) expect_precision = sklearn.metrics.precision_score( y_true_binary, thr_y_pred, average='macro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, thr_y_pred, average='macro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, thr_y_pred, average='macro') * 100 self.assertTensorEqual(precision, expect_precision) self.assertTensorEqual(recall, expect_recall) self.assertTensorEqual(f1_score, expect_f1) self.assertTensorEqual(support, 7) # Test with invalid inputs with self.assertRaisesRegex(TypeError, " is not"): MultiLabelMetric.calculate(y_pred, 'hi', num_classes=10) # Test with invalid input with self.assertRaisesRegex(AssertionError, 'Invalid `average` argument,'): MultiLabelMetric.calculate( y_pred, y_true, average='m', num_classes=10) y_true_binary = np.array([[1, 0, 0, 0], [0, 1, 0, 1]]) y_pred_binary = np.array([[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]]) # Test with invalid inputs with self.assertRaisesRegex(AssertionError, 'The size of pred'): MultiLabelMetric.calculate(y_pred_binary, y_true_binary) # Test with invalid inputs with self.assertRaisesRegex(TypeError, 'The `pred` and `target` must'): MultiLabelMetric.calculate(y_pred_binary, 5) def test_evaluate(self): y_true = [[0], [1, 3], [0, 1, 2], [3]] y_true_binary = torch.tensor([ [1, 0, 0, 0], [0, 1, 0, 1], [1, 1, 1, 0], [0, 0, 0, 1], ]) y_pred_score = torch.tensor([ [0.8, 0, 0, 0.6], [0.2, 0, 0.6, 0], [0, 0.9, 0.6, 0], [0, 0, 0.2, 0.3], ]) pred = [ DataSample(num_classes=4).set_pred_score(i).set_gt_label(j) for i, j in zip(y_pred_score, y_true) ] # Test with default argument evaluator = Evaluator(dict(type='MultiLabelMetric')) evaluator.process(pred) res = evaluator.evaluate(4) self.assertIsInstance(res, dict) thr05_y_pred = np.array([ [1, 0, 0, 1], [0, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 0], ]) expect_precision = sklearn.metrics.precision_score( y_true_binary, thr05_y_pred, average='macro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, thr05_y_pred, average='macro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, thr05_y_pred, average='macro') * 100 self.assertEqual(res['multi-label/precision'], expect_precision) self.assertEqual(res['multi-label/recall'], expect_recall) self.assertEqual(res['multi-label/f1-score'], expect_f1) # Test with topk argument evaluator = Evaluator(dict(type='MultiLabelMetric', topk=1)) evaluator.process(pred) res = evaluator.evaluate(4) self.assertIsInstance(res, dict) top1_y_pred = np.array([ [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1], ]) expect_precision = sklearn.metrics.precision_score( y_true_binary, top1_y_pred, average='macro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, top1_y_pred, average='macro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, top1_y_pred, average='macro') * 100 self.assertEqual(res['multi-label/precision_top1'], expect_precision) self.assertEqual(res['multi-label/recall_top1'], expect_recall) self.assertEqual(res['multi-label/f1-score_top1'], expect_f1) # Test with both argument evaluator = Evaluator(dict(type='MultiLabelMetric', thr=0.25, topk=1)) evaluator.process(pred) res = evaluator.evaluate(4) self.assertIsInstance(res, dict) # Expected values come from sklearn thr_y_pred = np.array([ [1, 0, 0, 1], [0, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1], ]) expect_precision = sklearn.metrics.precision_score( y_true_binary, thr_y_pred, average='macro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, thr_y_pred, average='macro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, thr_y_pred, average='macro') * 100 self.assertEqual(res['multi-label/precision_thr-0.25'], expect_precision) self.assertEqual(res['multi-label/recall_thr-0.25'], expect_recall) self.assertEqual(res['multi-label/f1-score_thr-0.25'], expect_f1) # Test with average micro evaluator = Evaluator(dict(type='MultiLabelMetric', average='micro')) evaluator.process(pred) res = evaluator.evaluate(4) self.assertIsInstance(res, dict) # Expected values come from sklearn expect_precision = sklearn.metrics.precision_score( y_true_binary, thr05_y_pred, average='micro') * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, thr05_y_pred, average='micro') * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, thr05_y_pred, average='micro') * 100 self.assertAlmostEqual( res['multi-label/precision_micro'], expect_precision, places=4) self.assertAlmostEqual( res['multi-label/recall_micro'], expect_recall, places=4) self.assertAlmostEqual( res['multi-label/f1-score_micro'], expect_f1, places=4) # Test with average None evaluator = Evaluator(dict(type='MultiLabelMetric', average=None)) evaluator.process(pred) res = evaluator.evaluate(4) self.assertIsInstance(res, dict) # Expected values come from sklearn expect_precision = sklearn.metrics.precision_score( y_true_binary, thr05_y_pred, average=None) * 100 expect_recall = sklearn.metrics.recall_score( y_true_binary, thr05_y_pred, average=None) * 100 expect_f1 = sklearn.metrics.f1_score( y_true_binary, thr05_y_pred, average=None) * 100 np.testing.assert_allclose(res['multi-label/precision_classwise'], expect_precision) np.testing.assert_allclose(res['multi-label/recall_classwise'], expect_recall) np.testing.assert_allclose(res['multi-label/f1-score_classwise'], expect_f1) # Test with gt_score pred = [ DataSample(num_classes=4).set_pred_score(i).set_gt_score(j) for i, j in zip(y_pred_score, y_true_binary) ] evaluator = Evaluator(dict(type='MultiLabelMetric', items=['support'])) evaluator.process(pred) res = evaluator.evaluate(4) self.assertIsInstance(res, dict) self.assertEqual(res['multi-label/support'], 7) def assertTensorEqual(self, tensor: torch.Tensor, value: float, msg=None, **kwarg): tensor = tensor.to(torch.float32) if tensor.dim() == 0: tensor = tensor.unsqueeze(0) value = torch.FloatTensor([value]) try: torch.testing.assert_allclose(tensor, value, **kwarg) except AssertionError as e: self.fail(self._formatMessage(msg, str(e) + str(tensor))) class TestAveragePrecision(TestCase): def test_evaluate(self): """Test using the metric in the same way as Evalutor.""" y_pred = torch.tensor([ [0.9, 0.8, 0.3, 0.2], [0.1, 0.2, 0.2, 0.1], [0.7, 0.5, 0.9, 0.3], [0.8, 0.1, 0.1, 0.2], ]) y_true = torch.tensor([ [1, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], ]) pred = [ DataSample(num_classes=4).set_pred_score(i).set_gt_score(j) for i, j in zip(y_pred, y_true) ] # Test with default macro avergae evaluator = Evaluator(dict(type='AveragePrecision')) evaluator.process(pred) res = evaluator.evaluate(5) self.assertIsInstance(res, dict) self.assertAlmostEqual(res['multi-label/mAP'], 70.83333, places=4) # Test with average mode None evaluator = Evaluator(dict(type='AveragePrecision', average=None)) evaluator.process(pred) res = evaluator.evaluate(5) self.assertIsInstance(res, dict) aps = res['multi-label/AP_classwise'] self.assertAlmostEqual(aps[0], 100., places=4) self.assertAlmostEqual(aps[1], 83.3333, places=4) self.assertAlmostEqual(aps[2], 100, places=4) self.assertAlmostEqual(aps[3], 0, places=4) # Test with gt_label without score pred = [ DataSample(num_classes=4).set_pred_score(i).set_gt_label(j) for i, j in zip(y_pred, [[0, 1], [1], [2], [0]]) ] evaluator = Evaluator(dict(type='AveragePrecision')) evaluator.process(pred) res = evaluator.evaluate(5) self.assertAlmostEqual(res['multi-label/mAP'], 70.83333, places=4) def test_calculate(self): """Test using the metric from static method.""" y_true = np.array([ [1, 0, 0, 0], [0, 1, 0, 1], [1, 1, 1, 0], [0, 0, 0, 1], ]) y_pred = np.array([ [0.9, 0.8, 0.3, 0.2], [0.1, 0.2, 0.2, 0.1], [0.7, 0.5, 0.9, 0.3], [0.8, 0.1, 0.1, 0.2], ]) ap_score = AveragePrecision.calculate(y_pred, y_true) expect_ap = sklearn.metrics.average_precision_score(y_true, y_pred) * 100 self.assertTensorEqual(ap_score, expect_ap) # Test with invalid inputs with self.assertRaisesRegex(AssertionError, 'Invalid `average` argument,'): AveragePrecision.calculate(y_pred, y_true, average='m') y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 1]]) y_pred = np.array([[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]]) # Test with invalid inputs with self.assertRaisesRegex(AssertionError, 'Both `pred` and `target`'): AveragePrecision.calculate(y_pred, y_true) # Test with invalid inputs with self.assertRaisesRegex(TypeError, " is not an"): AveragePrecision.calculate(y_pred, 5) def assertTensorEqual(self, tensor: torch.Tensor, value: float, msg=None, **kwarg): tensor = tensor.to(torch.float32) if tensor.dim() == 0: tensor = tensor.unsqueeze(0) value = torch.FloatTensor([value]) try: torch.testing.assert_allclose(tensor, value, **kwarg) except AssertionError as e: self.fail(self._formatMessage(msg, str(e) + str(tensor)))