mmpretrain/tests/test_metrics/test_multi_label.py

399 lines
15 KiB
Python
Raw Normal View History

2022-06-23 15:18:18 +08:00
# Copyright (c) OpenMMLab. All rights reserved.
from unittest import TestCase
import numpy as np
import sklearn.metrics
import torch
from mmengine.evaluator import Evaluator
2022-07-13 17:04:41 +08:00
from mmcls.engine import ClsDataSample
2022-06-23 15:18:18 +08:00
from mmcls.metrics import AveragePrecision, MultiLabelMetric
from mmcls.utils import register_all_modules
register_all_modules()
class TestMultiLabel(TestCase):
def test_calculate(self):
"""Test using the metric from static method."""
y_true = [[0], [1, 3], [0, 1, 2], [3]]
y_pred = [[0, 3], [0, 2], [1, 2], [2, 3]]
y_true_binary = np.array([
[1, 0, 0, 0],
[0, 1, 0, 1],
[1, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred_binary = np.array([
[1, 0, 0, 1],
[1, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 1, 1],
])
y_pred_score = np.array([
[0.8, 0, 0, 0.6],
[0.2, 0, 0.6, 0],
[0, 0.9, 0.6, 0],
[0, 0, 0.2, 0.3],
])
# Test with sequence of category indexes
res = MultiLabelMetric.calculate(
y_pred,
y_true,
pred_indices=True,
target_indices=True,
num_classes=4)
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
expect_precision = sklearn.metrics.precision_score(
y_true_binary, y_pred_binary, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, y_pred_binary, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, y_pred_binary, average='macro') * 100
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with onehot input
res = MultiLabelMetric.calculate(y_pred_binary,
torch.from_numpy(y_true_binary))
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
# Expected values come from sklearn
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with topk argument
res = MultiLabelMetric.calculate(
y_pred_score, y_true, target_indices=True, topk=1, num_classes=4)
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
# Expected values come from sklearn
top1_y_pred = np.array([
[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, top1_y_pred, average='macro') * 100
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with thr argument
res = MultiLabelMetric.calculate(
y_pred_score, y_true, target_indices=True, thr=0.25, num_classes=4)
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
# Expected values come from sklearn
thr_y_pred = np.array([
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr_y_pred, average='macro') * 100
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with invalid inputs
with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
MultiLabelMetric.calculate(y_pred, 'hi', num_classes=10)
# Test with invalid input
with self.assertRaisesRegex(AssertionError,
'Invalid `average` argument,'):
MultiLabelMetric.calculate(
y_pred, y_true, average='m', num_classes=10)
y_true_binary = np.array([[1, 0, 0, 0], [0, 1, 0, 1]])
y_pred_binary = np.array([[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]])
# Test with invalid inputs
with self.assertRaisesRegex(AssertionError, 'The size of pred'):
MultiLabelMetric.calculate(y_pred_binary, y_true_binary)
# Test with invalid inputs
with self.assertRaisesRegex(TypeError, 'The `pred` and `target` must'):
MultiLabelMetric.calculate(y_pred_binary, 5)
def test_evaluate(self):
fake_data_batch = [{
'inputs': None,
'data_sample': ClsDataSample()
} for _ in range(4)]
y_true = [[0], [1, 3], [0, 1, 2], [3]]
y_true_binary = torch.tensor([
[1, 0, 0, 0],
[0, 1, 0, 1],
[1, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred_score = torch.tensor([
[0.8, 0, 0, 0.6],
[0.2, 0, 0.6, 0],
[0, 0.9, 0.6, 0],
[0, 0, 0.2, 0.3],
])
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_label(j)
for i, j in zip(y_pred_score, y_true)
]
# Test with default argument
evaluator = Evaluator(dict(type='MultiLabelMetric'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
thr05_y_pred = np.array([
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 0],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr05_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr05_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr05_y_pred, average='macro') * 100
self.assertEqual(res['multi-label/precision'], expect_precision)
self.assertEqual(res['multi-label/recall'], expect_recall)
self.assertEqual(res['multi-label/f1-score'], expect_f1)
# Test with topk argument
evaluator = Evaluator(dict(type='MultiLabelMetric', topk=1))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
top1_y_pred = np.array([
[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, top1_y_pred, average='macro') * 100
self.assertEqual(res['multi-label/precision_top1'], expect_precision)
self.assertEqual(res['multi-label/recall_top1'], expect_recall)
self.assertEqual(res['multi-label/f1-score_top1'], expect_f1)
# Test with both argument
evaluator = Evaluator(dict(type='MultiLabelMetric', thr=0.25, topk=1))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
# Expected values come from sklearn
thr_y_pred = np.array([
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr_y_pred, average='macro') * 100
self.assertEqual(res['multi-label/precision_thr-0.25'],
expect_precision)
self.assertEqual(res['multi-label/recall_thr-0.25'], expect_recall)
self.assertEqual(res['multi-label/f1-score_thr-0.25'], expect_f1)
# Test with average micro
evaluator = Evaluator(dict(type='MultiLabelMetric', average='micro'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
# Expected values come from sklearn
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr05_y_pred, average='micro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr05_y_pred, average='micro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr05_y_pred, average='micro') * 100
self.assertAlmostEqual(
res['multi-label/precision_micro'], expect_precision, places=4)
self.assertAlmostEqual(
res['multi-label/recall_micro'], expect_recall, places=4)
self.assertAlmostEqual(
res['multi-label/f1-score_micro'], expect_f1, places=4)
# Test with average None
evaluator = Evaluator(dict(type='MultiLabelMetric', average=None))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
# Expected values come from sklearn
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr05_y_pred, average=None) * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr05_y_pred, average=None) * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr05_y_pred, average=None) * 100
np.testing.assert_allclose(res['multi-label/precision_classwise'],
expect_precision)
np.testing.assert_allclose(res['multi-label/recall_classwise'],
expect_recall)
np.testing.assert_allclose(res['multi-label/f1-score_classwise'],
expect_f1)
# Test with gt_score
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_score(j)
for i, j in zip(y_pred_score, y_true_binary)
]
evaluator = Evaluator(dict(type='MultiLabelMetric', items=['support']))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
self.assertEqual(res['multi-label/support'], 7)
def assertTensorEqual(self,
tensor: torch.Tensor,
value: float,
msg=None,
**kwarg):
tensor = tensor.to(torch.float32)
if tensor.dim() == 0:
tensor = tensor.unsqueeze(0)
value = torch.FloatTensor([value])
try:
torch.testing.assert_allclose(tensor, value, **kwarg)
except AssertionError as e:
self.fail(self._formatMessage(msg, str(e) + str(tensor)))
class TestAveragePrecision(TestCase):
def test_evaluate(self):
"""Test using the metric in the same way as Evalutor."""
y_pred = torch.tensor([
[0.9, 0.8, 0.3, 0.2],
[0.1, 0.2, 0.2, 0.1],
[0.7, 0.5, 0.9, 0.3],
[0.8, 0.1, 0.1, 0.2],
])
y_true = torch.tensor([
[1, 1, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[1, 0, 0, 0],
])
fake_data_batch = [{
'inputs': None,
'data_sample': ClsDataSample()
} for _ in range(4)]
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_score(j)
for i, j in zip(y_pred, y_true)
]
# Test with default macro avergae
evaluator = Evaluator(dict(type='AveragePrecision'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(5)
self.assertIsInstance(res, dict)
self.assertAlmostEqual(res['multi-label/mAP'], 70.83333, places=4)
# Test with average mode None
evaluator = Evaluator(dict(type='AveragePrecision', average=None))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(5)
self.assertIsInstance(res, dict)
aps = res['multi-label/AP_classwise']
self.assertAlmostEqual(aps[0], 100., places=4)
self.assertAlmostEqual(aps[1], 83.3333, places=4)
self.assertAlmostEqual(aps[2], 100, places=4)
self.assertAlmostEqual(aps[3], 0, places=4)
# Test with gt_label without score
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_label(j)
for i, j in zip(y_pred, [[0, 1], [1], [2], [0]])
]
evaluator = Evaluator(dict(type='AveragePrecision'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(5)
self.assertAlmostEqual(res['multi-label/mAP'], 70.83333, places=4)
def test_calculate(self):
"""Test using the metric from static method."""
y_true = np.array([
[1, 0, 0, 0],
[0, 1, 0, 1],
[1, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred = np.array([
[0.9, 0.8, 0.3, 0.2],
[0.1, 0.2, 0.2, 0.1],
[0.7, 0.5, 0.9, 0.3],
[0.8, 0.1, 0.1, 0.2],
])
ap_score = AveragePrecision.calculate(y_pred, y_true)
expect_ap = sklearn.metrics.average_precision_score(y_true,
y_pred) * 100
self.assertTensorEqual(ap_score, expect_ap)
# Test with invalid inputs
with self.assertRaisesRegex(AssertionError,
'Invalid `average` argument,'):
AveragePrecision.calculate(y_pred, y_true, average='m')
y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 1]])
y_pred = np.array([[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]])
# Test with invalid inputs
with self.assertRaisesRegex(AssertionError,
'Both `pred` and `target`'):
AveragePrecision.calculate(y_pred, y_true)
# Test with invalid inputs
with self.assertRaisesRegex(TypeError, "<class 'int'> is not an"):
AveragePrecision.calculate(y_pred, 5)
def assertTensorEqual(self,
tensor: torch.Tensor,
value: float,
msg=None,
**kwarg):
tensor = tensor.to(torch.float32)
if tensor.dim() == 0:
tensor = tensor.unsqueeze(0)
value = torch.FloatTensor([value])
try:
torch.testing.assert_allclose(tensor, value, **kwarg)
except AssertionError as e:
self.fail(self._formatMessage(msg, str(e) + str(tensor)))