mmpretrain/tests/test_metrics/test_multi_label.py

399 lines
15 KiB
Python

# Copyright (c) OpenMMLab. All rights reserved.
from unittest import TestCase
import numpy as np
import sklearn.metrics
import torch
from mmengine.evaluator import Evaluator
from mmcls.engine import ClsDataSample
from mmcls.metrics import AveragePrecision, MultiLabelMetric
from mmcls.utils import register_all_modules
register_all_modules()
class TestMultiLabel(TestCase):
def test_calculate(self):
"""Test using the metric from static method."""
y_true = [[0], [1, 3], [0, 1, 2], [3]]
y_pred = [[0, 3], [0, 2], [1, 2], [2, 3]]
y_true_binary = np.array([
[1, 0, 0, 0],
[0, 1, 0, 1],
[1, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred_binary = np.array([
[1, 0, 0, 1],
[1, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 1, 1],
])
y_pred_score = np.array([
[0.8, 0, 0, 0.6],
[0.2, 0, 0.6, 0],
[0, 0.9, 0.6, 0],
[0, 0, 0.2, 0.3],
])
# Test with sequence of category indexes
res = MultiLabelMetric.calculate(
y_pred,
y_true,
pred_indices=True,
target_indices=True,
num_classes=4)
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
expect_precision = sklearn.metrics.precision_score(
y_true_binary, y_pred_binary, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, y_pred_binary, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, y_pred_binary, average='macro') * 100
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with onehot input
res = MultiLabelMetric.calculate(y_pred_binary,
torch.from_numpy(y_true_binary))
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
# Expected values come from sklearn
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with topk argument
res = MultiLabelMetric.calculate(
y_pred_score, y_true, target_indices=True, topk=1, num_classes=4)
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
# Expected values come from sklearn
top1_y_pred = np.array([
[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, top1_y_pred, average='macro') * 100
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with thr argument
res = MultiLabelMetric.calculate(
y_pred_score, y_true, target_indices=True, thr=0.25, num_classes=4)
self.assertIsInstance(res, tuple)
precision, recall, f1_score, support = res
# Expected values come from sklearn
thr_y_pred = np.array([
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr_y_pred, average='macro') * 100
self.assertTensorEqual(precision, expect_precision)
self.assertTensorEqual(recall, expect_recall)
self.assertTensorEqual(f1_score, expect_f1)
self.assertTensorEqual(support, 7)
# Test with invalid inputs
with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
MultiLabelMetric.calculate(y_pred, 'hi', num_classes=10)
# Test with invalid input
with self.assertRaisesRegex(AssertionError,
'Invalid `average` argument,'):
MultiLabelMetric.calculate(
y_pred, y_true, average='m', num_classes=10)
y_true_binary = np.array([[1, 0, 0, 0], [0, 1, 0, 1]])
y_pred_binary = np.array([[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]])
# Test with invalid inputs
with self.assertRaisesRegex(AssertionError, 'The size of pred'):
MultiLabelMetric.calculate(y_pred_binary, y_true_binary)
# Test with invalid inputs
with self.assertRaisesRegex(TypeError, 'The `pred` and `target` must'):
MultiLabelMetric.calculate(y_pred_binary, 5)
def test_evaluate(self):
fake_data_batch = [{
'inputs': None,
'data_sample': ClsDataSample()
} for _ in range(4)]
y_true = [[0], [1, 3], [0, 1, 2], [3]]
y_true_binary = torch.tensor([
[1, 0, 0, 0],
[0, 1, 0, 1],
[1, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred_score = torch.tensor([
[0.8, 0, 0, 0.6],
[0.2, 0, 0.6, 0],
[0, 0.9, 0.6, 0],
[0, 0, 0.2, 0.3],
])
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_label(j)
for i, j in zip(y_pred_score, y_true)
]
# Test with default argument
evaluator = Evaluator(dict(type='MultiLabelMetric'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
thr05_y_pred = np.array([
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 0],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr05_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr05_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr05_y_pred, average='macro') * 100
self.assertEqual(res['multi-label/precision'], expect_precision)
self.assertEqual(res['multi-label/recall'], expect_recall)
self.assertEqual(res['multi-label/f1-score'], expect_f1)
# Test with topk argument
evaluator = Evaluator(dict(type='MultiLabelMetric', topk=1))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
top1_y_pred = np.array([
[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, top1_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, top1_y_pred, average='macro') * 100
self.assertEqual(res['multi-label/precision_top1'], expect_precision)
self.assertEqual(res['multi-label/recall_top1'], expect_recall)
self.assertEqual(res['multi-label/f1-score_top1'], expect_f1)
# Test with both argument
evaluator = Evaluator(dict(type='MultiLabelMetric', thr=0.25, topk=1))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
# Expected values come from sklearn
thr_y_pred = np.array([
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 1],
])
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr_y_pred, average='macro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr_y_pred, average='macro') * 100
self.assertEqual(res['multi-label/precision_thr-0.25'],
expect_precision)
self.assertEqual(res['multi-label/recall_thr-0.25'], expect_recall)
self.assertEqual(res['multi-label/f1-score_thr-0.25'], expect_f1)
# Test with average micro
evaluator = Evaluator(dict(type='MultiLabelMetric', average='micro'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
# Expected values come from sklearn
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr05_y_pred, average='micro') * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr05_y_pred, average='micro') * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr05_y_pred, average='micro') * 100
self.assertAlmostEqual(
res['multi-label/precision_micro'], expect_precision, places=4)
self.assertAlmostEqual(
res['multi-label/recall_micro'], expect_recall, places=4)
self.assertAlmostEqual(
res['multi-label/f1-score_micro'], expect_f1, places=4)
# Test with average None
evaluator = Evaluator(dict(type='MultiLabelMetric', average=None))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
# Expected values come from sklearn
expect_precision = sklearn.metrics.precision_score(
y_true_binary, thr05_y_pred, average=None) * 100
expect_recall = sklearn.metrics.recall_score(
y_true_binary, thr05_y_pred, average=None) * 100
expect_f1 = sklearn.metrics.f1_score(
y_true_binary, thr05_y_pred, average=None) * 100
np.testing.assert_allclose(res['multi-label/precision_classwise'],
expect_precision)
np.testing.assert_allclose(res['multi-label/recall_classwise'],
expect_recall)
np.testing.assert_allclose(res['multi-label/f1-score_classwise'],
expect_f1)
# Test with gt_score
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_score(j)
for i, j in zip(y_pred_score, y_true_binary)
]
evaluator = Evaluator(dict(type='MultiLabelMetric', items=['support']))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(4)
self.assertIsInstance(res, dict)
self.assertEqual(res['multi-label/support'], 7)
def assertTensorEqual(self,
tensor: torch.Tensor,
value: float,
msg=None,
**kwarg):
tensor = tensor.to(torch.float32)
if tensor.dim() == 0:
tensor = tensor.unsqueeze(0)
value = torch.FloatTensor([value])
try:
torch.testing.assert_allclose(tensor, value, **kwarg)
except AssertionError as e:
self.fail(self._formatMessage(msg, str(e) + str(tensor)))
class TestAveragePrecision(TestCase):
def test_evaluate(self):
"""Test using the metric in the same way as Evalutor."""
y_pred = torch.tensor([
[0.9, 0.8, 0.3, 0.2],
[0.1, 0.2, 0.2, 0.1],
[0.7, 0.5, 0.9, 0.3],
[0.8, 0.1, 0.1, 0.2],
])
y_true = torch.tensor([
[1, 1, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[1, 0, 0, 0],
])
fake_data_batch = [{
'inputs': None,
'data_sample': ClsDataSample()
} for _ in range(4)]
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_score(j)
for i, j in zip(y_pred, y_true)
]
# Test with default macro avergae
evaluator = Evaluator(dict(type='AveragePrecision'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(5)
self.assertIsInstance(res, dict)
self.assertAlmostEqual(res['multi-label/mAP'], 70.83333, places=4)
# Test with average mode None
evaluator = Evaluator(dict(type='AveragePrecision', average=None))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(5)
self.assertIsInstance(res, dict)
aps = res['multi-label/AP_classwise']
self.assertAlmostEqual(aps[0], 100., places=4)
self.assertAlmostEqual(aps[1], 83.3333, places=4)
self.assertAlmostEqual(aps[2], 100, places=4)
self.assertAlmostEqual(aps[3], 0, places=4)
# Test with gt_label without score
pred = [
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_label(j)
for i, j in zip(y_pred, [[0, 1], [1], [2], [0]])
]
evaluator = Evaluator(dict(type='AveragePrecision'))
evaluator.process(fake_data_batch, pred)
res = evaluator.evaluate(5)
self.assertAlmostEqual(res['multi-label/mAP'], 70.83333, places=4)
def test_calculate(self):
"""Test using the metric from static method."""
y_true = np.array([
[1, 0, 0, 0],
[0, 1, 0, 1],
[1, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred = np.array([
[0.9, 0.8, 0.3, 0.2],
[0.1, 0.2, 0.2, 0.1],
[0.7, 0.5, 0.9, 0.3],
[0.8, 0.1, 0.1, 0.2],
])
ap_score = AveragePrecision.calculate(y_pred, y_true)
expect_ap = sklearn.metrics.average_precision_score(y_true,
y_pred) * 100
self.assertTensorEqual(ap_score, expect_ap)
# Test with invalid inputs
with self.assertRaisesRegex(AssertionError,
'Invalid `average` argument,'):
AveragePrecision.calculate(y_pred, y_true, average='m')
y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 1]])
y_pred = np.array([[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]])
# Test with invalid inputs
with self.assertRaisesRegex(AssertionError,
'Both `pred` and `target`'):
AveragePrecision.calculate(y_pred, y_true)
# Test with invalid inputs
with self.assertRaisesRegex(TypeError, "<class 'int'> is not an"):
AveragePrecision.calculate(y_pred, 5)
def assertTensorEqual(self,
tensor: torch.Tensor,
value: float,
msg=None,
**kwarg):
tensor = tensor.to(torch.float32)
if tensor.dim() == 0:
tensor = tensor.unsqueeze(0)
value = torch.FloatTensor([value])
try:
torch.testing.assert_allclose(tensor, value, **kwarg)
except AssertionError as e:
self.fail(self._formatMessage(msg, str(e) + str(tensor)))