229 lines
8.1 KiB
Python
229 lines
8.1 KiB
Python
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||
|
from unittest import TestCase
|
||
|
|
||
|
import numpy as np
|
||
|
import sklearn.metrics
|
||
|
import torch
|
||
|
from mmengine.evaluator import Evaluator
|
||
|
|
||
|
from mmcls.structures import ClsDataSample
|
||
|
from mmcls.utils import register_all_modules
|
||
|
|
||
|
register_all_modules()
|
||
|
|
||
|
|
||
|
class TestVOCMultiLabel(TestCase):
|
||
|
|
||
|
def test_evaluate(self):
|
||
|
# prepare input data
|
||
|
y_true_label = [[0], [1, 3], [0, 1, 2], [3]]
|
||
|
y_true_difficult = [[0], [2], [1], []]
|
||
|
y_pred_score = torch.tensor([
|
||
|
[0.8, 0, 0, 0.6],
|
||
|
[0.2, 0, 0.6, 0],
|
||
|
[0, 0.9, 0.6, 0],
|
||
|
[0, 0, 0.2, 0.3],
|
||
|
])
|
||
|
|
||
|
# generate data samples
|
||
|
pred = [
|
||
|
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_label(j)
|
||
|
for i, j in zip(y_pred_score, y_true_label)
|
||
|
]
|
||
|
for sample, difficult_label in zip(pred, y_true_difficult):
|
||
|
sample.set_metainfo({'gt_label_difficult': difficult_label})
|
||
|
|
||
|
# 1. Test with default argument
|
||
|
evaluator = Evaluator(dict(type='VOCMultiLabelMetric'))
|
||
|
evaluator.process(pred)
|
||
|
res = evaluator.evaluate(4)
|
||
|
self.assertIsInstance(res, dict)
|
||
|
|
||
|
# generate sklearn input
|
||
|
y_true = np.array([
|
||
|
[1, 0, 0, 0],
|
||
|
[0, 1, -1, 1],
|
||
|
[1, 1, 1, 0],
|
||
|
[0, 0, 0, 1],
|
||
|
])
|
||
|
ignored_index = y_true == -1
|
||
|
y_true[ignored_index] = 0
|
||
|
thr05_y_pred = np.array([
|
||
|
[1, 0, 0, 1],
|
||
|
[0, 0, 1, 0],
|
||
|
[0, 1, 1, 0],
|
||
|
[0, 0, 0, 0],
|
||
|
])
|
||
|
thr05_y_pred[ignored_index] = 0
|
||
|
|
||
|
expect_precision = sklearn.metrics.precision_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
expect_recall = sklearn.metrics.recall_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
expect_f1 = sklearn.metrics.f1_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
self.assertEqual(res['multi-label/precision'], expect_precision)
|
||
|
self.assertEqual(res['multi-label/recall'], expect_recall)
|
||
|
# precision is different between torch and sklearn
|
||
|
self.assertAlmostEqual(res['multi-label/f1-score'], expect_f1, 5)
|
||
|
|
||
|
# 2. Test with `difficult_as_positive`=False argument
|
||
|
evaluator = Evaluator(
|
||
|
dict(type='VOCMultiLabelMetric', difficult_as_positive=False))
|
||
|
evaluator.process(pred)
|
||
|
res = evaluator.evaluate(4)
|
||
|
self.assertIsInstance(res, dict)
|
||
|
|
||
|
# generate sklearn input
|
||
|
y_true = np.array([
|
||
|
[1, 0, 0, 0],
|
||
|
[0, 1, 0, 1],
|
||
|
[1, 1, 1, 0],
|
||
|
[0, 0, 0, 1],
|
||
|
])
|
||
|
thr05_y_pred = np.array([
|
||
|
[1, 0, 0, 1],
|
||
|
[0, 0, 1, 0],
|
||
|
[0, 1, 1, 0],
|
||
|
[0, 0, 0, 0],
|
||
|
])
|
||
|
|
||
|
expect_precision = sklearn.metrics.precision_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
expect_recall = sklearn.metrics.recall_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
expect_f1 = sklearn.metrics.f1_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
self.assertEqual(res['multi-label/precision'], expect_precision)
|
||
|
self.assertEqual(res['multi-label/recall'], expect_recall)
|
||
|
# precision is different between torch and sklearn
|
||
|
self.assertAlmostEqual(res['multi-label/f1-score'], expect_f1, 5)
|
||
|
|
||
|
# 3. Test with `difficult_as_positive`=True argument
|
||
|
evaluator = Evaluator(
|
||
|
dict(type='VOCMultiLabelMetric', difficult_as_positive=True))
|
||
|
evaluator.process(pred)
|
||
|
res = evaluator.evaluate(4)
|
||
|
self.assertIsInstance(res, dict)
|
||
|
|
||
|
# generate sklearn input
|
||
|
y_true = np.array([
|
||
|
[1, 0, 0, 0],
|
||
|
[0, 1, 1, 1],
|
||
|
[1, 1, 1, 0],
|
||
|
[0, 0, 0, 1],
|
||
|
])
|
||
|
thr05_y_pred = np.array([
|
||
|
[1, 0, 0, 1],
|
||
|
[0, 0, 1, 0],
|
||
|
[0, 1, 1, 0],
|
||
|
[0, 0, 0, 0],
|
||
|
])
|
||
|
|
||
|
expect_precision = sklearn.metrics.precision_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
expect_recall = sklearn.metrics.recall_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
expect_f1 = sklearn.metrics.f1_score(
|
||
|
y_true, thr05_y_pred, average='macro') * 100
|
||
|
self.assertEqual(res['multi-label/precision'], expect_precision)
|
||
|
self.assertEqual(res['multi-label/recall'], expect_recall)
|
||
|
# precision is different between torch and sklearn
|
||
|
self.assertAlmostEqual(res['multi-label/f1-score'], expect_f1, 5)
|
||
|
|
||
|
|
||
|
class TestVOCAveragePrecision(TestCase):
|
||
|
|
||
|
def test_evaluate(self):
|
||
|
"""Test using the metric in the same way as Evalutor."""
|
||
|
# prepare input data
|
||
|
y_true_difficult = [[0], [2], [1], []]
|
||
|
y_pred_score = torch.tensor([
|
||
|
[0.8, 0.1, 0, 0.6],
|
||
|
[0.2, 0.2, 0.7, 0],
|
||
|
[0.1, 0.9, 0.6, 0.1],
|
||
|
[0, 0, 0.2, 0.3],
|
||
|
])
|
||
|
y_true_label = [[0], [1, 3], [0, 1, 2], [3]]
|
||
|
y_true = torch.tensor([
|
||
|
[1, 0, 0, 0],
|
||
|
[0, 1, 0, 1],
|
||
|
[1, 1, 1, 0],
|
||
|
[0, 0, 0, 1],
|
||
|
])
|
||
|
y_true_difficult = [[0], [2], [1], []]
|
||
|
|
||
|
# generate data samples
|
||
|
pred = [
|
||
|
ClsDataSample(num_classes=4).set_pred_score(i).set_gt_score(
|
||
|
j).set_gt_label(k)
|
||
|
for i, j, k in zip(y_pred_score, y_true, y_true_label)
|
||
|
]
|
||
|
for sample, difficult_label in zip(pred, y_true_difficult):
|
||
|
sample.set_metainfo({'gt_label_difficult': difficult_label})
|
||
|
|
||
|
# 1. Test with default
|
||
|
evaluator = Evaluator(dict(type='VOCAveragePrecision'))
|
||
|
evaluator.process(pred)
|
||
|
res = evaluator.evaluate(4)
|
||
|
self.assertIsInstance(res, dict)
|
||
|
|
||
|
# prepare inputs for sklearn for this case
|
||
|
y_pred_score = [[0.8, 0.2, 0.1, 0], [0.1, 0.2, 0.9, 0], [0, 0.6, 0.2],
|
||
|
[0.6, 0, 0.1, 0.3]]
|
||
|
y_true = [[1, 0, 1, 0], [0, 1, 1, 0], [0, 1, 0], [0, 1, 0, 1]]
|
||
|
expected_res = []
|
||
|
for pred_per_class, gt_per_class in zip(y_pred_score, y_true):
|
||
|
expected_res.append(
|
||
|
sklearn.metrics.average_precision_score(
|
||
|
gt_per_class, pred_per_class))
|
||
|
|
||
|
self.assertAlmostEqual(
|
||
|
res['multi-label/mAP'],
|
||
|
sum(expected_res) * 100 / len(expected_res),
|
||
|
places=4)
|
||
|
|
||
|
# 2. Test with `difficult_as_positive`=False argument
|
||
|
evaluator = Evaluator(
|
||
|
dict(type='VOCAveragePrecision', difficult_as_positive=False))
|
||
|
evaluator.process(pred)
|
||
|
res = evaluator.evaluate(4)
|
||
|
self.assertIsInstance(res, dict)
|
||
|
|
||
|
# prepare inputs for sklearn for this case
|
||
|
y_pred_score = [[0.8, 0.2, 0.1, 0], [0.1, 0.2, 0.9, 0],
|
||
|
[0, 0.7, 0.6, 0.2], [0.6, 0, 0.1, 0.3]]
|
||
|
y_true = [[1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 1, 0], [0, 1, 0, 1]]
|
||
|
expected_res = []
|
||
|
for pred_per_class, gt_per_class in zip(y_pred_score, y_true):
|
||
|
expected_res.append(
|
||
|
sklearn.metrics.average_precision_score(
|
||
|
gt_per_class, pred_per_class))
|
||
|
|
||
|
self.assertAlmostEqual(
|
||
|
res['multi-label/mAP'],
|
||
|
sum(expected_res) * 100 / len(expected_res),
|
||
|
places=4)
|
||
|
|
||
|
# 3. Test with `difficult_as_positive`=True argument
|
||
|
evaluator = Evaluator(
|
||
|
dict(type='VOCAveragePrecision', difficult_as_positive=True))
|
||
|
evaluator.process(pred)
|
||
|
res = evaluator.evaluate(4)
|
||
|
self.assertIsInstance(res, dict)
|
||
|
|
||
|
# prepare inputs for sklearn for this case
|
||
|
y_pred_score = [[0.8, 0.2, 0.1, 0], [0.1, 0.2, 0.9, 0],
|
||
|
[0, 0.7, 0.6, 0.2], [0.6, 0, 0.1, 0.3]]
|
||
|
y_true = [[1, 0, 1, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 1, 0, 1]]
|
||
|
expected_res = []
|
||
|
for pred_per_class, gt_per_class in zip(y_pred_score, y_true):
|
||
|
expected_res.append(
|
||
|
sklearn.metrics.average_precision_score(
|
||
|
gt_per_class, pred_per_class))
|
||
|
|
||
|
self.assertAlmostEqual(
|
||
|
res['multi-label/mAP'],
|
||
|
sum(expected_res) * 100 / len(expected_res),
|
||
|
places=4)
|