diff --git a/docs/en/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css index 577a67a8..fbf96d37 100644 --- a/docs/en/_static/css/readthedocs.css +++ b/docs/en/_static/css/readthedocs.css @@ -25,3 +25,9 @@ article.pytorch-article section table code { table.autosummary td { width: 50% } + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} diff --git a/docs/en/_static/image/confusion-matrix.png b/docs/en/_static/image/confusion-matrix.png new file mode 100755 index 00000000..a1dc7ba6 Binary files /dev/null and b/docs/en/_static/image/confusion-matrix.png differ diff --git a/docs/en/advanced_guides/evaluation.md b/docs/en/advanced_guides/evaluation.md index 422d9f23..565a36d7 100644 --- a/docs/en/advanced_guides/evaluation.md +++ b/docs/en/advanced_guides/evaluation.md @@ -1 +1,103 @@ -# Custom evaluation metrics (TODO) +# Customize Evaluation Metrics + +## Use metrics in MMClassification + +In MMClassification, we have provided multiple metrics for both single-label classification and multi-label +classification: + +**Single-label Classification**: + +- [`Accuracy`](mmcls.evaluation.Accuracy) +- [`SingleLabelMetric`](mmcls.evaluation.SingleLabelMetric), including precision, recall, f1-score and + support. + +**Multi-label Classification**: + +- [`AveragePrecision`](mmcls.evaluation.AveragePrecision), or AP (mAP). +- [`MultiLabelMetric`](mmcls.evaluation.MultiLabelMetric), including precision, recall, f1-score and + support. + +To use these metrics during validation and testing, we need to modify the `val_evaluator` and `test_evaluator` +fields in the config file. + +Here is several examples: + +1. Calculate top-1 and top-5 accuracy during both validation and test. + + ```python + val_evaluator = dict(type='Accuracy', topk=(1, 5)) + test_evaluator = val_evaluator + ``` + +2. Calculate top-1 accuracy, top-5 accuracy, precision and recall during both validation and test. + + ```python + val_evaluator = [ + dict(type='Accuracy', topk=(1, 5)), + dict(type='SingleLabelMetric', items=['precision', 'recall']), + ] + test_evaluator = val_evaluator + ``` + +3. Calculate mAP (mean AveragePrecision), CP (Class-wise mean Precision), CR (Class-wise mean Recall), CF + (Class-wise mean F1-score), OP (Overall mean Precision), OR (Overall mean Recall) and OF1 (Overall mean + F1-score). + + ```python + val_evaluator = [ + dict(type='AveragePrecision'), + dict(type='MultiLabelMetric', average='macro'), # class-wise mean + dict(type='MultiLabelMetric', average='micro'), # overall mean + ] + test_evaluator = val_evaluator + ``` + +## Add new metrics + +MMClassification supports the implementation of customized evaluation metrics for users who pursue higher customization. + +You need to create a new file under `mmcls/evaluation/metrics`, and implement the new metric in the file, for example, in `mmcls/evaluation/metrics/my_metric.py`. And create a customized evaluation metric class `MyMetric` which inherits [`BaseMetric in MMEngine`](mmengine.evaluator.metrics.BaseMetric). + +The data format processing method `process` and the metric calculation method `compute_metrics` need to be overwritten respectively. Add it to the `METRICS` registry to implement any customized evaluation metric. + +```python +from mmengine.evaluator import BaseMetric +from mmcls.registry import METRICS + +@METRICS.register_module() +class MyMetric(BaseMetric): + + def process(self, data_batch: Sequence[Dict], data_samples: Sequence[Dict]): + """ The processed results should be stored in ``self.results``, which will + be used to computed the metrics when all batches have been processed. + `data_batch` stores the batch data from dataloader, + and `data_samples` stores the batch outputs from model. + """ + ... + + def compute_metrics(self, results: List): + """ Compute the metrics from processed results and returns the evaluation results. + """ + ... +``` + +Then, import it in the `mmcls/evaluation/metrics/__init__.py` to add it into the `mmcls.evaluation` package. + +```python +# In mmcls/evaluation/metrics/__init__.py +... +from .my_metric import MyMetric + +__all__ = [..., 'MyMetric'] +``` + +Finally, use `MyMetric` in the `val_evaluator` and `test_evaluator` field of config files. + +```python +val_evaluator = dict(type='MyMetric', ...) +test_evaluator = val_evaluator +``` + +```{note} +More details can be found in {external+mmengine:doc}`MMEngine Documentation: Evaluation `. +``` diff --git a/docs/zh_CN/_static/css/readthedocs.css b/docs/zh_CN/_static/css/readthedocs.css index 577a67a8..fbf96d37 100644 --- a/docs/zh_CN/_static/css/readthedocs.css +++ b/docs/zh_CN/_static/css/readthedocs.css @@ -25,3 +25,9 @@ article.pytorch-article section table code { table.autosummary td { width: 50% } + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} diff --git a/docs/zh_CN/_static/image/confusion-matrix.png b/docs/zh_CN/_static/image/confusion-matrix.png new file mode 120000 index 00000000..7b0b3772 --- /dev/null +++ b/docs/zh_CN/_static/image/confusion-matrix.png @@ -0,0 +1 @@ +../../../en/_static/image/confusion-matrix.png \ No newline at end of file diff --git a/docs/zh_CN/advanced_guides/evaluation.md b/docs/zh_CN/advanced_guides/evaluation.md index af3a57d2..82529786 100644 --- a/docs/zh_CN/advanced_guides/evaluation.md +++ b/docs/zh_CN/advanced_guides/evaluation.md @@ -1 +1,4 @@ # 自定义评估指标(待更新) + +请参见[英文文档](https://mmclassification.readthedocs.io/en/dev-1.x/advanced_guides/evaluation.html),如果你有兴 +趣参与中文文档的翻译,欢迎在 [讨论区](https://github.com/open-mmlab/mmclassification/discussions/1027)进行报名。 diff --git a/mmcls/evaluation/metrics/multi_label.py b/mmcls/evaluation/metrics/multi_label.py index 62498038..e0a4b936 100644 --- a/mmcls/evaluation/metrics/multi_label.py +++ b/mmcls/evaluation/metrics/multi_label.py @@ -13,38 +13,58 @@ from .single_label import _precision_recall_f1_support, to_tensor @METRICS.register_module() class MultiLabelMetric(BaseMetric): - """A collection of metrics for multi-label multi-class classification task - based on confusion matrix. + r"""A collection of precision, recall, f1-score and support for + multi-label tasks. - It includes precision, recall, f1-score and support. + The collection of metrics is for single-label multi-class classification. + And all these metrics are based on the confusion matrix of every category: + + .. image:: ../../_static/image/confusion-matrix.png + :width: 60% + :align: center + + All metrics can be formulated use variables above: + + **Precision** is the fraction of correct predictions in all predictions: + + .. math:: + \text{Precision} = \frac{TP}{TP+FP} + + **Recall** is the fraction of correct predictions in all targets: + + .. math:: + \text{Recall} = \frac{TP}{TP+FN} + + **F1-score** is the harmonic mean of the precision and recall: + + .. math:: + \text{F1-score} = \frac{2\times\text{Recall}\times\text{Precision}}{\text{Recall}+\text{Precision}} + + **Support** is the number of samples: + + .. math:: + \text{Support} = TP + TN + FN + FP Args: - thr (float, optional): Predictions with scores under the thresholds - are considered as negative. Defaults to None. + thr (float, optional): Predictions with scores under the threshold + are considered as negative. If None, the ``topk`` predictions will + be considered as positive. If the ``topk`` is also None, use + ``thr=0.5`` as default. Defaults to None. topk (int, optional): Predictions with the k-th highest scores are - considered as positive. Defaults to None. - items (Sequence[str]): The detailed metric items to evaluate. Here is - the available options: + considered as positive. If None, use ``thr`` to determine positive + predictions. If both ``thr`` and ``topk`` are not None, use + ``thr``. Defaults to None. + items (Sequence[str]): The detailed metric items to evaluate, select + from "precision", "recall", "f1-score" and "support". + Defaults to ``('precision', 'recall', 'f1-score')``. + average (str | None): How to calculate the final metrics from the + confusion matrix of every category. It supports three modes: - - `"precision"`: The ratio tp / (tp + fp) where tp is the - number of true positives and fp the number of false - positives. - - `"recall"`: The ratio tp / (tp + fn) where tp is the number - of true positives and fn the number of false negatives. - - `"f1-score"`: The f1-score is the harmonic mean of the - precision and recall. - - `"support"`: The total number of positive of each category - in the target. - - Defaults to ('precision', 'recall', 'f1-score'). - average (str | None): The average method. It supports three average - modes: - - - `"macro"`: Calculate metrics for each category, and calculate - the mean value over all categories. - - `"micro"`: Calculate metrics globally by counting the total - true positives, false negatives and false positives. - - `None`: Return scores of all categories. + - `"macro"`: Calculate metrics for each category, and calculate + the mean value over all categories. + - `"micro"`: Average the confusion matrix over all categories and + calculate metrics on the mean confusion matrix. + - `None`: Calculate metrics of every category and output directly. Defaults to "macro". collect_device (str): Device name used for collecting results from @@ -261,15 +281,16 @@ class MultiLabelMetric(BaseMetric): target_indices (bool): Whether the ``target`` is a sequence of category index labels. If True, ``num_classes`` must be set. Defaults to False. - average (str | None): The average method. It supports three average + average (str | None): How to calculate the final metrics from + the confusion matrix of every category. It supports three modes: - - `"macro"`: Calculate metrics for each category, and - calculate the mean value over all categories. - - `"micro"`: Calculate metrics globally by counting the - total true positives, false negatives and false - positives. - - `None`: Return scores of all categories. + - `"macro"`: Calculate metrics for each category, and calculate + the mean value over all categories. + - `"micro"`: Average the confusion matrix over all categories + and calculate metrics on the mean confusion matrix. + - `None`: Calculate metrics of every category and output + directly. Defaults to "macro". thr (float, optional): Predictions with scores under the thresholds @@ -402,14 +423,25 @@ def _average_precision(pred: torch.Tensor, @METRICS.register_module() class AveragePrecision(BaseMetric): - """Calculate the average precision with respect of classes. + r"""Calculate the average precision with respect of classes. + + AveragePrecision (AP) summarizes a precision-recall curve as the weighted + mean of maximum precisions obtained for any r'>r, where r is the recall: + + .. math:: + \text{AP} = \sum_n (R_n - R_{n-1}) P_n + + Note that no approximation is involved since the curve is piecewise + constant. Args: - average (str | None): The average method. It supports two modes: + average (str | None): How to calculate the final metrics from + every category. It supports two modes: - - `"macro"`: Calculate metrics for each category, and calculate - the mean value over all categories. - - `None`: Return scores of all categories. + - `"macro"`: Calculate metrics for each category, and calculate + the mean value over all categories. The result of this mode + is also called **mAP**. + - `None`: Calculate metrics of every category and output directly. Defaults to "macro". collect_device (str): Device name used for collecting results from @@ -529,15 +561,6 @@ class AveragePrecision(BaseMetric): average: Optional[str] = 'macro') -> torch.Tensor: r"""Calculate the average precision for a single class. - AP summarizes a precision-recall curve as the weighted mean of maximum - precisions obtained for any r'>r, where r is the recall: - - .. math:: - \text{AP} = \sum_n (R_n - R_{n-1}) P_n - - Note that no approximation is involved since the curve is piecewise - constant. - Args: pred (torch.Tensor | np.ndarray): The model predictions with shape ``(N, num_classes)``. @@ -545,9 +568,11 @@ class AveragePrecision(BaseMetric): with shape ``(N, num_classes)``. average (str | None): The average method. It supports two modes: - - `"macro"`: Calculate metrics for each category, and - calculate the mean value over all categories. - - `None`: Return scores of all categories. + - `"macro"`: Calculate metrics for each category, and calculate + the mean value over all categories. The result of this mode + is also called mAP. + - `None`: Calculate metrics of every category and output + directly. Defaults to "macro". diff --git a/mmcls/evaluation/metrics/single_label.py b/mmcls/evaluation/metrics/single_label.py index b18b649f..4f17cac8 100644 --- a/mmcls/evaluation/metrics/single_label.py +++ b/mmcls/evaluation/metrics/single_label.py @@ -54,15 +54,25 @@ def _precision_recall_f1_support(pred_positive, gt_positive, average): @METRICS.register_module() class Accuracy(BaseMetric): - """Top-k accuracy evaluation metric. + r"""Accuracy evaluation metric. + + For either binary classification or multi-class classification, the + accuracy is the fraction of correct predictions in all predictions: + + .. math:: + + \text{Accuracy} = \frac{N_{\text{correct}}}{N_{\text{all}}} Args: - topk (int | Sequence[int]): If the predictions in ``topk`` - matches the target, the predictions will be regarded as - correct ones. Defaults to 1. - thrs (Sequence[float | None] | float | None): Predictions with scores - under the thresholds are considered negative. None means no - thresholds. Defaults to 0. + topk (int | Sequence[int]): If the ground truth label matches one of + the best **k** predictions, the sample will be regard as a positive + prediction. If the parameter is a tuple, all of top-k accuracy will + be calculated and outputted together. Defaults to 1. + thrs (Sequence[float | None] | float | None): If a float, predictions + with score lower than the threshold will be regard as the negative + prediction. If None, not apply threshold. If the parameter is a + tuple, accuracy based on all thresholds will be calculated and + outputted together. Defaults to 0. collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'. @@ -262,41 +272,59 @@ class Accuracy(BaseMetric): @METRICS.register_module() class SingleLabelMetric(BaseMetric): - """A collection of metrics for single-label multi-class classification task - based on confusion matrix. + r"""A collection of precision, recall, f1-score and support for + single-label tasks. - It includes precision, recall, f1-score and support. Comparing with - :class:`Accuracy`, these metrics doesn't support topk, but supports - various average mode. + The collection of metrics is for single-label multi-class classification. + And all these metrics are based on the confusion matrix of every category: + + .. image:: ../../_static/image/confusion-matrix.png + :width: 60% + :align: center + + All metrics can be formulated use variables above: + + **Precision** is the fraction of correct predictions in all predictions: + + .. math:: + \text{Precision} = \frac{TP}{TP+FP} + + **Recall** is the fraction of correct predictions in all targets: + + .. math:: + \text{Recall} = \frac{TP}{TP+FN} + + **F1-score** is the harmonic mean of the precision and recall: + + .. math:: + \text{F1-score} = \frac{2\times\text{Recall}\times\text{Precision}}{\text{Recall}+\text{Precision}} + + **Support** is the number of samples: + + .. math:: + \text{Support} = TP + TN + FN + FP Args: - thrs (Sequence[float | None] | float | None): Predictions with scores - under the thresholds are considered negative. None means no - thresholds. Defaults to 0. - items (Sequence[str]): The detailed metric items to evaluate. Here is - the available options: + thrs (Sequence[float | None] | float | None): If a float, predictions + with score lower than the threshold will be regard as the negative + prediction. If None, only the top-1 prediction will be regard as + the positive prediction. If the parameter is a tuple, accuracy + based on all thresholds will be calculated and outputted together. + Defaults to 0. + items (Sequence[str]): The detailed metric items to evaluate, select + from "precision", "recall", "f1-score" and "support". + Defaults to ``('precision', 'recall', 'f1-score')``. + average (str | None): How to calculate the final metrics from the + confusion matrix of every category. It supports three modes: - - `"precision"`: The ratio tp / (tp + fp) where tp is the - number of true positives and fp the number of false - positives. - - `"recall"`: The ratio tp / (tp + fn) where tp is the number - of true positives and fn the number of false negatives. - - `"f1-score"`: The f1-score is the harmonic mean of the - precision and recall. - - `"support"`: The total number of occurrences of each category - in the target. - - Defaults to ('precision', 'recall', 'f1-score'). - average (str, optional): The average method. If None, the scores - for each class are returned. And it supports two average modes: - - - `"macro"`: Calculate metrics for each category, and calculate - the mean value over all categories. - - `"micro"`: Calculate metrics globally by counting the total - true positives, false negatives and false positives. + - `"macro"`: Calculate metrics for each category, and calculate + the mean value over all categories. + - `"micro"`: Average the confusion matrix over all categories and + calculate metrics on the mean confusion matrix. + - `None`: Calculate metrics of every category and output directly. Defaults to "macro". - num_classes (Optional, int): The number of classes. Defaults to None. + num_classes (int, optional): The number of classes. Defaults to None. collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'. @@ -343,7 +371,7 @@ class SingleLabelMetric(BaseMetric): 'single-label/recall_classwise': [18.5, 18.5, 17.0, 20.0, 18.0], 'single-label/f1-score_classwise': [19.7, 18.6, 17.1, 19.7, 17.0] } - """ + """ # noqa: E501 default_prefix: Optional[str] = 'single-label' def __init__(self, @@ -483,14 +511,16 @@ class SingleLabelMetric(BaseMetric): the thresholds are considered negative. It's only used when ``pred`` is scores. None means no thresholds. Defaults to (0., ). - average (str, optional): The average method. If None, the scores - for each class are returned. And it supports two average modes: + average (str | None): How to calculate the final metrics from + the confusion matrix of every category. It supports three + modes: - - `"macro"`: Calculate metrics for each category, and - calculate the mean value over all categories. - - `"micro"`: Calculate metrics globally by counting the - total true positives, false negatives and false - positives. + - `"macro"`: Calculate metrics for each category, and calculate + the mean value over all categories. + - `"micro"`: Average the confusion matrix over all categories + and calculate metrics on the mean confusion matrix. + - `None`: Calculate metrics of every category and output + directly. Defaults to "macro". num_classes (Optional, int): The number of classes. If the ``pred``