From 312f264ecd0b21b114d1fcc8a64004efde16ca59 Mon Sep 17 00:00:00 2001
From: Mashiro <57566630+HAOCHENYE@users.noreply.github.com>
Date: Wed, 22 Jun 2022 19:49:20 +0800
Subject: [PATCH] [Feature] Add autocast wrapper (#307)

* add autocast wrapper

* fix docstring

* fix docstring

* fix compare version

* fix unit test

* fix incompatible arguments

* fix as comment

* fix unit test

* rename auto_cast to autocast
---
 .../optim/optimizer/amp_optimizer_wrapper.py  |  3 +-
 mmengine/runner/__init__.py                   |  3 +-
 mmengine/runner/amp.py                        | 87 +++++++++++++++++++
 mmengine/runner/loops.py                      | 27 ++++--
 mmengine/runner/runner.py                     | 12 ++-
 tests/test_model/test_averaged_model.py       |  2 +-
 tests/test_runner/test_amp.py                 | 57 ++++++++++++
 tests/test_runner/test_runner.py              | 51 ++++++++++-
 8 files changed, 227 insertions(+), 15 deletions(-)
 create mode 100644 mmengine/runner/amp.py
 create mode 100644 tests/test_runner/test_amp.py

diff --git a/mmengine/optim/optimizer/amp_optimizer_wrapper.py b/mmengine/optim/optimizer/amp_optimizer_wrapper.py
index 4bbd76a9..c7e18a8f 100644
--- a/mmengine/optim/optimizer/amp_optimizer_wrapper.py
+++ b/mmengine/optim/optimizer/amp_optimizer_wrapper.py
@@ -120,5 +120,6 @@ class AmpOptimWrapper(OptimWrapper):
         Args:
             model (nn.Module): The training model.
         """
-        with super().optim_context(model), torch.cuda.amp.autocast():
+        from mmengine.runner.amp import autocast
+        with super().optim_context(model), autocast():
             yield
diff --git a/mmengine/runner/__init__.py b/mmengine/runner/__init__.py
index 043c56fd..fce566d9 100644
--- a/mmengine/runner/__init__.py
+++ b/mmengine/runner/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .amp import autocast
 from .base_loop import BaseLoop
 from .checkpoint import (CheckpointLoader, find_latest_checkpoint,
                          get_deprecated_model_names, get_external_models,
@@ -13,5 +14,5 @@ __all__ = [
     'get_external_models', 'get_mmcls_models', 'get_deprecated_model_names',
     'CheckpointLoader', 'load_checkpoint', 'weights_to_cpu', 'get_state_dict',
     'save_checkpoint', 'EpochBasedTrainLoop', 'IterBasedTrainLoop', 'ValLoop',
-    'TestLoop', 'Runner', 'find_latest_checkpoint'
+    'TestLoop', 'Runner', 'find_latest_checkpoint', 'autocast'
 ]
diff --git a/mmengine/runner/amp.py b/mmengine/runner/amp.py
new file mode 100644
index 00000000..6278ac1b
--- /dev/null
+++ b/mmengine/runner/amp.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+
+import torch
+
+from mmengine.utils import TORCH_VERSION, digit_version
+
+
+@contextmanager
+def autocast(enabled: bool = True, **kwargs):
+    """A wrapper of ``torch.autocast`` and ``toch.cuda.amp.autocast``.
+
+    Pytorch 1.6.0 provide ``torch.cuda.amp.autocast`` for running in
+    mixed precision , and update it to ``torch.autocast`` in 1.10.0.
+    Both interfaces have different arguments, and ``torch.autocast``
+    support running with cpu additionally.
+
+    This function provides a unified interface by wrapping
+    ``torch.autocast`` and ``torch.cuda.amp.autocast``, which resolves the
+    compatibility issues that ``torch.cuda.amp.autocast`` does not support
+    running mixed precision with cpu, and both contexts have different
+    arguments. We suggest users using this function in the code
+    to achieve maximized compatibility of different PyTorch versions.
+
+    Note:
+        ``autocast`` requires pytorch version >= 1.5.0. If pytorch version
+        <= 1.10.0 and cuda is not available, it will raise an error with
+        ``enabled=True``, since ``torch.cuda.amp.autocast`` only support cuda
+        mode.
+
+    Examples:
+         >>> # case1: 1.10 > Pytorch version >= 1.5.0
+         >>> with autocast():
+         >>>    # run in mixed precision context
+         >>>    pass
+         >>> with autocast(device_type='cpu')::
+         >>>    # raise error, torch.cuda.amp.autocast only support cuda mode.
+         >>>    pass
+         >>> # case2: Pytorch version >= 1.10.0
+         >>> with autocast():
+         >>>    # default cuda mixed precision context
+         >>>    pass
+         >>> with autocast(device_type='cpu'):
+         >>>    # cpu mixed precision context
+         >>>    pass
+         >>> with autocast(
+         >>>     device_type='cuda', enabled=True, cache_enabled=True):
+         >>>    # enable precision context with more specific arguments.
+         >>>    pass
+
+    Args:
+        enabled (bool): Whether autocasting should be enabled in the region.
+            Defaults to True.
+        kwargs (dict): Arguments of torch.autocast except for ``enabled``.
+    """
+    # If `enabled` is True, enable an empty context and all calculations
+    # are performed under fp32.
+    assert digit_version(TORCH_VERSION) >= digit_version('1.5.0'), (
+        'The minimum pytorch version requirements of mmengine is 1.5.0, but '
+        f'got {TORCH_VERSION}')
+
+    if (digit_version('1.5.0') <= digit_version(TORCH_VERSION) <
+            digit_version('1.10.0')):
+        # If pytorch version is between 1.5.0 and 1.10.0, the default value of
+        # dtype for `torch.cuda.amp.autocast` is torch.float16.
+        assert not kwargs, (
+            f'autocast under pytorch {TORCH_VERSION} only accept `enabled` '
+            'arguments.')
+        if torch.cuda.is_available():
+            with torch.cuda.amp.autocast(enabled=enabled):
+                yield
+        else:
+            if not enabled:
+                yield
+            else:
+                raise RuntimeError(
+                    'If pytorch versions is between 1.5.0 and 1.10, '
+                    '`autocast` is only available in gpu mode')
+
+    elif digit_version(TORCH_VERSION) >= digit_version('1.10.0'):
+        if torch.cuda.is_available():
+            kwargs.setdefault('device_type', 'cuda')
+        else:
+            kwargs.setdefault('device_type', 'cpu')
+
+        with torch.autocast(enabled=enabled, **kwargs):
+            yield
diff --git a/mmengine/runner/loops.py b/mmengine/runner/loops.py
index c8501760..1a6f2651 100644
--- a/mmengine/runner/loops.py
+++ b/mmengine/runner/loops.py
@@ -9,6 +9,7 @@ from torch.utils.data import DataLoader
 from mmengine.evaluator import Evaluator
 from mmengine.registry import LOOPS
 from mmengine.utils import is_list_of
+from .amp import autocast
 from .base_loop import BaseLoop
 
 
@@ -269,10 +270,15 @@ class ValLoop(BaseLoop):
         dataloader (Dataloader or dict): A dataloader object or a dict to
             build a dataloader.
         evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 validation. Defaults to
+            False.
     """
 
-    def __init__(self, runner, dataloader: Union[DataLoader, Dict],
-                 evaluator: Union[Evaluator, Dict, List]) -> None:
+    def __init__(self,
+                 runner,
+                 dataloader: Union[DataLoader, Dict],
+                 evaluator: Union[Evaluator, Dict, List],
+                 fp16: bool = False) -> None:
         super().__init__(runner, dataloader)
 
         if isinstance(evaluator, dict) or is_list_of(evaluator, dict):
@@ -288,6 +294,7 @@ class ValLoop(BaseLoop):
                 f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
                 'metainfo. ``dataset_meta`` in evaluator, metric and '
                 'visualizer will be None.')
+        self.fp16 = fp16
 
     def run(self):
         """Launch validation."""
@@ -313,7 +320,8 @@ class ValLoop(BaseLoop):
         self.runner.call_hook(
             'before_val_iter', batch_idx=idx, data_batch=data_batch)
         # outputs should be sequence of BaseDataElement
-        outputs = self.runner.model.val_step(data_batch)
+        with autocast(enabled=self.fp16):
+            outputs = self.runner.model.val_step(data_batch)
         self.evaluator.process(data_batch, outputs)
         self.runner.call_hook(
             'after_val_iter',
@@ -331,10 +339,15 @@ class TestLoop(BaseLoop):
         dataloader (Dataloader or dict): A dataloader object or a dict to
             build a dataloader.
         evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 testing. Defaults to
+            False.
     """
 
-    def __init__(self, runner, dataloader: Union[DataLoader, Dict],
-                 evaluator: Union[Evaluator, Dict, List]):
+    def __init__(self,
+                 runner,
+                 dataloader: Union[DataLoader, Dict],
+                 evaluator: Union[Evaluator, Dict, List],
+                 fp16: bool = False):
         super().__init__(runner, dataloader)
 
         if isinstance(evaluator, dict) or is_list_of(evaluator, dict):
@@ -350,6 +363,7 @@ class TestLoop(BaseLoop):
                 f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
                 'metainfo. ``dataset_meta`` in evaluator, metric and '
                 'visualizer will be None.')
+        self.fp16 = fp16
 
     def run(self) -> None:
         """Launch test."""
@@ -374,7 +388,8 @@ class TestLoop(BaseLoop):
         self.runner.call_hook(
             'before_test_iter', batch_idx=idx, data_batch=data_batch)
         # predictions should be sequence of BaseDataElement
-        predictions = self.runner.model.test_step(data_batch)
+        with autocast(enabled=self.fp16):
+            predictions = self.runner.model.test_step(data_batch)
         self.evaluator.process(data_batch, predictions)
         self.runner.call_hook(
             'after_test_iter',
diff --git a/mmengine/runner/runner.py b/mmengine/runner/runner.py
index 5f5a033d..637540d6 100644
--- a/mmengine/runner/runner.py
+++ b/mmengine/runner/runner.py
@@ -97,13 +97,15 @@ class Runner:
         val_cfg (dict, optional): A dict to build a validation loop. If it does
             not provide "type" key, :class:`ValLoop` will be used by default.
             If ``val_cfg`` specified, :attr:`val_dataloader` should also be
-            specified. Defaults to None.
-            See :meth:`build_val_loop` for more details.
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_val_loop` for more details.
         test_cfg (dict, optional): A dict to build a test loop. If it does
             not provide "type" key, :class:`TestLoop` will be used by default.
             If ``test_cfg`` specified, :attr:`test_dataloader` should also be
-            specified. Defaults to None.
-            See :meth:`build_test_loop` for more details.
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_test_loop` for more details.
         auto_scale_lr (dict, Optional): Config to scale the learning rate
             automatically. It includes ``base_batch_size`` and ``enable``.
             ``base_batch_size`` is the batch size that the optimizer lr is
@@ -1424,6 +1426,7 @@ class Runner:
                     evaluator=self._val_evaluator))
         else:
             loop = ValLoop(
+                **loop_cfg,
                 runner=self,
                 dataloader=self._val_dataloader,
                 evaluator=self._val_evaluator)  # type: ignore
@@ -1465,6 +1468,7 @@ class Runner:
                     evaluator=self._test_evaluator))
         else:
             loop = TestLoop(
+                **loop_cfg,
                 runner=self,
                 dataloader=self._test_dataloader,
                 evaluator=self._test_evaluator)  # type: ignore
diff --git a/tests/test_model/test_averaged_model.py b/tests/test_model/test_averaged_model.py
index 9afef4e6..8151902c 100644
--- a/tests/test_model/test_averaged_model.py
+++ b/tests/test_model/test_averaged_model.py
@@ -24,7 +24,7 @@ class TestAveragedModel(TestCase):
         averaged_params = [
             torch.zeros_like(param) for param in model.parameters()
         ]
-        n_updates = 10
+        n_updates = 2
         for i in range(n_updates):
             for p, p_avg in zip(model.parameters(), averaged_params):
                 p.detach().add_(torch.randn_like(p))
diff --git a/tests/test_runner/test_amp.py b/tests/test_runner/test_amp.py
new file mode 100644
index 00000000..8ac0dd55
--- /dev/null
+++ b/tests/test_runner/test_amp.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+import torch.nn as nn
+
+from mmengine.runner import autocast
+from mmengine.utils import TORCH_VERSION, digit_version
+
+
+class TestAmp(unittest.TestCase):
+
+    def test_autocast(self):
+        if not torch.cuda.is_available():
+            if digit_version(TORCH_VERSION) < digit_version('1.10.0'):
+                # `torch.cuda.amp.autocast` is only support in gpu mode, if
+                # cuda is not available, it will return an empty context and
+                # should not accept any arguments.
+                with self.assertRaisesRegex(RuntimeError,
+                                            'If pytorch versions is '):
+                    with autocast():
+                        pass
+
+                with autocast(enabled=False):
+                    layer = nn.Conv2d(1, 1, 1)
+                    res = layer(torch.randn(1, 1, 1, 1))
+                    self.assertEqual(res.dtype, torch.float32)
+
+            else:
+                with autocast(device_type='cpu'):
+                    # torch.autocast support cpu mode.
+                    layer = nn.Conv2d(1, 1, 1)
+                    res = layer(torch.randn(1, 1, 1, 1))
+                    self.assertIn(res.dtype, (torch.bfloat16, torch.float16))
+                    with autocast(enabled=False):
+                        res = layer(torch.randn(1, 1, 1, 1))
+                        self.assertEqual(res.dtype, torch.float32)
+
+        else:
+            if digit_version(TORCH_VERSION) < digit_version('1.10.0'):
+                devices = ['cuda']
+            else:
+                devices = ['cpu', 'cuda']
+            for device in devices:
+                with autocast():
+                    # torch.autocast support cpu and cuda mode.
+                    layer = nn.Conv2d(1, 1, 1).to(device)
+                    res = layer(torch.randn(1, 1, 1, 1).to(device))
+                    self.assertIn(res.dtype, (torch.bfloat16, torch.float16))
+                    with autocast(enabled=False):
+                        res = layer(torch.randn(1, 1, 1, 1).to(device))
+                        self.assertEqual(res.dtype, torch.float32)
+                # Test with fp32_enabled
+                with autocast(enabled=False):
+                    layer = nn.Conv2d(1, 1, 1).to(device)
+                    res = layer(torch.randn(1, 1, 1, 1).to(device))
+                    self.assertEqual(res.dtype, torch.float32)
diff --git a/tests/test_runner/test_runner.py b/tests/test_runner/test_runner.py
index b14bf403..43fed5b3 100644
--- a/tests/test_runner/test_runner.py
+++ b/tests/test_runner/test_runner.py
@@ -31,7 +31,7 @@ from mmengine.runner import (BaseLoop, EpochBasedTrainLoop, IterBasedTrainLoop,
                              Runner, TestLoop, ValLoop)
 from mmengine.runner.loops import _InfiniteDataloaderIterator
 from mmengine.runner.priority import Priority, get_priority
-from mmengine.utils import is_list_of
+from mmengine.utils import TORCH_VERSION, digit_version, is_list_of
 from mmengine.visualization import Visualizer
 
 
@@ -55,7 +55,6 @@ class ToyModel(BaseModel):
             outputs = dict(loss=loss)
             return outputs
         elif mode == 'predict':
-            outputs = dict(log_vars=dict(a=1, b=0.5))
             return outputs
 
 
@@ -1273,7 +1272,31 @@ class TestRunner(TestCase):
         cfg.pop('test_cfg')
         cfg.pop('test_evaluator')
         runner = Runner.from_cfg(cfg)
+
+        # Test default fp32 `autocast` context.
+        predictions = []
+
+        def get_outputs_callback(module, inputs, outputs):
+            predictions.append(outputs)
+
+        runner.model.register_forward_hook(get_outputs_callback)
         runner.val()
+        self.assertEqual(predictions[0].dtype, torch.float32)
+        predictions.clear()
+
+        # Test fp16 `autocast` context.
+        cfg.experiment_name = 'test_val3'
+        cfg.val_cfg = dict(fp16=True)
+        runner = Runner.from_cfg(cfg)
+        runner.model.register_forward_hook(get_outputs_callback)
+        if (digit_version(TORCH_VERSION) < digit_version('1.10.0')
+                and not torch.cuda.is_available()):
+            with self.assertRaisesRegex(RuntimeError, 'If pytorch versions'):
+                runner.val()
+        else:
+            runner.val()
+            self.assertIn(predictions[0].dtype,
+                          (torch.float16, torch.bfloat16))
 
     def test_test(self):
         cfg = copy.deepcopy(self.epoch_based_cfg)
@@ -1303,7 +1326,31 @@ class TestRunner(TestCase):
         cfg.pop('val_cfg')
         cfg.pop('val_evaluator')
         runner = Runner.from_cfg(cfg)
+
+        # Test default fp32 `autocast` context.
+        predictions = []
+
+        def get_outputs_callback(module, inputs, outputs):
+            predictions.append(outputs)
+
+        runner.model.register_forward_hook(get_outputs_callback)
         runner.test()
+        self.assertEqual(predictions[0].dtype, torch.float32)
+        predictions.clear()
+
+        # Test fp16 `autocast` context.
+        cfg.experiment_name = 'test_val3'
+        cfg.test_cfg = dict(fp16=True)
+        runner = Runner.from_cfg(cfg)
+        runner.model.register_forward_hook(get_outputs_callback)
+        if (digit_version(TORCH_VERSION) < digit_version('1.10.0')
+                and not torch.cuda.is_available()):
+            with self.assertRaisesRegex(RuntimeError, 'If pytorch versions'):
+                runner.test()
+        else:
+            runner.test()
+            self.assertIn(predictions[0].dtype,
+                          (torch.float16, torch.bfloat16))
 
     def test_register_hook(self):
         cfg = copy.deepcopy(self.epoch_based_cfg)