[Feature] support modified resnet structure used in oCLIP (#1458)

* support modified ResNet in CLIP and oCLIP * update unit test for TestCLIPBottleneck; update docs * Apply suggestions from code review * fix Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
2025-06-03 21:54:47 +08:00 · 2022-11-03 17:54:15 +08:00 · 2022-11-03 17:54:15 +08:00 · f1dd437d8d
commit f1dd437d8d
parent 1c06edc68f
6 changed files with 228 additions and 1 deletions
--- a/mmocr/models/common/backbones/init.py
+++ b/mmocr/models/common/backbones/init.py
@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .clip_resnet import CLIPResNet
 from .unet import UNet
-__all__ = ['UNet']
+__all__ = ['UNet', 'CLIPResNet']
--- a/mmocr/models/common/backbones/clip_resnet.py
+++ b/mmocr/models/common/backbones/clip_resnet.py
@ -0,0 +1,100 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch.nn as nn
 from mmdet.models.backbones import ResNet
 from mmdet.models.backbones.resnet import Bottleneck
 from mmocr.registry import MODELS
 class CLIPBottleneck(Bottleneck):
    """Bottleneck for CLIPResNet.
    It is a Bottleneck variant used in the ResNet variant of CLIP. After the
    second convolution layer, there is an additional average pooling layer with
    kernel_size 2 and stride 2, which is added as a plugin when the
    input stride > 1. The stride of each convolution layer is always set to 1.
    Args:
        **kwargs: Keyword arguments for
            :class:``mmdet.models.backbones.resnet.Bottleneck``.
    """
    def __init__(self, **kwargs):
        stride = kwargs.get('stride', 1)
        kwargs['stride'] = 1
        plugins = kwargs.get('plugins', None)
        if stride > 1:
            if plugins is None:
                plugins = []
            plugins.insert(
                0,
                dict(
                    cfg=dict(type='mmocr.AvgPool2d', kernel_size=2),
                    position='after_conv2'))
            kwargs['plugins'] = plugins
        super().__init__(**kwargs)
@MODELS.register_module()
 class CLIPResNet(ResNet):
    """Implement the ResNet variant used in `oCLIP.
    <https://github.com/bytedance/oclip>`_.
    It is also the official structure in
    `CLIP <https://github.com/openai/CLIP>`_.
    Compared with ResNetV1d structure, CLIPResNet replaces the
    max pooling layer with an average pooling layer at the end
    of the input stem.
    In the Bottleneck of CLIPResNet, after the second convolution
    layer, there is an additional average pooling layer with
    kernel_size 2 and stride 2, which is added as a plugin
    when the input stride > 1.
    The stride of each convolution layer is always set to 1.
    Args:
        depth (int): Depth of resnet, options are [50]. Defaults to 50.
        strides (sequence(int)): Strides of the first block of each stage.
            Defaults to (1, 2, 2, 2).
        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
            Defaults to True.
        avg_down (bool): Use AvgPool instead of stride conv at
            the downsampling stage in the bottleneck. Defaults to True.
        **kwargs: Keyword arguments for
            :class:``mmdet.models.backbones.resnet.ResNet``.
    """
    arch_settings = {
        50: (CLIPBottleneck, (3, 4, 6, 3)),
    }
    def __init__(self,
                 depth=50,
                 strides=(1, 2, 2, 2),
                 deep_stem=True,
                 avg_down=True,
                 **kwargs):
        super().__init__(
            depth=depth,
            strides=strides,
            deep_stem=deep_stem,
            avg_down=avg_down,
            **kwargs)
    def _make_stem_layer(self, in_channels: int, stem_channels: int):
        """Build stem layer for CLIPResNet used in `CLIP
        https://github.com/openai/CLIP>`_.
        It uses an average pooling layer rather than a max pooling
        layer at the end of the input stem.
        Args:
            in_channels (int): Number of input channels.
            stem_channels (int): Number of output channels.
        """
        super()._make_stem_layer(in_channels, stem_channels)
        if self.deep_stem:
            self.maxpool = nn.AvgPool2d(kernel_size=2)
--- a/mmocr/models/common/plugins/init.py
+++ b/mmocr/models/common/plugins/init.py
@ -0,0 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .common import AvgPool2d
 __all__ = ['AvgPool2d']
--- a/mmocr/models/common/plugins/common.py
+++ b/mmocr/models/common/plugins/common.py
@ -0,0 +1,40 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
 from mmocr.registry import MODELS
@MODELS.register_module()
 class AvgPool2d(nn.Module):
    """Applies a 2D average pooling over an input signal composed of several
    input planes.
    It can also be used as a network plugin.
    Args:
        kernel_size (int or tuple(int)): the size of the window.
        stride (int or tuple(int), optional): the stride of the window.
            Defaults to None.
        padding (int or tuple(int)): implicit zero padding. Defaults to 0.
    """
    def __init__(self,
                 kernel_size: Union[int, Tuple[int]],
                 stride: Optional[Union[int, Tuple[int]]] = None,
                 padding: Union[int, Tuple[int]] = 0,
                 **kwargs) -> None:
        super().__init__()
        self.model = nn.AvgPool2d(kernel_size, stride, padding)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward function.
        Args:
            x (Tensor): Input feature map.
        Returns:
            Tensor: Output tensor after Avgpooling layer.
        """
        return self.model(x)
--- a/tests/test_models/test_common/test_backbones/test_clip_resnet.py
+++ b/tests/test_models/test_common/test_backbones/test_clip_resnet.py
@ -0,0 +1,66 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from unittest import TestCase
 import torch
 import torch.nn as nn
 from mmcv.cnn import build_conv_layer, build_norm_layer
 from mmocr.models.common.backbones import CLIPResNet
 from mmocr.models.common.backbones.clip_resnet import CLIPBottleneck
 class TestCLIPResNet(TestCase):
    def test_forward(self):
        model = CLIPResNet()
        model.eval()
        imgs = torch.randn(1, 3, 32, 32)
        feat = model(imgs)
        assert len(feat) == 4
        assert feat[0].shape == torch.Size([1, 256, 8, 8])
        assert feat[1].shape == torch.Size([1, 512, 4, 4])
        assert feat[2].shape == torch.Size([1, 1024, 2, 2])
        assert feat[3].shape == torch.Size([1, 2048, 1, 1])
 class TestCLIPBottleneck(TestCase):
    def test_forward(self):
        stride = 2
        inplanes = 256
        planes = 128
        conv_cfg = None
        norm_cfg = {'type': 'BN', 'requires_grad': True}
        downsample = []
        downsample.append(
            nn.AvgPool2d(
                kernel_size=stride,
                stride=stride,
                ceil_mode=True,
                count_include_pad=False))
        downsample.extend([
            build_conv_layer(
                conv_cfg,
                inplanes,
                planes * CLIPBottleneck.expansion,
                kernel_size=1,
                stride=1,
                bias=False),
            build_norm_layer(norm_cfg, planes * CLIPBottleneck.expansion)[1]
        ])
        downsample = nn.Sequential(*downsample)
        model = CLIPBottleneck(
            inplanes=inplanes,
            planes=planes,
            stride=stride,
            downsample=downsample,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg)
        model.eval()
        input_feat = torch.randn(1, 256, 8, 8)
        output_feat = model(input_feat)
        assert output_feat.shape == torch.Size([1, 512, 4, 4])
--- a/tests/test_models/test_common/test_plugins/test_avgpool.py
+++ b/tests/test_models/test_common/test_plugins/test_avgpool.py
@ -0,0 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from unittest import TestCase
 import torch
 from mmocr.models.common.plugins import AvgPool2d
 class TestAvgPool2d(TestCase):
    def setUp(self) -> None:
        self.img = torch.rand(1, 3, 32, 100)
    def test_avgpool2d(self):
        avgpool2d = AvgPool2d(kernel_size=2, stride=2)
        self.assertEqual(avgpool2d(self.img).shape, torch.Size([1, 3, 16, 50]))