refactor tps config (#135)

* refactor tps config * recover tps * add ckpt of tps
2025-06-03 21:54:47 +08:00 · 2021-05-12 14:14:24 +08:00 · 2021-05-12 14:14:24 +08:00 · df5493a79e
commit df5493a79e
parent 18c54afbdc
6 changed files with 156 additions and 55 deletions
--- a/configs/textrecog/crnn/README.md
+++ b/configs/textrecog/crnn/README.md
@ -28,10 +28,13 @@
 | IIIT5K  |     3000     | regular |
 |   SVT   |     647      | regular |
 |  IC13   |     1015     | regular |
 |  IC15   |     2077     |irregular|
 |  SVTP   |     645      |irregular|
 |  CT80   |     288      |irregular|
 ## Results and models
 |                         methods                          |        | Regular Text |      |     |      | Irregular Text |      |                                                                                    download                                                                                    |
 | :------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 |                         methods                          | IIIT5K |     SVT      | IC13 |     | IC15 |      SVTP      | CT80 |
-| [CRNN](/configs/textrecog/crnn/crnn_academic_dataset.py) |  80.5  |     81.5     | 86.5 |     |  -   |       -        |  -   | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_academic-a723a1c5.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/20210326_111035.log.json) |
+| [CRNN](/configs/textrecog/crnn/crnn_academic_dataset.py) |  80.5  |     81.5     | 86.5 |     |  54.1   |       59.1        |  55.6   | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_academic-a723a1c5.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/20210326_111035.log.json) |
--- a/configs/textrecog/crnn/crnn_academic_dataset.py
+++ b/configs/textrecog/crnn/crnn_academic_dataset.py
@ -95,13 +95,20 @@ train1 = dict(
    test_mode=False)
 test_prefix = 'data/mixture/'
 test_img_prefix1 = test_prefix + 'icdar_2013/'
 test_img_prefix2 = test_prefix + 'IIIT5K/'
 test_img_prefix3 = test_prefix + 'svt/'
-test_ann_file1 = test_prefix + 'icdar_2013/test_label_1015.txt'
+test_img_prefix1 = test_prefix + 'IIIT5K/'
-test_ann_file2 = test_prefix + 'IIIT5K/test_label.txt'
+test_img_prefix2 = test_prefix + 'svt/'
-test_ann_file3 = test_prefix + 'svt/test_label.txt'
+test_img_prefix3 = test_prefix + 'icdar_2013/'
 test_img_prefix4 = test_prefix + 'icdar_2015/'
 test_img_prefix5 = test_prefix + 'svtp/'
 test_img_prefix6 = test_prefix + 'ct80/'
 test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
 test_ann_file2 = test_prefix + 'svt/test_label.txt'
 test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
 test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
 test_ann_file5 = test_prefix + 'svtp/test_label.txt'
 test_ann_file6 = test_prefix + 'ct80/test_label.txt'
 test1 = dict(
    type=dataset_type,
@ -126,12 +133,28 @@ test3 = {key: value for key, value in test1.items()}
 test3['img_prefix'] = test_img_prefix3
 test3['ann_file'] = test_ann_file3
 test4 = {key: value for key, value in test1.items()}
 test4['img_prefix'] = test_img_prefix4
 test4['ann_file'] = test_ann_file4
 test5 = {key: value for key, value in test1.items()}
 test5['img_prefix'] = test_img_prefix5
 test5['ann_file'] = test_ann_file5
 test6 = {key: value for key, value in test1.items()}
 test6['img_prefix'] = test_img_prefix6
 test6['ann_file'] = test_ann_file6
 data = dict(
    samples_per_gpu=64,
    workers_per_gpu=4,
    train=dict(type='ConcatDataset', datasets=[train1]),
-    val=dict(type='ConcatDataset', datasets=[test1, test2, test3]),
+    val=dict(
-    test=dict(type='ConcatDataset', datasets=[test1, test2, test3]))
+        type='ConcatDataset',
        datasets=[test1, test2, test3, test4, test5, test6]),
    test=dict(
        type='ConcatDataset',
        datasets=[test1, test2, test3, test4, test5, test6]))
 evaluation = dict(interval=1, metric='acc')
--- a/configs/textrecog/tps/README.md
+++ b/configs/textrecog/tps/README.md
@ -1,9 +1,20 @@
-# Thin-Plate-Spline (TPS) transformation
+# CRNN with TPS based STN
 ## Introduction
 [ALGORITHM]
 ```bibtex
@article{shi2016end,
  title={An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition},
  author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  year={2016}
 }
 ```
 [PREPROCESSOR]
 ```bibtex
@article{shi2016robust,
  title={Robust Scene Text Recognition with Automatic Rectification},
@ -13,14 +24,28 @@
 }
 ```
-## About using TPS in other models
+## Results and Models
- Simply change `cfg.model.preprocessor` from `None` to
+### Train Dataset
-```python
+
-dict(
+| trainset | instance_num | repeat_num | note  |
-  type='TPSPreprocessor',
+| :------: | :----------: | :--------: | :---: |
-  num_fiducial=20,
+|  Syn90k  |   8919273    |     1      | synth |
-  img_size=(32, 100),
+
-  rectified_img_size=(32, 100),
+### Test Dataset
-  num_img_channel=1
+
-)
+| testset | instance_num |  note   |
 | :-----: | :----------: | :-----: |
 | IIIT5K  |     3000     | regular |
 |   SVT   |     647      | regular |
 |  IC13   |     1015     | regular |
 |  IC15   |     2077     |irregular|
 |  SVTP   |     645      |irregular|
 |  CT80   |     288      |irregular|
 ## Results and models
 |                         methods                          |        | Regular Text |      |     |      | Irregular Text |      |                                                                                    download                                                                                    |
 | :------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 |                                                   | IIIT5K |     SVT      | IC13 |     | IC15 |      SVTP      | CT80 |
 | [CRNN-STN](/configs/textrecog/tps/crnn_tps_academic_dataset.py) |  80.8  |     81.3     | 85.0 |     |  59.6   |       68.1        |  53.8   | [model](https://download.openmmlab.com/mmocr/textrecog/tps/crnn_tps_academic_dataset_20210510-d221a905.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/tps/20210510_204353.log.json) |
--- a/configs/textrecog/tps/crnn_tps_academic_dataset.py
+++ b/configs/textrecog/tps/crnn_tps_academic_dataset.py
@ -26,7 +26,7 @@ model = dict(
        img_size=(32, 100),
        rectified_img_size=(32, 100),
        num_img_channel=1),
-    backbone=dict(type='VeryDeepVgg', leakyRelu=False, input_channels=1),
+    backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
    encoder=None,
    decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
    loss=dict(type='CTCLoss'),
@ -68,9 +68,9 @@ test_pipeline = [
    dict(
        type='ResizeOCR',
        height=32,
-        min_width=4,
+        min_width=32,
-        max_width=None,
+        max_width=100,
-        keep_aspect_ratio=True),
+        keep_aspect_ratio=False),
    dict(type='ToTensorOCR'),
    dict(type='NormalizeOCR', **img_norm_cfg),
    dict(
@ -100,13 +100,20 @@ train1 = dict(
    test_mode=False)
 test_prefix = 'data/mixture/'
 test_img_prefix1 = test_prefix + 'icdar_2013/'
 test_img_prefix2 = test_prefix + 'IIIT5K/'
 test_img_prefix3 = test_prefix + 'svt/'
-test_ann_file1 = test_prefix + 'icdar_2013/test_label_1015.txt'
+test_img_prefix1 = test_prefix + 'IIIT5K/'
-test_ann_file2 = test_prefix + 'IIIT5K/test_label.txt'
+test_img_prefix2 = test_prefix + 'svt/'
-test_ann_file3 = test_prefix + 'svt/test_label.txt'
+test_img_prefix3 = test_prefix + 'icdar_2013/'
 test_img_prefix4 = test_prefix + 'icdar_2015/'
 test_img_prefix5 = test_prefix + 'svtp/'
 test_img_prefix6 = test_prefix + 'ct80/'
 test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
 test_ann_file2 = test_prefix + 'svt/test_label.txt'
 test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
 test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
 test_ann_file5 = test_prefix + 'svtp/test_label.txt'
 test_ann_file6 = test_prefix + 'ct80/test_label.txt'
 test1 = dict(
    type=dataset_type,
@ -131,12 +138,28 @@ test3 = {key: value for key, value in test1.items()}
 test3['img_prefix'] = test_img_prefix3
 test3['ann_file'] = test_ann_file3
 test4 = {key: value for key, value in test1.items()}
 test4['img_prefix'] = test_img_prefix4
 test4['ann_file'] = test_ann_file4
 test5 = {key: value for key, value in test1.items()}
 test5['img_prefix'] = test_img_prefix5
 test5['ann_file'] = test_ann_file5
 test6 = {key: value for key, value in test1.items()}
 test6['img_prefix'] = test_img_prefix6
 test6['ann_file'] = test_ann_file6
 data = dict(
    samples_per_gpu=64,
    workers_per_gpu=4,
    train=dict(type='ConcatDataset', datasets=[train1]),
-    val=dict(type='ConcatDataset', datasets=[test1, test2, test3]),
+    val=dict(
-    test=dict(type='ConcatDataset', datasets=[test1, test2, test3]))
+        type='ConcatDataset',
        datasets=[test1, test2, test3, test4, test5, test6]),
    test=dict(
        type='ConcatDataset',
        datasets=[test1, test2, test3, test4, test5, test6]))
 evaluation = dict(interval=1, metric='acc')
--- a/mmocr/models/textrecog/preprocessor/tps_preprocessor.py
+++ b/mmocr/models/textrecog/preprocessor/tps_preprocessor.py
@ -23,26 +23,35 @@ from .base_preprocessor import BasePreprocessor
@PREPROCESSOR.register_module()
 class TPSPreprocessor(BasePreprocessor):
-    """Rectification Network of RARE, namely TPS based STN."""
+    """Rectification Network of RARE, namely TPS based STN in.
    <https://arxiv.org/pdf/1603.03915.pdf>`_.
    Args:
        num_fiducial (int): Number of fiducial points of TPS-STN.
        img_size (tuple(int, int)): Size (height, width) of the input image.
        rectified_img_size (tuple(int, int))::
            Size (height, width) of the rectified image.
        num_img_channel (int): Number of channels of the input image.
    Output:
        batch_rectified_img: Rectified image with size
            [batch_size x num_img_channel x rectified_img_height
            x rectified_img_width]
    """
    def __init__(self,
-                 num_fiducial,
+                 num_fiducial=20,
-                 img_size,
+                 img_size=(32, 100),
-                 rectified_img_size,
+                 rectified_img_size=(32, 100),
                 num_img_channel=1):
        """ Based on RARE TPS
        Args:
            num_fiducial (int): number of fiducial points of TPS-STN
            img_size (int, int): (height, width) of the input image
            rectified_img_size (int, int):
                (height, width) of the rectified image
            num_img_channel (int): the number of channels of the input image
        output:
            batch_rectified_img: rectified image
                [batch_size x num_img_channel x rectified_img_height
                x rectified_img_width]
        """
        super().__init__()
        assert isinstance(num_fiducial, int)
        assert num_fiducial > 0
        assert isinstance(img_size, tuple)
        assert isinstance(rectified_img_size, tuple)
        assert isinstance(num_img_channel, int)
        self.num_fiducial = num_fiducial
        self.img_size = img_size
        self.rectified_img_size = rectified_img_size
@ -71,13 +80,15 @@ class TPSPreprocessor(BasePreprocessor):
        return batch_rectified_img
    def init_weights(self):
        pass
 class LocalizationNetwork(nn.Module):
    """Localization Network of RARE, which predicts C' (K x 2) from input
-    (img_width x img_height)"""
+    (img_width x img_height)
    Args:
        num_fiducial (int): Number of fiducial points of TPS-STN.
        num_img_channel (int): Number of channels of the input image.
    """
    def __init__(self, num_fiducial, num_img_channel):
        super().__init__()
@ -128,7 +139,8 @@ class LocalizationNetwork(nn.Module):
        Args:
            batch_img (tensor): Batch Input Image
                [batch_size x num_img_channel x img_height x img_width]
-        output:
+
        Output:
            batch_C_prime : Predicted coordinates of fiducial points for
            input batch [batch_size x num_fiducial x 2]
        """
@ -141,8 +153,13 @@ class LocalizationNetwork(nn.Module):
 class GridGenerator(nn.Module):
-    """Grid Generator of RARE, which produces P_prime by multipling T with
+    """Grid Generator of RARE, which produces P_prime by multipling T with P.
-    P."""
+
    Args:
        num_fiducial (int): Number of fiducial points of TPS-STN.
        rectified_img_size (tuple(int, int)):
            Size (height, width) of the rectified image.
    """
    def __init__(self, num_fiducial, rectified_img_size):
        """Generate P_hat and inv_delta_C for later."""
--- a/tests/test_models/test_ocr_preprocessor.py
+++ b/tests/test_models/test_ocr_preprocessor.py
@ -1,3 +1,4 @@
 import pytest
 import torch
 from mmocr.models.textrecog.preprocessor import (BasePreprocessor,
@ -5,6 +6,15 @@ from mmocr.models.textrecog.preprocessor import (BasePreprocessor,
 def test_tps_preprocessor():
    with pytest.raises(AssertionError):
        TPSPreprocessor(num_fiducial=-1)
    with pytest.raises(AssertionError):
        TPSPreprocessor(img_size=32)
    with pytest.raises(AssertionError):
        TPSPreprocessor(rectified_img_size=100)
    with pytest.raises(AssertionError):
        TPSPreprocessor(num_img_channel='bgr')
    tps_preprocessor = TPSPreprocessor(
        num_fiducial=20,
        img_size=(32, 100),