mirror of https://github.com/open-mmlab/mmocr.git
parent
c44b611a6c
commit
e8d1bc37d3
115
docs/en/api.rst
115
docs/en/api.rst
|
@ -1,22 +1,59 @@
|
|||
mmocr.datasets
|
||||
---------------------------------------------
|
||||
.. automodule:: mmocr.datasets
|
||||
:members:
|
||||
|
||||
Dataset Types
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. automodule:: mmocr.datasets.ocr_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.icdar_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_lmdb_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_text_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.wildreceipt_dataset
|
||||
:members:
|
||||
|
||||
Transforms
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.datasets.transforms
|
||||
:members:
|
||||
|
||||
|
||||
mmocr.engine
|
||||
-------------
|
||||
---------------------------------------------
|
||||
Hooks
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.engine.hooks
|
||||
:members:
|
||||
|
||||
|
||||
mmocr.evaluation
|
||||
-------------
|
||||
---------------------------------------------
|
||||
Evaluator
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.evaluation.evaluator
|
||||
:members:
|
||||
|
||||
Functional
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.evaluation.functional
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.evaluation.metircs
|
||||
Metric
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.evaluation.metrics
|
||||
:members:
|
||||
|
||||
mmocr.utils
|
||||
-------------
|
||||
---------------------------------------------
|
||||
Point utils
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.utils.point_utils
|
||||
|
@ -66,8 +103,9 @@ Others
|
|||
.. automodule:: mmocr.utils.parsers
|
||||
:members:
|
||||
|
||||
|
||||
mmocr.models
|
||||
---------------
|
||||
---------------------------------------------
|
||||
Common
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.common.backbones
|
||||
|
@ -105,7 +143,7 @@ Text Detection Module Losses
|
|||
.. automodule:: mmocr.models.textdet.module_losses
|
||||
:members:
|
||||
|
||||
Text Detection Preprocessors
|
||||
Text Detection Data Preprocessors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.textdet.data_preprocessors
|
||||
:members:
|
||||
|
@ -125,7 +163,7 @@ Text Recognition Backbones
|
|||
.. automodule:: mmocr.models.textrecog.backbones
|
||||
:members:
|
||||
|
||||
Text Recognition Preprocessors
|
||||
Text Recognition Data Preprocessors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.textrecog.data_preprocessors
|
||||
:members:
|
||||
|
@ -156,80 +194,59 @@ Text Recognition Module Losses
|
|||
:members:
|
||||
|
||||
KIE Extractors
|
||||
^^^^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.kie.extractors
|
||||
:members:
|
||||
|
||||
KIE Heads
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.kie.heads
|
||||
:members:
|
||||
|
||||
KIE Module Losses
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.kie.module_losses
|
||||
:members:
|
||||
|
||||
mmocr.datasets
|
||||
-----------------
|
||||
.. automodule:: mmocr.datasets
|
||||
:members:
|
||||
|
||||
Dataset Types
|
||||
^^^^^^^^^^^
|
||||
|
||||
.. automodule:: mmocr.datasets.ocr_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.icdar_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_lmdb_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_text_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.wildreceipt_dataset
|
||||
:members:
|
||||
|
||||
Transforms
|
||||
^^^^^^^^^^^
|
||||
.. automodule:: mmocr.datasets.transforms
|
||||
:members:
|
||||
|
||||
mmocr.structures
|
||||
-----------------
|
||||
---------------------------------------------
|
||||
|
||||
Text Detection Data Sample
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.structures.textdet_data_sample
|
||||
:members:
|
||||
|
||||
Text Recognition Data Sample
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.structures.textrecog_data_sample
|
||||
:members:
|
||||
|
||||
KIE Data Sample
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.structures.kie_data_sample
|
||||
:members:
|
||||
|
||||
mmocr.visualization
|
||||
-----------------
|
||||
|
||||
visualize
|
||||
^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.visualize
|
||||
:members:
|
||||
mmocr.visualization
|
||||
---------------------------------------------
|
||||
|
||||
Text Detection Visualizer
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.textdet_visualizer
|
||||
:members:
|
||||
|
||||
Text Recognition Visualizer
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.textrecog_visualizer
|
||||
:members:
|
||||
|
||||
Text Spotting Visualizer
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.textspotting_visualizer
|
||||
:members:
|
||||
|
||||
KIE Visualizer
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.kie_visualizer
|
||||
:members:
|
||||
|
|
|
@ -39,9 +39,10 @@ release = __version__
|
|||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode',
|
||||
'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser'
|
||||
'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser',
|
||||
'sphinx.ext.intersphinx', 'sphinx.ext.autodoc.typehints'
|
||||
]
|
||||
|
||||
autodoc_typehints = 'description'
|
||||
autodoc_mock_imports = ['mmcv._ext']
|
||||
|
||||
# Ignore >>> when copying code
|
||||
|
@ -129,7 +130,7 @@ intersphinx_mapping = {
|
|||
'numpy': ('https://numpy.org/doc/stable', None),
|
||||
'torch': ('https://pytorch.org/docs/stable/', None),
|
||||
'mmcv': ('https://mmcv.readthedocs.io/en/dev-2.x/', None),
|
||||
'mmengine': ('https://mmengine.readthedocs.io/en/main/', None),
|
||||
'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
|
||||
'mmdetection': ('https://mmdetection.readthedocs.io/en/dev-3.x/', None),
|
||||
}
|
||||
|
||||
|
|
255
docs/en/demo.md
255
docs/en/demo.md
|
@ -1,255 +0,0 @@
|
|||
# Demo
|
||||
|
||||
We provide an easy-to-use API for the demo and application purpose in [ocr.py](https://github.com/open-mmlab/mmocr/blob/main/mmocr/utils/ocr.py) script.
|
||||
|
||||
The API can be called through command line (CL) or by calling it from another python script.
|
||||
It exposes all the models in MMOCR to API as individual modules that can be called and chained together. [Tesseract](https://tesseract-ocr.github.io/) is integrated as a text detector and/or recognizer in the task pipeline.
|
||||
|
||||
______________________________________________________________________
|
||||
|
||||
## Example 1: Text Detection
|
||||
|
||||
<div align="center">
|
||||
<img src="https://raw.githubusercontent.com/open-mmlab/mmocr/main/demo/resources/text_det_pred.jpg"/><br>
|
||||
</div>
|
||||
<br>
|
||||
|
||||
**Instruction:** Perform detection inference on an image with the TextSnake recognition model, export the result in a json file (default) and save the visualization file.
|
||||
|
||||
- CL interface:
|
||||
|
||||
```shell
|
||||
python mmocr/utils/ocr.py demo/demo_text_det.jpg --output demo/det_out.jpg --det TextSnake --recog None --export demo/
|
||||
```
|
||||
|
||||
- Python interface:
|
||||
|
||||
```python
|
||||
from mmocr.utils.ocr import MMOCR
|
||||
|
||||
# Load models into memory
|
||||
ocr = MMOCR(det='TextSnake', recog=None)
|
||||
|
||||
# Inference
|
||||
results = ocr.readtext('demo/demo_text_det.jpg', output='demo/det_out.jpg', export='demo/')
|
||||
```
|
||||
|
||||
## Example 2: Text Recognition
|
||||
|
||||
<div align="center">
|
||||
<img src="https://raw.githubusercontent.com/open-mmlab/mmocr/main/demo/resources/text_recog_pred.jpg"/><br>
|
||||
</div>
|
||||
<br>
|
||||
|
||||
**Instruction:** Perform batched recognition inference on a folder with hundreds of image with the CRNN_TPS recognition model and save the visualization results in another folder.
|
||||
*Batch size is set to 10 to prevent out of memory CUDA runtime errors.*
|
||||
|
||||
- CL interface:
|
||||
|
||||
```shell
|
||||
python mmocr/utils/ocr.py %INPUT_FOLDER_PATH% --det None --recog CRNN_TPS --batch-mode --single-batch-size 10 --output %OUPUT_FOLDER_PATH%
|
||||
```
|
||||
|
||||
- Python interface:
|
||||
|
||||
```python
|
||||
from mmocr.utils.ocr import MMOCR
|
||||
|
||||
# Load models into memory
|
||||
ocr = MMOCR(det=None, recog='CRNN_TPS')
|
||||
|
||||
# Inference
|
||||
results = ocr.readtext(%INPUT_FOLDER_PATH%, output = %OUTPUT_FOLDER_PATH%, batch_mode=True, single_batch_size = 10)
|
||||
```
|
||||
|
||||
## Example 3: Text Detection + Recognition
|
||||
|
||||
<div align="center">
|
||||
<img src="https://raw.githubusercontent.com/open-mmlab/mmocr/main/demo/resources/demo_ocr_pred.jpg"/><br>
|
||||
</div>
|
||||
<br>
|
||||
|
||||
**Instruction:** Perform ocr (det + recog) inference on the demo/demo_text_det.jpg image with the PANet_IC15 (default) detection model and SAR (default) recognition model, print the result in the terminal and show the visualization.
|
||||
|
||||
- CL interface:
|
||||
|
||||
```shell
|
||||
python mmocr/utils/ocr.py demo/demo_text_ocr.jpg --print-result --imshow
|
||||
```
|
||||
|
||||
```{note}
|
||||
|
||||
When calling the script from the command line, the script assumes configs are saved in the `configs/` folder. User can customize the directory by specifying the value of `config_dir`.
|
||||
|
||||
```
|
||||
|
||||
- Python interface:
|
||||
|
||||
```python
|
||||
from mmocr.utils.ocr import MMOCR
|
||||
|
||||
# Load models into memory
|
||||
ocr = MMOCR()
|
||||
|
||||
# Inference
|
||||
results = ocr.readtext('demo/demo_text_ocr.jpg', print_result=True, imshow=True)
|
||||
```
|
||||
|
||||
______________________________________________________________________
|
||||
|
||||
## Example 4: Text Detection + Recognition + Key Information Extraction
|
||||
|
||||
<div align="center">
|
||||
<img src="https://raw.githubusercontent.com/open-mmlab/mmocr/main/demo/resources/demo_kie_pred.png"/><br>
|
||||
</div>
|
||||
<br>
|
||||
|
||||
**Instruction:** Perform end-to-end ocr (det + recog) inference first with PS_CTW detection model and SAR recognition model, then run KIE inference with SDMGR model on the ocr result and show the visualization.
|
||||
|
||||
- CL interface:
|
||||
|
||||
```shell
|
||||
python mmocr/utils/ocr.py demo/demo_kie.jpeg --det PS_CTW --recog SAR --kie SDMGR --print-result --imshow
|
||||
```
|
||||
|
||||
```{note}
|
||||
|
||||
Note: When calling the script from the command line, the script assumes configs are saved in the `configs/` folder. User can customize the directory by specifying the value of `config_dir`.
|
||||
|
||||
```
|
||||
|
||||
- Python interface:
|
||||
|
||||
```python
|
||||
from mmocr.utils.ocr import MMOCR
|
||||
|
||||
# Load models into memory
|
||||
ocr = MMOCR(det='PS_CTW', recog='SAR', kie='SDMGR')
|
||||
|
||||
# Inference
|
||||
results = ocr.readtext('demo/demo_kie.jpeg', print_result=True, imshow=True)
|
||||
```
|
||||
|
||||
______________________________________________________________________
|
||||
|
||||
## API Arguments
|
||||
|
||||
The API has an extensive list of arguments that you can use. The following tables are for the python interface.
|
||||
|
||||
**MMOCR():**
|
||||
|
||||
| Arguments | Type | Default | Description |
|
||||
| -------------- | --------------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `det` | see [models](#models) | PANet_IC15 | Text detection algorithm |
|
||||
| `recog` | see [models](#models) | SAR | Text recognition algorithm |
|
||||
| `kie` \[1\] | see [models](#models) | None | Key information extraction algorithm |
|
||||
| `config_dir` | str | configs/ | Path to the config directory where all the config files are located |
|
||||
| `det_config` | str | None | Path to the custom config file of the selected det model |
|
||||
| `det_ckpt` | str | None | Path to the custom checkpoint file of the selected det model |
|
||||
| `recog_config` | str | None | Path to the custom config file of the selected recog model |
|
||||
| `recog_ckpt` | str | None | Path to the custom checkpoint file of the selected recog model |
|
||||
| `kie_config` | str | None | Path to the custom config file of the selected kie model |
|
||||
| `kie_ckpt` | str | None | Path to the custom checkpoint file of the selected kie model |
|
||||
| `device` | str | None | Device used for inference, accepting all allowed strings by `torch.device`. E.g., 'cuda:0' or 'cpu'. |
|
||||
|
||||
\[1\]: `kie` is only effective when both text detection and recognition models are specified.
|
||||
|
||||
```{note}
|
||||
|
||||
User can use default pretrained models by specifying `det` and/or `recog`, which is equivalent to specifying their corresponding `*_config` and `*_ckpt`. However, manually specifying `*_config` and `*_ckpt` will always override values set by `det` and/or `recog`. Similar rules also apply to `kie`, `kie_config` and `kie_ckpt`.
|
||||
|
||||
```
|
||||
|
||||
### readtext()
|
||||
|
||||
| Arguments | Type | Default | Description |
|
||||
| ------------------- | ----------------------- | ------------ | ---------------------------------------------------------------------- |
|
||||
| `img` | str/list/tuple/np.array | **required** | img, folder path, np array or list/tuple (with img paths or np arrays) |
|
||||
| `output` | str | None | Output result visualization - img path or folder path |
|
||||
| `batch_mode` | bool | False | Whether use batch mode for inference \[1\] |
|
||||
| `det_batch_size` | int | 0 | Batch size for text detection (0 for max size) |
|
||||
| `recog_batch_size` | int | 0 | Batch size for text recognition (0 for max size) |
|
||||
| `single_batch_size` | int | 0 | Batch size for only detection or recognition |
|
||||
| `export` | str | None | Folder where the results of each image are exported |
|
||||
| `export_format` | str | json | Format of the exported result file(s) |
|
||||
| `details` | bool | False | Whether include the text boxes coordinates and confidence values |
|
||||
| `imshow` | bool | False | Whether to show the result visualization on screen |
|
||||
| `print_result` | bool | False | Whether to show the result for each image |
|
||||
| `merge` | bool | False | Whether to merge neighboring boxes \[2\] |
|
||||
| `merge_xdist` | float | 20 | The maximum x-axis distance to merge boxes |
|
||||
|
||||
\[1\]: Make sure that the model is compatible with batch mode.
|
||||
|
||||
\[2\]: Only effective when the script is running in det + recog mode.
|
||||
|
||||
All arguments are the same for the cli, all you need to do is add 2 hyphens at the beginning of the argument and replace underscores by hyphens.
|
||||
(*Example:* `det_batch_size` becomes `--det-batch-size`)
|
||||
|
||||
For bool type arguments, putting the argument in the command stores it as true.
|
||||
(*Example:* `python mmocr/utils/ocr.py demo/demo_text_det.jpg --batch_mode --print_result`
|
||||
means that `batch_mode` and `print_result` are set to `True`)
|
||||
|
||||
______________________________________________________________________
|
||||
|
||||
## Models
|
||||
|
||||
**Text detection:**
|
||||
|
||||
| Name | Reference | `batch_mode` inference support |
|
||||
| ------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------: |
|
||||
| DB_r18 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#real-time-scene-text-detection-with-differentiable-binarization) | No |
|
||||
| DB_r50 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#real-time-scene-text-detection-with-differentiable-binarization) | No |
|
||||
| DBPP_r50 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#dbnetpp) | No |
|
||||
| DRRG | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#drrg) | No |
|
||||
| FCE_IC15 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#fourier-contour-embedding-for-arbitrary-shaped-text-detection) | No |
|
||||
| FCE_CTW_DCNv2 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#fourier-contour-embedding-for-arbitrary-shaped-text-detection) | No |
|
||||
| MaskRCNN_CTW | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#mask-r-cnn) | No |
|
||||
| MaskRCNN_IC15 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#mask-r-cnn) | No |
|
||||
| MaskRCNN_IC17 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#mask-r-cnn) | No |
|
||||
| PANet_CTW | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#efficient-and-accurate-arbitrary-shaped-text-detection-with-pixel-aggregation-network) | Yes |
|
||||
| PANet_IC15 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#efficient-and-accurate-arbitrary-shaped-text-detection-with-pixel-aggregation-network) | Yes |
|
||||
| PS_CTW | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#psenet) | No |
|
||||
| PS_IC15 | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#psenet) | No |
|
||||
| Tesseract | [link](https://tesseract-ocr.github.io/) | No |
|
||||
| TextSnake | [link](https://mmocr.readthedocs.io/en/latest/textdet_models.html#textsnake) | Yes |
|
||||
|
||||
**Text recognition:**
|
||||
|
||||
| Name | Reference | `batch_mode` inference support |
|
||||
| ------------- | :-----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------: |
|
||||
| ABINet | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#read-like-humans-autonomous-bidirectional-and-iterative-language-modeling-for-scene-text-recognition) | Yes |
|
||||
| CRNN | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#an-end-to-end-trainable-neural-network-for-image-based-sequence-recognition-and-its-application-to-scene-text-recognition) | No |
|
||||
| CRNN_TPS | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#crnn-with-tps-based-stn) | Yes |
|
||||
| MASTER | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#master) | Yes |
|
||||
| NRTR_1/16-1/8 | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#nrtr) | Yes |
|
||||
| NRTR_1/8-1/4 | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#nrtr) | Yes |
|
||||
| RobustScanner | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#robustscanner-dynamically-enhancing-positional-clues-for-robust-text-recognition) | Yes |
|
||||
| SAR | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#show-attend-and-read-a-simple-and-strong-baseline-for-irregular-text-recognition) | Yes |
|
||||
| SAR_CN \* | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#show-attend-and-read-a-simple-and-strong-baseline-for-irregular-text-recognition) | Yes |
|
||||
| SATRN | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#satrn) | Yes |
|
||||
| SATRN_sm | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#satrn) | Yes |
|
||||
| Tesseract | [link](https://tesseract-ocr.github.io/) | No |
|
||||
|
||||
```{warning}
|
||||
|
||||
SAR_CN is the only model that supports Chinese character recognition and it requires
|
||||
a Chinese dictionary. Please download the dictionary from [here](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#chinese-dataset) for a successful run.
|
||||
|
||||
```
|
||||
|
||||
**Key information extraction:**
|
||||
|
||||
| Name | Reference | `batch_mode` support |
|
||||
| ----- | :---------------------------------------------------------------------------------------------------------------------------------: | :------------------: |
|
||||
| SDMGR | [link](https://mmocr.readthedocs.io/en/latest/kie_models.html#spatial-dual-modality-graph-reasoning-for-key-information-extraction) | Yes |
|
||||
|
||||
## Additional info
|
||||
|
||||
- To perform det + recog inference (end2end ocr), both the `det` and `recog` arguments must be defined.
|
||||
- To perform only detection set the `recog` argument to `None`.
|
||||
- To perform only recognition set the `det` argument to `None`.
|
||||
- `details` argument only works with end2end ocr.
|
||||
- `det_batch_size` and `recog_batch_size` arguments define the number of images you want to forward to the model at the same time. For maximum speed, set this to the highest number you can. The max batch size is limited by the model complexity and the GPU VRAM size.
|
||||
- MMOCR calls Tesseract's API via [`tesserocr`](https://github.com/sirfz/tesserocr)
|
||||
|
||||
If you have any suggestions for new features, feel free to open a thread or even PR :)
|
|
@ -28,7 +28,7 @@ You can switch between English and Chinese in the lower-left corner of the layou
|
|||
|
||||
basic_concepts/overview.md
|
||||
basic_concepts/data_flow.md
|
||||
basic_concepts/dataset.md
|
||||
basic_concepts/datasets.md
|
||||
basic_concepts/structures.md
|
||||
basic_concepts/models.md
|
||||
basic_concepts/transforms.md
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
# Key Information Extraction Models
|
||||
|
||||
## SDMGR
|
||||
|
||||
[Spatial Dual-Modality Graph Reasoning for Key Information Extraction](https://arxiv.org/abs/2103.14470)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Key information extraction from document images is of paramount importance in office automation. Conventional template matching based approaches fail to generalize well to document images of unseen templates, and are not robust against text recognition errors. In this paper, we propose an end-to-end Spatial Dual-Modality Graph Reasoning method (SDMG-R) to extract key information from unstructured document images. We model document images as dual-modality graphs, nodes of which encode both the visual and textual features of detected text regions, and edges of which represent the spatial relations between neighboring text regions. The key information extraction is solved by iteratively propagating messages along graph edges and reasoning the categories of graph nodes. In order to roundly evaluate our proposed method as well as boost the future research, we release a new dataset named WildReceipt, which is collected and annotated tailored for the evaluation of key information extraction from document images of unseen templates in the wild. It contains 25 key information categories, a total of about 69000 text boxes, and is about 2 times larger than the existing public datasets. Extensive experiments validate that all information including visual features, textual features and spatial relations can benefit key information extraction. It has been shown that SDMG-R can effectively extract key information from document images of unseen templates, and obtain new state-of-the-art results on the recent popular benchmark SROIE and our WildReceipt. Our code and dataset will be publicly released.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142580689-18edb4d7-f716-475c-b1c1-e2b934658cee.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### WildReceipt
|
||||
|
||||
| Method | Modality | Macro F1-Score | Download |
|
||||
| :----------------------------------------------------------------------------------: | :--------------: | :------------: | :------------------------------------------------------------------------------------: |
|
||||
| [sdmgr_unet16](https://github.com/open-mmlab/mmocr/tree/master/configs/kie/sdmgr/sdmgr_unet16_60e_wildreceipt.py) | Visual + Textual | 0.888 | [model](https://download.openmmlab.com/mmocr/kie/sdmgr/sdmgr_unet16_60e_wildreceipt_20210520-7489e6de.pth) \| [log](https://download.openmmlab.com/mmocr/kie/sdmgr/20210520_132236.log.json) |
|
||||
| [sdmgr_novisual](https://github.com/open-mmlab/mmocr/tree/master/configs/kie/sdmgr/sdmgr_novisual_60e_wildreceipt.py) | Textual | 0.870 | [model](https://download.openmmlab.com/mmocr/kie/sdmgr/sdmgr_novisual_60e_wildreceipt_20210517-a44850da.pth) \| [log](https://download.openmmlab.com/mmocr/kie/sdmgr/20210517_205829.log.json) |
|
||||
|
||||
```{note}
|
||||
1. For `sdmgr_novisual`, images are not needed for training and testing. So fake `img_prefix` can be used in configs. As well, fake `file_name` can be used in annotation files.
|
||||
```
|
||||
|
||||
#### WildReceiptOpenset
|
||||
|
||||
| Method | Modality | Edge F1-Score | Node Macro F1-Score | Node Micro F1-Score | Download |
|
||||
| :-------------------------------------------------------------------: | :------: | :-----------: | :-----------------: | :-----------------: | :----------------------------------------------------------------------: |
|
||||
| [sdmgr_novisual](https://github.com/open-mmlab/mmocr/tree/master/configs/kie/sdmgr/sdmgr_novisual_60e_wildreceipt_openset.py) | Textual | 0.786 | 0.926 | 0.935 | [model](https://download.openmmlab.com/mmocr/kie/sdmgr/sdmgr_novisual_60e_wildreceipt_openset_20210917-d236b3ea.pth) \| [log](https://download.openmmlab.com/mmocr/kie/sdmgr/20210917_050824.log.json) |
|
||||
|
||||
```{note}
|
||||
1. In the case of openset, the number of node categories is unknown or unfixed, and more node category can be added.
|
||||
2. To show that our method can handle openset problem, we modify the ground truth of `WildReceipt` to `WildReceiptOpenset`. The `nodes` are just classified into 4 classes: `background, key, value, others`, while adding `edge` labels for each box.
|
||||
3. The model is used to predict whether two nodes are a pair connecting by a valid edge.
|
||||
4. You can learn more about the key differences between CloseSet and OpenSet annotations in our [tutorial](tutorials/kie_closeset_openset.md).
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@misc{sun2021spatial,
|
||||
title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction},
|
||||
author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang},
|
||||
year={2021},
|
||||
eprint={2103.14470},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CV}
|
||||
}
|
||||
```
|
|
@ -5,6 +5,5 @@ sed -e '$a\\n' -s ../../configs/kie/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/
|
|||
sed -e '$a\\n' -s ../../configs/textdet/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# Text Detection Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >textdet_models.md
|
||||
sed -e '$a\\n' -s ../../configs/textrecog/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# Text Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >textrecog_models.md
|
||||
|
||||
# replace special symbols in demo.md
|
||||
cp ../../demo/README.md demo.md
|
||||
sed -i 's/:heavy_check_mark:/Yes/g' demo.md && sed -i 's/:x:/No/g' demo.md
|
||||
# replace special symbols in inference.md
|
||||
sed -i 's/:heavy_check_mark:/Yes/g' user_guides/inference.md && sed -i 's/:x:/No/g' user_guides/inference.md
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
# Statistics
|
||||
|
||||
- Number of checkpoints: 31
|
||||
- Number of configs: 28
|
||||
- Number of papers: 17
|
||||
- ALGORITHM: 17
|
||||
|
||||
## [ Key Information Extraction Models](kie_models.md)
|
||||
|
||||
- Number of checkpoints: 3
|
||||
- Number of configs: 3
|
||||
- Number of papers: 1
|
||||
- \[ALGORITHM\] [Spatial Dual-Modality Graph Reasoning for Key Information Extraction](kie_models.html#sdmgr)
|
||||
|
||||
## [](ner_models.md)
|
||||
|
||||
- Number of checkpoints: 0
|
||||
- Number of configs: 0
|
||||
- Number of papers: 0
|
||||
|
||||
## [ Text Detection Models](textdet_models.md)
|
||||
|
||||
- Number of checkpoints: 15
|
||||
- Number of configs: 14
|
||||
- Number of papers: 8
|
||||
- \[ALGORITHM\] [Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection](textdet_models.html#drrg)
|
||||
- \[ALGORITHM\] [Efficient and Accurate Arbitrary-Shaped Text Detection With Pixel Aggregation Network](textdet_models.html#panet)
|
||||
- \[ALGORITHM\] [Fourier Contour Embedding for Arbitrary-Shaped Text Detection](textdet_models.html#fcenet)
|
||||
- \[ALGORITHM\] [Mask R-CNN](textdet_models.html#mask-r-cnn)
|
||||
- \[ALGORITHM\] [Real-Time Scene Text Detection With Differentiable Binarization and Adaptive Scale Fusion](textdet_models.html#dbnetpp)
|
||||
- \[ALGORITHM\] [Real-Time Scene Text Detection With Differentiable Binarization](textdet_models.html#dbnet)
|
||||
- \[ALGORITHM\] [Shape Robust Text Detection With Progressive Scale Expansion Network](textdet_models.html#psenet)
|
||||
- \[ALGORITHM\] [Textsnake: A Flexible Representation for Detecting Text of Arbitrary Shapes](textdet_models.html#textsnake)
|
||||
|
||||
## [ Text Recognition Models](textrecog_models.md)
|
||||
|
||||
- Number of checkpoints: 13
|
||||
- Number of configs: 11
|
||||
- Number of papers: 8
|
||||
- \[ALGORITHM\] [An End-to-End Trainable Neural Network for Image-Based Sequence Recognition and Its Application to Scene Text Recognition](textrecog_models.html#crnn)
|
||||
- \[ALGORITHM\] [Nrtr: A No-Recurrence Sequence-to-Sequence Model for Scene Text Recognition](textrecog_models.html#nrtr)
|
||||
- \[ALGORITHM\] [On Recognizing Texts of Arbitrary Shapes With 2d Self-Attention](textrecog_models.html#satrn)
|
||||
- \[ALGORITHM\] [Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition](textrecog_models.html#abinet)
|
||||
- \[ALGORITHM\] [Robust Scene Text Recognition With Automatic Rectification](textrecog_models.html#crnn-stn)
|
||||
- \[ALGORITHM\] [Robustscanner: Dynamically Enhancing Positional Clues for Robust Text Recognition](textrecog_models.html#robustscanner)
|
||||
- \[ALGORITHM\] [Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition](textrecog_models.html#sar)
|
||||
- \[ALGORITHM\] [{Master](textrecog_models.html#master)
|
|
@ -1,315 +0,0 @@
|
|||
# Text Detection Models
|
||||
|
||||
## DBNetpp
|
||||
|
||||
[Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion](https://arxiv.org/abs/2202.10304)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Recently, segmentation-based scene text detection methods have drawn extensive attention in the scene text detection field, because of their superiority in detecting the text instances of arbitrary shapes and extreme aspect ratios, profiting from the pixel-level descriptions. However, the vast majority of the existing segmentation-based approaches are limited to their complex post-processing algorithms and the scale robustness of their segmentation models, where the post-processing algorithms are not only isolated to the model optimization but also time-consuming and the scale robustness is usually strengthened by fusing multi-scale feature maps directly. In this paper, we propose a Differentiable Binarization (DB) module that integrates the binarization process, one of the most important steps in the post-processing procedure, into a segmentation network. Optimized along with the proposed DB module, the segmentation network can produce more accurate results, which enhances the accuracy of text detection with a simple pipeline. Furthermore, an efficient Adaptive Scale Fusion (ASF) module is proposed to improve the scale robustness by fusing features of different scales adaptively. By incorporating the proposed DB and ASF with the segmentation network, our proposed scene text detector consistently achieves state-of-the-art results, in terms of both detection accuracy and speed, on five standard benchmarks.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/45810070/166850828-f1e48c25-4a0f-429d-ae54-6997ed25c062.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :--------------------------------------: | :-------------------------------------------------: | :-------------: | :------------: | :------: | :-------: | :----: | :-------: | :---: | :-----------------------------------------: |
|
||||
| [DBNetpp_r50dcn](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/dbnetpp/dbnetpp_resnet50-dcnv2_fpnc_1200e_icdar2015.py) | [Synthtext](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/dbnetpp/dbnetpp_resnet50-dcnv2_fpnc_100k_synthtext.py) ([model](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnetpp_r50dcnv2_fpnc_100k_iter_synthtext-20220502-db297554.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnetpp_r50dcnv2_fpnc_100k_iter_synthtext-20220502-db297554.log.json)) | ICDAR2015 Train | ICDAR2015 Test | 1200 | 1024 | 0.822 | 0.901 | 0.860 | [model](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnetpp_r50dcnv2_fpnc_1200e_icdar2015-20220502-d7a76fff.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnetpp_r50dcnv2_fpnc_1200e_icdar2015-20220502-d7a76fff.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{liao2022real,
|
||||
title={Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion},
|
||||
author={Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang},
|
||||
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
|
||||
year={2022},
|
||||
publisher={IEEE}
|
||||
}
|
||||
```
|
||||
|
||||
## DBNet
|
||||
|
||||
[Real-time Scene Text Detection with Differentiable Binarization](https://arxiv.org/abs/1911.08947)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Recently, segmentation-based methods are quite popular in scene text detection, as the segmentation results can more accurately describe scene text of various shapes such as curve text. However, the post-processing of binarization is essential for segmentation-based detection, which converts probability maps produced by a segmentation method into bounding boxes/regions of text. In this paper, we propose a module named Differentiable Binarization (DB), which can perform the binarization process in a segmentation network. Optimized along with a DB module, a segmentation network can adaptively set the thresholds for binarization, which not only simplifies the post-processing but also enhances the performance of text detection. Based on a simple segmentation network, we validate the performance improvements of DB on five benchmark datasets, which consistently achieves state-of-the-art results, in terms of both detection accuracy and speed. In particular, with a light-weight backbone, the performance improvements by DB are significant so that we can look for an ideal tradeoff between detection accuracy and efficiency. Specifically, with a backbone of ResNet-18, our detector achieves an F-measure of 82.8, running at 62 FPS, on the MSRA-TD500 dataset.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142791306-0da6db2a-20a6-4a68-b228-64ff275f67b3.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :--------------------------------------: | :-------------------------------------------------: | :-------------: | :------------: | :------: | :-------: | :----: | :-------: | :---: | :-----------------------------------------: |
|
||||
| [DBNet_r18](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/dbnet/dbnet_resnet18_fpnc_1200e_icdar2015.py) | ImageNet | ICDAR2015 Train | ICDAR2015 Test | 1200 | 736 | 0.731 | 0.871 | 0.795 | [model](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r18_fpnc_sbn_1200e_icdar2015_20210329-ba3ab597.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r18_fpnc_sbn_1200e_icdar2015_20210329-ba3ab597.log.json) |
|
||||
| [DBNet_r50dcn](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/dbnet/dbnet_resnet50-dcnv2_fpnc_1200e_icdar2015.py) | [Synthtext](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r50dcnv2_fpnc_sbn_2e_synthtext_20210325-aa96e477.pth) | ICDAR2015 Train | ICDAR2015 Test | 1200 | 1024 | 0.814 | 0.868 | 0.840 | [model](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r50dcnv2_fpnc_sbn_1200e_icdar2015_20211025-9fe3b590.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r50dcnv2_fpnc_sbn_1200e_icdar2015_20211025-9fe3b590.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{Liao_Wan_Yao_Chen_Bai_2020,
|
||||
title={Real-Time Scene Text Detection with Differentiable Binarization},
|
||||
journal={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang},
|
||||
year={2020},
|
||||
pages={11474-11481}}
|
||||
```
|
||||
|
||||
## DRRG
|
||||
|
||||
[Deep relational reasoning graph network for arbitrary shape text detection](https://arxiv.org/abs/2003.07493)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Arbitrary shape text detection is a challenging task due to the high variety and complexity of scenes texts. In this paper, we propose a novel unified relational reasoning graph network for arbitrary shape text detection. In our method, an innovative local graph bridges a text proposal model via Convolutional Neural Network (CNN) and a deep relational reasoning network via Graph Convolutional Network (GCN), making our network end-to-end trainable. To be concrete, every text instance will be divided into a series of small rectangular components, and the geometry attributes (e.g., height, width, and orientation) of the small components will be estimated by our text proposal model. Given the geometry attributes, the local graph construction model can roughly establish linkages between different text components. For further reasoning and deducing the likelihood of linkages between the component and its neighbors, we adopt a graph-based network to perform deep relational reasoning on local graphs. Experiments on public available datasets demonstrate the state-of-the-art performance of our method.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142791777-f282300a-fb83-4b5a-a7d4-29f308949f11.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------: | :--------------: | :-----------: | :----------: | :------: | :-------: | :-----------: | :-----------: | :-----------: | :---------------------------------------------------: |
|
||||
| [DRRG](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/drrg/drrg_resnet50_fpn-unet_1200e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 1200 | 640 | 0.822 (0.791) | 0.858 (0.862) | 0.840 (0.825) | [model](https://download.openmmlab.com/mmocr/textdet/drrg/drrg_r50_fpn_unet_1200e_ctw1500_20211022-fb30b001.pth) \\ [log](https://download.openmmlab.com/mmocr/textdet/drrg/20210511_234719.log) |
|
||||
|
||||
```{note}
|
||||
We've upgraded our IoU backend from `Polygon3` to `shapely`. There are some performance differences for some models due to the backends' different logics to handle invalid polygons (more info [here](https://github.com/open-mmlab/mmocr/issues/465)). **New evaluation result is presented in brackets** and new logs will be uploaded soon.
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{zhang2020drrg,
|
||||
title={Deep relational reasoning graph network for arbitrary shape text detection},
|
||||
author={Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng},
|
||||
booktitle={CVPR},
|
||||
pages={9699-9708},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
|
||||
## FCENet
|
||||
|
||||
[Fourier Contour Embedding for Arbitrary-Shaped Text Detection](https://arxiv.org/abs/2104.10442)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
One of the main challenges for arbitrary-shaped text detection is to design a good text instance representation that allows networks to learn diverse text geometry variances. Most of existing methods model text instances in image spatial domain via masks or contour point sequences in the Cartesian or the polar coordinate system. However, the mask representation might lead to expensive post-processing, while the point sequence one may have limited capability to model texts with highly-curved shapes. To tackle these problems, we model text instances in the Fourier domain and propose one novel Fourier Contour Embedding (FCE) method to represent arbitrary shaped text contours as compact signatures. We further construct FCENet with a backbone, feature pyramid networks (FPN) and a simple post-processing with the Inverse Fourier Transformation (IFT) and Non-Maximum Suppression (NMS). Different from previous methods, FCENet first predicts compact Fourier signatures of text instances, and then reconstructs text contours via IFT and NMS during test. Extensive experiments demonstrate that FCE is accurate and robust to fit contours of scene texts even with highly-curved shapes, and also validate the effectiveness and the good generalization of FCENet for arbitrary-shaped text detection. Furthermore, experimental results show that our FCENet is superior to the state-of-the-art (SOTA) methods on CTW1500 and Total-Text, especially on challenging highly-curved text subset.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142791859-1b0ebde4-b151-4c25-ba1b-f354bd8ddc8c.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### CTW1500
|
||||
|
||||
| Method | Backbone | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------: | :--------------: | :--------------: | :-----------: | :----------: | :------: | :---------: | :----: | :-------: | :----: | :---------------------------------------------------: |
|
||||
| [FCENet](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/fcenet/fcenet_resnet50-dcnv2_fpn_1500e_ctw1500.py) | ResNet50 + DCNv2 | ImageNet | CTW1500 Train | CTW1500 Test | 1500 | (736, 1080) | 0.8468 | 0.8532 | 0.8500 | [model](https://download.openmmlab.com/mmocr/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500_20211022-e326d7ec.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/fcenet/20210511_181328.log.json) |
|
||||
|
||||
#### ICDAR2015
|
||||
|
||||
| Method | Backbone | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------------: | :------: | :--------------: | :----------: | :-------: | :------: | :----------: | :----: | :-------: | :----: | :--------------------------------------------------------: |
|
||||
| [FCENet](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/fcenet/fcenet_resnet50_fpn_1500e_icdar2015.py) | ResNet50 | ImageNet | IC15 Train | IC15 Test | 1500 | (2260, 2260) | 0.8243 | 0.8834 | 0.8528 | [model](https://download.openmmlab.com/mmocr/textdet/fcenet/fcenet_r50_fpn_1500e_icdar2015_20211022-daefb6ed.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/fcenet/20210601_222655.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@InProceedings{zhu2021fourier,
|
||||
title={Fourier Contour Embedding for Arbitrary-Shaped Text Detection},
|
||||
author={Yiqin Zhu and Jianyong Chen and Lingyu Liang and Zhanghui Kuang and Lianwen Jin and Wayne Zhang},
|
||||
year={2021},
|
||||
booktitle = {CVPR}
|
||||
}
|
||||
```
|
||||
|
||||
## Mask R-CNN
|
||||
|
||||
[Mask R-CNN](https://arxiv.org/abs/1703.06870)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142795605-dfdd5f69-e9cd-4b69-9c6b-6d8bded18e89.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :---------------------------------------------------------: | :--------------: | :-----------: | :----------: | :------: | :-------: | :----: | :-------: | :----: | :------------------------------------------------------------: |
|
||||
| [MaskRCNN](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/maskrcnn/mask-rcnn_resnet50_fpn_160e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 160 | 1600 | 0.7714 | 0.7272 | 0.7486 | [model](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500_20210219-96497a76.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500_20210219-96497a76.log.json) |
|
||||
|
||||
#### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :-------------------------------------------------------: | :--------------: | :-------------: | :------------: | :------: | :-------: | :----: | :-------: | :----: | :----------------------------------------------------------: |
|
||||
| [MaskRCNN](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/maskrcnn/mask-rcnn_resnet50_fpn_160e_icdar2015.py) | ImageNet | ICDAR2015 Train | ICDAR2015 Test | 160 | 1920 | 0.8045 | 0.8530 | 0.8280 | [model](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015_20210219-8eb340a3.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015_20210219-8eb340a3.log.json) |
|
||||
|
||||
#### ICDAR2017
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :--------------------------------------------------------: | :--------------: | :-------------: | :-----------: | :------: | :-------: | :----: | :-------: | :---: | :-----------------------------------------------------------: |
|
||||
| [MaskRCNN](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/maskrcnn/mask-rcnn_resnet50_fpn_160e_icdar2017.py) | ImageNet | ICDAR2017 Train | ICDAR2017 Val | 160 | 1600 | 0.754 | 0.827 | 0.789 | [model](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017_20210218-c6ec3ebb.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017_20210218-c6ec3ebb.log.json) |
|
||||
|
||||
```{note}
|
||||
We tuned parameters with the techniques in [Pyramid Mask Text Detector](https://arxiv.org/abs/1903.11800)
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@INPROCEEDINGS{8237584,
|
||||
author={K. {He} and G. {Gkioxari} and P. {Dollár} and R. {Girshick}},
|
||||
booktitle={2017 IEEE International Conference on Computer Vision (ICCV)},
|
||||
title={Mask R-CNN},
|
||||
year={2017},
|
||||
pages={2980-2988},
|
||||
doi={10.1109/ICCV.2017.322}}
|
||||
```
|
||||
|
||||
## PANet
|
||||
|
||||
[Efficient and Accurate Arbitrary-Shaped Text Detection with Pixel Aggregation Network](https://arxiv.org/abs/1908.05900)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Scene text detection, an important step of scene text reading systems, has witnessed rapid development with convolutional neural networks. Nonetheless, two main challenges still exist and hamper its deployment to real-world applications. The first problem is the trade-off between speed and accuracy. The second one is to model the arbitrary-shaped text instance. Recently, some methods have been proposed to tackle arbitrary-shaped text detection, but they rarely take the speed of the entire pipeline into consideration, which may fall short in practical this http URL this paper, we propose an efficient and accurate arbitrary-shaped text detector, termed Pixel Aggregation Network (PAN), which is equipped with a low computational-cost segmentation head and a learnable post-processing. More specifically, the segmentation head is made up of Feature Pyramid Enhancement Module (FPEM) and Feature Fusion Module (FFM). FPEM is a cascadable U-shaped module, which can introduce multi-level information to guide the better segmentation. FFM can gather the features given by the FPEMs of different depths into a final feature for segmentation. The learnable post-processing is implemented by Pixel Aggregation (PA), which can precisely aggregate text pixels by predicted similarity vectors. Experiments on several standard benchmarks validate the superiority of the proposed PAN. It is worth noting that our method can achieve a competitive F-measure of 79.9% at 84.2 FPS on CTW1500.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142795741-0e1ea962-1596-47c2-8671-27bbe87d0df8.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------: | :--------------: | :-----------: | :----------: | :------: | :-------: | :-----------: | :-----------: | :-----------: | :---------------------------------------------------: |
|
||||
| [PANet](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/panet/panet_resnet18_fpem-ffm_600e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 600 | 640 | 0.776 (0.717) | 0.838 (0.835) | 0.806 (0.801) | [model](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_ctw1500_20210219-3b3a9aa3.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_ctw1500_20210219-3b3a9aa3.log.json) |
|
||||
|
||||
#### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :-----------------------------------------------: | :--------------: | :-------------: | :------------: | :------: | :-------: | :----------: | :----------: | :-----------: | :--------------------------------------------------: |
|
||||
| [PANet](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/panet/panet_resnet18_fpem-ffm_600e_icdar2015.py) | ImageNet | ICDAR2015 Train | ICDAR2015 Test | 600 | 736 | 0.734 (0.74) | 0.856 (0.86) | 0.791 (0.795) | [model](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_icdar2015_20210219-42dbe46a.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_icdar2015_20210219-42dbe46a.log.json) |
|
||||
|
||||
```{note}
|
||||
We've upgraded our IoU backend from `Polygon3` to `shapely`. There are some performance differences for some models due to the backends' different logics to handle invalid polygons (more info [here](https://github.com/open-mmlab/mmocr/issues/465)). **New evaluation result is presented in brackets** and new logs will be uploaded soon.
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@inproceedings{WangXSZWLYS19,
|
||||
author={Wenhai Wang and Enze Xie and Xiaoge Song and Yuhang Zang and Wenjia Wang and Tong Lu and Gang Yu and Chunhua Shen},
|
||||
title={Efficient and Accurate Arbitrary-Shaped Text Detection With Pixel Aggregation Network},
|
||||
booktitle={ICCV},
|
||||
pages={8439--8448},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## PSENet
|
||||
|
||||
[Shape robust text detection with progressive scale expansion network](https://arxiv.org/abs/1903.12473)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Scene text detection has witnessed rapid progress especially with the recent development of convolutional neural networks. However, there still exists two challenges which prevent the algorithm into industry applications. On the one hand, most of the state-of-art algorithms require quadrangle bounding box which is in-accurate to locate the texts with arbitrary shape. On the other hand, two text instances which are close to each other may lead to a false detection which covers both instances. Traditionally, the segmentation-based approach can relieve the first problem but usually fail to solve the second challenge. To address these two challenges, in this paper, we propose a novel Progressive Scale Expansion Network (PSENet), which can precisely detect text instances with arbitrary shapes. More specifically, PSENet generates the different scale of kernels for each text instance, and gradually expands the minimal scale kernel to the text instance with the complete shape. Due to the fact that there are large geometrical margins among the minimal scale kernels, our method is effective to split the close text instances, making it easier to use segmentation-based methods to detect arbitrary-shaped text instances. Extensive experiments on CTW1500, Total-Text, ICDAR 2015 and ICDAR 2017 MLT validate the effectiveness of PSENet. Notably, on CTW1500, a dataset full of long curve texts, PSENet achieves a F-measure of 74.3% at 27 FPS, and our best F-measure (82.2%) outperforms state-of-art algorithms by 6.6%. The code will be released in the future.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142795864-9b455b10-8a19-45bb-aeaf-4b733f341afc.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### CTW1500
|
||||
|
||||
| Method | Backbone | Extra Data | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :-----------------------------------------------: | :------: | :--------: | :-----------: | :----------: | :------: | :-------: | :-----------: | :-----------: | :-----------: | :--------------------------------------------------: |
|
||||
| [PSENet-4s](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/psenet/psenet_resnet50_fpnf_600e_ctw1500.py) | ResNet50 | - | CTW1500 Train | CTW1500 Test | 600 | 1280 | 0.728 (0.717) | 0.849 (0.852) | 0.784 (0.779) | [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_ctw1500_20210401-216fed50.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/psenet/20210401_215421.log.json) |
|
||||
|
||||
#### ICDAR2015
|
||||
|
||||
| Method | Backbone | Extra Data | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :----------------------------------------: | :------: | :---------------------------------------------: | :----------: | :-------: | :------: | :-------: | :----: | :-------: | :---: | :-------------------------------------------: |
|
||||
| [PSENet-4s](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/psenet/psenet_resnet50_fpnf_600e_icdar2015.py) | ResNet50 | - | IC15 Train | IC15 Test | 600 | 2240 | 0.766 | 0.840 | 0.806 | [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_icdar2015-c6131f0d.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/psenet/20210331_214145.log.json) |
|
||||
| [PSENet-4s](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/psenet/psenet_resnet50_fpnf_600e_icdar2015.py) | ResNet50 | pretrain on IC17 MLT [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_icdar2017_as_pretrain-3bd6056c.pth) | IC15 Train | IC15 Test | 600 | 2240 | 0.834 | 0.861 | 0.847 | [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_icdar2015_pretrain-eefd8fe6.pth) \| [log](<>) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@inproceedings{wang2019shape,
|
||||
title={Shape robust text detection with progressive scale expansion network},
|
||||
author={Wang, Wenhai and Xie, Enze and Li, Xiang and Hou, Wenbo and Lu, Tong and Yu, Gang and Shao, Shuai},
|
||||
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
|
||||
pages={9336--9345},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## Textsnake
|
||||
|
||||
[TextSnake: A Flexible Representation for Detecting Text of Arbitrary Shapes](https://arxiv.org/abs/1807.01544)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Driven by deep neural networks and large scale datasets, scene text detection methods have progressed substantially over the past years, continuously refreshing the performance records on various standard benchmarks. However, limited by the representations (axis-aligned rectangles, rotated rectangles or quadrangles) adopted to describe text, existing methods may fall short when dealing with much more free-form text instances, such as curved text, which are actually very common in real-world scenarios. To tackle this problem, we propose a more flexible representation for scene text, termed as TextSnake, which is able to effectively represent text instances in horizontal, oriented and curved forms. In TextSnake, a text instance is described as a sequence of ordered, overlapping disks centered at symmetric axes, each of which is associated with potentially variable radius and orientation. Such geometry attributes are estimated via a Fully Convolutional Network (FCN) model. In experiments, the text detector based on TextSnake achieves state-of-the-art or comparable performance on Total-Text and SCUT-CTW1500, the two newly published benchmarks with special emphasis on curved text in natural images, as well as the widely-used datasets ICDAR 2015 and MSRA-TD500. Specifically, TextSnake outperforms the baseline on Total-Text by more than 40% in F-measure.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142795949-2525ead4-865b-4762-baaa-e977cfd6ac66.png"/>
|
||||
</div>
|
||||
|
||||
### Results and models
|
||||
|
||||
#### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | ##epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :----------------------------------------------------------: | :--------------: | :-----------: | :----------: | :------: | :-------: | :----: | :-------: | :---: | :------------------------------------------------------------: |
|
||||
| [TextSnake](https://github.com/open-mmlab/mmocr/tree/master/configs/textdet/textsnake/textsnake_resnet50_fpn-unet_1200e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 1200 | 736 | 0.795 | 0.840 | 0.817 | [model](https://download.openmmlab.com/mmocr/textdet/textsnake/textsnake_r50_fpn_unet_1200e_ctw1500-27f65b64.pth) \| [log](<>) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{long2018textsnake,
|
||||
title={TextSnake: A Flexible Representation for Detecting Text of Arbitrary Shapes},
|
||||
author={Long, Shangbang and Ruan, Jiaqiang and Zhang, Wenjie and He, Xin and Wu, Wenhao and Yao, Cong},
|
||||
booktitle={ECCV},
|
||||
pages={20-36},
|
||||
year={2018}
|
||||
}
|
||||
```
|
|
@ -1,482 +0,0 @@
|
|||
# Text Recognition Models
|
||||
|
||||
## ABINet
|
||||
|
||||
[Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition](https://arxiv.org/abs/2103.06495)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Linguistic knowledge is of great benefit to scene text recognition. However, how to effectively model linguistic rules in end-to-end deep networks remains a research challenge. In this paper, we argue that the limited capacity of language models comes from: 1) implicitly language modeling; 2) unidirectional feature representation; and 3) language model with noise input. Correspondingly, we propose an autonomous, bidirectional and iterative ABINet for scene text recognition. Firstly, the autonomous suggests to block gradient flow between vision and language models to enforce explicitly language modeling. Secondly, a novel bidirectional cloze network (BCN) as the language model is proposed based on bidirectional feature representation. Thirdly, we propose an execution manner of iterative correction for language model which can effectively alleviate the impact of noise input. Additionally, based on the ensemble of iterative predictions, we propose a self-training method which can learn from unlabeled images effectively. Extensive experiments indicate that ABINet has superiority on low-quality images and achieves state-of-the-art results on several mainstream benchmarks. Besides, the ABINet trained with ensemble self-training shows promising improvement in realizing human-level recognition.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/145804331-9ae955dc-0d3b-41eb-a6b2-dc7c9f7c1bef.png"/>
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | note |
|
||||
| :-------: | :----------: | :--------: | :----------: |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
| SynthText | 7239272 | 1 | alphanumeric |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | note |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and models
|
||||
|
||||
| methods | pretrained | | Regular Text | | | Irregular Text | | download |
|
||||
| :------------------------------------------------: | :----------------------------------------------------: | :----: | :----------: | :--: | :--: | :------------: | :--: | :--------------------------------------------------- |
|
||||
| | | IIIT5K | SVT | IC13 | IC15 | SVTP | CT80 | |
|
||||
| [ABINet-Vision](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/abinet/abinet-vision_6e_st-an_mj.py) | - | 94.7 | 91.7 | 93.6 | 83.0 | 85.1 | 86.5 | [model](https://download.openmmlab.com/mmocr/textrecog/abinet/abinet_vision_only_academic-e6b9ea89.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/abinet/20211201_195512.log) |
|
||||
| [ABINet](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/abinet/abinet_6e_st-an_mj.py) | [Pretrained](https://download.openmmlab.com/mmocr/textrecog/abinet/abinet_pretrain-1bed979b.pth) | 95.7 | 94.6 | 95.7 | 85.1 | 90.4 | 90.3 | [model](https://download.openmmlab.com/mmocr/textrecog/abinet/abinet_academic-f718abf6.pth) \| [log1](https://download.openmmlab.com/mmocr/textrecog/abinet/20211210_095832.log) \| [log2](https://download.openmmlab.com/mmocr/textrecog/abinet/20211213_131724.log) |
|
||||
|
||||
```{note}
|
||||
1. ABINet allows its encoder to run and be trained without decoder and fuser. Its encoder is designed to recognize texts as a stand-alone model and therefore can work as an independent text recognizer. We release it as ABINet-Vision.
|
||||
2. Facts about the pretrained model: MMOCR does not have a systematic pipeline to pretrain the language model (LM) yet, thus the weights of LM are converted from [the official pretrained model](https://github.com/FangShancheng/ABINet). The weights of ABINet-Vision are directly used as the vision model of ABINet.
|
||||
3. Due to some technical issues, the training process of ABINet was interrupted at the 13th epoch and we resumed it later. Both logs are released for full reference.
|
||||
4. The model architecture in the logs looks slightly different from the final released version, since it was refactored afterward. However, both architectures are essentially equivalent.
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{fang2021read,
|
||||
title={Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition},
|
||||
author={Fang, Shancheng and Xie, Hongtao and Wang, Yuxin and Mao, Zhendong and Zhang, Yongdong},
|
||||
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
|
||||
year={2021}
|
||||
}
|
||||
```
|
||||
|
||||
## CRNN
|
||||
|
||||
[An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition](https://arxiv.org/abs/1507.05717)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Image-based sequence recognition has been a long-standing research topic in computer vision. In this paper, we investigate the problem of scene text recognition, which is among the most important and challenging tasks in image-based sequence recognition. A novel neural network architecture, which integrates feature extraction, sequence modeling and transcription into a unified framework, is proposed. Compared with previous systems for scene text recognition, the proposed architecture possesses four distinctive properties: (1) It is end-to-end trainable, in contrast to most of the existing algorithms whose components are separately trained and tuned. (2) It naturally handles sequences in arbitrary lengths, involving no character segmentation or horizontal scale normalization. (3) It is not confined to any predefined lexicon and achieves remarkable performances in both lexicon-free and lexicon-based scene text recognition tasks. (4) It generates an effective yet much smaller model, which is more practical for real-world application scenarios. The experiments on standard benchmarks, including the IIIT-5K, Street View Text and ICDAR datasets, demonstrate the superiority of the proposed algorithm over the prior arts. Moreover, the proposed algorithm performs well in the task of image-based music score recognition, which evidently verifies the generality of it.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142797788-6b1cd78d-1dd6-4e02-be32-3dbd257c4992.png"/>
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | note |
|
||||
| :------: | :----------: | :--------: | :---: |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | note |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and models
|
||||
|
||||
| methods | | Regular Text | | | | Irregular Text | | download |
|
||||
| :--------------------------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :---------------------------------------------------------------------------: |
|
||||
| methods | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [CRNN](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/crnn/crnn_mini-vgg_5e_mj.py) | 80.5 | 81.5 | 86.5 | | 54.1 | 59.1 | 55.6 | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_academic-a723a1c5.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/20210326_111035.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{shi2016end,
|
||||
title={An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition},
|
||||
author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
|
||||
journal={IEEE transactions on pattern analysis and machine intelligence},
|
||||
year={2016}
|
||||
}
|
||||
```
|
||||
|
||||
## MASTER
|
||||
|
||||
[MASTER: Multi-aspect non-local network for scene text recognition](https://arxiv.org/abs/1910.02562)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Attention-based scene text recognizers have gained huge success, which leverages a more compact intermediate representation to learn 1d- or 2d- attention by a RNN-based encoder-decoder architecture. However, such methods suffer from attention-drift problem because high similarity among encoded features leads to attention confusion under the RNN-based local attention mechanism. Moreover, RNN-based methods have low efficiency due to poor parallelization. To overcome these problems, we propose the MASTER, a self-attention based scene text recognizer that (1) not only encodes the input-output attention but also learns self-attention which encodes feature-feature and target-target relationships inside the encoder and decoder and (2) learns a more powerful and robust intermediate representation to spatial distortion, and (3) owns a great training efficiency because of high training parallelization and a high-speed inference because of an efficient memory-cache mechanism. Extensive experiments on various benchmarks demonstrate the superior performance of our MASTER on both regular and irregular scene text.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/65173622/164642001-037f81b7-37dd-4808-a6a9-09ff6f6a17ea.JPG">
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :-------: | :----------: | :--------: | :----: |
|
||||
| SynthText | 7266686 | 1 | synth |
|
||||
| SynthAdd | 1216889 | 1 | synth |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and Models
|
||||
|
||||
| Methods | Backbone | | Regular Text | | | | Irregular Text | | download |
|
||||
| :------------------------------------------------------------------: | :-----------: | :----: | :----------: | :---: | :-: | :---: | :------------: | :---: | :-------------------------------------------------------------------: |
|
||||
| | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [MASTER](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/master/master_resnet31_12e_st_mj_sa.py) | R31-GCAModule | 94.63 | 90.42 | 94.98 | | 75.54 | 82.79 | 88.54 | [model](https://download.openmmlab.com/mmocr/textrecog/master/master_r31_12e_ST_MJ_SA-787edd36.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/master/master_r31_12e_ST_MJ_SA-787edd36.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{Lu2021MASTER,
|
||||
title={{MASTER}: Multi-Aspect Non-local Network for Scene Text Recognition},
|
||||
author={Ning Lu and Wenwen Yu and Xianbiao Qi and Yihao Chen and Ping Gong and Rong Xiao and Xiang Bai},
|
||||
journal={Pattern Recognition},
|
||||
year={2021}
|
||||
}
|
||||
```
|
||||
|
||||
## NRTR
|
||||
|
||||
[NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition](https://arxiv.org/abs/1806.00926)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Scene text recognition has attracted a great many researches due to its importance to various applications. Existing methods mainly adopt recurrence or convolution based networks. Though have obtained good performance, these methods still suffer from two limitations: slow training speed due to the internal recurrence of RNNs, and high complexity due to stacked convolutional layers for long-term feature extraction. This paper, for the first time, proposes a no-recurrence sequence-to-sequence text recognizer, named NRTR, that dispenses with recurrences and convolutions entirely. NRTR follows the encoder-decoder paradigm, where the encoder uses stacked self-attention to extract image features, and the decoder applies stacked self-attention to recognize texts based on encoder output. NRTR relies solely on self-attention mechanism thus could be trained with more parallelization and less complexity. Considering scene image has large variation in text and background, we further design a modality-transform block to effectively transform 2D input images to 1D sequences, combined with the encoder to extract more discriminative features. NRTR achieves state-of-the-art or highly competitive performance on both regular and irregular benchmarks, while requires only a small fraction of training time compared to the best model from the literature (at least 8 times faster).
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142797203-d9df6c35-868f-4848-8261-c286751fd342.png"/>
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :-------: | :----------: | :--------: | :----: |
|
||||
| SynthText | 7266686 | 1 | synth |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and Models
|
||||
|
||||
| Methods | Backbone | | Regular Text | | | | Irregular Text | | download |
|
||||
| :------------------------------------------------------------------: | :----------: | :----: | :----------: | :---: | :-: | :---: | :------------: | :---: | :--------------------------------------------------------------------: |
|
||||
| | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [NRTR](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/nrtr/nrtr_resnet31-1by16-1by8_6e_st_mj.py) | R31-1/16-1/8 | 94.8 | 89.03 | 93.79 | | 74.19 | 80.31 | 87.15 | [model](https://download.openmmlab.com/mmocr/textrecog/nrtr/nrtr_r31_1by16_1by8_academic_20211124-f60cebf4.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/nrtr/20211124_002420.log.json) |
|
||||
| [NRTR](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/nrtr/nrtr_resnet31-1by8-1by4_6e_st_mj.py) | R31-1/8-1/4 | 95.5 | 90.01 | 94.38 | | 74.05 | 79.53 | 87.15 | [model](https://download.openmmlab.com/mmocr/textrecog/nrtr/nrtr_r31_1by8_1by4_academic_20211123-e1fdb322.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/nrtr/20211123_232151.log.json) |
|
||||
|
||||
```{note}
|
||||
|
||||
- For backbone `R31-1/16-1/8`:
|
||||
- The output consists of 92 classes, including 26 lowercase letters, 26 uppercase letters, 28 symbols, 10 digital numbers, 1 unknown token and 1 end-of-sequence token.
|
||||
- The encoder-block number is 6.
|
||||
- `1/16-1/8` means the height of feature from backbone is 1/16 of input image, where 1/8 for width.
|
||||
- For backbone `R31-1/8-1/4`:
|
||||
- The output consists of 92 classes, including 26 lowercase letters, 26 uppercase letters, 28 symbols, 10 digital numbers, 1 unknown token and 1 end-of-sequence token.
|
||||
- The encoder-block number is 6.
|
||||
- `1/8-1/4` means the height of feature from backbone is 1/8 of input image, where 1/4 for width.
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@inproceedings{sheng2019nrtr,
|
||||
title={NRTR: A no-recurrence sequence-to-sequence model for scene text recognition},
|
||||
author={Sheng, Fenfen and Chen, Zhineng and Xu, Bo},
|
||||
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
|
||||
pages={781--786},
|
||||
year={2019},
|
||||
organization={IEEE}
|
||||
}
|
||||
```
|
||||
|
||||
## RobustScanner
|
||||
|
||||
[RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition](https://arxiv.org/abs/2007.07542)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
The attention-based encoder-decoder framework has recently achieved impressive results for scene text recognition, and many variants have emerged with improvements in recognition quality. However, it performs poorly on contextless texts (e.g., random character sequences) which is unacceptable in most of real application scenarios. In this paper, we first deeply investigate the decoding process of the decoder. We empirically find that a representative character-level sequence decoder utilizes not only context information but also positional information. Contextual information, which the existing approaches heavily rely on, causes the problem of attention drift. To suppress such side-effect, we propose a novel position enhancement branch, and dynamically fuse its outputs with those of the decoder attention module for scene text recognition. Specifically, it contains a position aware module to enable the encoder to output feature vectors encoding their own spatial positions, and an attention module to estimate glimpses using the positional clue (i.e., the current decoding time step) only. The dynamic fusion is conducted for more robust feature via an element-wise gate mechanism. Theoretically, our proposed method, dubbed \\emph{RobustScanner}, decodes individual characters with dynamic ratio between context and positional clues, and utilizes more positional ones when the decoding sequences with scarce context, and thus is robust and practical. Empirically, it has achieved new state-of-the-art results on popular regular and irregular text recognition benchmarks while without much performance drop on contextless benchmarks, validating its robustness in both contextual and contextless application scenarios.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142798010-eee8795e-8cda-4a7f-a81d-ff9c94af58dc.png"/>
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :--------: | :----------: | :--------: | :-------------------------: |
|
||||
| icdar_2011 | 3567 | 20 | real |
|
||||
| icdar_2013 | 848 | 20 | real |
|
||||
| icdar2015 | 4468 | 20 | real |
|
||||
| coco_text | 42142 | 20 | real |
|
||||
| IIIT5K | 2000 | 20 | real |
|
||||
| SynthText | 2400000 | 1 | synth |
|
||||
| SynthAdd | 1216889 | 1 | synth, 1.6m in [\[1\]](##1) |
|
||||
| Syn90k | 2400000 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :----------------------------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular, 639 in [\[1\]](##1) |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and Models
|
||||
|
||||
| Methods | GPUs | | Regular Text | | | | Irregular Text | | download |
|
||||
| :------------------------------------------------------------------------: | :--: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :-------------------------------------------------------------------------: |
|
||||
| | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [RobustScanner](configs/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real.py) | 16 | 95.1 | 89.2 | 93.1 | | 77.8 | 80.3 | 90.3 | [model](https://download.openmmlab.com/mmocr/textrecog/robustscanner/robustscanner_r31_academic-5f05874f.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/robustscanner/20210401_170932.log.json) |
|
||||
|
||||
### References
|
||||
|
||||
<a id="1">\[1\]</a> Li, Hui and Wang, Peng and Shen, Chunhua and Zhang, Guyu. Show, attend and read: A simple and strong baseline for irregular text recognition. In AAAI 2019.
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@inproceedings{yue2020robustscanner,
|
||||
title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition},
|
||||
author={Yue, Xiaoyu and Kuang, Zhanghui and Lin, Chenhao and Sun, Hongbin and Zhang, Wayne},
|
||||
booktitle={European Conference on Computer Vision},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
|
||||
## SAR
|
||||
|
||||
[Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition](https://arxiv.org/abs/1811.00751)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Recognizing irregular text in natural scene images is challenging due to the large variance in text appearance, such as curvature, orientation and distortion. Most existing approaches rely heavily on sophisticated model designs and/or extra fine-grained annotations, which, to some extent, increase the difficulty in algorithm implementation and data collection. In this work, we propose an easy-to-implement strong baseline for irregular scene text recognition, using off-the-shelf neural network components and only word-level annotations. It is composed of a 31-layer ResNet, an LSTM-based encoder-decoder framework and a 2-dimensional attention module. Despite its simplicity, the proposed method is robust and achieves state-of-the-art performance on both regular and irregular scene text recognition benchmarks.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142798157-ac68907f-5a8a-473f-a29f-f0532b7fdba0.png"/>
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :--------: | :----------: | :--------: | :-------------------------: |
|
||||
| icdar_2011 | 3567 | 20 | real |
|
||||
| icdar_2013 | 848 | 20 | real |
|
||||
| icdar2015 | 4468 | 20 | real |
|
||||
| coco_text | 42142 | 20 | real |
|
||||
| IIIT5K | 2000 | 20 | real |
|
||||
| SynthText | 2400000 | 1 | synth |
|
||||
| SynthAdd | 1216889 | 1 | synth, 1.6m in [\[1\]](##1) |
|
||||
| Syn90k | 2400000 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :----------------------------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular, 639 in [\[1\]](##1) |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and Models
|
||||
|
||||
| Methods | Backbone | Decoder | | Regular Text | | | | Irregular Text | | download |
|
||||
| :----------------------------------------------------------: | :---------: | :------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :------------------------------------------------------------: |
|
||||
| | | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [SAR](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/sar/sar_r31_parallel_decoder_academic.py) | R31-1/8-1/4 | ParallelSARDecoder | 95.0 | 89.6 | 93.7 | | 79.0 | 82.2 | 88.9 | [model](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_parallel_decoder_academic-dba3a4a3.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/sar/20210327_154129.log.json) |
|
||||
| [SAR](configs/textrecog/sar/sar_r31_sequential_decoder_academic.py) | R31-1/8-1/4 | SequentialSARDecoder | 95.2 | 88.7 | 92.4 | | 78.2 | 81.9 | 89.6 | [model](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_sequential_decoder_academic-d06c9a8e.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/sar/20210330_105728.log.json) |
|
||||
|
||||
### Chinese Dataset
|
||||
|
||||
### Results and Models
|
||||
|
||||
| Methods | Backbone | Decoder | | download |
|
||||
| :---------------------------------------------------------------------------------: | :---------: | :----------------: | :-: | :-----------------------------------------------------------------------------------: |
|
||||
| [SAR](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/sar/sar_r31_parallel_decoder_chinese.py) | R31-1/8-1/4 | ParallelSARDecoder | | [model](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_parallel_decoder_chineseocr_20210507-b4be8214.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/sar/20210506_225557.log.json) \| [dict](https://download.openmmlab.com/mmocr/textrecog/sar/dict_printed_chinese_english_digits.txt) |
|
||||
|
||||
```{note}
|
||||
|
||||
- `R31-1/8-1/4` means the height of feature from backbone is 1/8 of input image, where 1/4 for width.
|
||||
- We did not use beam search during decoding.
|
||||
- We implemented two kinds of decoder. Namely, `ParallelSARDecoder` and `SequentialSARDecoder`.
|
||||
- `ParallelSARDecoder`: Parallel decoding during training with `LSTM` layer. It would be faster.
|
||||
- `SequentialSARDecoder`: Sequential Decoding during training with `LSTMCell`. It would be easier to understand.
|
||||
- For train dataset.
|
||||
- We did not construct distinct data groups (20 groups in [[1]](##1)) to train the model group-by-group since it would render model training too complicated.
|
||||
- Instead, we randomly selected `2.4m` patches from `Syn90k`, `2.4m` from `SynthText` and `1.2m` from `SynthAdd`, and grouped all data together. See [config](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_academic.py) for details.
|
||||
- We used 48 GPUs with `total_batch_size = 64 * 48` in the experiment above to speedup training, while keeping the `initial lr = 1e-3` unchanged.
|
||||
```
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@inproceedings{li2019show,
|
||||
title={Show, attend and read: A simple and strong baseline for irregular text recognition},
|
||||
author={Li, Hui and Wang, Peng and Shen, Chunhua and Zhang, Guyu},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={33},
|
||||
number={01},
|
||||
pages={8610--8617},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## SATRN
|
||||
|
||||
[On Recognizing Texts of Arbitrary Shapes with 2D Self-Attention](https://arxiv.org/abs/1910.04396)
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Scene text recognition (STR) is the task of recognizing character sequences in natural scenes. While there have been great advances in STR methods, current methods still fail to recognize texts in arbitrary shapes, such as heavily curved or rotated texts, which are abundant in daily life (e.g. restaurant signs, product labels, company logos, etc). This paper introduces a novel architecture to recognizing texts of arbitrary shapes, named Self-Attention Text Recognition Network (SATRN), which is inspired by the Transformer. SATRN utilizes the self-attention mechanism to describe two-dimensional (2D) spatial dependencies of characters in a scene text image. Exploiting the full-graph propagation of self-attention, SATRN can recognize texts with arbitrary arrangements and large inter-character spacing. As a result, SATRN outperforms existing STR models by a large margin of 5.7 pp on average in "irregular text" benchmarks. We provide empirical analyses that illustrate the inner mechanisms and the extent to which the model is applicable (e.g. rotated and multi-line text). We will open-source the code.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142798828-cc4ded5d-3fb8-478c-9f3e-74edbcf41982.png"/>
|
||||
</div>
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :-------: | :----------: | :--------: | :----: |
|
||||
| SynthText | 7266686 | 1 | synth |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and Models
|
||||
|
||||
| Methods | | Regular Text | | | | Irregular Text | | download |
|
||||
| :--------------------------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :---------------------------------------------------------------------------: |
|
||||
| | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [Satrn](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/satrn/satrn_shallow_5e_st_mj.py) | 95.1 | 92.0 | 95.8 | | 81.4 | 87.6 | 90.6 | [model](https://download.openmmlab.com/mmocr/textrecog/satrn/satrn_academic_20211009-cb8b1580.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/satrn/20210809_093244.log.json) |
|
||||
| [Satrn_small](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/satrn/satrn_shallow-small_5e_st_mj.py) | 94.7 | 91.3 | 95.4 | | 81.9 | 85.9 | 86.5 | [model](https://download.openmmlab.com/mmocr/textrecog/satrn/satrn_small_20211009-2cf13355.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/satrn/20210811_053047.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{junyeop2019recognizing,
|
||||
title={On Recognizing Texts of Arbitrary Shapes with 2D Self-Attention},
|
||||
author={Junyeop Lee, Sungrae Park, Jeonghun Baek, Seong Joon Oh, Seonghyeon Kim, Hwalsuk Lee},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## CRNN-STN
|
||||
|
||||
<!-- [ALGORITHM] -->
|
||||
|
||||
### Abstract
|
||||
|
||||
Image-based sequence recognition has been a long-standing research topic in computer vision. In this paper, we investigate the problem of scene text recognition, which is among the most important and challenging tasks in image-based sequence recognition. A novel neural network architecture, which integrates feature extraction, sequence modeling and transcription into a unified framework, is proposed. Compared with previous systems for scene text recognition, the proposed architecture possesses four distinctive properties: (1) It is end-to-end trainable, in contrast to most of the existing algorithms whose components are separately trained and tuned. (2) It naturally handles sequences in arbitrary lengths, involving no character segmentation or horizontal scale normalization. (3) It is not confined to any predefined lexicon and achieves remarkable performances in both lexicon-free and lexicon-based scene text recognition tasks. (4) It generates an effective yet much smaller model, which is more practical for real-world application scenarios. The experiments on standard benchmarks, including the IIIT-5K, Street View Text and ICDAR datasets, demonstrate the superiority of the proposed algorithm over the prior arts. Moreover, the proposed algorithm performs well in the task of image-based music score recognition, which evidently verifies the generality of it.
|
||||
|
||||
<div align=center>
|
||||
<img src="https://user-images.githubusercontent.com/22607038/142797788-6b1cd78d-1dd6-4e02-be32-3dbd257c4992.png"/>
|
||||
</div>
|
||||
|
||||
```{note}
|
||||
We use STN from this paper as the preprocessor and CRNN as the recognition network.
|
||||
```
|
||||
|
||||
### Dataset
|
||||
|
||||
#### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | note |
|
||||
| :------: | :----------: | :--------: | :---: |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
#### Test Dataset
|
||||
|
||||
| testset | instance_num | note |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
### Results and models
|
||||
|
||||
| methods | | Regular Text | | | | Irregular Text | | download |
|
||||
| :--------------------------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :---------------------------------------------------------------------------: |
|
||||
| | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 | |
|
||||
| [CRNN-STN](https://github.com/open-mmlab/mmocr/tree/master/configs/textrecog/tps/crnn_tps_academic_dataset.py) | 80.8 | 81.3 | 85.0 | | 59.6 | 68.1 | 53.8 | [model](https://download.openmmlab.com/mmocr/textrecog/tps/crnn_tps_academic_dataset_20210510-d221a905.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/tps/20210510_204353.log.json) |
|
||||
|
||||
### Citation
|
||||
|
||||
```bibtex
|
||||
@article{shi2016robust,
|
||||
title={Robust Scene Text Recognition with Automatic Rectification},
|
||||
author={Shi, Baoguang and Wang, Xinggang and Lyu, Pengyuan and Yao,
|
||||
Cong and Bai, Xiang},
|
||||
year={2016}
|
||||
}
|
||||
```
|
|
@ -1,22 +1,59 @@
|
|||
mmocr.datasets
|
||||
---------------------------------------------
|
||||
.. automodule:: mmocr.datasets
|
||||
:members:
|
||||
|
||||
Dataset Types
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. automodule:: mmocr.datasets.ocr_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.icdar_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_lmdb_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_text_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.wildreceipt_dataset
|
||||
:members:
|
||||
|
||||
Transforms
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.datasets.transforms
|
||||
:members:
|
||||
|
||||
|
||||
mmocr.engine
|
||||
-------------
|
||||
---------------------------------------------
|
||||
Hooks
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.engine.hooks
|
||||
:members:
|
||||
|
||||
|
||||
mmocr.evaluation
|
||||
-------------
|
||||
---------------------------------------------
|
||||
Evaluator
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.evaluation.evaluator
|
||||
:members:
|
||||
|
||||
Functional
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.evaluation.functional
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.evaluation.metircs
|
||||
Metric
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.evaluation.metrics
|
||||
:members:
|
||||
|
||||
mmocr.utils
|
||||
-------------
|
||||
---------------------------------------------
|
||||
Point utils
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.utils.point_utils
|
||||
|
@ -66,8 +103,9 @@ Others
|
|||
.. automodule:: mmocr.utils.parsers
|
||||
:members:
|
||||
|
||||
|
||||
mmocr.models
|
||||
---------------
|
||||
---------------------------------------------
|
||||
Common
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.common.backbones
|
||||
|
@ -105,7 +143,7 @@ Text Detection Module Losses
|
|||
.. automodule:: mmocr.models.textdet.module_losses
|
||||
:members:
|
||||
|
||||
Text Detection Preprocessors
|
||||
Text Detection Data Preprocessors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.textdet.data_preprocessors
|
||||
:members:
|
||||
|
@ -125,7 +163,7 @@ Text Recognition Backbones
|
|||
.. automodule:: mmocr.models.textrecog.backbones
|
||||
:members:
|
||||
|
||||
Text Recognition Preprocessors
|
||||
Text Recognition Data Preprocessors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.textrecog.data_preprocessors
|
||||
:members:
|
||||
|
@ -156,80 +194,59 @@ Text Recognition Module Losses
|
|||
:members:
|
||||
|
||||
KIE Extractors
|
||||
^^^^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.kie.extractors
|
||||
:members:
|
||||
|
||||
KIE Heads
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.kie.heads
|
||||
:members:
|
||||
|
||||
KIE Module Losses
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.models.kie.module_losses
|
||||
:members:
|
||||
|
||||
mmocr.datasets
|
||||
-----------------
|
||||
.. automodule:: mmocr.datasets
|
||||
:members:
|
||||
|
||||
Dataset Types
|
||||
^^^^^^^^^^^
|
||||
|
||||
.. automodule:: mmocr.datasets.ocr_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.icdar_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_lmdb_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.recog_text_dataset
|
||||
:members:
|
||||
|
||||
.. automodule:: mmocr.datasets.wildreceipt_dataset
|
||||
:members:
|
||||
|
||||
Transforms
|
||||
^^^^^^^^^^^
|
||||
.. automodule:: mmocr.datasets.transforms
|
||||
:members:
|
||||
|
||||
mmocr.structures
|
||||
-----------------
|
||||
---------------------------------------------
|
||||
|
||||
Text Detection Data Sample
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.structures.textdet_data_sample
|
||||
:members:
|
||||
|
||||
Text Recognition Data Sample
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.structures.textrecog_data_sample
|
||||
:members:
|
||||
|
||||
KIE Data Sample
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.structures.kie_data_sample
|
||||
:members:
|
||||
|
||||
mmocr.visualization
|
||||
-----------------
|
||||
|
||||
visualize
|
||||
^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.visualize
|
||||
:members:
|
||||
mmocr.visualization
|
||||
---------------------------------------------
|
||||
|
||||
Text Detection Visualizer
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.textdet_visualizer
|
||||
:members:
|
||||
|
||||
Text Recognition Visualizer
|
||||
^^^^^^^^^^^
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.textrecog_visualizer
|
||||
:members:
|
||||
|
||||
Text Spotting Visualizer
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.textspotting_visualizer
|
||||
:members:
|
||||
|
||||
KIE Visualizer
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. automodule:: mmocr.visualization.kie_visualizer
|
||||
:members:
|
||||
|
|
|
@ -39,8 +39,10 @@ release = __version__
|
|||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode',
|
||||
'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser'
|
||||
'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser',
|
||||
'sphinx.ext.intersphinx', 'sphinx.ext.autodoc.typehints'
|
||||
]
|
||||
autodoc_typehints = 'description'
|
||||
|
||||
autodoc_mock_imports = ['mmcv._ext']
|
||||
|
||||
|
@ -130,7 +132,7 @@ intersphinx_mapping = {
|
|||
'numpy': ('https://numpy.org/doc/stable', None),
|
||||
'torch': ('https://pytorch.org/docs/stable/', None),
|
||||
'mmcv': ('https://mmcv.readthedocs.io/zh_CN/dev-2.x/', None),
|
||||
'mmengine': ('https://mmengine.readthedocs.io/zh_CN/main/', None),
|
||||
'mmengine': ('https://mmengine.readthedocs.io/zh_CN/latest/', None),
|
||||
'mmdetection': ('https://mmdetection.readthedocs.io/zh_CN/dev-3.x/', None),
|
||||
}
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
|
||||
basic_concepts/overview.md
|
||||
basic_concepts/data_flow.md
|
||||
basic_concepts/dataset.md
|
||||
basic_concepts/datasets.md
|
||||
basic_concepts/structures.md
|
||||
basic_concepts/models.md
|
||||
basic_concepts/transforms.md
|
||||
|
|
|
@ -4,8 +4,6 @@
|
|||
sed -e '$a\\n' -s ../../configs/kie/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# 关键信息提取模型' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >kie_models.md
|
||||
sed -e '$a\\n' -s ../../configs/textdet/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# 文本检测模型' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >textdet_models.md
|
||||
sed -e '$a\\n' -s ../../configs/textrecog/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# 文本识别模型' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >textrecog_models.md
|
||||
sed -e '$a\\n' -s ../../configs/ner/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# 命名实体识别模型' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >ner_models.md
|
||||
|
||||
# replace special symbols in demo.md
|
||||
cp ../../demo/README_zh-CN.md demo.md
|
||||
sed -i 's/:heavy_check_mark:/Yes/g' demo.md && sed -i 's/:x:/No/g' demo.md
|
||||
# replace special symbols in inference.md
|
||||
sed -i 's/:heavy_check_mark:/Yes/g' user_guides/inference.md && sed -i 's/:x:/No/g' user_guides/inference.md
|
||||
|
|
|
@ -1,904 +0,0 @@
|
|||
# 更新日志
|
||||
|
||||
## 0.6.0 (05/05/2022)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. A new recognition algorithm [MASTER](https://arxiv.org/abs/1910.02562) has been added into MMOCR, which was the championship solution for the "ICDAR 2021 Competition on Scientific Table Image Recognition to Latex"! The model pre-trained on SynthText and MJSynth is available for testing! Credit to @JiaquanYe
|
||||
2. [DBNet++](https://arxiv.org/abs/2202.10304) has been released now! A new Adaptive Scale Fusion module has been equipped for feature enhancement. Benefiting from this, the new model achieved 2% better h-mean score than its predecessor on the ICDAR2015 dataset.
|
||||
3. Three more dataset converters are added: LSVT, RCTW and HierText. Check the dataset zoo ([Det](https://mmocr.readthedocs.io/en/latest/datasets/det.html#) & [Recog](https://mmocr.readthedocs.io/en/latest/datasets/recog.html) ) to explore further information.
|
||||
4. To enhance the data storage efficiency, MMOCR now supports loading both images and labels from .lmdb format annotations for the text recognition task. To enable such a feature, the new lmdb_converter.py is ready for use to pack your cropped images and labels into an lmdb file. For a detailed tutorial, please refer to the following sections and the [doc](https://mmocr.readthedocs.io/en/latest/tools.html#convert-text-recognition-dataset-to-lmdb-format).
|
||||
5. Testing models on multiple datasets is a widely used evaluation strategy. MMOCR now supports automatically reporting mean scores when there is more than one dataset to evaluate, which enables a more convenient comparison between checkpoints. [Doc](https://mmocr.readthedocs.io/en/latest/tutorials/dataset_types.html#getting-mean-evaluation-scores)
|
||||
6. Evaluation is more flexible and customizable now. For text detection tasks, you can set the score threshold range where the best results might come out. ([Doc](https://mmocr.readthedocs.io/en/latest/tutorials/dataset_types.html#evaluation)) If too many results are flooding your text recognition train log, you can trim it by specifying a subset of metrics in evaluation config. Check out the [Evaluation](https://mmocr.readthedocs.io/en/latest/tutorials/dataset_types.html#ocrdataset) section for details.
|
||||
7. MMOCR provides a script to convert the .json labels obtained by the popular annotation toolkit **Labelme** to MMOCR-supported data format. @Y-M-Y contributed a log analysis tool that helps users gain a better understanding of the entire training process. Read [tutorial docs](https://mmocr.readthedocs.io/en/latest/tools.html) to get started.
|
||||
|
||||
### Lmdb Dataset
|
||||
|
||||
Reading images or labels from files can be slow when data are excessive, e.g. on a scale of millions. Besides, in academia, most of the scene text recognition datasets are stored in lmdb format, including images and labels. To get closer to the mainstream practice and enhance the data storage efficiency, MMOCR now officially supports loading images and labels from lmdb datasets via a new pipeline [LoadImageFromLMDB](https://github.com/open-mmlab/mmocr/blob/878383b9de8d0e598f31fbb844ffcb0c305deb8b/mmocr/datasets/pipelines/loading.py#L140).
|
||||
This section is intended to serve as a quick walkthrough for you to master this update and apply it to facilitate your research.
|
||||
|
||||
#### Specifications
|
||||
|
||||
To better align with the academic community, MMOCR now requires the following specifications for lmdb datasets:
|
||||
|
||||
- The parameter describing the data volume of the dataset is `num-samples` instead of `total_number` (deprecated).
|
||||
- Images and labels are stored with keys in the form of `image-000000001` and `label-000000001`, respectively.
|
||||
|
||||
#### Usage
|
||||
|
||||
1. Use existing academic lmdb datasets if they meet the specifications; or the tool provided by MMOCR to pack images & annotations into a lmdb dataset.
|
||||
|
||||
- Previously, MMOCR had a function `txt2lmdb` (deprecated) that only supported converting labels to lmdb format. However, it is quite different from academic lmdb datasets, which usually contain both images and labels. Now MMOCR provides a new utility [lmdb_converter](https://github.com/open-mmlab/mmocr/blob/main/tools/data/utils/lmdb_converter.py) to convert recognition datasets with both images and labels to lmdb format.
|
||||
|
||||
- Say that your recognition data in MMOCR's format are organized as follows. (See an example in [ocr_toy_dataset](https://github.com/open-mmlab/mmocr/tree/main/tests/data/ocr_toy_dataset)).
|
||||
|
||||
```text
|
||||
# Directory structure
|
||||
|
||||
├──img_path
|
||||
| |—— img1.jpg
|
||||
| |—— img2.jpg
|
||||
| |—— ...
|
||||
|——label.txt (or label.jsonl)
|
||||
|
||||
# Annotation format
|
||||
|
||||
label.txt: img1.jpg HELLO
|
||||
img2.jpg WORLD
|
||||
...
|
||||
|
||||
label.jsonl: {'filename':'img1.jpg', 'text':'HELLO'}
|
||||
{'filename':'img2.jpg', 'text':'WORLD'}
|
||||
...
|
||||
```
|
||||
|
||||
- Then pack these files up:
|
||||
|
||||
```bash
|
||||
python tools/data/utils/lmdb_converter.py {PATH_TO_LABEL} {OUTPUT_PATH} --i {PATH_TO_IMAGES}
|
||||
```
|
||||
|
||||
- Check out [tools.md](https://github.com/open-mmlab/mmocr/blob/main/docs/en/tools.md) for more details.
|
||||
|
||||
2. The second step is to modify the configuration files. For example, to train CRNN on MJ and ST datasets:
|
||||
|
||||
- Set parser as `LineJsonParser` and `file_format` as 'lmdb' in [dataset config](https://github.com/open-mmlab/mmocr/blob/main/configs/_base_/recog_datasets/ST_MJ_train.py#L9)
|
||||
|
||||
```python
|
||||
# configs/_base_/recog_datasets/ST_MJ_train.py
|
||||
train1 = dict(
|
||||
type='OCRDataset',
|
||||
img_prefix=train_img_prefix1,
|
||||
ann_file=train_ann_file1,
|
||||
loader=dict(
|
||||
type='AnnFileLoader',
|
||||
repeat=1,
|
||||
file_format='lmdb',
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['filename', 'text'],
|
||||
)),
|
||||
pipeline=None,
|
||||
test_mode=False)
|
||||
```
|
||||
|
||||
- Use `LoadImageFromLMDB` in [pipeline](https://github.com/open-mmlab/mmocr/blob/main/configs/_base_/recog_pipelines/crnn_pipeline.py#L4):
|
||||
|
||||
```python
|
||||
# configs/_base_/recog_pipelines/crnn_pipeline.py
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromLMDB', color_type='grayscale'),
|
||||
...
|
||||
```
|
||||
|
||||
3. You are good to go! Start training and MMOCR will load data from your lmdb dataset.
|
||||
|
||||
### New Features & Enhancements
|
||||
|
||||
- Add analyze_logs in tools and its description in docs by @Y-M-Y in https://github.com/open-mmlab/mmocr/pull/899
|
||||
- Add LSVT Data Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/896
|
||||
- Add RCTW dataset converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/914
|
||||
- Support computing mean scores in UniformConcatDataset by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/981
|
||||
- Support loading images and labels from lmdb file by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/982
|
||||
- Add recog2lmdb and new toy dataset files by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/979
|
||||
- Add labelme converter for textdet and textrecog by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/972
|
||||
- Update CircleCI configs by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/918
|
||||
- Update Git Action by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/930
|
||||
- More customizable fields in dataloaders by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/933
|
||||
- Skip CIs when docs are modified by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/941
|
||||
- Rename Github tests, fix ignored paths by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/946
|
||||
- Support latest MMCV by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/959
|
||||
- Support dynamic threshold range in eval_hmean by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/962
|
||||
- Update the version requirement of mmdet in docker by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/966
|
||||
- Replace `opencv-python-headless` with `open-python` by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/970
|
||||
- Update Dataset Configs by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/980
|
||||
- Add SynthText dataset config by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/983
|
||||
- Automatically report mean scores when applicable by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/995
|
||||
- Add DBNet++ by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/973
|
||||
- Add MASTER by @JiaquanYe in https://github.com/open-mmlab/mmocr/pull/807
|
||||
- Allow choosing metrics to report in text recognition tasks by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/989
|
||||
- Add HierText converter by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/948
|
||||
- Fix lint_only in CircleCI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/998
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Fix CircleCi Main Branch Accidentally Run PR Stage Test by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/927
|
||||
- Fix a deprecate warning about mmdet.datasets.pipelines.formating by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/944
|
||||
- Fix a Bug in ResNet plugin by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/967
|
||||
- revert a wrong setting in db_r18 cfg by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/978
|
||||
- Fix TotalText Anno version issue by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/945
|
||||
- Update installation step of `albumentations` by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/984
|
||||
- Fix ImgAug transform by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/949
|
||||
- Fix GPG key error in CI and docker by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/988
|
||||
- update label.lmdb by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/991
|
||||
- correct meta key by @garvan2021 in https://github.com/open-mmlab/mmocr/pull/926
|
||||
- Use new image by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/976
|
||||
- Fix Data Converter Issues by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/955
|
||||
|
||||
### Docs
|
||||
|
||||
- Update CONTRIBUTING.md by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/905
|
||||
- Fix the misleading description in test.py by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/908
|
||||
- Update recog.md for lmdb Generation by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/934
|
||||
- Add MMCV by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/954
|
||||
- Add wechat QR code to CN readme by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/960
|
||||
- Update CONTRIBUTING.md by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/947
|
||||
- Use QR codes from MMCV by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/971
|
||||
- Renew dataset_types.md by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/997
|
||||
|
||||
### New Contributors
|
||||
|
||||
- @Y-M-Y made their first contribution in https://github.com/open-mmlab/mmocr/pull/899
|
||||
|
||||
**Full Changelog**: https://github.com/open-mmlab/mmocr/compare/v0.5.0...v0.6.0
|
||||
|
||||
## 0.5.0 (31/03/2022)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. MMOCR now supports SPACE recognition! (What a prominent feature!) Users only need to convert the recognition annotations that contain spaces from a plain `.txt` file to JSON line format `.jsonl`, and then revise a few configurations to enable the `LineJsonParser`. For more information, please read our step-by-step [tutorial](https://mmocr.readthedocs.io/en/latest/tutorials/blank_recog.html).
|
||||
2. [Tesseract](https://github.com/tesseract-ocr/tesseract) is now available in MMOCR! While MMOCR is more flexible to support various downstream tasks, users might sometimes not be satisfied with DL models and would like to turn to effective legacy solutions. Therefore, we offer this option in `mmocr.utils.ocr` by wrapping Tesseract as a detector and/or recognizer. Users can easily create an MMOCR object by `MMOCR(det=’Tesseract’, recog=’Tesseract’)`. Credit to @garvan2021
|
||||
3. We release data converters for **16** widely used OCR datasets, including multiple scenarios such as document, handwritten, and scene text. Now it is more convenient to generate annotation files for these datasets. Check the dataset zoo ( [Det](https://mmocr.readthedocs.io/en/latest/datasets/det.html#) & [Recog](https://mmocr.readthedocs.io/en/latest/datasets/recog.html) ) to explore further information.
|
||||
4. Special thanks to @EighteenSprings @BeyondYourself @yangrisheng, who had actively participated in documentation translation!
|
||||
|
||||
### Migration Guide - ResNet
|
||||
|
||||
Some refactoring processes are still going on. For text recognition models, we unified the [`ResNet-like` architectures](https://github.com/open-mmlab/mmocr/blob/72f945457324e700f0d14796dd10a51535c01a57/mmocr/models/textrecog/backbones/resnet.py) which are used as backbones. By introducing stage-wise and block-wise plugins, the refactored ResNet is highly flexible to support existing models, like ResNet31 and ResNet45, and other future designs of ResNet variants.
|
||||
|
||||
#### Plugin
|
||||
|
||||
- `Plugin` is a module category inherited from MMCV's implementation of `PLUGIN_LAYERS`, which can be inserted between each stage of ResNet or into a basicblock. You can find a simple implementation of plugin at [mmocr/models/textrecog/plugins/common.py](https://github.com/open-mmlab/mmocr/blob/72f945457324e700f0d14796dd10a51535c01a57/mmocr/models/textrecog/plugins/common.py), or click the button below.
|
||||
|
||||
<details close>
|
||||
<summary>Plugin Example</summary>
|
||||
|
||||
```python
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class Maxpool2d(nn.Module):
|
||||
"""A wrapper around nn.Maxpool2d().
|
||||
|
||||
Args:
|
||||
kernel_size (int or tuple(int)): Kernel size for max pooling layer
|
||||
stride (int or tuple(int)): Stride for max pooling layer
|
||||
padding (int or tuple(int)): Padding for pooling layer
|
||||
"""
|
||||
|
||||
def __init__(self, kernel_size, stride, padding=0, **kwargs):
|
||||
super(Maxpool2d, self).__init__()
|
||||
self.model = nn.MaxPool2d(kernel_size, stride, padding)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Input feature map
|
||||
|
||||
Returns:
|
||||
Tensor: The tensor after Maxpooling layer.
|
||||
"""
|
||||
return self.model(x)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### Stage-wise Plugins
|
||||
|
||||
- ResNet is composed of stages, and each stage is composed of blocks. E.g., ResNet18 is composed of 4 stages, and each stage is composed of basicblocks. For each stage, we provide two ports to insert stage-wise plugins by giving `plugins` parameters in ResNet.
|
||||
|
||||
```text
|
||||
[port1: before stage] ---> [stage] ---> [port2: after stage]
|
||||
```
|
||||
|
||||
- E.g. Using a ResNet with four stages as example. Suppose we want to insert an additional convolution layer before each stage, and an additional convolution layer at stage 1, 2, 4. Then you can define the special ResNet18 like this
|
||||
|
||||
```python
|
||||
resnet18_speical = ResNet(
|
||||
# for simplicity, some required
|
||||
# parameters are omitted
|
||||
plugins=[
|
||||
dict(
|
||||
cfg=dict(
|
||||
type='ConvModule',
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_cfg=dict(type='BN'),
|
||||
act_cfg=dict(type='ReLU')),
|
||||
stages=(True, True, True, True),
|
||||
position='before_stage')
|
||||
dict(
|
||||
cfg=dict(
|
||||
type='ConvModule',
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_cfg=dict(type='BN'),
|
||||
act_cfg=dict(type='ReLU')),
|
||||
stages=(True, True, False, True),
|
||||
position='after_stage')
|
||||
])
|
||||
```
|
||||
|
||||
- You can also insert more than one plugin in each port and those plugins will be executed in order. Let's take ResNet in [MASTER](https://arxiv.org/abs/1910.02562) as an example:
|
||||
|
||||
<details close>
|
||||
<summary>Multiple Plugins Example</summary>
|
||||
|
||||
- ResNet in Master is based on ResNet31. And after each stage, a module named `GCAModule` will be used. The `GCAModule` is inserted before the stage-wise convolution layer in ResNet31. In conlusion, there will be two plugins at `after_stage` port in the same time.
|
||||
|
||||
```python
|
||||
resnet_master = ResNet(
|
||||
# for simplicity, some required
|
||||
# parameters are omitted
|
||||
plugins=[
|
||||
dict(
|
||||
cfg=dict(type='Maxpool2d', kernel_size=2, stride=(2, 2)),
|
||||
stages=(True, True, False, False),
|
||||
position='before_stage'),
|
||||
dict(
|
||||
cfg=dict(type='Maxpool2d', kernel_size=(2, 1), stride=(2, 1)),
|
||||
stages=(False, False, True, False),
|
||||
position='before_stage'),
|
||||
dict(
|
||||
cfg=dict(type='GCAModule', kernel_size=3, stride=1, padding=1),
|
||||
stages=[True, True, True, True],
|
||||
position='after_stage'),
|
||||
dict(
|
||||
cfg=dict(
|
||||
type='ConvModule',
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_cfg=dict(type='BN'),
|
||||
act_cfg=dict(type='ReLU')),
|
||||
stages=(True, True, True, True),
|
||||
position='after_stage')
|
||||
])
|
||||
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
- In each plugin, we will pass two parameters (`in_channels`, `out_channels`) to support operations that need the information of current channels.
|
||||
|
||||
#### Block-wise Plugin (Experimental)
|
||||
|
||||
- We also refactored the `BasicBlock` used in ResNet. Now it can be customized with block-wise plugins. Check [here](https://github.com/open-mmlab/mmocr/blob/72f945457324e700f0d14796dd10a51535c01a57/mmocr/models/textrecog/layers/conv_layer.py) for more details.
|
||||
|
||||
- BasicBlock is composed of two convolution layer in the main branch and a shortcut branch. We provide four ports to insert plugins.
|
||||
|
||||
```text
|
||||
[port1: before_conv1] ---> [conv1] --->
|
||||
[port2: after_conv1] ---> [conv2] --->
|
||||
[port3: after_conv2] ---> +(shortcut) ---> [port4: after_shortcut]
|
||||
```
|
||||
|
||||
- In each plugin, we will pass a parameter `in_channels` to support operations that need the information of current channels.
|
||||
|
||||
- E.g. Build a ResNet with customized BasicBlock with an additional convolution layer before conv1:
|
||||
|
||||
<details close>
|
||||
<summary>Block-wise Plugin Example</summary>
|
||||
|
||||
```python
|
||||
resnet_31 = ResNet(
|
||||
in_channels=3,
|
||||
stem_channels=[64, 128],
|
||||
block_cfgs=dict(type='BasicBlock'),
|
||||
arch_layers=[1, 2, 5, 3],
|
||||
arch_channels=[256, 256, 512, 512],
|
||||
strides=[1, 1, 1, 1],
|
||||
plugins=[
|
||||
dict(
|
||||
cfg=dict(type='Maxpool2d',
|
||||
kernel_size=2,
|
||||
stride=(2, 2)),
|
||||
stages=(True, True, False, False),
|
||||
position='before_stage'),
|
||||
dict(
|
||||
cfg=dict(type='Maxpool2d',
|
||||
kernel_size=(2, 1),
|
||||
stride=(2, 1)),
|
||||
stages=(False, False, True, False),
|
||||
position='before_stage'),
|
||||
dict(
|
||||
cfg=dict(
|
||||
type='ConvModule',
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_cfg=dict(type='BN'),
|
||||
act_cfg=dict(type='ReLU')),
|
||||
stages=(True, True, True, True),
|
||||
position='after_stage')
|
||||
])
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### Full Examples
|
||||
|
||||
<details close>
|
||||
<summary>ResNet without plugins</summary>
|
||||
|
||||
- ResNet45 is used in ASTER and ABINet without any plugins.
|
||||
|
||||
```python
|
||||
resnet45_aster = ResNet(
|
||||
in_channels=3,
|
||||
stem_channels=[64, 128],
|
||||
block_cfgs=dict(type='BasicBlock', use_conv1x1='True'),
|
||||
arch_layers=[3, 4, 6, 6, 3],
|
||||
arch_channels=[32, 64, 128, 256, 512],
|
||||
strides=[(2, 2), (2, 2), (2, 1), (2, 1), (2, 1)])
|
||||
|
||||
resnet45_abi = ResNet(
|
||||
in_channels=3,
|
||||
stem_channels=32,
|
||||
block_cfgs=dict(type='BasicBlock', use_conv1x1='True'),
|
||||
arch_layers=[3, 4, 6, 6, 3],
|
||||
arch_channels=[32, 64, 128, 256, 512],
|
||||
strides=[2, 1, 2, 1, 1])
|
||||
```
|
||||
|
||||
</details>
|
||||
<details close>
|
||||
<summary>ResNet with plugins</summary>
|
||||
|
||||
- ResNet31 is a typical architecture to use stage-wise plugins. Before the first three stages, Maxpooling layer is used. After each stage, a convolution layer with BN and ReLU is used.
|
||||
|
||||
```python
|
||||
resnet_31 = ResNet(
|
||||
in_channels=3,
|
||||
stem_channels=[64, 128],
|
||||
block_cfgs=dict(type='BasicBlock'),
|
||||
arch_layers=[1, 2, 5, 3],
|
||||
arch_channels=[256, 256, 512, 512],
|
||||
strides=[1, 1, 1, 1],
|
||||
plugins=[
|
||||
dict(
|
||||
cfg=dict(type='Maxpool2d',
|
||||
kernel_size=2,
|
||||
stride=(2, 2)),
|
||||
stages=(True, True, False, False),
|
||||
position='before_stage'),
|
||||
dict(
|
||||
cfg=dict(type='Maxpool2d',
|
||||
kernel_size=(2, 1),
|
||||
stride=(2, 1)),
|
||||
stages=(False, False, True, False),
|
||||
position='before_stage'),
|
||||
dict(
|
||||
cfg=dict(
|
||||
type='ConvModule',
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_cfg=dict(type='BN'),
|
||||
act_cfg=dict(type='ReLU')),
|
||||
stages=(True, True, True, True),
|
||||
position='after_stage')
|
||||
])
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Migration Guide - Dataset Annotation Loader
|
||||
|
||||
The annotation loaders, `LmdbLoader` and `HardDiskLoader`, are unified into `AnnFileLoader` for a more consistent design and wider support on different file formats and storage backends. `AnnFileLoader` can load the annotations from `disk`(default), `http` and `petrel` backend, and parse the annotation in `txt` or `lmdb` format. `LmdbLoader` and `HardDiskLoader` are deprecated, and users are recommended to modify their configs to use the new `AnnFileLoader`. Users can migrate their legacy loader `HardDiskLoader` referring to the following example:
|
||||
|
||||
```python
|
||||
# Legacy config
|
||||
train = dict(
|
||||
type='OCRDataset',
|
||||
...
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
...))
|
||||
|
||||
# Suggested config
|
||||
train = dict(
|
||||
type='OCRDataset',
|
||||
...
|
||||
loader=dict(
|
||||
type='AnnFileLoader',
|
||||
file_storage_backend='disk',
|
||||
file_format='txt',
|
||||
...))
|
||||
```
|
||||
|
||||
Similarly, using `AnnFileLoader` with `file_format='lmdb'` instead of `LmdbLoader` is strongly recommended.
|
||||
|
||||
### New Features & Enhancements
|
||||
|
||||
- Update mmcv install by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/775
|
||||
- Upgrade isort by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/771
|
||||
- Automatically infer device for inference if not speicifed by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/781
|
||||
- Add open-mmlab precommit hooks by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/787
|
||||
- Add windows CI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/790
|
||||
- Add CurvedSyntext150k Converter by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/719
|
||||
- Add FUNSD Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/808
|
||||
- Support loading annotation file with petrel/http backend by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/793
|
||||
- Support different seeds on different ranks by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/820
|
||||
- Support json in recognition converter by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/844
|
||||
- Add args and docs for multi-machine training/testing by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/849
|
||||
- Add warning info for LineStrParser by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/850
|
||||
- Deploy openmmlab-bot by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/876
|
||||
- Add Tesserocr Inference by @garvan2021 in https://github.com/open-mmlab/mmocr/pull/814
|
||||
- Add LV Dataset Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/871
|
||||
- Add SROIE Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/810
|
||||
- Add NAF Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/815
|
||||
- Add DeText Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/818
|
||||
- Add IMGUR Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/825
|
||||
- Add ILST Converter by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/833
|
||||
- Add KAIST Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/835
|
||||
- Add IC11 (Born-digital Images) Data Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/857
|
||||
- Add IC13 (Focused Scene Text) Data Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/861
|
||||
- Add BID Converter by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/862
|
||||
- Add Vintext Converter by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/864
|
||||
- Add MTWI Data Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/867
|
||||
- Add COCO Text v2 Data Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/872
|
||||
- Add ReCTS Data Converter by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/892
|
||||
- Refactor ResNets by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/809
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Bump mmdet version to 2.20.0 in Dockerfile by @GPhilo in https://github.com/open-mmlab/mmocr/pull/763
|
||||
- Update mmdet version limit by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/773
|
||||
- Minimum version requirement of albumentations by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/769
|
||||
- Disable worker in the dataloader of gpu unit test by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/780
|
||||
- Standardize the type of torch.device in ocr.py by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/800
|
||||
- Use RECOGNIZER instead of DETECTORS by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/685
|
||||
- Add num_classes to configs of ABINet by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/805
|
||||
- Support loading space character from dict file by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/854
|
||||
- Description in tools/data/utils/txt2lmdb.py by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/870
|
||||
- ignore_index in SARLoss by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/869
|
||||
- Fix a bug that may cause inplace operation error by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/884
|
||||
- Use hyphen instead of underscores in script args by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/890
|
||||
|
||||
### Docs
|
||||
|
||||
- Add deprecation message for deploy tools by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/801
|
||||
- Reorganizing OpenMMLab projects in readme by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/806
|
||||
- Add demo/README_zh.md by @EighteenSprings in https://github.com/open-mmlab/mmocr/pull/802
|
||||
- Add detailed version requirement table by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/778
|
||||
- Correct misleading section title in training.md by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/819
|
||||
- Update README_zh-CN document URL by @BeyondYourself in https://github.com/open-mmlab/mmocr/pull/823
|
||||
- translate testing.md. by @yangrisheng in https://github.com/open-mmlab/mmocr/pull/822
|
||||
- Fix confused description for load-from and resume-from by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/842
|
||||
- Add documents getting_started in docs/zh by @BeyondYourself in https://github.com/open-mmlab/mmocr/pull/841
|
||||
- Add the model serving translation document by @BeyondYourself in https://github.com/open-mmlab/mmocr/pull/845
|
||||
- Update docs about installation on Windows by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/852
|
||||
- Update tutorial notebook by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/853
|
||||
- Update Instructions for New Data Converters by @xinke-wang in https://github.com/open-mmlab/mmocr/pull/900
|
||||
- Brief installation instruction in README by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/897
|
||||
- update doc for ILST, VinText, BID by @Mountchicken in https://github.com/open-mmlab/mmocr/pull/902
|
||||
- Fix typos in readme by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/903
|
||||
- Recog dataset doc by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/893
|
||||
- Reorganize the directory structure section in det.md by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/894
|
||||
|
||||
### New Contributors
|
||||
|
||||
- @GPhilo made their first contribution in https://github.com/open-mmlab/mmocr/pull/763
|
||||
- @xinke-wang made their first contribution in https://github.com/open-mmlab/mmocr/pull/801
|
||||
- @EighteenSprings made their first contribution in https://github.com/open-mmlab/mmocr/pull/802
|
||||
- @BeyondYourself made their first contribution in https://github.com/open-mmlab/mmocr/pull/823
|
||||
- @yangrisheng made their first contribution in https://github.com/open-mmlab/mmocr/pull/822
|
||||
- @Mountchicken made their first contribution in https://github.com/open-mmlab/mmocr/pull/844
|
||||
- @garvan2021 made their first contribution in https://github.com/open-mmlab/mmocr/pull/814
|
||||
|
||||
**Full Changelog**: https://github.com/open-mmlab/mmocr/compare/v0.4.1...v0.5.0
|
||||
|
||||
## v0.4.1 (27/01/2022)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. Visualizing edge weights in OpenSet KIE is now supported! https://github.com/open-mmlab/mmocr/pull/677
|
||||
2. Some configurations have been optimized to significantly speed up the training and testing processes! Don't worry - you can still tune these parameters in case these modifications do not work. https://github.com/open-mmlab/mmocr/pull/757
|
||||
3. Now you can use CPU to train/debug your model! https://github.com/open-mmlab/mmocr/pull/752
|
||||
4. We have fixed a severe bug that causes users unable to call `mmocr.apis.test` with our pre-built wheels. https://github.com/open-mmlab/mmocr/pull/667
|
||||
|
||||
### New Features & Enhancements
|
||||
|
||||
- Show edge score for openset kie by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/677
|
||||
- Download flake8 from github as pre-commit hooks by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/695
|
||||
- Deprecate the support for 'python setup.py test' by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/722
|
||||
- Disable multi-processing feature of cv2 to speed up data loading by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/721
|
||||
- Extend ctw1500 converter to support text fields by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/729
|
||||
- Extend totaltext converter to support text fields by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/728
|
||||
- Speed up training by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/739
|
||||
- Add setup multi-processing both in train and test.py by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/757
|
||||
- Support CPU training/testing by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/752
|
||||
- Support specify gpu for testing and training with gpu-id instead of gpu-ids and gpus by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/756
|
||||
- Remove unnecessary custom_import from test.py by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/758
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Fix satrn onnxruntime test by @AllentDan in https://github.com/open-mmlab/mmocr/pull/679
|
||||
- Support both ConcatDataset and UniformConcatDataset by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/675
|
||||
- Fix bugs of show_results in single_gpu_test by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/667
|
||||
- Fix a bug for sar decoder when bi-rnn is used by @MhLiao in https://github.com/open-mmlab/mmocr/pull/690
|
||||
- Fix opencv version to avoid some bugs by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/694
|
||||
- Fix py39 ci error by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/707
|
||||
- Update visualize.py by @TommyZihao in https://github.com/open-mmlab/mmocr/pull/715
|
||||
- Fix link of config by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/726
|
||||
- Use yaml.safe_load instead of load by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/753
|
||||
- Add necessary keys to test_pipelines to enable test-time visualization by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/754
|
||||
|
||||
### Docs
|
||||
|
||||
- Fix recog.md by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/674
|
||||
- Add config tutorial by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/683
|
||||
- Add MMSelfSup/MMRazor/MMDeploy in readme by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/692
|
||||
- Add recog & det model summary by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/693
|
||||
- Update docs link by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/710
|
||||
- add pull request template.md by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/711
|
||||
- Add website links to readme by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/731
|
||||
- update readme according to standard by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/742
|
||||
|
||||
### New Contributors
|
||||
|
||||
- @MhLiao made their first contribution in https://github.com/open-mmlab/mmocr/pull/690
|
||||
- @TommyZihao made their first contribution in https://github.com/open-mmlab/mmocr/pull/715
|
||||
|
||||
**Full Changelog**: https://github.com/open-mmlab/mmocr/compare/v0.4.0...v0.4.1
|
||||
|
||||
## v0.4.0 (15/12/2021)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. We release a new text recognition model - [ABINet](https://arxiv.org/pdf/2103.06495.pdf) (CVPR 2021, Oral). With it dedicated model design and useful data augmentation transforms, ABINet can achieve the best performance on irregular text recognition tasks. [Check it out!](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#read-like-humans-autonomous-bidirectional-and-iterative-language-modeling-for-scene-text-recognition)
|
||||
2. We are also working hard to fulfill the requests from our community.
|
||||
[OpenSet KIE](https://mmocr.readthedocs.io/en/latest/kie_models.html#wildreceiptopenset) is one of the achievement, which extends the application of SDMGR from text node classification to node-pair relation extraction. We also provide
|
||||
a demo script to convert WildReceipt to open set domain, though it cannot
|
||||
take the full advantage of OpenSet format. For more information, please read our
|
||||
[tutorial](https://mmocr.readthedocs.io/en/latest/tutorials/kie_closeset_openset.html).
|
||||
3. APIs of models can be exposed through TorchServe. [Docs](https://mmocr.readthedocs.io/en/latest/model_serving.html)
|
||||
|
||||
### Breaking Changes & Migration Guide
|
||||
|
||||
#### Postprocessor
|
||||
|
||||
Some refactoring processes are still going on. For all text detection models, we unified their `decode` implementations into a new module category, `POSTPROCESSOR`, which is responsible for decoding different raw outputs into boundary instances. In all text detection configs, the `text_repr_type` argument in `bbox_head` is deprecated and will be removed in the future release.
|
||||
|
||||
**Migration Guide**: Find a similar line from detection model's config:
|
||||
|
||||
```
|
||||
text_repr_type=xxx,
|
||||
```
|
||||
|
||||
And replace it with
|
||||
|
||||
```
|
||||
postprocessor=dict(type='{MODEL_NAME}Postprocessor', text_repr_type=xxx)),
|
||||
```
|
||||
|
||||
Take a snippet of PANet's config as an example. Before the change, its config for `bbox_head` looks like:
|
||||
|
||||
```
|
||||
bbox_head=dict(
|
||||
type='PANHead',
|
||||
text_repr_type='poly',
|
||||
in_channels=[128, 128, 128, 128],
|
||||
out_channels=6,
|
||||
module_loss=dict(type='PANModuleLoss')),
|
||||
```
|
||||
|
||||
Afterwards:
|
||||
|
||||
```
|
||||
bbox_head=dict(
|
||||
type='PANHead',
|
||||
in_channels=[128, 128, 128, 128],
|
||||
out_channels=6,
|
||||
module_loss=dict(type='PANModuleLoss'),
|
||||
postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
|
||||
```
|
||||
|
||||
There are other postprocessors and each takes different arguments. Interested users can find their interfaces or implementations in `mmocr/models/textdet/postprocess` or through our [api docs](https://mmocr.readthedocs.io/en/latest/api.html#textdet-postprocess).
|
||||
|
||||
#### New Config Structure
|
||||
|
||||
We reorganized the `configs/` directory by extracting reusable sections into `configs/_base_`. Now the directory tree of `configs/_base_` is organized as follows:
|
||||
|
||||
```
|
||||
_base_
|
||||
├── det_datasets
|
||||
├── det_models
|
||||
├── det_pipelines
|
||||
├── recog_datasets
|
||||
├── recog_models
|
||||
├── recog_pipelines
|
||||
└── schedules
|
||||
```
|
||||
|
||||
Most of model configs are making full use of base configs now, which makes the overall structural clearer and facilitates fair
|
||||
comparison across models. Despite the seemingly significant hierarchical difference, **these changes would not break the backward compatibility** as the names of model configs remain the same.
|
||||
|
||||
### New Features
|
||||
|
||||
- Support openset kie by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/498
|
||||
- Add converter for the Open Images v5 text annotations by Krylov et al. by @baudm in https://github.com/open-mmlab/mmocr/pull/497
|
||||
- Support Chinese for kie show result by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/464
|
||||
- Add TorchServe support for text detection and recognition by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/522
|
||||
- Save filename in text detection test results by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/570
|
||||
- Add codespell pre-commit hook and fix typos by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/520
|
||||
- Avoid duplicate placeholder docs in CN by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/582
|
||||
- Save results to json file for kie. by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/589
|
||||
- Add SAR_CN to ocr.py by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/579
|
||||
- mim extension for windows by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/641
|
||||
- Support muitiple pipelines for different datasets by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/657
|
||||
- ABINet Framework by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/651
|
||||
|
||||
### Refactoring
|
||||
|
||||
- Refactor textrecog config structure by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/617
|
||||
- Refactor text detection config by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/626
|
||||
- refactor transformer modules by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/618
|
||||
- refactor textdet postprocess by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/640
|
||||
|
||||
### Docs
|
||||
|
||||
- C++ example section by @apiaccess21 in https://github.com/open-mmlab/mmocr/pull/593
|
||||
- install.md Chinese section by @A465539338 in https://github.com/open-mmlab/mmocr/pull/364
|
||||
- Add Chinese Translation of deployment.md. by @fatfishZhao in https://github.com/open-mmlab/mmocr/pull/506
|
||||
- Fix a model link and add the metafile for SATRN by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/473
|
||||
- Improve docs style by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/474
|
||||
- Enhancement & sync Chinese docs by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/492
|
||||
- TorchServe docs by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/539
|
||||
- Update docs menu by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/564
|
||||
- Docs for KIE CloseSet & OpenSet by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/573
|
||||
- Fix broken links by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/576
|
||||
- Docstring for text recognition models by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/562
|
||||
- Add MMFlow & MIM by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/597
|
||||
- Add MMFewShot by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/621
|
||||
- Update model readme by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/604
|
||||
- Add input size check to model_inference by @mpena-vina in https://github.com/open-mmlab/mmocr/pull/633
|
||||
- Docstring for textdet models by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/561
|
||||
- Add MMHuman3D in readme by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/644
|
||||
- Use shared menu from theme instead by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/655
|
||||
- Refactor docs structure by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/662
|
||||
- Docs fix by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/664
|
||||
|
||||
### Enhancements
|
||||
|
||||
- Use bounding box around polygon instead of within polygon by @alexander-soare in https://github.com/open-mmlab/mmocr/pull/469
|
||||
- Add CITATION.cff by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/476
|
||||
- Add py3.9 CI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/475
|
||||
- update model-index.yml by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/484
|
||||
- Use container in CI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/502
|
||||
- CircleCI Setup by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/611
|
||||
- Remove unnecessary custom_import from train.py by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/603
|
||||
- Change the upper version of mmcv to 1.5.0 by @zhouzaida in https://github.com/open-mmlab/mmocr/pull/628
|
||||
- Update CircleCI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/631
|
||||
- Pass custom_hooks to MMCV by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/609
|
||||
- Skip CI when some specific files were changed by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/642
|
||||
- Add markdown linter in pre-commit hook by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/643
|
||||
- Use shape from loaded image by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/652
|
||||
- Cancel previous runs that are not completed by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/666
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Modify algorithm "sar" weights path in metafile by @ShoupingShan in https://github.com/open-mmlab/mmocr/pull/581
|
||||
- Fix Cuda CI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/472
|
||||
- Fix image export in test.py for KIE models by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/486
|
||||
- Allow invalid polygons in intersection and union by default by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/471
|
||||
- Update checkpoints' links for SATRN by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/518
|
||||
- Fix converting to onnx bug because of changing key from img_shape to resize_shape by @Harold-lkk in https://github.com/open-mmlab/mmocr/pull/523
|
||||
- Fix PyTorch 1.6 incompatible checkpoints by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/540
|
||||
- Fix paper field in metafiles by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/550
|
||||
- Unify recognition task names in metafiles by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/548
|
||||
- Fix py3.9 CI by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/563
|
||||
- Always map location to cpu when loading checkpoint by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/567
|
||||
- Fix wrong model builder in recog_test_imgs by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/574
|
||||
- Improve dbnet r50 by fixing img std by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/578
|
||||
- Fix resource warning: unclosed file by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/577
|
||||
- Fix bug that same start_point for different texts in draw_texts_by_pil by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/587
|
||||
- Keep original texts for kie by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/588
|
||||
- Fix random seed by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/600
|
||||
- Fix DBNet_r50 config by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/625
|
||||
- Change SBC case to DBC case by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/632
|
||||
- Fix kie demo by @innerlee in https://github.com/open-mmlab/mmocr/pull/610
|
||||
- fix type check by @cuhk-hbsun in https://github.com/open-mmlab/mmocr/pull/650
|
||||
- Remove depreciated image validator in totaltext converter by @gaotongxiao in https://github.com/open-mmlab/mmocr/pull/661
|
||||
- Fix change locals() dict by @Fei-Wang in https://github.com/open-mmlab/mmocr/pull/663
|
||||
- fix #614: textsnake targets by @HolyCrap96 in https://github.com/open-mmlab/mmocr/pull/660
|
||||
|
||||
### New Contributors
|
||||
|
||||
- @alexander-soare made their first contribution in https://github.com/open-mmlab/mmocr/pull/469
|
||||
- @A465539338 made their first contribution in https://github.com/open-mmlab/mmocr/pull/364
|
||||
- @fatfishZhao made their first contribution in https://github.com/open-mmlab/mmocr/pull/506
|
||||
- @baudm made their first contribution in https://github.com/open-mmlab/mmocr/pull/497
|
||||
- @ShoupingShan made their first contribution in https://github.com/open-mmlab/mmocr/pull/581
|
||||
- @apiaccess21 made their first contribution in https://github.com/open-mmlab/mmocr/pull/593
|
||||
- @zhouzaida made their first contribution in https://github.com/open-mmlab/mmocr/pull/628
|
||||
- @mpena-vina made their first contribution in https://github.com/open-mmlab/mmocr/pull/633
|
||||
- @Fei-Wang made their first contribution in https://github.com/open-mmlab/mmocr/pull/663
|
||||
|
||||
**Full Changelog**: https://github.com/open-mmlab/mmocr/compare/v0.3.0...0.4.0
|
||||
|
||||
## v0.3.0 (25/8/2021)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. We add a new text recognition model -- SATRN! Its pretrained checkpoint achieves the best performance over other provided text recognition models. A lighter version of SATRN is also released which can obtain ~98% of the performance of the original model with only 45 MB in size. ([@2793145003](https://github.com/2793145003)) [#405](https://github.com/open-mmlab/mmocr/pull/405)
|
||||
2. Improve the demo script, `ocr.py`, which supports applying end-to-end text detection, text recognition and key information extraction models on images with easy-to-use commands. Users can find its full documentation in the demo section. ([@samayala22](https://github.com/samayala22), [@manjrekarom](https://github.com/manjrekarom)) [#371](https://github.com/open-mmlab/mmocr/pull/371), [#386](https://github.com/open-mmlab/mmocr/pull/386), [#400](https://github.com/open-mmlab/mmocr/pull/400), [#374](https://github.com/open-mmlab/mmocr/pull/374), [#428](https://github.com/open-mmlab/mmocr/pull/428)
|
||||
3. Our documentation is reorganized into a clearer structure. More useful contents are on the way! [#409](https://github.com/open-mmlab/mmocr/pull/409), [#454](https://github.com/open-mmlab/mmocr/pull/454)
|
||||
4. The requirement of `Polygon3` is removed since this project is no longer maintained or distributed. We unified all its references to equivalent substitutions in `shapely` instead. [#448](https://github.com/open-mmlab/mmocr/pull/448)
|
||||
|
||||
### Breaking Changes & Migration Guide
|
||||
|
||||
1. Upgrade version requirement of MMDetection to 2.14.0 to avoid bugs [#382](https://github.com/open-mmlab/mmocr/pull/382)
|
||||
2. MMOCR now has its own model and layer registries inherited from MMDetection's or MMCV's counterparts. ([#436](https://github.com/open-mmlab/mmocr/pull/436)) The modified hierarchical structure of the model registries are now organized as follows.
|
||||
|
||||
```text
|
||||
mmcv.MODELS -> mmdet.BACKBONES -> BACKBONES
|
||||
mmcv.MODELS -> mmdet.NECKS -> NECKS
|
||||
mmcv.MODELS -> mmdet.ROI_EXTRACTORS -> ROI_EXTRACTORS
|
||||
mmcv.MODELS -> mmdet.HEADS -> HEADS
|
||||
mmcv.MODELS -> mmdet.LOSSES -> LOSSES
|
||||
mmcv.MODELS -> mmdet.DETECTORS -> DETECTORS
|
||||
mmcv.ACTIVATION_LAYERS -> ACTIVATION_LAYERS
|
||||
mmcv.UPSAMPLE_LAYERS -> UPSAMPLE_LAYERS
|
||||
```
|
||||
|
||||
To migrate your old implementation to our new backend, you need to change the import path of any registries and their corresponding builder functions (including `build_detectors`) from `mmdet.models.builder` to `mmocr.models.builder`. If you have referred to any model or layer of MMDetection or MMCV in your model config, you need to add `mmdet.` or `mmcv.` prefix to its name to inform the model builder of the right namespace to work on.
|
||||
|
||||
Interested users may check out [MMCV's tutorial on Registry](https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html) for in-depth explanations on its mechanism.
|
||||
|
||||
### New Features
|
||||
|
||||
- Automatically replace SyncBN with BN for inference [#420](https://github.com/open-mmlab/mmocr/pull/420), [#453](https://github.com/open-mmlab/mmocr/pull/453)
|
||||
- Support batch inference for CRNN and SegOCR [#407](https://github.com/open-mmlab/mmocr/pull/407)
|
||||
- Support exporting documentation in pdf or epub format [#406](https://github.com/open-mmlab/mmocr/pull/406)
|
||||
- Support `persistent_workers` option in data loader [#459](https://github.com/open-mmlab/mmocr/pull/459)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Remove depreciated key in kie_test_imgs.py [#381](https://github.com/open-mmlab/mmocr/pull/381)
|
||||
- Fix dimension mismatch in batch testing/inference of DBNet [#383](https://github.com/open-mmlab/mmocr/pull/383)
|
||||
- Fix the problem of dice loss which stays at 1 with an empty target given [#408](https://github.com/open-mmlab/mmocr/pull/408)
|
||||
- Fix a wrong link in ocr.py ([@naarkhoo](https://github.com/naarkhoo)) [#417](https://github.com/open-mmlab/mmocr/pull/417)
|
||||
- Fix undesired assignment to "pretrained" in test.py [#418](https://github.com/open-mmlab/mmocr/pull/418)
|
||||
- Fix a problem in polygon generation of DBNet [#421](https://github.com/open-mmlab/mmocr/pull/421), [#443](https://github.com/open-mmlab/mmocr/pull/443)
|
||||
- Skip invalid annotations in totaltext_converter [#438](https://github.com/open-mmlab/mmocr/pull/438)
|
||||
- Add zero division handler in poly utils, remove Polygon3 [#448](https://github.com/open-mmlab/mmocr/pull/448)
|
||||
|
||||
### Improvements
|
||||
|
||||
- Replace lanms-proper with lanms-neo to support installation on Windows (with special thanks to [@gen-ko](https://github.com/gen-ko) who has re-distributed this package!)
|
||||
- Support MIM [#394](https://github.com/open-mmlab/mmocr/pull/394)
|
||||
- Add tests for PyTorch 1.9 in CI [#401](https://github.com/open-mmlab/mmocr/pull/401)
|
||||
- Enables fullscreen layout in readthedocs [#413](https://github.com/open-mmlab/mmocr/pull/413)
|
||||
- General documentation enhancement [#395](https://github.com/open-mmlab/mmocr/pull/395)
|
||||
- Update version checker [#427](https://github.com/open-mmlab/mmocr/pull/427)
|
||||
- Add copyright info [#439](https://github.com/open-mmlab/mmocr/pull/439)
|
||||
- Update citation information [#440](https://github.com/open-mmlab/mmocr/pull/440)
|
||||
|
||||
### Contributors
|
||||
|
||||
We thank [@2793145003](https://github.com/2793145003), [@samayala22](https://github.com/samayala22), [@manjrekarom](https://github.com/manjrekarom), [@naarkhoo](https://github.com/naarkhoo), [@gen-ko](https://github.com/gen-ko), [@duanjiaqi](https://github.com/duanjiaqi), [@gaotongxiao](https://github.com/gaotongxiao), [@cuhk-hbsun](https://github.com/cuhk-hbsun), [@innerlee](https://github.com/innerlee), [@wdsd641417025](https://github.com/wdsd641417025) for their contribution to this release!
|
||||
|
||||
## v0.2.1 (20/7/2021)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. Upgrade to use MMCV-full **>= 1.3.8** and MMDetection **>= 2.13.0** for latest features
|
||||
2. Add ONNX and TensorRT export tool, supporting the deployment of DBNet, PSENet, PANet and CRNN (experimental) [#278](https://github.com/open-mmlab/mmocr/pull/278), [#291](https://github.com/open-mmlab/mmocr/pull/291), [#300](https://github.com/open-mmlab/mmocr/pull/300), [#328](https://github.com/open-mmlab/mmocr/pull/328)
|
||||
3. Unified parameter initialization method which uses init_cfg in config files [#365](https://github.com/open-mmlab/mmocr/pull/365)
|
||||
|
||||
### New Features
|
||||
|
||||
- Support TextOCR dataset [#293](https://github.com/open-mmlab/mmocr/pull/293)
|
||||
- Support Total-Text dataset [#266](https://github.com/open-mmlab/mmocr/pull/266), [#273](https://github.com/open-mmlab/mmocr/pull/273), [#357](https://github.com/open-mmlab/mmocr/pull/357)
|
||||
- Support grouping text detection box into lines [#290](https://github.com/open-mmlab/mmocr/pull/290), [#304](https://github.com/open-mmlab/mmocr/pull/304)
|
||||
- Add benchmark_processing script that benchmarks data loading process [#261](https://github.com/open-mmlab/mmocr/pull/261)
|
||||
- Add SynthText preprocessor for text recognition models [#351](https://github.com/open-mmlab/mmocr/pull/351), [#361](https://github.com/open-mmlab/mmocr/pull/361)
|
||||
- Support batch inference during testing [#310](https://github.com/open-mmlab/mmocr/pull/310)
|
||||
- Add user-friendly OCR inference script [#366](https://github.com/open-mmlab/mmocr/pull/366)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Fix improper class ignorance in SDMGR Loss [#221](https://github.com/open-mmlab/mmocr/pull/221)
|
||||
- Fix potential numerical zero division error in DRRG [#224](https://github.com/open-mmlab/mmocr/pull/224)
|
||||
- Fix installing requirements with pip and mim [#242](https://github.com/open-mmlab/mmocr/pull/242)
|
||||
- Fix dynamic input error of DBNet [#269](https://github.com/open-mmlab/mmocr/pull/269)
|
||||
- Fix space parsing error in LineStrParser [#285](https://github.com/open-mmlab/mmocr/pull/285)
|
||||
- Fix textsnake decode error [#264](https://github.com/open-mmlab/mmocr/pull/264)
|
||||
- Correct isort setup [#288](https://github.com/open-mmlab/mmocr/pull/288)
|
||||
- Fix a bug in SDMGR config [#316](https://github.com/open-mmlab/mmocr/pull/316)
|
||||
- Fix kie_test_img for KIE nonvisual [#319](https://github.com/open-mmlab/mmocr/pull/319)
|
||||
- Fix metafiles [#342](https://github.com/open-mmlab/mmocr/pull/342)
|
||||
- Fix different device problem in FCENet [#334](https://github.com/open-mmlab/mmocr/pull/334)
|
||||
- Ignore improper tailing empty characters in annotation files [#358](https://github.com/open-mmlab/mmocr/pull/358)
|
||||
- Docs fixes [#247](https://github.com/open-mmlab/mmocr/pull/247), [#255](https://github.com/open-mmlab/mmocr/pull/255), [#265](https://github.com/open-mmlab/mmocr/pull/265), [#267](https://github.com/open-mmlab/mmocr/pull/267), [#268](https://github.com/open-mmlab/mmocr/pull/268), [#270](https://github.com/open-mmlab/mmocr/pull/270), [#276](https://github.com/open-mmlab/mmocr/pull/276), [#287](https://github.com/open-mmlab/mmocr/pull/287), [#330](https://github.com/open-mmlab/mmocr/pull/330), [#355](https://github.com/open-mmlab/mmocr/pull/355), [#367](https://github.com/open-mmlab/mmocr/pull/367)
|
||||
- Fix NRTR config [#356](https://github.com/open-mmlab/mmocr/pull/356), [#370](https://github.com/open-mmlab/mmocr/pull/370)
|
||||
|
||||
### Improvements
|
||||
|
||||
- Add backend for resizeocr [#244](https://github.com/open-mmlab/mmocr/pull/244)
|
||||
- Skip image processing pipelines in SDMGR novisual [#260](https://github.com/open-mmlab/mmocr/pull/260)
|
||||
- Speedup DBNet [#263](https://github.com/open-mmlab/mmocr/pull/263)
|
||||
- Update mmcv installation method in workflow [#323](https://github.com/open-mmlab/mmocr/pull/323)
|
||||
- Add part of Chinese documentations [#353](https://github.com/open-mmlab/mmocr/pull/353), [#362](https://github.com/open-mmlab/mmocr/pull/362)
|
||||
- Add support for ConcatDataset with two workflows [#348](https://github.com/open-mmlab/mmocr/pull/348)
|
||||
- Add list_from_file and list_to_file utils [#226](https://github.com/open-mmlab/mmocr/pull/226)
|
||||
- Speed up sort_vertex [#239](https://github.com/open-mmlab/mmocr/pull/239)
|
||||
- Support distributed evaluation of KIE [#234](https://github.com/open-mmlab/mmocr/pull/234)
|
||||
- Add pretrained FCENet on IC15 [#258](https://github.com/open-mmlab/mmocr/pull/258)
|
||||
- Support CPU for OCR demo [#227](https://github.com/open-mmlab/mmocr/pull/227)
|
||||
- Avoid extra image pre-processing steps [#375](https://github.com/open-mmlab/mmocr/pull/375)
|
||||
|
||||
## v0.2.0 (18/5/2021)
|
||||
|
||||
### Highlights
|
||||
|
||||
1. Add the NER approach Bert-softmax (NAACL'2019)
|
||||
2. Add the text detection method DRRG (CVPR'2020)
|
||||
3. Add the text detection method FCENet (CVPR'2021)
|
||||
4. Increase the ease of use via adding text detection and recognition end-to-end demo, and colab online demo.
|
||||
5. Simplify the installation.
|
||||
|
||||
### New Features
|
||||
|
||||
- Add Bert-softmax for Ner task [#148](https://github.com/open-mmlab/mmocr/pull/148)
|
||||
- Add DRRG [#189](https://github.com/open-mmlab/mmocr/pull/189)
|
||||
- Add FCENet [#133](https://github.com/open-mmlab/mmocr/pull/133)
|
||||
- Add end-to-end demo [#105](https://github.com/open-mmlab/mmocr/pull/105)
|
||||
- Support batch inference [#86](https://github.com/open-mmlab/mmocr/pull/86) [#87](https://github.com/open-mmlab/mmocr/pull/87) [#178](https://github.com/open-mmlab/mmocr/pull/178)
|
||||
- Add TPS preprocessor for text recognition [#117](https://github.com/open-mmlab/mmocr/pull/117) [#135](https://github.com/open-mmlab/mmocr/pull/135)
|
||||
- Add demo documentation [#151](https://github.com/open-mmlab/mmocr/pull/151) [#166](https://github.com/open-mmlab/mmocr/pull/166) [#168](https://github.com/open-mmlab/mmocr/pull/168) [#170](https://github.com/open-mmlab/mmocr/pull/170) [#171](https://github.com/open-mmlab/mmocr/pull/171)
|
||||
- Add checkpoint for Chinese recognition [#156](https://github.com/open-mmlab/mmocr/pull/156)
|
||||
- Add metafile [#175](https://github.com/open-mmlab/mmocr/pull/175) [#176](https://github.com/open-mmlab/mmocr/pull/176) [#177](https://github.com/open-mmlab/mmocr/pull/177) [#182](https://github.com/open-mmlab/mmocr/pull/182) [#183](https://github.com/open-mmlab/mmocr/pull/183)
|
||||
- Add support for numpy array inference [#74](https://github.com/open-mmlab/mmocr/pull/74)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Fix the duplicated point bug due to transform for textsnake [#130](https://github.com/open-mmlab/mmocr/pull/130)
|
||||
- Fix CTC loss NaN [#159](https://github.com/open-mmlab/mmocr/pull/159)
|
||||
- Fix error raised if result is empty in demo [#144](https://github.com/open-mmlab/mmocr/pull/141)
|
||||
- Fix results missing if one image has a large number of boxes [#98](https://github.com/open-mmlab/mmocr/pull/98)
|
||||
- Fix package missing in dockerfile [#109](https://github.com/open-mmlab/mmocr/pull/109)
|
||||
|
||||
### Improvements
|
||||
|
||||
- Simplify installation procedure via removing compiling [#188](https://github.com/open-mmlab/mmocr/pull/188)
|
||||
- Speed up panet post processing so that it can detect dense texts [#188](https://github.com/open-mmlab/mmocr/pull/188)
|
||||
- Add zh-CN README [#70](https://github.com/open-mmlab/mmocr/pull/70) [#95](https://github.com/open-mmlab/mmocr/pull/95)
|
||||
- Support windows [#89](https://github.com/open-mmlab/mmocr/pull/89)
|
||||
- Add Colab [#147](https://github.com/open-mmlab/mmocr/pull/147) [#199](https://github.com/open-mmlab/mmocr/pull/199)
|
||||
- Add 1-step installation using conda environment [#193](https://github.com/open-mmlab/mmocr/pull/193) [#194](https://github.com/open-mmlab/mmocr/pull/194) [#195](https://github.com/open-mmlab/mmocr/pull/195)
|
||||
|
||||
## v0.1.0 (7/4/2021)
|
||||
|
||||
### Highlights
|
||||
|
||||
- MMOCR is released.
|
||||
|
||||
### Main Features
|
||||
|
||||
- Support text detection, text recognition and the corresponding downstream tasks such as key information extraction.
|
||||
- For text detection, support both single-step (`PSENet`, `PANet`, `DBNet`, `TextSnake`) and two-step (`MaskRCNN`) methods.
|
||||
- For text recognition, support CTC-loss based method `CRNN`; Encoder-decoder (with attention) based methods `SAR`, `Robustscanner`; Segmentation based method `SegOCR`; Transformer based method `NRTR`.
|
||||
- For key information extraction, support GCN based method `SDMG-R`.
|
||||
- Provide checkpoints and log files for all of the methods above.
|
|
@ -0,0 +1 @@
|
|||
# 贡献指南
|
Loading…
Reference in New Issue