mirror of https://github.com/open-mmlab/mmocr.git
commit
d1cc0dad44
|
@ -0,0 +1,3 @@
|
|||
[run]
|
||||
omit =
|
||||
*/__init__.py
|
|
@ -0,0 +1,76 @@
|
|||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as
|
||||
contributors and maintainers pledge to making participation in our project and
|
||||
our community a harassment-free experience for everyone, regardless of age, body
|
||||
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
||||
level of experience, education, socio-economic status, nationality, personal
|
||||
appearance, race, religion, or sexual identity and orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment
|
||||
include:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
* The use of sexualized language or imagery and unwelcome sexual attention or
|
||||
advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic
|
||||
address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable
|
||||
behavior and are expected to take appropriate and fair corrective action in
|
||||
response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct, or to ban temporarily or
|
||||
permanently any contributor for other behaviors that they deem inappropriate,
|
||||
threatening, offensive, or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces
|
||||
when an individual is representing the project or its community. Examples of
|
||||
representing a project or community include using an official project e-mail
|
||||
address, posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event. Representation of a project may be
|
||||
further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by contacting the project team at chenkaidev@gmail.com. All
|
||||
complaints will be reviewed and investigated and will result in a response that
|
||||
is deemed necessary and appropriate to the circumstances. The project team is
|
||||
obligated to maintain confidentiality with regard to the reporter of an incident.
|
||||
Further details of specific enforcement policies may be posted separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good
|
||||
faith may face temporary or permanent repercussions as determined by other
|
||||
members of the project's leadership.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
For answers to common questions about this code of conduct, see
|
||||
https://www.contributor-covenant.org/faq
|
|
@ -0,0 +1 @@
|
|||
We appreciate all contributions to improve MMOCR. Please refer to [CONTRIBUTING.md](/docs/contributing.md) in MMCV for more details about the contributing guideline.
|
|
@ -0,0 +1,68 @@
|
|||
name: build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
- name: Install pre-commit hook
|
||||
run: |
|
||||
pip install pre-commit
|
||||
pre-commit install
|
||||
- name: Linting
|
||||
run: pre-commit run --all-files
|
||||
- name: Check docstring coverage
|
||||
run: |
|
||||
pip install interrogate
|
||||
interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 50 mmocr
|
||||
|
||||
build_cpu:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.7]
|
||||
torch: [1.5.0, 1.6.0, 1.7.0]
|
||||
include:
|
||||
- torch: 1.5.0
|
||||
torchvision: 0.6.0
|
||||
- torch: 1.6.0
|
||||
torchvision: 0.7.0
|
||||
- torch: 1.7.0
|
||||
torchvision: 0.8.1
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Upgrade pip
|
||||
run: pip install pip --upgrade
|
||||
- name: Install Pillow
|
||||
run: pip install Pillow==6.2.2
|
||||
if: ${{matrix.torchvision == '0.4.1'}}
|
||||
- name: Install PyTorch
|
||||
run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
|
||||
- name: Install MMCV
|
||||
run: pip install mmcv-full==1.3.0 -f https://download.openmmlab.com/mmcv/dist/cpu/torch${{matrix.torch}}/index.html
|
||||
- name: Install MMDet
|
||||
run: pip install git+https://github.com/open-mmlab/mmdetection/
|
||||
- name: Install other dependencies
|
||||
run: pip install -r requirements.txt
|
||||
- name: Build and install
|
||||
run: rm -rf .eggs && pip install -e .
|
||||
- name: Run unittests and generate coverage report
|
||||
run: |
|
||||
coverage run --branch --source mmocr -m pytest tests/
|
||||
coverage xml
|
||||
coverage report -m
|
|
@ -0,0 +1,20 @@
|
|||
name: deploy
|
||||
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
build-n-publish:
|
||||
runs-on: ubuntu-latest
|
||||
if: startsWith(github.event.ref, 'refs/tags')
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.7
|
||||
- name: Build MMOCR
|
||||
run: python setup.py sdist
|
||||
- name: Publish distribution to PyPI
|
||||
run: |
|
||||
pip install twine
|
||||
twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
|
|
@ -0,0 +1,138 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.ipynb
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# cython generated cpp
|
||||
!data/dict
|
||||
data/*
|
||||
.vscode
|
||||
.idea
|
||||
|
||||
# custom
|
||||
*.pkl
|
||||
*.pkl.json
|
||||
*.log.json
|
||||
work_dirs/
|
||||
exps/
|
||||
*~
|
||||
show_dir/
|
||||
|
||||
# Pytorch
|
||||
*.pth
|
||||
|
||||
# demo
|
||||
!tests/data
|
||||
tests/results
|
||||
|
||||
#temp files
|
||||
.DS_Store
|
||||
|
||||
checkpoints
|
||||
|
||||
htmlcov
|
||||
*.swp
|
||||
log.txt
|
||||
workspace.code-workspace
|
||||
results
|
|
@ -0,0 +1,36 @@
|
|||
exclude: ^tests/data/
|
||||
repos:
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 3.8.1
|
||||
hooks:
|
||||
- id: flake8
|
||||
- repo: https://github.com/asottile/seed-isort-config
|
||||
rev: v2.2.0
|
||||
hooks:
|
||||
- id: seed-isort-config
|
||||
- repo: https://github.com/timothycrosley/isort
|
||||
rev: 4.3.21
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/pre-commit/mirrors-yapf
|
||||
rev: v0.30.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v3.1.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: requirements-txt-fixer
|
||||
- id: double-quote-string-fixer
|
||||
- id: check-merge-conflict
|
||||
- id: fix-encoding-pragma
|
||||
args: ["--remove"]
|
||||
- id: mixed-line-ending
|
||||
args: ["--fix=lf"]
|
||||
- repo: https://github.com/myint/docformatter
|
||||
rev: v1.3.1
|
||||
hooks:
|
||||
- id: docformatter
|
||||
args: ["--in-place", "--wrap-descriptions", "79"]
|
|
@ -0,0 +1,621 @@
|
|||
[MASTER]
|
||||
|
||||
# A comma-separated list of package or module names from where C extensions may
|
||||
# be loaded. Extensions are loading into the active Python interpreter and may
|
||||
# run arbitrary code.
|
||||
extension-pkg-whitelist=
|
||||
|
||||
# Specify a score threshold to be exceeded before program exits with error.
|
||||
fail-under=10.0
|
||||
|
||||
# Add files or directories to the blacklist. They should be base names, not
|
||||
# paths.
|
||||
ignore=CVS,configs
|
||||
|
||||
# Add files or directories matching the regex patterns to the blacklist. The
|
||||
# regex matches against base names, not paths.
|
||||
ignore-patterns=
|
||||
|
||||
# Python code to execute, usually for sys.path manipulation such as
|
||||
# pygtk.require().
|
||||
#init-hook=
|
||||
|
||||
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
|
||||
# number of processors available to use.
|
||||
jobs=1
|
||||
|
||||
# Control the amount of potential inferred values when inferring a single
|
||||
# object. This can help the performance when dealing with large functions or
|
||||
# complex, nested conditions.
|
||||
limit-inference-results=100
|
||||
|
||||
# List of plugins (as comma separated values of python module names) to load,
|
||||
# usually to register additional checkers.
|
||||
load-plugins=
|
||||
|
||||
# Pickle collected data for later comparisons.
|
||||
persistent=yes
|
||||
|
||||
# When enabled, pylint would attempt to guess common misconfiguration and emit
|
||||
# user-friendly hints instead of false-positive error messages.
|
||||
suggestion-mode=yes
|
||||
|
||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
||||
# active Python interpreter and may run arbitrary code.
|
||||
unsafe-load-any-extension=no
|
||||
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
|
||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
|
||||
confidence=
|
||||
|
||||
# Disable the message, report, category or checker with the given id(s). You
|
||||
# can either give multiple identifiers separated by comma (,) or put this
|
||||
# option multiple times (only on the command line, not in the configuration
|
||||
# file where it should appear only once). You can also use "--disable=all" to
|
||||
# disable everything first and then reenable specific checks. For example, if
|
||||
# you want to run only the similarities checker, you can use "--disable=all
|
||||
# --enable=similarities". If you want to run only the classes checker, but have
|
||||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
||||
# --disable=W".
|
||||
disable=print-statement,
|
||||
parameter-unpacking,
|
||||
unpacking-in-except,
|
||||
old-raise-syntax,
|
||||
backtick,
|
||||
long-suffix,
|
||||
old-ne-operator,
|
||||
old-octal-literal,
|
||||
import-star-module-level,
|
||||
non-ascii-bytes-literal,
|
||||
raw-checker-failed,
|
||||
bad-inline-option,
|
||||
locally-disabled,
|
||||
file-ignored,
|
||||
suppressed-message,
|
||||
useless-suppression,
|
||||
deprecated-pragma,
|
||||
use-symbolic-message-instead,
|
||||
apply-builtin,
|
||||
basestring-builtin,
|
||||
buffer-builtin,
|
||||
cmp-builtin,
|
||||
coerce-builtin,
|
||||
execfile-builtin,
|
||||
file-builtin,
|
||||
long-builtin,
|
||||
raw_input-builtin,
|
||||
reduce-builtin,
|
||||
standarderror-builtin,
|
||||
unicode-builtin,
|
||||
xrange-builtin,
|
||||
coerce-method,
|
||||
delslice-method,
|
||||
getslice-method,
|
||||
setslice-method,
|
||||
no-absolute-import,
|
||||
old-division,
|
||||
dict-iter-method,
|
||||
dict-view-method,
|
||||
next-method-called,
|
||||
metaclass-assignment,
|
||||
indexing-exception,
|
||||
raising-string,
|
||||
reload-builtin,
|
||||
oct-method,
|
||||
hex-method,
|
||||
nonzero-method,
|
||||
cmp-method,
|
||||
input-builtin,
|
||||
round-builtin,
|
||||
intern-builtin,
|
||||
unichr-builtin,
|
||||
map-builtin-not-iterating,
|
||||
zip-builtin-not-iterating,
|
||||
range-builtin-not-iterating,
|
||||
filter-builtin-not-iterating,
|
||||
using-cmp-argument,
|
||||
eq-without-hash,
|
||||
div-method,
|
||||
idiv-method,
|
||||
rdiv-method,
|
||||
exception-message-attribute,
|
||||
invalid-str-codec,
|
||||
sys-max-int,
|
||||
bad-python3-import,
|
||||
deprecated-string-function,
|
||||
deprecated-str-translate-call,
|
||||
deprecated-itertools-function,
|
||||
deprecated-types-field,
|
||||
next-method-defined,
|
||||
dict-items-not-iterating,
|
||||
dict-keys-not-iterating,
|
||||
dict-values-not-iterating,
|
||||
deprecated-operator-function,
|
||||
deprecated-urllib-function,
|
||||
xreadlines-attribute,
|
||||
deprecated-sys-function,
|
||||
exception-escape,
|
||||
comprehension-escape,
|
||||
no-member,
|
||||
invalid-name,
|
||||
too-many-branches,
|
||||
wrong-import-order,
|
||||
too-many-arguments,
|
||||
missing-function-docstring,
|
||||
missing-module-docstring,
|
||||
too-many-locals,
|
||||
too-few-public-methods,
|
||||
abstract-method,
|
||||
broad-except,
|
||||
too-many-nested-blocks,
|
||||
too-many-instance-attributes,
|
||||
missing-class-docstring,
|
||||
duplicate-code,
|
||||
not-callable,
|
||||
protected-access,
|
||||
dangerous-default-value,
|
||||
no-name-in-module,
|
||||
logging-fstring-interpolation,
|
||||
super-init-not-called,
|
||||
redefined-builtin,
|
||||
attribute-defined-outside-init,
|
||||
arguments-differ,
|
||||
cyclic-import,
|
||||
bad-super-call,
|
||||
too-many-statements
|
||||
|
||||
# Enable the message, report, category or checker with the given id(s). You can
|
||||
# either give multiple identifier separated by comma (,) or put this option
|
||||
# multiple time (only on the command line, not in the configuration file where
|
||||
# it should appear only once). See also the "--disable" option for examples.
|
||||
enable=c-extension-no-member
|
||||
|
||||
|
||||
[REPORTS]
|
||||
|
||||
# Python expression which should return a score less than or equal to 10. You
|
||||
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
|
||||
# which contain the number of messages in each category, as well as 'statement'
|
||||
# which is the total number of statements analyzed. This score is used by the
|
||||
# global evaluation report (RP0004).
|
||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
||||
|
||||
# Template used to display messages. This is a python new-style format string
|
||||
# used to format the message information. See doc for all details.
|
||||
#msg-template=
|
||||
|
||||
# Set the output format. Available formats are text, parseable, colorized, json
|
||||
# and msvs (visual studio). You can also give a reporter class, e.g.
|
||||
# mypackage.mymodule.MyReporterClass.
|
||||
output-format=text
|
||||
|
||||
# Tells whether to display a full report or only the messages.
|
||||
reports=no
|
||||
|
||||
# Activate the evaluation score.
|
||||
score=yes
|
||||
|
||||
|
||||
[REFACTORING]
|
||||
|
||||
# Maximum number of nested blocks for function / method body
|
||||
max-nested-blocks=5
|
||||
|
||||
# Complete name of functions that never returns. When checking for
|
||||
# inconsistent-return-statements if a never returning function is called then
|
||||
# it will be considered as an explicit return statement and no message will be
|
||||
# printed.
|
||||
never-returning-functions=sys.exit
|
||||
|
||||
|
||||
[TYPECHECK]
|
||||
|
||||
# List of decorators that produce context managers, such as
|
||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
||||
# produce valid context managers.
|
||||
contextmanager-decorators=contextlib.contextmanager
|
||||
|
||||
# List of members which are set dynamically and missed by pylint inference
|
||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
||||
# expressions are accepted.
|
||||
generated-members=
|
||||
|
||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
||||
ignore-mixin-members=yes
|
||||
|
||||
# Tells whether to warn about missing members when the owner of the attribute
|
||||
# is inferred to be None.
|
||||
ignore-none=yes
|
||||
|
||||
# This flag controls whether pylint should warn about no-member and similar
|
||||
# checks whenever an opaque object is returned when inferring. The inference
|
||||
# can return multiple potential results while evaluating a Python object, but
|
||||
# some branches might not be evaluated, which results in partial inference. In
|
||||
# that case, it might be useful to still emit no-member and other checks for
|
||||
# the rest of the inferred objects.
|
||||
ignore-on-opaque-inference=yes
|
||||
|
||||
# List of class names for which member attributes should not be checked (useful
|
||||
# for classes with dynamically set attributes). This supports the use of
|
||||
# qualified names.
|
||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
||||
|
||||
# List of module names for which member attributes should not be checked
|
||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
||||
# and thus existing member attributes cannot be deduced by static analysis). It
|
||||
# supports qualified module names, as well as Unix pattern matching.
|
||||
ignored-modules=
|
||||
|
||||
# Show a hint with possible names when a member name was not found. The aspect
|
||||
# of finding the hint is based on edit distance.
|
||||
missing-member-hint=yes
|
||||
|
||||
# The minimum edit distance a name should have in order to be considered a
|
||||
# similar match for a missing member name.
|
||||
missing-member-hint-distance=1
|
||||
|
||||
# The total number of similar names that should be taken in consideration when
|
||||
# showing a hint for a missing member.
|
||||
missing-member-max-choices=1
|
||||
|
||||
# List of decorators that change the signature of a decorated function.
|
||||
signature-mutators=
|
||||
|
||||
|
||||
[SPELLING]
|
||||
|
||||
# Limits count of emitted suggestions for spelling mistakes.
|
||||
max-spelling-suggestions=4
|
||||
|
||||
# Spelling dictionary name. Available dictionaries: none. To make it work,
|
||||
# install the python-enchant package.
|
||||
spelling-dict=
|
||||
|
||||
# List of comma separated words that should not be checked.
|
||||
spelling-ignore-words=
|
||||
|
||||
# A path to a file that contains the private dictionary; one word per line.
|
||||
spelling-private-dict-file=
|
||||
|
||||
# Tells whether to store unknown words to the private dictionary (see the
|
||||
# --spelling-private-dict-file option) instead of raising a message.
|
||||
spelling-store-unknown-words=no
|
||||
|
||||
|
||||
[LOGGING]
|
||||
|
||||
# The type of string formatting that logging methods do. `old` means using %
|
||||
# formatting, `new` is for `{}` formatting.
|
||||
logging-format-style=old
|
||||
|
||||
# Logging modules to check that the string format arguments are in logging
|
||||
# function parameter format.
|
||||
logging-modules=logging
|
||||
|
||||
|
||||
[VARIABLES]
|
||||
|
||||
# List of additional names supposed to be defined in builtins. Remember that
|
||||
# you should avoid defining new builtins when possible.
|
||||
additional-builtins=
|
||||
|
||||
# Tells whether unused global variables should be treated as a violation.
|
||||
allow-global-unused-variables=yes
|
||||
|
||||
# List of strings which can identify a callback function by name. A callback
|
||||
# name must start or end with one of those strings.
|
||||
callbacks=cb_,
|
||||
_cb
|
||||
|
||||
# A regular expression matching the name of dummy variables (i.e. expected to
|
||||
# not be used).
|
||||
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
|
||||
|
||||
# Argument names that match this expression will be ignored. Default to name
|
||||
# with leading underscore.
|
||||
ignored-argument-names=_.*|^ignored_|^unused_
|
||||
|
||||
# Tells whether we should check for unused import in __init__ files.
|
||||
init-import=no
|
||||
|
||||
# List of qualified module names which can have objects that can redefine
|
||||
# builtins.
|
||||
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
|
||||
|
||||
|
||||
[FORMAT]
|
||||
|
||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
||||
expected-line-ending-format=
|
||||
|
||||
# Regexp for a line that is allowed to be longer than the limit.
|
||||
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
||||
|
||||
# Number of spaces of indent required inside a hanging or continued line.
|
||||
indent-after-paren=4
|
||||
|
||||
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
||||
# tab).
|
||||
indent-string=' '
|
||||
|
||||
# Maximum number of characters on a single line.
|
||||
max-line-length=100
|
||||
|
||||
# Maximum number of lines in a module.
|
||||
max-module-lines=1000
|
||||
|
||||
# Allow the body of a class to be on the same line as the declaration if body
|
||||
# contains single statement.
|
||||
single-line-class-stmt=no
|
||||
|
||||
# Allow the body of an if to be on the same line as the test if there is no
|
||||
# else.
|
||||
single-line-if-stmt=no
|
||||
|
||||
|
||||
[STRING]
|
||||
|
||||
# This flag controls whether inconsistent-quotes generates a warning when the
|
||||
# character used as a quote delimiter is used inconsistently within a module.
|
||||
check-quote-consistency=no
|
||||
|
||||
# This flag controls whether the implicit-str-concat should generate a warning
|
||||
# on implicit string concatenation in sequences defined over several lines.
|
||||
check-str-concat-over-line-jumps=no
|
||||
|
||||
|
||||
[SIMILARITIES]
|
||||
|
||||
# Ignore comments when computing similarities.
|
||||
ignore-comments=yes
|
||||
|
||||
# Ignore docstrings when computing similarities.
|
||||
ignore-docstrings=yes
|
||||
|
||||
# Ignore imports when computing similarities.
|
||||
ignore-imports=no
|
||||
|
||||
# Minimum lines number of a similarity.
|
||||
min-similarity-lines=4
|
||||
|
||||
|
||||
[MISCELLANEOUS]
|
||||
|
||||
# List of note tags to take in consideration, separated by a comma.
|
||||
notes=FIXME,
|
||||
XXX,
|
||||
TODO
|
||||
|
||||
# Regular expression of note tags to take in consideration.
|
||||
#notes-rgx=
|
||||
|
||||
|
||||
[BASIC]
|
||||
|
||||
# Naming style matching correct argument names.
|
||||
argument-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct argument names. Overrides argument-
|
||||
# naming-style.
|
||||
#argument-rgx=
|
||||
|
||||
# Naming style matching correct attribute names.
|
||||
attr-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct attribute names. Overrides attr-naming-
|
||||
# style.
|
||||
#attr-rgx=
|
||||
|
||||
# Bad variable names which should always be refused, separated by a comma.
|
||||
bad-names=foo,
|
||||
bar,
|
||||
baz,
|
||||
toto,
|
||||
tutu,
|
||||
tata
|
||||
|
||||
# Bad variable names regexes, separated by a comma. If names match any regex,
|
||||
# they will always be refused
|
||||
bad-names-rgxs=
|
||||
|
||||
# Naming style matching correct class attribute names.
|
||||
class-attribute-naming-style=any
|
||||
|
||||
# Regular expression matching correct class attribute names. Overrides class-
|
||||
# attribute-naming-style.
|
||||
#class-attribute-rgx=
|
||||
|
||||
# Naming style matching correct class names.
|
||||
class-naming-style=PascalCase
|
||||
|
||||
# Regular expression matching correct class names. Overrides class-naming-
|
||||
# style.
|
||||
#class-rgx=
|
||||
|
||||
# Naming style matching correct constant names.
|
||||
const-naming-style=UPPER_CASE
|
||||
|
||||
# Regular expression matching correct constant names. Overrides const-naming-
|
||||
# style.
|
||||
#const-rgx=
|
||||
|
||||
# Minimum line length for functions/classes that require docstrings, shorter
|
||||
# ones are exempt.
|
||||
docstring-min-length=-1
|
||||
|
||||
# Naming style matching correct function names.
|
||||
function-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct function names. Overrides function-
|
||||
# naming-style.
|
||||
#function-rgx=
|
||||
|
||||
# Good variable names which should always be accepted, separated by a comma.
|
||||
good-names=i,
|
||||
j,
|
||||
k,
|
||||
ex,
|
||||
Run,
|
||||
_,
|
||||
x,
|
||||
y,
|
||||
w,
|
||||
h,
|
||||
a,
|
||||
b
|
||||
|
||||
# Good variable names regexes, separated by a comma. If names match any regex,
|
||||
# they will always be accepted
|
||||
good-names-rgxs=
|
||||
|
||||
# Include a hint for the correct naming format with invalid-name.
|
||||
include-naming-hint=no
|
||||
|
||||
# Naming style matching correct inline iteration names.
|
||||
inlinevar-naming-style=any
|
||||
|
||||
# Regular expression matching correct inline iteration names. Overrides
|
||||
# inlinevar-naming-style.
|
||||
#inlinevar-rgx=
|
||||
|
||||
# Naming style matching correct method names.
|
||||
method-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct method names. Overrides method-naming-
|
||||
# style.
|
||||
#method-rgx=
|
||||
|
||||
# Naming style matching correct module names.
|
||||
module-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct module names. Overrides module-naming-
|
||||
# style.
|
||||
#module-rgx=
|
||||
|
||||
# Colon-delimited sets of names that determine each other's naming style when
|
||||
# the name regexes allow several styles.
|
||||
name-group=
|
||||
|
||||
# Regular expression which should only match function or class names that do
|
||||
# not require a docstring.
|
||||
no-docstring-rgx=^_
|
||||
|
||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
||||
# to this list to register other decorators that produce valid properties.
|
||||
# These decorators are taken in consideration only for invalid-name.
|
||||
property-classes=abc.abstractproperty
|
||||
|
||||
# Naming style matching correct variable names.
|
||||
variable-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct variable names. Overrides variable-
|
||||
# naming-style.
|
||||
#variable-rgx=
|
||||
|
||||
|
||||
[DESIGN]
|
||||
|
||||
# Maximum number of arguments for function / method.
|
||||
max-args=5
|
||||
|
||||
# Maximum number of attributes for a class (see R0902).
|
||||
max-attributes=7
|
||||
|
||||
# Maximum number of boolean expressions in an if statement (see R0916).
|
||||
max-bool-expr=5
|
||||
|
||||
# Maximum number of branch for function / method body.
|
||||
max-branches=12
|
||||
|
||||
# Maximum number of locals for function / method body.
|
||||
max-locals=15
|
||||
|
||||
# Maximum number of parents for a class (see R0901).
|
||||
max-parents=7
|
||||
|
||||
# Maximum number of public methods for a class (see R0904).
|
||||
max-public-methods=20
|
||||
|
||||
# Maximum number of return / yield for function / method body.
|
||||
max-returns=6
|
||||
|
||||
# Maximum number of statements in function / method body.
|
||||
max-statements=50
|
||||
|
||||
# Minimum number of public methods for a class (see R0903).
|
||||
min-public-methods=2
|
||||
|
||||
|
||||
[IMPORTS]
|
||||
|
||||
# List of modules that can be imported at any level, not just the top level
|
||||
# one.
|
||||
allow-any-import-level=
|
||||
|
||||
# Allow wildcard imports from modules that define __all__.
|
||||
allow-wildcard-with-all=no
|
||||
|
||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
||||
# 3 compatible code, which means that the block might have code that exists
|
||||
# only in one or another interpreter, leading to false positives when analysed.
|
||||
analyse-fallback-blocks=no
|
||||
|
||||
# Deprecated modules which should not be used, separated by a comma.
|
||||
deprecated-modules=optparse,tkinter.tix
|
||||
|
||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
||||
# not be disabled).
|
||||
ext-import-graph=
|
||||
|
||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
||||
# given file (report RP0402 must not be disabled).
|
||||
import-graph=
|
||||
|
||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
||||
# not be disabled).
|
||||
int-import-graph=
|
||||
|
||||
# Force import order to recognize a module as part of the standard
|
||||
# compatibility libraries.
|
||||
known-standard-library=
|
||||
|
||||
# Force import order to recognize a module as part of a third party library.
|
||||
known-third-party=enchant
|
||||
|
||||
# Couples of modules and preferred modules, separated by a comma.
|
||||
preferred-modules=
|
||||
|
||||
|
||||
[CLASSES]
|
||||
|
||||
# List of method names used to declare (i.e. assign) instance attributes.
|
||||
defining-attr-methods=__init__,
|
||||
__new__,
|
||||
setUp,
|
||||
__post_init__
|
||||
|
||||
# List of member names, which should be excluded from the protected access
|
||||
# warning.
|
||||
exclude-protected=_asdict,
|
||||
_fields,
|
||||
_replace,
|
||||
_source,
|
||||
_make
|
||||
|
||||
# List of valid names for the first argument in a class method.
|
||||
valid-classmethod-first-arg=cls
|
||||
|
||||
# List of valid names for the first argument in a metaclass class method.
|
||||
valid-metaclass-classmethod-first-arg=cls
|
||||
|
||||
|
||||
[EXCEPTIONS]
|
||||
|
||||
# Exceptions that will emit a warning when being caught. Defaults to
|
||||
# "BaseException, Exception".
|
||||
overgeneral-exceptions=BaseException,
|
||||
Exception
|
|
@ -0,0 +1,7 @@
|
|||
version: 2
|
||||
|
||||
python:
|
||||
version: 3.7
|
||||
install:
|
||||
- requirements: requirements/docs.txt
|
||||
- requirements: requirements/readthedocs.txt
|
|
@ -0,0 +1,203 @@
|
|||
Copyright (c) MMOCR Authors. All rights reserved.
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2021 MMOCR Authors. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
24
README.md
24
README.md
|
@ -1,17 +1,17 @@
|
|||
<div align="center">
|
||||
<img src="resources/mmocr-logo.jpg" width="500px"/>
|
||||
<img src="resources/mmocr-logo.png" width="500px"/>
|
||||
</div>
|
||||
|
||||
## Introduction
|
||||
|
||||
[](https://github.com/open-mmlab/mmediting/actions)
|
||||
[](https://mmediting.readthedocs.io/en/latest/?badge=latest)
|
||||
[](https://codecov.io/gh/open-mmlab/mmediting)
|
||||
[](https://github.com/open-mmlab/mmediting/blob/master/LICENSE)
|
||||
[](https://github.com/open-mmlab/mmocr/actions)
|
||||
[](https://mmocr.readthedocs.io/en/latest/?badge=latest)
|
||||
[](https://codecov.io/gh/open-mmlab/mmocr)
|
||||
[](https://github.com/open-mmlab/mmocr/blob/master/LICENSE)
|
||||
|
||||
MMOCR is an open-source toolbox based on PyTorch and mmdetection for text detection, text recognition, and the corresponding downstream tasks including key information extraction. It is part of the open-mmlab project developed by [Multimedia Laboratory, CUHK](http://mmlab.ie.cuhk.edu.hk/).
|
||||
|
||||
The master branch works with **PyTorch 1.5**.
|
||||
The master branch works with **PyTorch 1.5+**.
|
||||
|
||||
Documentation: https://mmocr.readthedocs.io/en/latest/.
|
||||
|
||||
|
@ -31,7 +31,7 @@ Documentation: https://mmocr.readthedocs.io/en/latest/.
|
|||
|
||||
- **Modular Design**
|
||||
|
||||
The modular design of MMOCR enables users to define their own optimizers, data preprocessors, and model components such as backbones, necks and heads as well as losses. Please refer to [GETTING_STARTED.md](docs/GETTING_STARTED.md) for how to construct a customized model.
|
||||
The modular design of MMOCR enables users to define their own optimizers, data preprocessors, and model components such as backbones, necks and heads as well as losses. Please refer to [getting_started.md](docs/getting_started.md) for how to construct a customized model.
|
||||
|
||||
- **Numerous Utilities**
|
||||
|
||||
|
@ -43,24 +43,24 @@ This project is released under the [Apache 2.0 license](LICENSE).
|
|||
|
||||
## Changelog
|
||||
|
||||
v1.0 was released on 31/03/2021.
|
||||
v1.0 was released on 07/04/2021.
|
||||
|
||||
|
||||
## Benchmark and Model Zoo
|
||||
|
||||
Please refer to [MODEL_ZOO.md](MODEL_ZOO.md) for more details.
|
||||
Please refer to [modelzoo.md](modelzoo.md) for more details.
|
||||
|
||||
## Installation
|
||||
|
||||
Please refer to [INSTALL.md](docs/INSTALL.md) for installation.
|
||||
Please refer to [install.md](docs/install.md) for installation.
|
||||
|
||||
## Get Started
|
||||
|
||||
Please see [GETTING_STARTED.md](docs/GETTING_STARTED.md) for the basic usage of MMOCR.
|
||||
Please see [getting_started.md](docs/getting_started.md) for the basic usage of MMOCR.
|
||||
|
||||
## Contributing
|
||||
|
||||
We appreciate all contributions to improve MMOCR. Please refer to [CONTRIBUTING.md](docs/CONTRIBUTING.md) for the contributing guidelines.
|
||||
We appreciate all contributions to improve MMOCR. Please refer to [contributing.md](docs/contributing.md) for the contributing guidelines.
|
||||
|
||||
## Acknowledgement
|
||||
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
checkpoint_config = dict(interval=1)
|
||||
# yapf:disable
|
||||
log_config = dict(
|
||||
interval=5,
|
||||
hooks=[
|
||||
dict(type='TextLoggerHook')
|
||||
|
||||
])
|
||||
# yapf:enable
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
|
@ -0,0 +1,97 @@
|
|||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
train_cfg = None
|
||||
test_cfg = None
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 640)],
|
||||
ratio_range=(0.7, 1.3),
|
||||
aspect_ratio_range=(0.9, 1.1),
|
||||
multiscale_mode='value',
|
||||
keep_ratio=False),
|
||||
# shrink_ratio is from big to small. The 1st must be 1.0
|
||||
dict(type='PANetTargets', shrink_ratio=(1.0, 0.7)),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(3000, 640),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(3000, 640), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'TextDetDataset'
|
||||
img_prefix = 'tests/data/toy_dataset/imgs'
|
||||
train_anno_file = 'tests/data/toy_dataset/instances_test.txt'
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=4,
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['file_name', 'height', 'width', 'annotations'])),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
data_root = 'tests/data/toy_dataset'
|
||||
train2 = dict(
|
||||
type='IcdarDataset',
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline)
|
||||
|
||||
test_anno_file = 'tests/data/toy_dataset/instances_test.txt'
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=test_anno_file,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['file_name', 'height', 'width', 'annotations'])),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=2,
|
||||
workers_per_gpu=2,
|
||||
train=dict(type='ConcatDataset', datasets=[train1, train2]),
|
||||
val=dict(type='ConcatDataset', datasets=[test]),
|
||||
test=dict(type='ConcatDataset', datasets=[test]))
|
||||
|
||||
evaluation = dict(interval=1, metric='hmean-iou')
|
|
@ -0,0 +1,126 @@
|
|||
# model settings
|
||||
model = dict(
|
||||
type='OCRMaskRCNN',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='pytorch'),
|
||||
neck=dict(
|
||||
type='FPN',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
num_outs=5),
|
||||
rpn_head=dict(
|
||||
type='RPNHead',
|
||||
in_channels=256,
|
||||
feat_channels=256,
|
||||
anchor_generator=dict(
|
||||
type='AnchorGenerator',
|
||||
scales=[4],
|
||||
ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
|
||||
strides=[4, 8, 16, 32, 64]),
|
||||
bbox_coder=dict(
|
||||
type='DeltaXYWHBBoxCoder',
|
||||
target_means=[.0, .0, .0, .0],
|
||||
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
||||
loss_cls=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
||||
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
||||
roi_head=dict(
|
||||
type='StandardRoIHead',
|
||||
bbox_roi_extractor=dict(
|
||||
type='SingleRoIExtractor',
|
||||
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
||||
out_channels=256,
|
||||
featmap_strides=[4, 8, 16, 32]),
|
||||
bbox_head=dict(
|
||||
type='Shared2FCBBoxHead',
|
||||
in_channels=256,
|
||||
fc_out_channels=1024,
|
||||
roi_feat_size=7,
|
||||
num_classes=1,
|
||||
bbox_coder=dict(
|
||||
type='DeltaXYWHBBoxCoder',
|
||||
target_means=[0., 0., 0., 0.],
|
||||
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
||||
reg_class_agnostic=False,
|
||||
loss_cls=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
||||
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
||||
mask_roi_extractor=dict(
|
||||
type='SingleRoIExtractor',
|
||||
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
|
||||
out_channels=256,
|
||||
featmap_strides=[4, 8, 16, 32]),
|
||||
mask_head=dict(
|
||||
type='FCNMaskHead',
|
||||
num_convs=4,
|
||||
in_channels=256,
|
||||
conv_out_channels=256,
|
||||
num_classes=1,
|
||||
loss_mask=dict(
|
||||
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
|
||||
|
||||
# model training and testing settings
|
||||
train_cfg=dict(
|
||||
rpn=dict(
|
||||
assigner=dict(
|
||||
type='MaxIoUAssigner',
|
||||
pos_iou_thr=0.7,
|
||||
neg_iou_thr=0.3,
|
||||
min_pos_iou=0.3,
|
||||
match_low_quality=True,
|
||||
ignore_iof_thr=-1,
|
||||
gpu_assign_thr=50),
|
||||
sampler=dict(
|
||||
type='RandomSampler',
|
||||
num=256,
|
||||
pos_fraction=0.5,
|
||||
neg_pos_ub=-1,
|
||||
add_gt_as_proposals=False),
|
||||
allowed_border=-1,
|
||||
pos_weight=-1,
|
||||
debug=False),
|
||||
rpn_proposal=dict(
|
||||
nms_across_levels=False,
|
||||
nms_pre=2000,
|
||||
nms_post=1000,
|
||||
max_num=1000,
|
||||
nms_thr=0.7,
|
||||
min_bbox_size=0),
|
||||
rcnn=dict(
|
||||
assigner=dict(
|
||||
type='MaxIoUAssigner',
|
||||
pos_iou_thr=0.5,
|
||||
neg_iou_thr=0.5,
|
||||
min_pos_iou=0.5,
|
||||
match_low_quality=True,
|
||||
ignore_iof_thr=-1),
|
||||
sampler=dict(
|
||||
type='OHEMSampler',
|
||||
num=512,
|
||||
pos_fraction=0.25,
|
||||
neg_pos_ub=-1,
|
||||
add_gt_as_proposals=True),
|
||||
mask_size=28,
|
||||
pos_weight=-1,
|
||||
debug=False)),
|
||||
test_cfg=dict(
|
||||
rpn=dict(
|
||||
nms_across_levels=False,
|
||||
nms_pre=1000,
|
||||
nms_post=1000,
|
||||
max_num=1000,
|
||||
nms_thr=0.7,
|
||||
min_bbox_size=0),
|
||||
rcnn=dict(
|
||||
score_thr=0.05,
|
||||
nms=dict(type='nms', iou_threshold=0.5),
|
||||
max_per_img=100,
|
||||
mask_thr_binary=0.5)))
|
|
@ -0,0 +1,126 @@
|
|||
# model settings
|
||||
model = dict(
|
||||
type='OCRMaskRCNN',
|
||||
text_repr_type='poly',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='pytorch'),
|
||||
neck=dict(
|
||||
type='FPN',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
num_outs=5),
|
||||
rpn_head=dict(
|
||||
type='RPNHead',
|
||||
in_channels=256,
|
||||
feat_channels=256,
|
||||
anchor_generator=dict(
|
||||
type='AnchorGenerator',
|
||||
scales=[4],
|
||||
ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
|
||||
strides=[4, 8, 16, 32, 64]),
|
||||
bbox_coder=dict(
|
||||
type='DeltaXYWHBBoxCoder',
|
||||
target_means=[.0, .0, .0, .0],
|
||||
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
||||
loss_cls=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
||||
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
||||
roi_head=dict(
|
||||
type='StandardRoIHead',
|
||||
bbox_roi_extractor=dict(
|
||||
type='SingleRoIExtractor',
|
||||
roi_layer=dict(type='RoIAlign', output_size=7, sample_num=0),
|
||||
out_channels=256,
|
||||
featmap_strides=[4, 8, 16, 32]),
|
||||
bbox_head=dict(
|
||||
type='Shared2FCBBoxHead',
|
||||
in_channels=256,
|
||||
fc_out_channels=1024,
|
||||
roi_feat_size=7,
|
||||
num_classes=80,
|
||||
bbox_coder=dict(
|
||||
type='DeltaXYWHBBoxCoder',
|
||||
target_means=[0., 0., 0., 0.],
|
||||
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
||||
reg_class_agnostic=False,
|
||||
loss_cls=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
||||
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
||||
mask_roi_extractor=dict(
|
||||
type='SingleRoIExtractor',
|
||||
roi_layer=dict(type='RoIAlign', output_size=14, sample_num=0),
|
||||
out_channels=256,
|
||||
featmap_strides=[4, 8, 16, 32]),
|
||||
mask_head=dict(
|
||||
type='FCNMaskHead',
|
||||
num_convs=4,
|
||||
in_channels=256,
|
||||
conv_out_channels=256,
|
||||
num_classes=80,
|
||||
loss_mask=dict(
|
||||
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(
|
||||
rpn=dict(
|
||||
assigner=dict(
|
||||
type='MaxIoUAssigner',
|
||||
pos_iou_thr=0.7,
|
||||
neg_iou_thr=0.3,
|
||||
min_pos_iou=0.3,
|
||||
match_low_quality=True,
|
||||
ignore_iof_thr=-1),
|
||||
sampler=dict(
|
||||
type='RandomSampler',
|
||||
num=256,
|
||||
pos_fraction=0.5,
|
||||
neg_pos_ub=-1,
|
||||
add_gt_as_proposals=False),
|
||||
allowed_border=-1,
|
||||
pos_weight=-1,
|
||||
debug=False),
|
||||
rpn_proposal=dict(
|
||||
nms_across_levels=False,
|
||||
nms_pre=2000,
|
||||
nms_post=1000,
|
||||
max_num=1000,
|
||||
nms_thr=0.7,
|
||||
min_bbox_size=0),
|
||||
rcnn=dict(
|
||||
assigner=dict(
|
||||
type='MaxIoUAssigner',
|
||||
pos_iou_thr=0.5,
|
||||
neg_iou_thr=0.5,
|
||||
min_pos_iou=0.5,
|
||||
match_low_quality=True,
|
||||
ignore_iof_thr=-1,
|
||||
gpu_assign_thr=50),
|
||||
sampler=dict(
|
||||
type='OHEMSampler',
|
||||
num=512,
|
||||
pos_fraction=0.25,
|
||||
neg_pos_ub=-1,
|
||||
add_gt_as_proposals=True),
|
||||
mask_size=28,
|
||||
pos_weight=-1,
|
||||
debug=False)),
|
||||
test_cfg=dict(
|
||||
rpn=dict(
|
||||
nms_across_levels=False,
|
||||
nms_pre=1000,
|
||||
nms_post=1000,
|
||||
max_num=1000,
|
||||
nms_thr=0.7,
|
||||
min_bbox_size=0),
|
||||
rcnn=dict(
|
||||
score_thr=0.05,
|
||||
nms=dict(type='nms', iou_threshold=0.5),
|
||||
max_per_img=100,
|
||||
mask_thr_binary=0.5)))
|
|
@ -0,0 +1,96 @@
|
|||
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
|
||||
gt_label_convertor = dict(
|
||||
type='SegConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='RandomPaddingOCR',
|
||||
max_ratio=[0.15, 0.2, 0.15, 0.2],
|
||||
box_type='char_quads'),
|
||||
dict(type='OpencvToPil'),
|
||||
dict(
|
||||
type='RandomRotateImageBox',
|
||||
min_angle=-17,
|
||||
max_angle=17,
|
||||
box_type='char_quads'),
|
||||
dict(type='PilToOpencv'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=64,
|
||||
min_width=64,
|
||||
max_width=512,
|
||||
keep_aspect_ratio=True),
|
||||
dict(
|
||||
type='OCRSegTargets',
|
||||
label_convertor=gt_label_convertor,
|
||||
box_type='char_quads'),
|
||||
dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='FancyPCA'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels'],
|
||||
visualize=dict(flag=False, boundary_key=None),
|
||||
call_super=False),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'gt_kernels'],
|
||||
meta_keys=['filename', 'ori_shape', 'img_shape'])
|
||||
]
|
||||
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=64,
|
||||
min_width=64,
|
||||
max_width=None,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(type='CustomFormatBundle', call_super=False),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=['filename', 'ori_shape', 'img_shape'])
|
||||
]
|
||||
|
||||
prefix = 'tests/data/ocr_char_ann_toy_dataset/'
|
||||
train = dict(
|
||||
type='OCRSegDataset',
|
||||
img_prefix=prefix + 'imgs',
|
||||
ann_file=prefix + 'instances_train.txt',
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test = dict(
|
||||
type='OCRDataset',
|
||||
img_prefix=prefix + 'imgs',
|
||||
ann_file=prefix + 'instances_test.txt',
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=1,
|
||||
train=dict(type='ConcatDataset', datasets=[train]),
|
||||
val=dict(type='ConcatDataset', datasets=[test]),
|
||||
test=dict(type='ConcatDataset', datasets=[test]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,99 @@
|
|||
img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=32,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=32,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
img_prefix = 'tests/data/ocr_toy_dataset/imgs'
|
||||
train_anno_file1 = 'tests/data/ocr_toy_dataset/label.txt'
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train_anno_file2 = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
train2 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file2,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
test_anno_file1 = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=test_anno_file1,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=16,
|
||||
workers_per_gpu=2,
|
||||
train=dict(type='ConcatDataset', datasets=[train1, train2]),
|
||||
val=dict(type='ConcatDataset', datasets=[test]),
|
||||
test=dict(type='ConcatDataset', datasets=[test]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,11 @@
|
|||
label_convertor = dict(
|
||||
type='CTCConvertor', dict_type='DICT90', with_unknown=False)
|
||||
|
||||
model = dict(
|
||||
type='CRNNNet',
|
||||
preprocessor=None,
|
||||
backbone=dict(type='VeryDeepVgg', leakyRelu=False),
|
||||
encoder=None,
|
||||
decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
|
||||
loss=dict(type='CTCLoss', flatten=False),
|
||||
label_convertor=label_convertor)
|
|
@ -0,0 +1,11 @@
|
|||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
||||
|
||||
model = dict(
|
||||
type='NRTR',
|
||||
backbone=dict(type='NRTRModalityTransform'),
|
||||
encoder=dict(type='TFEncoder'),
|
||||
decoder=dict(type='TFDecoder'),
|
||||
loss=dict(type='TFLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=40)
|
|
@ -0,0 +1,24 @@
|
|||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
||||
|
||||
hybrid_decoder = dict(type='SequenceAttentionDecoder')
|
||||
|
||||
position_decoder = dict(type='PositionAttentionDecoder')
|
||||
|
||||
model = dict(
|
||||
type='RobustScanner',
|
||||
backbone=dict(type='ResNet31OCR'),
|
||||
encoder=dict(
|
||||
type='ChannelReductionEncoder',
|
||||
in_channels=512,
|
||||
out_channels=128,
|
||||
),
|
||||
decoder=dict(
|
||||
type='RobustScannerDecoder',
|
||||
dim_input=512,
|
||||
dim_model=128,
|
||||
hybrid_decoder=hybrid_decoder,
|
||||
position_decoder=position_decoder),
|
||||
loss=dict(type='SARLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=30)
|
|
@ -0,0 +1,24 @@
|
|||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
||||
|
||||
model = dict(
|
||||
type='SARNet',
|
||||
backbone=dict(type='ResNet31OCR'),
|
||||
encoder=dict(
|
||||
type='SAREncoder',
|
||||
enc_bi_rnn=False,
|
||||
enc_do_rnn=0.1,
|
||||
enc_gru=False,
|
||||
),
|
||||
decoder=dict(
|
||||
type='ParallelSARDecoder',
|
||||
enc_bi_rnn=False,
|
||||
dec_bi_rnn=False,
|
||||
dec_do_rnn=0,
|
||||
dec_gru=False,
|
||||
pred_dropout=0.1,
|
||||
d_k=512,
|
||||
pred_concat=True),
|
||||
loss=dict(type='SARLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=30)
|
|
@ -0,0 +1,11 @@
|
|||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT90', with_unknown=False)
|
||||
|
||||
model = dict(
|
||||
type='TransformerNet',
|
||||
backbone=dict(type='ResNet31OCR'),
|
||||
encoder=dict(type='TFEncoder'),
|
||||
decoder=dict(type='TFDecoder'),
|
||||
loss=dict(type='TFLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=40)
|
|
@ -0,0 +1,14 @@
|
|||
checkpoint_config = dict(interval=10)
|
||||
# yapf:disable
|
||||
log_config = dict(
|
||||
interval=5,
|
||||
hooks=[
|
||||
dict(type='TextLoggerHook')
|
||||
# dict(type='TensorboardLoggerHook')
|
||||
])
|
||||
# yapf:enable
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.007, momentum=0.9, weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9, min_lr=1e-7, by_epoch=True)
|
||||
total_epochs = 1200
|
|
@ -0,0 +1,11 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=500,
|
||||
warmup_ratio=0.001,
|
||||
step=[80, 128])
|
||||
total_epochs = 160
|
|
@ -0,0 +1,11 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=500,
|
||||
warmup_ratio=0.001,
|
||||
step=[8, 11])
|
||||
total_epochs = 12
|
|
@ -0,0 +1,4 @@
|
|||
_base_ = './schedule_1x.py'
|
||||
# learning policy
|
||||
lr_config = dict(step=[16, 19])
|
||||
total_epochs = 20
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.007, momentum=0.9, weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=True)
|
||||
total_epochs = 2
|
|
@ -0,0 +1,4 @@
|
|||
_base_ = './schedule_1x.py'
|
||||
# learning policy
|
||||
lr_config = dict(step=[16, 22])
|
||||
total_epochs = 24
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='Adadelta', lr=1.0)
|
||||
optimizer_config = dict(grad_clip=dict(max_norm=0.5))
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[8, 10, 12])
|
||||
total_epochs = 16
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='Adadelta', lr=1.0)
|
||||
optimizer_config = dict(grad_clip=dict(max_norm=0.5))
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[4, 6, 7])
|
||||
total_epochs = 8
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9)
|
||||
total_epochs = 1
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9)
|
||||
total_epochs = 600
|
|
@ -0,0 +1,6 @@
|
|||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=1e-3, momentum=0.99, weight_decay=5e-4)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[200, 400])
|
||||
total_epochs = 600
|
|
@ -0,0 +1,25 @@
|
|||
# Spatial Dual-Modality Graph Reasoning for Key Information Extraction
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@misc{sun2021spatial,
|
||||
title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction},
|
||||
author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang},
|
||||
year={2021},
|
||||
eprint={2103.14470},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CV}
|
||||
}
|
||||
```
|
||||
|
||||
## Results and models
|
||||
|
||||
### WildReceipt
|
||||
|
||||
| Method | Modality | Macro F1-Score | Download |
|
||||
| :--------------------------------------------------------------------: | :--------------: | :------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [sdmgr_unet16](/configs/kie/sdmgr/sdmgr_unet16_60e_wildreceipt.py) | Visual + Textual | 0.876 | [model](https://download.openmmlab.com/mmocr/kie/sdmgr/sdmgr_unet16_60e_wildreceipt_20210405-16a47642.pth) \| [log](https://download.openmmlab.com/mmocr/kie/sdmgr/20210405_104508.log.json) |
|
||||
| [sdmgr_novisual](/configs/kie/sdmgr/sdmgr_novisual_60e_wildreceipt.py) | Textual | 0.864 | [model](https://download.openmmlab.com/mmocr/kie/sdmgr/sdmgr_novisual_60e_wildreceipt_20210405-07bc26ad.pth) \| [log](https://download.openmmlab.com/mmocr/kie/sdmgr/20210405_141138.log.json) |
|
|
@ -0,0 +1,93 @@
|
|||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
max_scale, min_scale = 1024, 512
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(max_scale, min_scale), keep_ratio=True),
|
||||
dict(type='RandomFlip', flip_ratio=0.),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='KIEFormatBundle'),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'relations', 'texts', 'gt_bboxes', 'gt_labels'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(max_scale, min_scale), keep_ratio=True),
|
||||
dict(type='RandomFlip', flip_ratio=0.),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='KIEFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'relations', 'texts', 'gt_bboxes'])
|
||||
]
|
||||
|
||||
dataset_type = 'KIEDataset'
|
||||
data_root = 'data/wildreceipt'
|
||||
|
||||
loader = dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['file_name', 'height', 'width', 'annotations']))
|
||||
|
||||
train = dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/train.txt',
|
||||
pipeline=train_pipeline,
|
||||
img_prefix=data_root,
|
||||
loader=loader,
|
||||
dict_file=f'{data_root}/dict.txt',
|
||||
test_mode=False)
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/test.txt',
|
||||
pipeline=test_pipeline,
|
||||
img_prefix=data_root,
|
||||
loader=loader,
|
||||
dict_file=f'{data_root}/dict.txt',
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4, workers_per_gpu=0, train=train, val=test, test=test)
|
||||
|
||||
evaluation = dict(
|
||||
interval=1,
|
||||
metric='macro_f1',
|
||||
metric_options=dict(
|
||||
macro_f1=dict(
|
||||
ignores=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25])))
|
||||
|
||||
model = dict(
|
||||
type='SDMGR',
|
||||
backbone=dict(type='UNet', base_channels=16),
|
||||
bbox_head=dict(
|
||||
type='SDMGRHead', visual_dim=16, num_chars=92, num_classes=26),
|
||||
visual_modality=False,
|
||||
train_cfg=None,
|
||||
test_cfg=None,
|
||||
class_list=f'{data_root}/class_list.txt')
|
||||
|
||||
optimizer = dict(type='Adam', weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=1,
|
||||
warmup_ratio=1,
|
||||
step=[40, 50])
|
||||
total_epochs = 60
|
||||
|
||||
checkpoint_config = dict(interval=1)
|
||||
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
||||
|
||||
find_unused_parameters = True
|
|
@ -0,0 +1,93 @@
|
|||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
max_scale, min_scale = 1024, 512
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(max_scale, min_scale), keep_ratio=True),
|
||||
dict(type='RandomFlip', flip_ratio=0.),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='KIEFormatBundle'),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'relations', 'texts', 'gt_bboxes', 'gt_labels'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(max_scale, min_scale), keep_ratio=True),
|
||||
dict(type='RandomFlip', flip_ratio=0.),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='KIEFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'relations', 'texts', 'gt_bboxes'])
|
||||
]
|
||||
|
||||
dataset_type = 'KIEDataset'
|
||||
data_root = 'data/wildreceipt'
|
||||
|
||||
loader = dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['file_name', 'height', 'width', 'annotations']))
|
||||
|
||||
train = dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/train.txt',
|
||||
pipeline=train_pipeline,
|
||||
img_prefix=data_root,
|
||||
loader=loader,
|
||||
dict_file=f'{data_root}/dict.txt',
|
||||
test_mode=False)
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/test.txt',
|
||||
pipeline=test_pipeline,
|
||||
img_prefix=data_root,
|
||||
loader=loader,
|
||||
dict_file=f'{data_root}/dict.txt',
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4, workers_per_gpu=0, train=train, val=test, test=test)
|
||||
|
||||
evaluation = dict(
|
||||
interval=1,
|
||||
metric='macro_f1',
|
||||
metric_options=dict(
|
||||
macro_f1=dict(
|
||||
ignores=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25])))
|
||||
|
||||
model = dict(
|
||||
type='SDMGR',
|
||||
backbone=dict(type='UNet', base_channels=16),
|
||||
bbox_head=dict(
|
||||
type='SDMGRHead', visual_dim=16, num_chars=92, num_classes=26),
|
||||
visual_modality=True,
|
||||
train_cfg=None,
|
||||
test_cfg=None,
|
||||
class_list=f'{data_root}/class_list.txt')
|
||||
|
||||
optimizer = dict(type='Adam', weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=1,
|
||||
warmup_ratio=1,
|
||||
step=[40, 50])
|
||||
total_epochs = 60
|
||||
|
||||
checkpoint_config = dict(interval=1)
|
||||
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
||||
|
||||
find_unused_parameters = True
|
|
@ -0,0 +1,28 @@
|
|||
# Real-time Scene Text Detection with Differentiable Binarization
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@article{Liao_Wan_Yao_Chen_Bai_2020,
|
||||
title={Real-Time Scene Text Detection with Differentiable Binarization},
|
||||
journal={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang},
|
||||
year={2020},
|
||||
pages={11474-11481}}
|
||||
```
|
||||
|
||||
## Results and models
|
||||
|
||||
### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :---------------------------------------------------------------: | :--------------: | :-------------: | :------------: | :-----: | :-------: | :----: | :-------: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [DBNet](/configs/textdet/dbnet/dbnet_r18_fpnc_1200e_icdar2015.py) | ImageNet | ICDAR2015 Train | ICDAR2015 Test | 1200 | 736 | 0.731 | 0.871 | 0.795 | [model](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r18_fpnc_sbn_1200e_icdar2015_20210329-ba3ab597.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r18_fpnc_sbn_1200e_icdar2015_20210329-ba3ab597.log.json) |
|
||||
|
||||
### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :--------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------: | :-------------: | :------------: | :-----: | :-------: | :----: | :-------: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [DBNet](/configs/textdet/dbnet/dbnet_r50dcnv2_fpnc_1200e_icdar2015.py) | [Synthtext](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r50dcnv2_fpnc_sbn_2e_synthtext_20210325-aa96e477.pth) | ICDAR2015 Train | ICDAR2015 Test | 1200 | 1024 | 0.796 | 0.866 | 0.830 | [model](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r50dcnv2_fpnc_sbn_1200e_icdar2015_20210325-91cef9af.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_r50dcnv2_fpnc_sbn_1200e_icdar2015_20210325-91cef9af.pth.log.json) |
|
|
@ -0,0 +1,96 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_1200e.py', '../../_base_/runtime_10e.py'
|
||||
]
|
||||
model = dict(
|
||||
type='DBNet',
|
||||
pretrained='torchvision://resnet18',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=18,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=False,
|
||||
style='caffe'),
|
||||
neck=dict(
|
||||
type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256),
|
||||
bbox_head=dict(
|
||||
type='DBHead',
|
||||
text_repr_type='quad',
|
||||
in_channels=256,
|
||||
loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True)),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2015/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# for visualizing img, pls uncomment it.
|
||||
# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
# img aug
|
||||
dict(
|
||||
type='ImgAug',
|
||||
args=[['Fliplr', 0.5],
|
||||
dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
|
||||
# random crop
|
||||
dict(type='EastRandomCrop', target_size=(640, 640)),
|
||||
dict(type='DBNetTargets', shrink_ratio=0.4),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
# for visualizing img and gts, pls set visualize = True
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_shrink')),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(1333, 736),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(2944, 736), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=16,
|
||||
workers_per_gpu=8,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
# for debugging top k imgs
|
||||
# select_first_k=200,
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=100, metric='hmean-iou')
|
|
@ -0,0 +1,105 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_1200e.py', '../../_base_/runtime_10e.py'
|
||||
]
|
||||
load_from = 'checkpoints/textdet/dbnet/res50dcnv2_synthtext.pth'
|
||||
|
||||
model = dict(
|
||||
type='DBNet',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=False,
|
||||
style='caffe',
|
||||
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
|
||||
stage_with_dcn=(False, True, True, True)),
|
||||
neck=dict(
|
||||
type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256),
|
||||
bbox_head=dict(
|
||||
type='DBHead',
|
||||
text_repr_type='quad',
|
||||
in_channels=256,
|
||||
loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True)),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2015/'
|
||||
# img_norm_cfg = dict(
|
||||
# mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# from official dbnet code
|
||||
img_norm_cfg = dict(
|
||||
mean=[122.67891434, 116.66876762, 104.00698793],
|
||||
std=[255, 255, 255],
|
||||
to_rgb=False)
|
||||
# for visualizing img, pls uncomment it.
|
||||
# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
# img aug
|
||||
dict(
|
||||
type='ImgAug',
|
||||
args=[['Fliplr', 0.5],
|
||||
dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
|
||||
# random crop
|
||||
dict(type='EastRandomCrop', target_size=(640, 640)),
|
||||
dict(type='DBNetTargets', shrink_ratio=0.4),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
# for visualizing img and gts, pls set visualize = True
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_shrink')),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(4068, 1024),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(4068, 1024), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
# for debugging top k imgs
|
||||
# select_first_k=200,
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=100, metric='hmean-iou')
|
|
@ -0,0 +1,35 @@
|
|||
# Mask R-CNN
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@article{pmtd,
|
||||
author={Jingchao Liu and Xuebo Liu and Jie Sheng and Ding Liang and Xin Li and Qingjie Liu},
|
||||
title={Pyramid Mask Text Detector},
|
||||
journal={CoRR},
|
||||
volume={abs/1903.11800},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## Results and models
|
||||
|
||||
### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :---------------------------------------------------------------------: | :--------------: | :-----------: | :----------: | :-----: | :-------: | :----: | :-------: | :---: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [MaskRCNN](/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 160 | 1600 | 0.753 | 0.712 | 0.732 | [model](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500_20210219-96497a76.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500_20210219-96497a76.log.json) |
|
||||
|
||||
### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :-----------------------------------------------------------------------: | :--------------: | :-------------: | :------------: | :-----: | :-------: | :----: | :-------: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [MaskRCNN](/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015.py) | ImageNet | ICDAR2015 Train | ICDAR2015 Test | 160 | 1920 | 0.783 | 0.872 | 0.825 | [model](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015_20210219-8eb340a3.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015_20210219-8eb340a3.log.json) |
|
||||
|
||||
### ICDAR2017
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :-----------------------------------------------------------------------: | :--------------: | :-------------: | :-----------: | :-----: | :-------: | :----: | :-------: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [MaskRCNN](/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017.py) | ImageNet | ICDAR2017 Train | ICDAR2017 Val | 160 | 1600 | 0.754 | 0.827 | 0.789 | [model](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017_20210218-c6ec3ebb.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017_20210218-c6ec3ebb.log.json) |
|
|
@ -0,0 +1,67 @@
|
|||
_base_ = [
|
||||
'../../_base_/models/ocr_mask_rcnn_r50_fpn_ohem_poly.py',
|
||||
'../../_base_/schedules/schedule_160e.py', '../../_base_/runtime_10e.py'
|
||||
]
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/ctw1500/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=None,
|
||||
keep_ratio=False,
|
||||
resize_type='indep_sample_in_range',
|
||||
scale_range=(640, 2560)),
|
||||
dict(type='RandomFlip', flip_ratio=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
mask_type='union_all',
|
||||
instance_key='gt_masks'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
# resize the long size to 1600
|
||||
img_scale=(1600, 1600),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
# no flip
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
# select_first_k=1,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
# select_first_k=1,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,66 @@
|
|||
_base_ = [
|
||||
'../../_base_/models/ocr_mask_rcnn_r50_fpn_ohem.py',
|
||||
'../../_base_/schedules/schedule_160e.py', '../../_base_/runtime_10e.py'
|
||||
]
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2015/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=None,
|
||||
keep_ratio=False,
|
||||
resize_type='indep_sample_in_range',
|
||||
scale_range=(640, 2560)),
|
||||
dict(type='RandomFlip', flip_ratio=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
mask_type='union_all',
|
||||
instance_key='gt_masks'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
# resize the long size to 1600
|
||||
img_scale=(1920, 1920),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
# no flip
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
# select_first_k=1,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
# select_first_k=1,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,67 @@
|
|||
_base_ = [
|
||||
'../../_base_/models/ocr_mask_rcnn_r50_fpn_ohem.py',
|
||||
'../../_base_/schedules/schedule_160e.py', '../../_base_/runtime_10e.py'
|
||||
]
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2017/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=None,
|
||||
keep_ratio=False,
|
||||
resize_type='indep_sample_in_range',
|
||||
scale_range=(640, 2560)),
|
||||
dict(type='RandomFlip', flip_ratio=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
mask_type='union_all',
|
||||
instance_key='gt_masks'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
# resize the long size to 1600
|
||||
img_scale=(1600, 1600),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
# no flip
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
# select_first_k=1,
|
||||
ann_file=data_root + '/instances_val.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
# select_first_k=1,
|
||||
ann_file=data_root + '/instances_val.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,35 @@
|
|||
# Efficient and Accurate Arbitrary-Shaped Text Detection with Pixel Aggregation Network
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@inproceedings{WangXSZWLYS19,
|
||||
author={Wenhai Wang and Enze Xie and Xiaoge Song and Yuhang Zang and Wenjia Wang and Tong Lu and Gang Yu and Chunhua Shen},
|
||||
title={Efficient and Accurate Arbitrary-Shaped Text Detection With Pixel Aggregation Network},
|
||||
booktitle={ICCV},
|
||||
pages={8439--8448},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## Results and models
|
||||
|
||||
### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :----------------------------------------------------------------: | :--------------: | :-----------: | :----------: | :-----: | :-------: | :----: | :-------: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [PANet](/configs/textdet/panet/panet_r18_fpem_ffm_600e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 600 | 640 | 0.790 | 0.838 | 0.813 | [model](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_ctw1500_20210219-3b3a9aa3.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_ctw1500_20210219-3b3a9aa3.log.json) |
|
||||
|
||||
### ICDAR2015
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------------------------: | :--------------: | :-------------: | :------------: | :-----: | :-------: | :----: | :-------: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [PANet](/configs/textdet/panet/panet_r18_fpem_ffm_600e_icdar2015.py) | ImageNet | ICDAR2015 Train | ICDAR2015 Test | 600 | 736 | 0.734 | 0.856 | 0.791 | [model](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_icdar2015_20210219-42dbe46a.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/panet/panet_r18_fpem_ffm_sbn_600e_icdar2015_20210219-42dbe46a.log.json) |
|
||||
|
||||
### ICDAR2017
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------------------------: | :--------------: | :-------------: | :-----------: | :-----: | :-------: | :----: | :-------: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [PANet](/configs/textdet/panet/panet_r50_fpem_ffm_600e_icdar2017.py) | ImageNet | ICDAR2017 Train | ICDAR2017 Val | 600 | 800 | 0.604 | 0.812 | 0.693 | [model](https://download.openmmlab.com/mmocr/textdet/panet/panet_r50_fpem_ffm_sbn_600e_icdar2017_20210219-b4877a4f.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/panet/panet_r50_fpem_ffm_sbn_600e_icdar2017_20210219-b4877a4f.log.json) |
|
|
@ -0,0 +1,104 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_adam_600e.py',
|
||||
'../../_base_/runtime_10e.py'
|
||||
]
|
||||
model = dict(
|
||||
type='PANet',
|
||||
pretrained='torchvision://resnet18',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=18,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
|
||||
bbox_head=dict(
|
||||
type='PANHead',
|
||||
text_repr_type='poly',
|
||||
in_channels=[128, 128, 128, 128],
|
||||
out_channels=6,
|
||||
loss=dict(type='PANLoss')),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/ctw1500/'
|
||||
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# for visualizing img, pls uncomment it.
|
||||
# img_norm_cfg = dict(
|
||||
# mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 640)],
|
||||
ratio_range=(0.7, 1.3),
|
||||
aspect_ratio_range=(0.9, 1.1),
|
||||
multiscale_mode='value',
|
||||
keep_ratio=False),
|
||||
# shrink_ratio is from big to small. The 1st must be 1.0
|
||||
dict(type='PANetTargets', shrink_ratio=(1.0, 0.7)),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
# for visualizing img and gts, pls set visualize = True
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(3000, 640),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(3000, 640), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=2,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
# for debugging top k imgs
|
||||
# select_first_k=200,
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,102 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_adam_600e.py',
|
||||
'../../_base_/runtime_10e.py'
|
||||
]
|
||||
model = dict(
|
||||
type='PANet',
|
||||
pretrained='torchvision://resnet18',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=18,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
|
||||
bbox_head=dict(
|
||||
type='PANHead',
|
||||
text_repr_type='quad',
|
||||
in_channels=[128, 128, 128, 128],
|
||||
out_channels=6,
|
||||
loss=dict(type='PANLoss')),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2015/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
# for visualizing img, pls uncomment it.
|
||||
# img_norm_cfg = dict(
|
||||
# mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 736)],
|
||||
ratio_range=(0.7, 1.3),
|
||||
aspect_ratio_range=(0.9, 1.1),
|
||||
multiscale_mode='value',
|
||||
keep_ratio=False),
|
||||
dict(type='PANetTargets', shrink_ratio=(1.0, 0.5), max_shrink=20),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(736, 736),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
# for visualizing img and gts, pls set visualize = True
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(1333, 736),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(1333, 736), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
# for debugging top k imgs
|
||||
# select_first_k=200,
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
# select_first_k=100,
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,93 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_adam_600e.py',
|
||||
'../../_base_/runtime_10e.py'
|
||||
]
|
||||
model = dict(
|
||||
type='PANet',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]),
|
||||
bbox_head=dict(
|
||||
type='PANHead',
|
||||
in_channels=[128, 128, 128, 128],
|
||||
out_channels=6,
|
||||
loss=dict(type='PANLoss', speedup_bbox_thr=32)),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2017/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 800)],
|
||||
ratio_range=(0.7, 1.3),
|
||||
aspect_ratio_range=(0.9, 1.1),
|
||||
multiscale_mode='value',
|
||||
keep_ratio=False),
|
||||
dict(type='PANetTargets', shrink_ratio=(1.0, 0.5)),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(800, 800),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
# for visualizing img and gts, pls set visualize = True
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(1333, 800),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_val.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_val.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,29 @@
|
|||
# PSENet
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@article{li2018shape,
|
||||
title={Shape robust text detection with progressive scale expansion network},
|
||||
author={Li, Xiang and Wang, Wenhai and Hou, Wenbo and Liu, Ruo-Ze and Lu, Tong and Yang, Jian},
|
||||
journal={arXiv preprint arXiv:1806.02559},
|
||||
year={2018}
|
||||
}
|
||||
```
|
||||
|
||||
## Results and models
|
||||
|
||||
### CTW1500
|
||||
|
||||
| Method | Backbone | Extra Data | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :------------------------------------------------------------------: | :------: | :--------: | :-----------: | :----------: | :-----: | :-------: | :----: | :-------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [PSENet-4s](/configs/textdet/psenet/psenet_r50_fpnf_600e_ctw1500.py) | ResNet50 | - | CTW1500 Train | CTW1500 Test | 600 | 1280 | 0.728 | 0.849 | 0.784 | [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_ctw1500_20210401-216fed50.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/psenet/20210401_215421.log.json) |
|
||||
|
||||
### ICDAR2015
|
||||
|
||||
| Method | Backbone | Extra Data | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :--------------------------------------------------------------------: | :------: | :---------------------------------------------------------------------------------------------------------------------------------------: | :----------: | :-------: | :-----: | :-------: | :----: | :-------: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [PSENet-4s](/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2015.py) | ResNet50 | - | IC15 Train | IC15 Test | 600 | 2240 | 0.784 | 0.831 | 0.807 | [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_icdar2015-c6131f0d.pth) \| [log](https://download.openmmlab.com/mmocr/textdet/psenet/20210331_214145.log.json) |
|
||||
| [PSENet-4s](/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2015.py) | ResNet50 | pretrain on IC17 MLT [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_icdar2017_as_pretrain-3bd6056c.pth) | IC15 Train | IC15 Test | 600 | 2240 | 0.834 | 0.861 | 0.847 | [model](https://download.openmmlab.com/mmocr/textdet/psenet/psenet_r50_fpnf_600e_icdar2015_pretrain-eefd8fe6.pth) \| [log]() |
|
|
@ -0,0 +1,108 @@
|
|||
_base_ = ['../../_base_/default_runtime.py']
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-4)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[200, 400])
|
||||
total_epochs = 600
|
||||
|
||||
model = dict(
|
||||
type='PSENet',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(
|
||||
type='FPNF',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
fusion_type='concat'),
|
||||
bbox_head=dict(
|
||||
type='PSEHead',
|
||||
text_repr_type='poly',
|
||||
in_channels=[256],
|
||||
out_channels=7,
|
||||
loss=dict(type='PSELoss')),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/ctw1500/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 736)],
|
||||
ratio_range=(0.5, 3),
|
||||
aspect_ratio_range=(1, 1),
|
||||
multiscale_mode='value',
|
||||
long_size_bound=1280,
|
||||
short_size_bound=640,
|
||||
resize_type='long_short_bound',
|
||||
keep_ratio=False),
|
||||
dict(type='PSENetTargets'),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(1280, 1280),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(1280, 1280), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=2,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,108 @@
|
|||
_base_ = ['../../_base_/runtime_10e.py']
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-4)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[200, 400])
|
||||
total_epochs = 600
|
||||
|
||||
model = dict(
|
||||
type='PSENet',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(
|
||||
type='FPNF',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
fusion_type='concat'),
|
||||
bbox_head=dict(
|
||||
type='PSEHead',
|
||||
text_repr_type='quad',
|
||||
in_channels=[256],
|
||||
out_channels=7,
|
||||
loss=dict(type='PSELoss')),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2015/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 736)], # unused
|
||||
ratio_range=(0.5, 3),
|
||||
aspect_ratio_range=(1, 1),
|
||||
multiscale_mode='value',
|
||||
long_size_bound=1280,
|
||||
short_size_bound=640,
|
||||
resize_type='long_short_bound',
|
||||
keep_ratio=False),
|
||||
dict(type='PSENetTargets'),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2240, 2200),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(2240, 2200), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,103 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_sgd_600e.py',
|
||||
'../../_base_/runtime_10e.py'
|
||||
]
|
||||
model = dict(
|
||||
type='PSENet',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(
|
||||
type='FPNF',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
fusion_type='concat'),
|
||||
bbox_head=dict(
|
||||
type='PSEHead',
|
||||
text_repr_type='quad',
|
||||
in_channels=[256],
|
||||
out_channels=7,
|
||||
loss=dict(type='PSELoss')),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/icdar2017/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 736)],
|
||||
ratio_range=(0.5, 3),
|
||||
aspect_ratio_range=(1, 1),
|
||||
multiscale_mode='value',
|
||||
long_size_bound=1280,
|
||||
short_size_bound=640,
|
||||
resize_type='long_short_bound',
|
||||
keep_ratio=False),
|
||||
dict(type='PSENetTargets'),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='RandomRotateTextDet'),
|
||||
dict(
|
||||
type='RandomCropInstances',
|
||||
target_size=(640, 640),
|
||||
instance_key='gt_kernels'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels', 'gt_mask'],
|
||||
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
||||
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2240, 2200),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(2240, 2200), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=8,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_val.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_val.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,23 @@
|
|||
# Textsnake
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@article{long2018textsnake,
|
||||
title={TextSnake: A Flexible Representation for Detecting Text of Arbitrary Shapes},
|
||||
author={Long, Shangbang and Ruan, Jiaqiang and Zhang, Wenjie and He, Xin and Wu, Wenhao and Yao, Cong},
|
||||
booktitle={ECCV},
|
||||
pages={20-36},
|
||||
year={2018}
|
||||
}
|
||||
```
|
||||
|
||||
## Results and models
|
||||
|
||||
### CTW1500
|
||||
|
||||
| Method | Pretrained Model | Training set | Test set | #epochs | Test size | Recall | Precision | Hmean | Download |
|
||||
| :----------------------------------------------------------------------------: | :--------------: | :-----------: | :----------: | :-----: | :-------: | :----: | :-------: | :---: | :--------------------------------------------------------------------------------------------------------------------------: |
|
||||
| [TextSnake](/configs/textdet/textsnake/textsnake_r50_fpn_unet_600e_ctw1500.py) | ImageNet | CTW1500 Train | CTW1500 Test | 1200 | 736 | 0.795 | 0.840 | 0.817 | [model](https://download.openmmlab.com/mmocr/textdet/textsnake/textsnake_r50_fpn_unet_1200e_ctw1500-27f65b64.pth) \| [log]() |
|
|
@ -0,0 +1,113 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_1200e.py',
|
||||
'../../_base_/default_runtime.py'
|
||||
]
|
||||
model = dict(
|
||||
type='TextSnake',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='caffe'),
|
||||
neck=dict(
|
||||
type='FPN_UNET', in_channels=[256, 512, 1024, 2048], out_channels=32),
|
||||
bbox_head=dict(
|
||||
type='TextSnakeHead',
|
||||
in_channels=32,
|
||||
text_repr_type='poly',
|
||||
loss=dict(type='TextSnakeLoss')),
|
||||
train_cfg=None,
|
||||
test_cfg=None)
|
||||
|
||||
dataset_type = 'IcdarDataset'
|
||||
data_root = 'data/ctw1500/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='LoadTextAnnotations',
|
||||
with_bbox=True,
|
||||
with_mask=True,
|
||||
poly2mask=False),
|
||||
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(
|
||||
type='RandomCropPolyInstances',
|
||||
instance_key='gt_masks',
|
||||
crop_ratio=0.65,
|
||||
min_side_ratio=0.3),
|
||||
dict(
|
||||
type='RandomRotatePolyInstances',
|
||||
rotate_ratio=0.5,
|
||||
max_angle=20,
|
||||
pad_with_fixed_color=False),
|
||||
dict(
|
||||
type='ScaleAspectJitter',
|
||||
img_scale=[(3000, 736)], # unused
|
||||
ratio_range=(0.7, 1.3),
|
||||
aspect_ratio_range=(0.9, 1.1),
|
||||
multiscale_mode='value',
|
||||
long_size_bound=800,
|
||||
short_size_bound=480,
|
||||
resize_type='long_short_bound',
|
||||
keep_ratio=False),
|
||||
dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
|
||||
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
||||
dict(type='TextSnakeTargets'),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=[
|
||||
'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
|
||||
'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
|
||||
],
|
||||
visualize=dict(flag=False, boundary_key='gt_text_mask')),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=[
|
||||
'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
|
||||
'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
|
||||
])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(1333, 736),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', img_scale=(1333, 736), keep_ratio=True),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_training.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=data_root + '/instances_test.json',
|
||||
img_prefix=data_root + '/imgs',
|
||||
pipeline=test_pipeline))
|
||||
|
||||
evaluation = dict(interval=10, metric='hmean-iou')
|
|
@ -0,0 +1,37 @@
|
|||
# An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@article{shi2016end,
|
||||
title={An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition},
|
||||
author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
|
||||
journal={IEEE transactions on pattern analysis and machine intelligence},
|
||||
year={2016}
|
||||
}
|
||||
```
|
||||
|
||||
## Results and Models
|
||||
|
||||
### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | note |
|
||||
| :------: | :----------: | :--------: | :---: |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
### Test Dataset
|
||||
|
||||
| testset | instance_num | note |
|
||||
| :-----: | :----------: | :-----: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
|
||||
## Results and models
|
||||
|
||||
| methods | | Regular Text | | | | Irregular Text | | download |
|
||||
| :-----: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :------------------: |
|
||||
| methods | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 |
|
||||
| CRNN | 80.5 | 81.5 | 86.5 | | - | - | - | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_academic-a723a1c5.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/20210326_111035.log.json) |
|
|
@ -0,0 +1,138 @@
|
|||
_base_ = []
|
||||
checkpoint_config = dict(interval=1)
|
||||
# yapf:disable
|
||||
log_config = dict(
|
||||
interval=1,
|
||||
hooks=[
|
||||
dict(type='TextLoggerHook')
|
||||
|
||||
])
|
||||
# yapf:enable
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
||||
|
||||
# model
|
||||
label_convertor = dict(
|
||||
type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
|
||||
|
||||
model = dict(
|
||||
type='CRNNNet',
|
||||
preprocessor=None,
|
||||
backbone=dict(type='VeryDeepVgg', leakyRelu=False, input_channels=1),
|
||||
encoder=None,
|
||||
decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
|
||||
loss=dict(type='CTCLoss'),
|
||||
label_convertor=label_convertor,
|
||||
pretrained=None)
|
||||
|
||||
train_cfg = None
|
||||
test_cfg = None
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adadelta', lr=1.0)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[])
|
||||
total_epochs = 5
|
||||
|
||||
# data
|
||||
img_norm_cfg = dict(mean=[0.5], std=[0.5])
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=100,
|
||||
max_width=100,
|
||||
keep_aspect_ratio=False),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=4,
|
||||
max_width=None,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=['filename', 'ori_shape', 'img_shape', 'valid_ratio']),
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
|
||||
train_img_prefix = 'data/mixture/Syn90k/mnt/ramdisk/max/90kDICT32px'
|
||||
train_ann_file = 'data/mixture/Syn90k/label.lmdb'
|
||||
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix,
|
||||
ann_file=train_ann_file,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
test_prefix = 'data/mixture/'
|
||||
test_img_prefix1 = test_prefix + 'icdar_2013/'
|
||||
test_img_prefix2 = test_prefix + 'IIIT5K/'
|
||||
test_img_prefix3 = test_prefix + 'svt/'
|
||||
|
||||
test_ann_file1 = test_prefix + 'icdar_2013/test_label_1015.txt'
|
||||
test_ann_file2 = test_prefix + 'IIIT5K/test_label.txt'
|
||||
test_ann_file3 = test_prefix + 'svt/test_label.txt'
|
||||
|
||||
test1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=test_img_prefix1,
|
||||
ann_file=test_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test2 = {key: value for key, value in test1.items()}
|
||||
test2['img_prefix'] = test_img_prefix2
|
||||
test2['ann_file'] = test_ann_file2
|
||||
|
||||
test3 = {key: value for key, value in test1.items()}
|
||||
test3['img_prefix'] = test_img_prefix3
|
||||
test3['ann_file'] = test_ann_file3
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=64,
|
||||
workers_per_gpu=4,
|
||||
train=dict(type='ConcatDataset', datasets=[train1]),
|
||||
val=dict(type='ConcatDataset', datasets=[test1, test2, test3]),
|
||||
test=dict(type='ConcatDataset', datasets=[test1, test2, test3]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
||||
|
||||
cudnn_benchmark = True
|
|
@ -0,0 +1,6 @@
|
|||
_base_ = [
|
||||
'../../_base_/schedules/schedule_adadelta_8e.py',
|
||||
'../../_base_/default_runtime.py',
|
||||
'../../_base_/recog_datasets/toy_dataset.py',
|
||||
'../../_base_/recog_models/crnn.py'
|
||||
]
|
|
@ -0,0 +1,61 @@
|
|||
# NRTR
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@inproceedings{sheng2019nrtr,
|
||||
title={NRTR: A no-recurrence sequence-to-sequence model for scene text recognition},
|
||||
author={Sheng, Fenfen and Chen, Zhineng and Xu, Bo},
|
||||
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
|
||||
pages={781--786},
|
||||
year={2019},
|
||||
organization={IEEE}
|
||||
}
|
||||
```
|
||||
|
||||
[BACKBONE]
|
||||
|
||||
```bibtex
|
||||
@inproceedings{li2019show,
|
||||
title={Show, attend and read: A simple and strong baseline for irregular text recognition},
|
||||
author={Li, Hui and Wang, Peng and Shen, Chunhua and Zhang, Guyu},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={33},
|
||||
number={01},
|
||||
pages={8610--8617},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :--------: | :----------: | :--------: | :----------------------: |
|
||||
| SynthText | 7266686 | 1 | synth |
|
||||
| Syn90k | 8919273 | 1 | synth |
|
||||
|
||||
### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------------------------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
## Results and Models
|
||||
|
||||
| Methods | Backbone || Regular Text |||| Irregular Text ||download|
|
||||
| :-------: | :---------: | :----: | :----: | :--: | :-: | :--: | :------: | :--: | :-----: |
|
||||
| | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 |
|
||||
| [NRTR](/configs/textrecog/nrtr/nrtr_r31_academic.py) | R31-1/16-1/8 | 93.9 | 90.0| 93.5 | | 74.5 | 78.5 | 86.5 | [model](https://download.openmmlab.com/mmocr/textrecog/nrtr/nrtr_r31_academic_20210406-954db95e.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/nrtr/20210406_010150.log.json) |
|
||||
|
||||
**Notes:**
|
||||
|
||||
- `R31-1/16-1/8` means the height of feature from backbone is 1/16 of input image, where 1/8 for width.
|
|
@ -0,0 +1,112 @@
|
|||
_base_ = [
|
||||
'../../_base_/default_runtime.py',
|
||||
'../../_base_/recog_models/nrtr.py',
|
||||
]
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 6
|
||||
|
||||
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=32,
|
||||
max_width=100,
|
||||
keep_aspect_ratio=False),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=32,
|
||||
max_width=100,
|
||||
keep_aspect_ratio=False),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
img_prefix = 'tests/data/ocr_toy_dataset/imgs'
|
||||
train_anno_file1 = 'tests/data/ocr_toy_dataset/label.txt'
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train_anno_file2 = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
train2 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file2,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
test_anno_file1 = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=test_anno_file1,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=16,
|
||||
workers_per_gpu=2,
|
||||
train=dict(type='ConcatDataset', datasets=[train1, train2]),
|
||||
val=dict(type='ConcatDataset', datasets=[test]),
|
||||
test=dict(type='ConcatDataset', datasets=[test]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,163 @@
|
|||
_base_ = [
|
||||
'../../_base_/default_runtime.py', '../../_base_/recog_models/nrtr.py'
|
||||
]
|
||||
|
||||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
||||
|
||||
model = dict(
|
||||
type='NRTR',
|
||||
backbone=dict(
|
||||
type='ResNet31OCR',
|
||||
layers=[1, 2, 5, 3],
|
||||
channels=[32, 64, 128, 256, 512, 512],
|
||||
stage4_pool_cfg=dict(kernel_size=(2, 1), stride=(2, 1)),
|
||||
last_stage_pool=True),
|
||||
encoder=dict(type='TFEncoder'),
|
||||
decoder=dict(type='TFDecoder'),
|
||||
loss=dict(type='TFLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=40)
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 6
|
||||
|
||||
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=32,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=32,
|
||||
min_width=32,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
|
||||
train_prefix = 'data/mixture/'
|
||||
|
||||
train_img_prefix1 = train_prefix + \
|
||||
'SynthText/synthtext/SynthText_patch_horizontal'
|
||||
train_img_prefix2 = train_prefix + 'Syn90k/mnt/ramdisk/max/90kDICT32px'
|
||||
|
||||
train_ann_file1 = train_prefix + 'SynthText/label.lmdb',
|
||||
train_ann_file2 = train_prefix + 'Syn90k/label.lmdb'
|
||||
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix1,
|
||||
ann_file=train_ann_file1,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train2 = {key: value for key, value in train1.items()}
|
||||
train2['img_prefix'] = train_img_prefix2
|
||||
train2['ann_file'] = train_ann_file2
|
||||
|
||||
test_prefix = 'data/mixture/'
|
||||
test_img_prefix1 = test_prefix + 'IIIT5K/'
|
||||
test_img_prefix2 = test_prefix + 'svt/'
|
||||
test_img_prefix3 = test_prefix + 'icdar_2013/'
|
||||
test_img_prefix4 = test_prefix + 'icdar_2015/'
|
||||
test_img_prefix5 = test_prefix + 'svtp/'
|
||||
test_img_prefix6 = test_prefix + 'ct80/'
|
||||
|
||||
test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
|
||||
test_ann_file2 = test_prefix + 'svt/test_label.txt'
|
||||
test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
|
||||
test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
|
||||
test_ann_file5 = test_prefix + 'svtp/test_label.txt'
|
||||
test_ann_file6 = test_prefix + 'ct80/test_label.txt'
|
||||
|
||||
test1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=test_img_prefix1,
|
||||
ann_file=test_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test2 = {key: value for key, value in test1.items()}
|
||||
test2['img_prefix'] = test_img_prefix2
|
||||
test2['ann_file'] = test_ann_file2
|
||||
|
||||
test3 = {key: value for key, value in test1.items()}
|
||||
test3['img_prefix'] = test_img_prefix3
|
||||
test3['ann_file'] = test_ann_file3
|
||||
|
||||
test4 = {key: value for key, value in test1.items()}
|
||||
test4['img_prefix'] = test_img_prefix4
|
||||
test4['ann_file'] = test_ann_file4
|
||||
|
||||
test5 = {key: value for key, value in test1.items()}
|
||||
test5['img_prefix'] = test_img_prefix5
|
||||
test5['ann_file'] = test_ann_file5
|
||||
|
||||
test6 = {key: value for key, value in test1.items()}
|
||||
test6['img_prefix'] = test_img_prefix6
|
||||
test6['ann_file'] = test_ann_file6
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=128,
|
||||
workers_per_gpu=4,
|
||||
train=dict(type='ConcatDataset', datasets=[train1, train2]),
|
||||
val=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]),
|
||||
test=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,51 @@
|
|||
# RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@inproceedings{yue2020robustscanner,
|
||||
title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition},
|
||||
author={Yue, Xiaoyu and Kuang, Zhanghui and Lin, Chenhao and Sun, Hongbin and Zhang, Wayne},
|
||||
booktitle={European Conference on Computer Vision},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :--------: | :----------: | :--------: | :----------------------: |
|
||||
| icdar_2011 | 3567 | 20 | real |
|
||||
| icdar_2013 | 848 | 20 | real |
|
||||
| icdar2015 | 4468 | 20 | real |
|
||||
| coco_text | 42142 | 20 | real |
|
||||
| IIIT5K | 2000 | 20 | real |
|
||||
| SynthText | 2400000 | 1 | synth |
|
||||
| SynthAdd | 1216889 | 1 | synth, 1.6m in [[1]](#1) |
|
||||
| Syn90k | 2400000 | 1 | synth |
|
||||
|
||||
### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------------------------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular, 639 in [[1]](#1) |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
## Results and Models
|
||||
|
||||
| Methods | GPUs | | Regular Text | | | | Irregular Text | | download |
|
||||
| :-----------------------------------------------------------------: | :---------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 |
|
||||
| [RobustScanner](configs/textrecog/robust_scanner/robustscanner_r31_academic.py) | 16 | 95.1 | 89.2 | 93.1 | | 77.8 | 80.3 | 90.3 | [model](https://download.openmmlab.com/mmocr/textrecog/robustscanner/robustscanner_r31_academic-5f05874f.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/robustscanner/20210401_170932.log.json) |
|
||||
|
||||
## References
|
||||
|
||||
<a id="1">[1]</a> Li, Hui and Wang, Peng and Shen, Chunhua and Zhang, Guyu. Show, attend and read: A simple and strong baseline for irregular text recognition. In AAAI 2019.
|
|
@ -0,0 +1,12 @@
|
|||
_base_ = [
|
||||
'../../_base_/default_runtime.py',
|
||||
'../../_base_/recog_models/robust_scanner.py',
|
||||
'../../_base_/recog_datasets/toy_dataset.py'
|
||||
]
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 6
|
|
@ -0,0 +1,197 @@
|
|||
_base_ = [
|
||||
'../../_base_/default_runtime.py',
|
||||
'../../_base_/recog_models/robust_scanner.py'
|
||||
]
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 5
|
||||
|
||||
img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
|
||||
train_prefix = 'data/mixture/'
|
||||
|
||||
train_img_prefix1 = train_prefix + 'icdar_2011'
|
||||
train_img_prefix2 = train_prefix + 'icdar_2013'
|
||||
train_img_prefix3 = train_prefix + 'icdar_2015'
|
||||
train_img_prefix4 = train_prefix + 'coco_text'
|
||||
train_img_prefix5 = train_prefix + 'III5K'
|
||||
train_img_prefix6 = train_prefix + 'SynthText_Add'
|
||||
train_img_prefix7 = train_prefix + 'SynthText'
|
||||
train_img_prefix8 = train_prefix + 'Syn90k'
|
||||
|
||||
train_ann_file1 = train_prefix + 'icdar_2011/train_label.txt',
|
||||
train_ann_file2 = train_prefix + 'icdar_2013/train_label.txt',
|
||||
train_ann_file3 = train_prefix + 'icdar_2015/train_label.txt',
|
||||
train_ann_file4 = train_prefix + 'coco_text/train_label.txt',
|
||||
train_ann_file5 = train_prefix + 'III5K/train_label.txt',
|
||||
train_ann_file6 = train_prefix + 'SynthText_Add/label.txt',
|
||||
train_ann_file7 = train_prefix + 'SynthText/shuffle_labels.txt',
|
||||
train_ann_file8 = train_prefix + 'Syn90k/shuffle_labels.txt'
|
||||
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix1,
|
||||
ann_file=train_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=20,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train2 = {key: value for key, value in train1.items()}
|
||||
train2['img_prefix'] = train_img_prefix2
|
||||
train2['ann_file'] = train_ann_file2
|
||||
|
||||
train3 = {key: value for key, value in train1.items()}
|
||||
train3['img_prefix'] = train_img_prefix3
|
||||
train3['ann_file'] = train_ann_file3
|
||||
|
||||
train4 = {key: value for key, value in train1.items()}
|
||||
train4['img_prefix'] = train_img_prefix4
|
||||
train4['ann_file'] = train_ann_file4
|
||||
|
||||
train5 = {key: value for key, value in train1.items()}
|
||||
train5['img_prefix'] = train_img_prefix5
|
||||
train5['ann_file'] = train_ann_file5
|
||||
|
||||
train6 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix6,
|
||||
ann_file=train_ann_file6,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train7 = {key: value for key, value in train6.items()}
|
||||
train7['img_prefix'] = train_img_prefix7
|
||||
train7['ann_file'] = train_ann_file7
|
||||
|
||||
train8 = {key: value for key, value in train6.items()}
|
||||
train8['img_prefix'] = train_img_prefix8
|
||||
train8['ann_file'] = train_ann_file8
|
||||
|
||||
test_prefix = 'data/mixture/'
|
||||
test_img_prefix1 = test_prefix + 'IIIT5K/'
|
||||
test_img_prefix2 = test_prefix + 'svt/'
|
||||
test_img_prefix3 = test_prefix + 'icdar_2013/'
|
||||
test_img_prefix4 = test_prefix + 'icdar_2015/'
|
||||
test_img_prefix5 = test_prefix + 'svtp/'
|
||||
test_img_prefix6 = test_prefix + 'ct80/'
|
||||
|
||||
test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
|
||||
test_ann_file2 = test_prefix + 'svt/test_label.txt'
|
||||
test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
|
||||
test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
|
||||
test_ann_file5 = test_prefix + 'svtp/test_label.txt'
|
||||
test_ann_file6 = test_prefix + 'ct80/test_label.txt'
|
||||
|
||||
test1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=test_img_prefix1,
|
||||
ann_file=test_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test2 = {key: value for key, value in test1.items()}
|
||||
test2['img_prefix'] = test_img_prefix2
|
||||
test2['ann_file'] = test_ann_file2
|
||||
|
||||
test3 = {key: value for key, value in test1.items()}
|
||||
test3['img_prefix'] = test_img_prefix3
|
||||
test3['ann_file'] = test_ann_file3
|
||||
|
||||
test4 = {key: value for key, value in test1.items()}
|
||||
test4['img_prefix'] = test_img_prefix4
|
||||
test4['ann_file'] = test_ann_file4
|
||||
|
||||
test5 = {key: value for key, value in test1.items()}
|
||||
test5['img_prefix'] = test_img_prefix5
|
||||
test5['ann_file'] = test_ann_file5
|
||||
|
||||
test6 = {key: value for key, value in test1.items()}
|
||||
test6['img_prefix'] = test_img_prefix6
|
||||
test6['ann_file'] = test_ann_file6
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=64,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[
|
||||
train1, train2, train3, train4, train5, train6, train7, train8
|
||||
]),
|
||||
val=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]),
|
||||
test=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,67 @@
|
|||
# Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@inproceedings{li2019show,
|
||||
title={Show, attend and read: A simple and strong baseline for irregular text recognition},
|
||||
author={Li, Hui and Wang, Peng and Shen, Chunhua and Zhang, Guyu},
|
||||
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
|
||||
volume={33},
|
||||
number={01},
|
||||
pages={8610--8617},
|
||||
year={2019}
|
||||
}
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :--------: | :----------: | :--------: | :----------------------: |
|
||||
| icdar_2011 | 3567 | 20 | real |
|
||||
| icdar_2013 | 848 | 20 | real |
|
||||
| icdar2015 | 4468 | 20 | real |
|
||||
| coco_text | 42142 | 20 | real |
|
||||
| IIIT5K | 2000 | 20 | real |
|
||||
| SynthText | 2400000 | 1 | synth |
|
||||
| SynthAdd | 1216889 | 1 | synth, 1.6m in [[1]](#1) |
|
||||
| Syn90k | 2400000 | 1 | synth |
|
||||
|
||||
### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------------------------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| IC15 | 2077 | irregular |
|
||||
| SVTP | 645 | irregular, 639 in [[1]](#1) |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
## Results and Models
|
||||
|
||||
| Methods | Backbone | Decoder | | Regular Text | | | | Irregular Text | | download |
|
||||
| :-----------------------------------------------------------------: | :---------: | :------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
||||
| | | | IIIT5K | SVT | IC13 | | IC15 | SVTP | CT80 |
|
||||
| [SAR](/configs/textrecog/sar/sar_r31_parallel_decoder_academic.py) | R31-1/8-1/4 | ParallelSARDecoder | 95.0 | 89.6 | 93.7 | | 79.0 | 82.2 | 88.9 | [model](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_parallel_decoder_academic-dba3a4a3.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/sar/20210327_154129.log.json) |
|
||||
| [SAR](configs/textrecog/sar/sar_r31_sequential_decoder_academic.py) | R31-1/8-1/4 | SequentialSARDecoder | 95.2 | 88.7 | 92.4 | | 78.2 | 81.9 | 89.6 | [model](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_sequential_decoder_academic-d06c9a8e.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/sar/20210330_105728.log.json) |
|
||||
|
||||
**Notes:**
|
||||
|
||||
- `R31-1/8-1/4` means the height of feature from backbone is 1/8 of input image, where 1/4 for width.
|
||||
- We did not use beam search during decoding.
|
||||
- We implemented two kinds of decoder. Namely, `ParallelSARDecoder` and `SequentialSARDecoder`.
|
||||
- `ParallelSARDecoder`: Parallel decoding during training with `LSTM` layer. It would be faster.
|
||||
- `SequentialSARDecoder`: Sequential Decoding during training with `LSTMCell`. It would be easier to understand.
|
||||
- For train dataset.
|
||||
- We did not construct distinct data groups (20 groups in [[1]](#1)) to train the model group-by-group since it would render model training too complicated.
|
||||
- Instead, we randomly selected `2.4m` patches from `Syn90k`, `2.4m` from `SynthText` and `1.2m` from `SynthAdd`, and grouped all data together. See [config](https://download.openmmlab.com/mmocr/textrecog/sar/sar_r31_academic.py) for details.
|
||||
- We used 48 GPUs with `total_batch_size = 64 * 48` in the experiment above to speedup training, while keeping the `initial lr = 1e-3` unchanged.
|
||||
|
||||
## References
|
||||
|
||||
<a id="1">[1]</a> Li, Hui and Wang, Peng and Shen, Chunhua and Zhang, Guyu. Show, attend and read: A simple and strong baseline for irregular text recognition. In AAAI 2019.
|
|
@ -0,0 +1,219 @@
|
|||
_base_ = ['../../_base_/default_runtime.py']
|
||||
|
||||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
||||
|
||||
model = dict(
|
||||
type='SARNet',
|
||||
backbone=dict(type='ResNet31OCR'),
|
||||
encoder=dict(
|
||||
type='SAREncoder',
|
||||
enc_bi_rnn=False,
|
||||
enc_do_rnn=0.1,
|
||||
enc_gru=False,
|
||||
),
|
||||
decoder=dict(
|
||||
type='ParallelSARDecoder',
|
||||
enc_bi_rnn=False,
|
||||
dec_bi_rnn=False,
|
||||
dec_do_rnn=0,
|
||||
dec_gru=False,
|
||||
pred_dropout=0.1,
|
||||
d_k=512,
|
||||
pred_concat=True),
|
||||
loss=dict(type='SARLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=30)
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 5
|
||||
|
||||
img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
|
||||
train_prefix = 'data/mixture/'
|
||||
|
||||
train_img_prefix1 = train_prefix + 'icdar_2011'
|
||||
train_img_prefix2 = train_prefix + 'icdar_2013'
|
||||
train_img_prefix3 = train_prefix + 'icdar_2015'
|
||||
train_img_prefix4 = train_prefix + 'coco_text'
|
||||
train_img_prefix5 = train_prefix + 'III5K'
|
||||
train_img_prefix6 = train_prefix + 'SynthText_Add'
|
||||
train_img_prefix7 = train_prefix + 'SynthText'
|
||||
train_img_prefix8 = train_prefix + 'Syn90k'
|
||||
|
||||
train_ann_file1 = train_prefix + 'icdar_2011/train_label.txt',
|
||||
train_ann_file2 = train_prefix + 'icdar_2013/train_label.txt',
|
||||
train_ann_file3 = train_prefix + 'icdar_2015/train_label.txt',
|
||||
train_ann_file4 = train_prefix + 'coco_text/train_label.txt',
|
||||
train_ann_file5 = train_prefix + 'III5K/train_label.txt',
|
||||
train_ann_file6 = train_prefix + 'SynthText_Add/label.txt',
|
||||
train_ann_file7 = train_prefix + 'SynthText/shuffle_labels.txt',
|
||||
train_ann_file8 = train_prefix + 'Syn90k/shuffle_labels.txt'
|
||||
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix1,
|
||||
ann_file=train_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=20,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train2 = {key: value for key, value in train1.items()}
|
||||
train2['img_prefix'] = train_img_prefix2
|
||||
train2['ann_file'] = train_ann_file2
|
||||
|
||||
train3 = {key: value for key, value in train1.items()}
|
||||
train3['img_prefix'] = train_img_prefix3
|
||||
train3['ann_file'] = train_ann_file3
|
||||
|
||||
train4 = {key: value for key, value in train1.items()}
|
||||
train4['img_prefix'] = train_img_prefix4
|
||||
train4['ann_file'] = train_ann_file4
|
||||
|
||||
train5 = {key: value for key, value in train1.items()}
|
||||
train5['img_prefix'] = train_img_prefix5
|
||||
train5['ann_file'] = train_ann_file5
|
||||
|
||||
train6 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix6,
|
||||
ann_file=train_ann_file6,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train7 = {key: value for key, value in train6.items()}
|
||||
train7['img_prefix'] = train_img_prefix7
|
||||
train7['ann_file'] = train_ann_file7
|
||||
|
||||
train8 = {key: value for key, value in train6.items()}
|
||||
train8['img_prefix'] = train_img_prefix8
|
||||
train8['ann_file'] = train_ann_file8
|
||||
|
||||
test_prefix = 'data/mixture/'
|
||||
test_img_prefix1 = test_prefix + 'IIIT5K/'
|
||||
test_img_prefix2 = test_prefix + 'svt/'
|
||||
test_img_prefix3 = test_prefix + 'icdar_2013/'
|
||||
test_img_prefix4 = test_prefix + 'icdar_2015/'
|
||||
test_img_prefix5 = test_prefix + 'svtp/'
|
||||
test_img_prefix6 = test_prefix + 'ct80/'
|
||||
|
||||
test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
|
||||
test_ann_file2 = test_prefix + 'svt/test_label.txt'
|
||||
test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
|
||||
test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
|
||||
test_ann_file5 = test_prefix + 'svtp/test_label.txt'
|
||||
test_ann_file6 = test_prefix + 'ct80/test_label.txt'
|
||||
|
||||
test1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=test_img_prefix1,
|
||||
ann_file=test_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test2 = {key: value for key, value in test1.items()}
|
||||
test2['img_prefix'] = test_img_prefix2
|
||||
test2['ann_file'] = test_ann_file2
|
||||
|
||||
test3 = {key: value for key, value in test1.items()}
|
||||
test3['img_prefix'] = test_img_prefix3
|
||||
test3['ann_file'] = test_ann_file3
|
||||
|
||||
test4 = {key: value for key, value in test1.items()}
|
||||
test4['img_prefix'] = test_img_prefix4
|
||||
test4['ann_file'] = test_ann_file4
|
||||
|
||||
test5 = {key: value for key, value in test1.items()}
|
||||
test5['img_prefix'] = test_img_prefix5
|
||||
test5['ann_file'] = test_ann_file5
|
||||
|
||||
test6 = {key: value for key, value in test1.items()}
|
||||
test6['img_prefix'] = test_img_prefix6
|
||||
test6['ann_file'] = test_ann_file6
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=64,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[
|
||||
train1, train2, train3, train4, train5, train6, train7, train8
|
||||
]),
|
||||
val=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]),
|
||||
test=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,110 @@
|
|||
_base_ = [
|
||||
'../../_base_/default_runtime.py', '../../_base_/recog_models/sar.py'
|
||||
]
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 5
|
||||
|
||||
img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
img_prefix = 'tests/data/ocr_toy_dataset/imgs'
|
||||
train_anno_file1 = 'tests/data/ocr_toy_dataset/label.txt'
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train_anno_file2 = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
train2 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file2,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=100,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
test_anno_file1 = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=test_anno_file1,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=16,
|
||||
workers_per_gpu=2,
|
||||
train=dict(type='ConcatDataset', datasets=[train1, train2]),
|
||||
val=dict(type='ConcatDataset', datasets=[test]),
|
||||
test=dict(type='ConcatDataset', datasets=[test]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,219 @@
|
|||
_base_ = ['../../_base_/default_runtime.py']
|
||||
|
||||
label_convertor = dict(
|
||||
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
||||
|
||||
model = dict(
|
||||
type='SARNet',
|
||||
backbone=dict(type='ResNet31OCR'),
|
||||
encoder=dict(
|
||||
type='SAREncoder',
|
||||
enc_bi_rnn=False,
|
||||
enc_do_rnn=0.1,
|
||||
enc_gru=False,
|
||||
),
|
||||
decoder=dict(
|
||||
type='SequentialSARDecoder',
|
||||
enc_bi_rnn=False,
|
||||
dec_bi_rnn=False,
|
||||
dec_do_rnn=0,
|
||||
dec_gru=False,
|
||||
pred_dropout=0.1,
|
||||
d_k=512,
|
||||
pred_concat=True),
|
||||
loss=dict(type='SARLoss'),
|
||||
label_convertor=label_convertor,
|
||||
max_seq_len=30)
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-3)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 5
|
||||
|
||||
img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
|
||||
]),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiRotateAugOCR',
|
||||
rotate_degrees=[0, 90, 270],
|
||||
transforms=[
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=48,
|
||||
min_width=48,
|
||||
max_width=160,
|
||||
keep_aspect_ratio=True,
|
||||
width_downsample_ratio=0.25),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'filename', 'ori_shape', 'img_shape', 'valid_ratio'
|
||||
]),
|
||||
])
|
||||
]
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
|
||||
train_prefix = 'data/mixture/'
|
||||
|
||||
train_img_prefix1 = train_prefix + 'icdar_2011'
|
||||
train_img_prefix2 = train_prefix + 'icdar_2013'
|
||||
train_img_prefix3 = train_prefix + 'icdar_2015'
|
||||
train_img_prefix4 = train_prefix + 'coco_text'
|
||||
train_img_prefix5 = train_prefix + 'III5K'
|
||||
train_img_prefix6 = train_prefix + 'SynthText_Add'
|
||||
train_img_prefix7 = train_prefix + 'SynthText'
|
||||
train_img_prefix8 = train_prefix + 'Syn90k'
|
||||
|
||||
train_ann_file1 = train_prefix + 'icdar_2011/train_label.txt',
|
||||
train_ann_file2 = train_prefix + 'icdar_2013/train_label.txt',
|
||||
train_ann_file3 = train_prefix + 'icdar_2015/train_label.txt',
|
||||
train_ann_file4 = train_prefix + 'coco_text/train_label.txt',
|
||||
train_ann_file5 = train_prefix + 'III5K/train_label.txt',
|
||||
train_ann_file6 = train_prefix + 'SynthText_Add/label.txt',
|
||||
train_ann_file7 = train_prefix + 'SynthText/shuffle_labels.txt',
|
||||
train_ann_file8 = train_prefix + 'Syn90k/shuffle_labels.txt'
|
||||
|
||||
train1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix1,
|
||||
ann_file=train_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=20,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train2 = {key: value for key, value in train1.items()}
|
||||
train2['img_prefix'] = train_img_prefix2
|
||||
train2['ann_file'] = train_ann_file2
|
||||
|
||||
train3 = {key: value for key, value in train1.items()}
|
||||
train3['img_prefix'] = train_img_prefix3
|
||||
train3['ann_file'] = train_ann_file3
|
||||
|
||||
train4 = {key: value for key, value in train1.items()}
|
||||
train4['img_prefix'] = train_img_prefix4
|
||||
train4['ann_file'] = train_ann_file4
|
||||
|
||||
train5 = {key: value for key, value in train1.items()}
|
||||
train5['img_prefix'] = train_img_prefix5
|
||||
train5['ann_file'] = train_ann_file5
|
||||
|
||||
train6 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=train_img_prefix6,
|
||||
ann_file=train_ann_file6,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
train7 = {key: value for key, value in train6.items()}
|
||||
train7['img_prefix'] = train_img_prefix7
|
||||
train7['ann_file'] = train_ann_file7
|
||||
|
||||
train8 = {key: value for key, value in train6.items()}
|
||||
train8['img_prefix'] = train_img_prefix8
|
||||
train8['ann_file'] = train_ann_file8
|
||||
|
||||
test_prefix = 'data/mixture/'
|
||||
test_img_prefix1 = test_prefix + 'IIIT5K/'
|
||||
test_img_prefix2 = test_prefix + 'svt/'
|
||||
test_img_prefix3 = test_prefix + 'icdar_2013/'
|
||||
test_img_prefix4 = test_prefix + 'icdar_2015/'
|
||||
test_img_prefix5 = test_prefix + 'svtp/'
|
||||
test_img_prefix6 = test_prefix + 'ct80/'
|
||||
|
||||
test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
|
||||
test_ann_file2 = test_prefix + 'svt/test_label.txt'
|
||||
test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
|
||||
test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
|
||||
test_ann_file5 = test_prefix + 'svtp/test_label.txt'
|
||||
test_ann_file6 = test_prefix + 'ct80/test_label.txt'
|
||||
|
||||
test1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=test_img_prefix1,
|
||||
ann_file=test_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test2 = {key: value for key, value in test1.items()}
|
||||
test2['img_prefix'] = test_img_prefix2
|
||||
test2['ann_file'] = test_ann_file2
|
||||
|
||||
test3 = {key: value for key, value in test1.items()}
|
||||
test3['img_prefix'] = test_img_prefix3
|
||||
test3['ann_file'] = test_ann_file3
|
||||
|
||||
test4 = {key: value for key, value in test1.items()}
|
||||
test4['img_prefix'] = test_img_prefix4
|
||||
test4['ann_file'] = test_ann_file4
|
||||
|
||||
test5 = {key: value for key, value in test1.items()}
|
||||
test5['img_prefix'] = test_img_prefix5
|
||||
test5['ann_file'] = test_ann_file5
|
||||
|
||||
test6 = {key: value for key, value in test1.items()}
|
||||
test6['img_prefix'] = test_img_prefix6
|
||||
test6['ann_file'] = test_ann_file6
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=64,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[
|
||||
train1, train2, train3, train4, train5, train6, train7, train8
|
||||
]),
|
||||
val=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]),
|
||||
test=dict(
|
||||
type='ConcatDataset',
|
||||
datasets=[test1, test2, test3, test4, test5, test6]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,43 @@
|
|||
# SegOCR Simple Baseline.
|
||||
|
||||
## Introduction
|
||||
|
||||
[ALGORITHM]
|
||||
|
||||
```bibtex
|
||||
@unpublished{key,
|
||||
title={SegOCR Simple Baseline.},
|
||||
author={},
|
||||
note={Unpublished Manuscript},
|
||||
year={2021}
|
||||
}
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
### Train Dataset
|
||||
|
||||
| trainset | instance_num | repeat_num | source |
|
||||
| :-------: | :----------: | :--------: | :----: |
|
||||
| SynthText | 7266686 | 1 | synth |
|
||||
|
||||
### Test Dataset
|
||||
|
||||
| testset | instance_num | type |
|
||||
| :-----: | :----------: | :-------: |
|
||||
| IIIT5K | 3000 | regular |
|
||||
| SVT | 647 | regular |
|
||||
| IC13 | 1015 | regular |
|
||||
| CT80 | 288 | irregular |
|
||||
|
||||
## Results and Models
|
||||
|
||||
|Backbone|Neck|Head|||Regular Text|||Irregular Text|download
|
||||
| :-------------: | :-----: | :-----: | :------: | :-----: | :----: | :-----: | :-----: | :-----: | :-----: |
|
||||
|||||IIIT5K|SVT|IC13||CT80|
|
||||
|R31-1/16|FPNOCR|1x||90.9|81.8|90.7||80.9|[model](https://download.openmmlab.com/mmocr/textrecog/seg/seg_r31_1by16_fpnocr_academic-72235b11.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/seg/20210325_112835.log.json) |
|
||||
|
||||
**Notes:**
|
||||
|
||||
- `R31-1/16` means the size (both height and width ) of feature from backbone is 1/16 of input image.
|
||||
- `1x` means the size (both height and width) of feature from head is the same with input image.
|
|
@ -0,0 +1,160 @@
|
|||
_base_ = ['../../_base_/default_runtime.py']
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-4)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 5
|
||||
|
||||
label_convertor = dict(
|
||||
type='SegConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
||||
|
||||
model = dict(
|
||||
type='SegRecognizer',
|
||||
backbone=dict(
|
||||
type='ResNet31OCR',
|
||||
layers=[1, 2, 5, 3],
|
||||
channels=[32, 64, 128, 256, 512, 512],
|
||||
out_indices=[0, 1, 2, 3],
|
||||
stage4_pool_cfg=dict(kernel_size=2, stride=2),
|
||||
last_stage_pool=True),
|
||||
neck=dict(
|
||||
type='FPNOCR', in_channels=[128, 256, 512, 512], out_channels=256),
|
||||
head=dict(
|
||||
type='SegHead',
|
||||
in_channels=256,
|
||||
upsample_param=dict(scale_factor=2.0, mode='nearest')),
|
||||
loss=dict(
|
||||
type='SegLoss', seg_downsample_ratio=1.0, seg_with_loss_weight=True),
|
||||
label_convertor=label_convertor)
|
||||
|
||||
find_unused_parameters = True
|
||||
|
||||
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
|
||||
gt_label_convertor = dict(
|
||||
type='SegConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='RandomPaddingOCR',
|
||||
max_ratio=[0.15, 0.2, 0.15, 0.2],
|
||||
box_type='char_quads'),
|
||||
dict(type='OpencvToPil'),
|
||||
dict(
|
||||
type='RandomRotateImageBox',
|
||||
min_angle=-17,
|
||||
max_angle=17,
|
||||
box_type='char_quads'),
|
||||
dict(type='PilToOpencv'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=64,
|
||||
min_width=64,
|
||||
max_width=512,
|
||||
keep_aspect_ratio=True),
|
||||
dict(
|
||||
type='OCRSegTargets',
|
||||
label_convertor=gt_label_convertor,
|
||||
box_type='char_quads'),
|
||||
dict(type='RandomRotateTextDet', rotate_ratio=0.5, max_angle=15),
|
||||
dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='FancyPCA'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(
|
||||
type='CustomFormatBundle',
|
||||
keys=['gt_kernels'],
|
||||
visualize=dict(flag=False, boundary_key=None),
|
||||
call_super=False),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'gt_kernels'],
|
||||
meta_keys=['filename', 'ori_shape', 'img_shape'])
|
||||
]
|
||||
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeOCR',
|
||||
height=64,
|
||||
min_width=64,
|
||||
max_width=None,
|
||||
keep_aspect_ratio=True),
|
||||
dict(type='ToTensorOCR'),
|
||||
dict(type='NormalizeOCR', **img_norm_cfg),
|
||||
dict(type='CustomFormatBundle', call_super=False),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=['filename', 'ori_shape', 'img_shape'])
|
||||
]
|
||||
|
||||
train_img_root = 'data/mixture/'
|
||||
|
||||
train_img_prefix = train_img_root + 'SynthText'
|
||||
|
||||
train_ann_file = train_img_root + 'SynthText/instances_train.txt'
|
||||
|
||||
train = dict(
|
||||
type='OCRSegDataset',
|
||||
img_prefix=train_img_prefix,
|
||||
ann_file=train_ann_file,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
|
||||
dataset_type = 'OCRDataset'
|
||||
test_prefix = 'data/mixture/'
|
||||
|
||||
test_img_prefix1 = test_prefix + 'IIIT5K/'
|
||||
test_img_prefix2 = test_prefix + 'svt/'
|
||||
test_img_prefix3 = test_prefix + 'icdar_2013/'
|
||||
test_img_prefix4 = test_prefix + 'ct80/'
|
||||
|
||||
test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
|
||||
test_ann_file2 = test_prefix + 'svt/test_label.txt'
|
||||
test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
|
||||
test_ann_file4 = test_prefix + 'ct80/test_label.txt'
|
||||
|
||||
test1 = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=test_img_prefix1,
|
||||
ann_file=test_ann_file1,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=1,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
|
||||
test2 = {key: value for key, value in test1.items()}
|
||||
test2['img_prefix'] = test_img_prefix2
|
||||
test2['ann_file'] = test_ann_file2
|
||||
|
||||
test3 = {key: value for key, value in test1.items()}
|
||||
test3['img_prefix'] = test_img_prefix3
|
||||
test3['ann_file'] = test_ann_file3
|
||||
|
||||
test4 = {key: value for key, value in test1.items()}
|
||||
test4['img_prefix'] = test_img_prefix4
|
||||
test4['ann_file'] = test_ann_file4
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=16,
|
||||
workers_per_gpu=2,
|
||||
train=dict(type='ConcatDataset', datasets=[train]),
|
||||
val=dict(type='ConcatDataset', datasets=[test1, test2, test3, test4]),
|
||||
test=dict(type='ConcatDataset', datasets=[test1, test2, test3, test4]))
|
||||
|
||||
evaluation = dict(interval=1, metric='acc')
|
|
@ -0,0 +1,35 @@
|
|||
_base_ = [
|
||||
'../../_base_/default_runtime.py',
|
||||
'../../_base_/recog_datasets/seg_toy_dataset.py'
|
||||
]
|
||||
|
||||
# optimizer
|
||||
optimizer = dict(type='Adam', lr=1e-4)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(policy='step', step=[3, 4])
|
||||
total_epochs = 5
|
||||
|
||||
label_convertor = dict(
|
||||
type='SegConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
||||
|
||||
model = dict(
|
||||
type='SegRecognizer',
|
||||
backbone=dict(
|
||||
type='ResNet31OCR',
|
||||
layers=[1, 2, 5, 3],
|
||||
channels=[32, 64, 128, 256, 512, 512],
|
||||
out_indices=[0, 1, 2, 3],
|
||||
stage4_pool_cfg=dict(kernel_size=2, stride=2),
|
||||
last_stage_pool=True),
|
||||
neck=dict(
|
||||
type='FPNOCR', in_channels=[128, 256, 512, 512], out_channels=256),
|
||||
head=dict(
|
||||
type='SegHead',
|
||||
in_channels=256,
|
||||
upsample_param=dict(scale_factor=2.0, mode='nearest')),
|
||||
loss=dict(
|
||||
type='SegLoss', seg_downsample_ratio=1.0, seg_with_loss_weight=False),
|
||||
label_convertor=label_convertor)
|
||||
|
||||
find_unused_parameters = True
|
Binary file not shown.
After Width: | Height: | Size: 90 KiB |
Binary file not shown.
After Width: | Height: | Size: 44 KiB |
|
@ -0,0 +1,44 @@
|
|||
from argparse import ArgumentParser
|
||||
|
||||
import mmcv
|
||||
|
||||
from mmdet.apis import init_detector
|
||||
from mmocr.apis.inference import model_inference
|
||||
from mmocr.datasets import build_dataset # noqa: F401
|
||||
from mmocr.models import build_detector # noqa: F401
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('img', help='Image file.')
|
||||
parser.add_argument('config', help='Config file.')
|
||||
parser.add_argument('checkpoint', help='Checkpoint file.')
|
||||
parser.add_argument('save_path', help='Path to save visualized image.')
|
||||
parser.add_argument(
|
||||
'--device', default='cuda:0', help='Device used for inference.')
|
||||
parser.add_argument(
|
||||
'--imshow',
|
||||
action='store_true',
|
||||
help='Whether show image with OpenCV.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# build the model from a config file and a checkpoint file
|
||||
model = init_detector(args.config, args.checkpoint, device=args.device)
|
||||
if model.cfg.data.test['type'] == 'ConcatDataset':
|
||||
model.cfg.data.test.pipeline = model.cfg.data.test['datasets'][
|
||||
0].pipeline
|
||||
|
||||
# test a single image
|
||||
result = model_inference(model, args.img)
|
||||
print(f'result: {result}')
|
||||
|
||||
# show the results
|
||||
img = model.show_result(args.img, result, out_file=None, show=False)
|
||||
|
||||
mmcv.imwrite(img, args.save_path)
|
||||
if args.imshow:
|
||||
mmcv.imshow(img, 'predicted results')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,52 @@
|
|||
import argparse
|
||||
|
||||
import cv2
|
||||
import torch
|
||||
|
||||
from mmdet.apis import init_detector
|
||||
from mmocr.apis import model_inference
|
||||
from mmocr.datasets import build_dataset # noqa: F401
|
||||
from mmocr.models import build_detector # noqa: F401
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='MMDetection webcam demo.')
|
||||
parser.add_argument('config', help='Test config file path.')
|
||||
parser.add_argument('checkpoint', help='Checkpoint file.')
|
||||
parser.add_argument(
|
||||
'--device', type=str, default='cuda:0', help='CPU/CUDA device option.')
|
||||
parser.add_argument(
|
||||
'--camera-id', type=int, default=0, help='Camera device id.')
|
||||
parser.add_argument(
|
||||
'--score-thr', type=float, default=0.5, help='Bbox score threshold.')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
device = torch.device(args.device)
|
||||
|
||||
model = init_detector(args.config, args.checkpoint, device=device)
|
||||
if model.cfg.data.test['type'] == 'ConcatDataset':
|
||||
model.cfg.data.test.pipeline = model.cfg.data.test['datasets'][
|
||||
0].pipeline
|
||||
|
||||
camera = cv2.VideoCapture(args.camera_id)
|
||||
|
||||
print('Press "Esc", "q" or "Q" to exit.')
|
||||
while True:
|
||||
ret_val, img = camera.read()
|
||||
result = model_inference(model, img)
|
||||
|
||||
ch = cv2.waitKey(1)
|
||||
if ch == 27 or ch == ord('q') or ch == ord('Q'):
|
||||
break
|
||||
|
||||
model.show_result(
|
||||
img, result, score_thr=args.score_thr, wait_time=1, show=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,28 @@
|
|||
ARG PYTORCH="1.5"
|
||||
ARG CUDA="10.1"
|
||||
ARG CUDNN="7"
|
||||
|
||||
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
|
||||
|
||||
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
|
||||
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
|
||||
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
|
||||
|
||||
RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN conda clean --all
|
||||
RUN pip install mmcv-full==1.2.6+torch1.5.0+cu101 -f https://download.openmmlab.com/mmcv/dist/index.html
|
||||
|
||||
RUN git clone https://github.com/open-mmlab/mmdetection.git /mmdet
|
||||
WORKDIR /mmdet
|
||||
RUN git checkout -b v2.9.0 v2.9.0
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install .
|
||||
|
||||
RUN git clone https://github.com/open-mmlab/mmocr.git /mmocr
|
||||
WORKDIR /mmocr
|
||||
ENV FORCE_CUDA="1"
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install --no-cache-dir -e .
|
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
@ -0,0 +1,15 @@
|
|||
API Reference
|
||||
=============
|
||||
|
||||
mmocr.apis
|
||||
-------------
|
||||
.. automodule:: mmocr.apis
|
||||
:members:
|
||||
|
||||
mmocr.core
|
||||
-------------
|
||||
|
||||
evaluation
|
||||
^^^^^^^^^^
|
||||
.. automodule:: mmocr.core.evaluation
|
||||
:members:
|
|
@ -0,0 +1 @@
|
|||
## Changelog
|
|
@ -0,0 +1,93 @@
|
|||
<a id="markdown-contributor-covenant-code-of-conduct" name="contributor-covenant-code-of-conduct"></a>
|
||||
# Contributor Covenant Code of Conduct
|
||||
<!-- TOC -->
|
||||
|
||||
- [Contributor Covenant Code of Conduct](#contributor-covenant-code-of-conduct)
|
||||
- [Our Pledge](#our-pledge)
|
||||
- [Our Standards](#our-standards)
|
||||
- [Our Responsibilities](#our-responsibilities)
|
||||
- [Scope](#scope)
|
||||
- [Enforcement](#enforcement)
|
||||
- [Attribution](#attribution)
|
||||
|
||||
<!-- /TOC -->
|
||||
<a id="markdown-our-pledge" name="our-pledge"></a>
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as
|
||||
contributors and maintainers pledge to making participation in our project and
|
||||
our community a harassment-free experience for everyone, regardless of age, body
|
||||
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
||||
level of experience, education, socio-economic status, nationality, personal
|
||||
appearance, race, religion, or sexual identity and orientation.
|
||||
|
||||
<a id="markdown-our-standards" name="our-standards"></a>
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment
|
||||
include:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
* The use of sexualized language or imagery and unwelcome sexual attention or
|
||||
advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic
|
||||
address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
<a id="markdown-our-responsibilities" name="our-responsibilities"></a>
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable
|
||||
behavior and are expected to take appropriate and fair corrective action in
|
||||
response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct, or to ban temporarily or
|
||||
permanently any contributor for other behaviors that they deem inappropriate,
|
||||
threatening, offensive, or harmful.
|
||||
|
||||
<a id="markdown-scope" name="scope"></a>
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces
|
||||
when an individual is representing the project or its community. Examples of
|
||||
representing a project or community include using an official project e-mail
|
||||
address, posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event. Representation of a project may be
|
||||
further defined and clarified by project maintainers.
|
||||
|
||||
<a id="markdown-enforcement" name="enforcement"></a>
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by contacting the project team at chenkaidev@gmail.com. All
|
||||
complaints will be reviewed and investigated and will result in a response that
|
||||
is deemed necessary and appropriate to the circumstances. The project team is
|
||||
obligated to maintain confidentiality with regard to the reporter of an incident.
|
||||
Further details of specific enforcement policies may be posted separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good
|
||||
faith may face temporary or permanent repercussions as determined by other
|
||||
members of the project's leadership.
|
||||
|
||||
<a id="markdown-attribution" name="attribution"></a>
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
For answers to common questions about this code of conduct, see
|
||||
https://www.contributor-covenant.org/faq
|
|
@ -0,0 +1,83 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath('..'))
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'MMOCR'
|
||||
copyright = '2020-2030, OpenMMLab'
|
||||
author = 'OpenMMLab'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.1.0'
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.napoleon',
|
||||
'sphinx.ext.viewcode',
|
||||
'recommonmark',
|
||||
'sphinx_markdown_tables',
|
||||
]
|
||||
|
||||
autodoc_mock_imports = ['torch', 'torchvision', 'mmcv', 'mmocr.version']
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
source_suffix = {
|
||||
'.rst': 'restructuredtext',
|
||||
'.md': 'markdown',
|
||||
}
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
master_doc = 'index'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = []
|
||||
|
||||
|
||||
def builder_inited_handler(app):
|
||||
subprocess.run(['./merge_docs.sh'])
|
||||
subprocess.run(['./stats.py'])
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.connect('builder-inited', builder_inited_handler)
|
|
@ -0,0 +1,187 @@
|
|||
<a id="markdown-contributing-to-mmocr" name="contributing-to-mmocr"></a>
|
||||
# Contributing to mmocr
|
||||
|
||||
All kinds of contributions are welcome, including but not limited to the following.
|
||||
|
||||
- Fixes (typo, bugs)
|
||||
- New features and components
|
||||
- Enhancement like function speedup
|
||||
<!-- TOC -->
|
||||
|
||||
- [Contributing to mmocr](#contributing-to-mmocr)
|
||||
- [Workflow](#workflow)
|
||||
- [Step 1: Create a Fork](#step-1-create-a-fork)
|
||||
- [Step 2: Develop a new feature](#step-2-develop-a-new-feature)
|
||||
- [Step 2.1: Keep your fork up to date](#step-21-keep-your-fork-up-to-date)
|
||||
- [<span id = "step2.2">Step 2.2: Create a feature branch</span>](#step-22-create-a-feature-branch)
|
||||
- [Create an issue on github](#create-an-issue-on-github)
|
||||
- [Create branch](#create-branch)
|
||||
- [Step 2.3: Develop and test <your_new_feature>](#step-23-develop-and-test-your_new_feature)
|
||||
- [Step 2.4: Prepare to Pull Request](#step-24-prepare-to-pull-request)
|
||||
- [Merge official repo updates to your fork](#merge-official-repo-updates-to-your-fork)
|
||||
- [Push <your_new_feature> branch to your remote forked repo,](#push-your_new_feature-branch-to-your-remote-forked-repo)
|
||||
- [Step 2.5: Create a Pull Request](#step-25-create-a-pull-request)
|
||||
- [Step 2.6: Review code](#step-26-review-code)
|
||||
- [Step 2.7: Revise <your_new_feature> (optional)](#step-27-revise-your_new_feature--optional)
|
||||
- [Step 2.8: Delete <your_new_feature> branch if your PR is accepted.](#step-28-delete-your_new_feature-branch-if-your-pr-is-accepted)
|
||||
- [Code style](#code-style)
|
||||
- [Python](#python)
|
||||
- [C++ and CUDA](#c-and-cuda)
|
||||
|
||||
<!-- /TOC -->
|
||||
<a id="markdown-workflow" name="workflow"></a>
|
||||
## Workflow
|
||||
|
||||
This document describes the fork & merge request workflow that should be used when contributing to **MMOCR**.
|
||||
|
||||
The official public [repository](https://github.com/open-mmlab/mmocr) holds two branches with an infinite lifetime only:
|
||||
+ master
|
||||
+ develop
|
||||
|
||||
The *master* branch is the main branch where the source code of **HEAD** always reflects a *production-ready state*.
|
||||
|
||||
The *develop* branch is the branch where the source code of **HEAD** always reflects a state with the latest development changes for the next release.
|
||||
|
||||
Feature branches are used to develop new features for the upcoming or a distant future release.
|
||||
|
||||

|
||||
|
||||
All new developers to **MMOCR** need to follow the following steps:
|
||||
|
||||
<a id="markdown-step-1-create-a-fork" name="step-1-create-a-fork"></a>
|
||||
### Step 1: Create a Fork
|
||||
|
||||
1. Fork the repo on GitHub or GitLab to your personal account. Click the `Fork` button on the [project page](https://github.com/open-mmlab/mmocr).
|
||||
|
||||
2. Clone your new forked repo to your computer.
|
||||
```
|
||||
git clone https://github.com/<your name>/mmocr.git
|
||||
```
|
||||
3. Add the official repo as an upstream:
|
||||
```
|
||||
git remote add upstream https://github.com/open-mmlab/mmocr.git
|
||||
```
|
||||
|
||||
<a id="markdown-step-2-develop-a-new-feature" name="step-2-develop-a-new-feature"></a>
|
||||
### Step 2: Develop a new feature
|
||||
|
||||
<a id="markdown-step-21-keep-your-fork-up-to-date" name="step-21-keep-your-fork-up-to-date"></a>
|
||||
#### Step 2.1: Keep your fork up to date
|
||||
|
||||
Whenever you want to update your fork with the latest upstream changes, you need to fetch the upstream repo's branches and latest commits to bring them into your repository:
|
||||
|
||||
```
|
||||
# Fetch from upstream remote
|
||||
git fetch upstream
|
||||
|
||||
# Update your master branch
|
||||
git checkout master
|
||||
git rebase upstream/master
|
||||
git push origin master
|
||||
|
||||
# Update your develop branch
|
||||
git checkout develop
|
||||
git rebase upsteam/develop
|
||||
git push origin develop
|
||||
```
|
||||
|
||||
<a id="markdown-span-id--step22step-22-create-a-feature-branchspan" name="span-id--step22step-22-create-a-feature-branchspan"></a>
|
||||
#### <span id = "step2.2">Step 2.2: Create a feature branch</span>
|
||||
<a id="markdown-create-an-issue-on-githubhttpsgithubcomopen-mmlabmmocr" name="create-an-issue-on-githubhttpsgithubcomopen-mmlabmmocr"></a>
|
||||
##### Create an issue on [github](https://github.com/open-mmlab/mmocr)
|
||||
- The title of the issue should be one of the following formats: `[Feature]: xxx`, `[Fix]: xxx`, `[Enhance]: xxx`, `[Refactor]: xxx`.
|
||||
- More details can be written in comments.
|
||||
|
||||
<a id="markdown-create-branch" name="create-branch"></a>
|
||||
##### Create branch
|
||||
```
|
||||
git checkout -b feature/iss_<index> develop
|
||||
# index is the issue number above
|
||||
```
|
||||
Till now, your fork has three branches as follows:
|
||||
|
||||

|
||||
|
||||
<a id="markdown-step-23-develop-and-test-your_new_feature" name="step-23-develop-and-test-your_new_feature"></a>
|
||||
#### Step 2.3: Develop and test <your_new_feature>
|
||||
|
||||
Develop your new feature and test it to make sure it works well.
|
||||
|
||||
Pls run
|
||||
```
|
||||
pre-commit run --all-files
|
||||
pytest tests
|
||||
```
|
||||
and fix all failures before every git commit.
|
||||
```
|
||||
git commit -m "fix #<issue_index>: <commit_message>"
|
||||
```
|
||||
**Note:**
|
||||
- <issue_index> is the [issue](#step2.2) number.
|
||||
|
||||
<a id="markdown-step-24-prepare-to-pull-request" name="step-24-prepare-to-pull-request"></a>
|
||||
#### Step 2.4: Prepare to Pull Request
|
||||
- Make sure to link your pull request to the related issue. Please refer to the [instructon](https://docs.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue)
|
||||
|
||||
|
||||
<a id="markdown-merge-official-repo-updates-to-your-fork" name="merge-official-repo-updates-to-your-fork"></a>
|
||||
##### Merge official repo updates to your fork
|
||||
|
||||
```
|
||||
# fetch from upstream remote. i.e., the official repo
|
||||
git fetch upstream
|
||||
|
||||
# update the develop branch of your fork
|
||||
git checkout develop
|
||||
git rebase upsteam/develop
|
||||
git push origin develop
|
||||
|
||||
# update the <your_new_feature> branch
|
||||
git checkout <your_new_feature>
|
||||
git rebase develop
|
||||
# solve conflicts if any and Test
|
||||
```
|
||||
|
||||
<a id="markdown-push-your_new_feature-branch-to-your-remote-forked-repo" name="push-your_new_feature-branch-to-your-remote-forked-repo"></a>
|
||||
##### Push <your_new_feature> branch to your remote forked repo,
|
||||
```
|
||||
git checkout <your_new_feature>
|
||||
git push origin <your_new_feature>
|
||||
```
|
||||
<a id="markdown-step-25-create-a-pull-request" name="step-25-create-a-pull-request"></a>
|
||||
#### Step 2.5: Create a Pull Request
|
||||
|
||||
Go to the page for your fork on GitHub, select your new feature branch, and click the pull request button to integrate your feature branch into the upstream remote’s develop branch.
|
||||
|
||||
<a id="markdown-step-26-review-code" name="step-26-review-code"></a>
|
||||
#### Step 2.6: Review code
|
||||
|
||||
|
||||
<a id="markdown-step-27-revise-your_new_feature--optional" name="step-27-revise-your_new_feature--optional"></a>
|
||||
#### Step 2.7: Revise <your_new_feature> (optional)
|
||||
If PR is not accepted, pls follow Step 2.1, 2.3, 2.4 and 2.5 till your PR is accepted.
|
||||
|
||||
<a id="markdown-step-28-delete-your_new_feature-branch-if-your-pr-is-accepted" name="step-28-delete-your_new_feature-branch-if-your-pr-is-accepted"></a>
|
||||
#### Step 2.8: Delete <your_new_feature> branch if your PR is accepted.
|
||||
```
|
||||
git branch -d <your_new_feature>
|
||||
git push origin :<your_new_feature>
|
||||
```
|
||||
|
||||
<a id="markdown-code-style" name="code-style"></a>
|
||||
## Code style
|
||||
|
||||
<a id="markdown-python" name="python"></a>
|
||||
### Python
|
||||
We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
|
||||
|
||||
We use the following tools for linting and formatting:
|
||||
- [flake8](http://flake8.pycqa.org/en/latest/): linter
|
||||
- [yapf](https://github.com/google/yapf): formatter
|
||||
- [isort](https://github.com/timothycrosley/isort): sort imports
|
||||
|
||||
>Before you create a PR, make sure that your code lints and is formatted by yapf.
|
||||
|
||||
<a id="markdown-c-and-cuda" name="c-and-cuda"></a>
|
||||
### C++ and CUDA
|
||||
We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
|
|
@ -0,0 +1,208 @@
|
|||
<a id="markdown-datasets-preparation" name="datasets-preparation"></a>
|
||||
# Datasets Preparation
|
||||
This page lists the datasets which are commonly used in text detection, text recognition and key information extraction, and their download links.
|
||||
<!-- TOC -->
|
||||
|
||||
- [Datasets Preparation](#datasets-preparation)
|
||||
- [Text Detection](#text-detection)
|
||||
- [Text Recognition](#text-recognition)
|
||||
- [Key Information Extraction](#key-information-extraction)
|
||||
|
||||
<!-- /TOC -->
|
||||
<a id="markdown-text-detection" name="text-detection"></a>
|
||||
## Text Detection
|
||||
**The structure of the text detection dataset directory is organized as follows.**
|
||||
```
|
||||
├── ctw1500
|
||||
│ ├── imgs
|
||||
│ ├── instances_test.json
|
||||
│ └── instances_training.json
|
||||
├── icdar2015
|
||||
│ ├── imgs
|
||||
│ ├── instances_test.json
|
||||
│ └── instances_training.json
|
||||
├── icdar2017
|
||||
│ ├── imgs
|
||||
│ ├── instances_training.json
|
||||
│ └── instances_val.json
|
||||
├── synthtext
|
||||
│ ├── imgs
|
||||
│ └── instances_training.lmdb
|
||||
```
|
||||
| Dataset | | Images | | | Annotation Files | | | Note | |
|
||||
|:---------:|:-:|:--------------------------:|:-:|:--------------------------------------------:|:---------------------------------------:|:----------------------------------------:|:-:|:----:|---|
|
||||
| | | | | training | validation | testing | | | |
|
||||
| CTW1500 | | [homepage](https://github.com/Yuliang-Liu/Curve-Text-Detector) | | [instances_training.json](https://download.openmmlab.com/mmocr/data/ctw1500/instances_training.json) | - | [instances_test.json](https://download.openmmlab.com/mmocr/data/ctw1500/instances_test.json) | | | |
|
||||
| ICDAR2015 | | [homepage](https://rrc.cvc.uab.es/?ch=4&com=downloads) | | [instances_training.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_training.json) | - | [instances_test.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_test.json) | | | |
|
||||
| ICDAR2017 | | [homepage](https://rrc.cvc.uab.es/?ch=8&com=downloads) | [renamed_imgs](https://download.openmmlab.com/mmocr/data/icdar2017/renamed_imgs.tar) | [instances_training.json](https://download.openmmlab.com/mmocr/data/icdar2017/instances_training.json) | [instances_val.json](https://openmmlab) | [instances_test.json](https://download.openmmlab.com/mmocr/data/icdar2017/instances_test.json) | | | |
|
||||
| Synthtext | | [homepage](https://www.robots.ox.ac.uk/~vgg/data/scenetext/) | | [instances_training.lmdb](https://download.openmmlab.com/mmocr/data/synthtext/instances_training.lmdb)|-| | | |
|
||||
|
||||
- For `icdar2015`:
|
||||
- Step1: Download `ch4_training_images.zip` and `ch4_test_images.zip` from [homepage](https://rrc.cvc.uab.es/?ch=4&com=downloads)
|
||||
- Step2: Download [instances_training.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_training.json) and [instances_test.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_test.json)
|
||||
- Step3:
|
||||
```bash
|
||||
mkdir icdar2015 && cd icdar2015
|
||||
mv /path/to/instances_training.json .
|
||||
mv /path/to/instances_test.json .
|
||||
|
||||
mkdir imgs && cd imgs
|
||||
ln -s /path/to/ch4_training_images training
|
||||
ln -s /path/to/ch4_test_images test
|
||||
```
|
||||
- For `icdar2017`:
|
||||
- To avoid the effect of rotation when load `jpg` with opencv, We provide re-saved `png` format image in [renamed_images](https://download.openmmlab.com/mmocr/data/icdar2017/renamed_imgs.tar). You can copy these images to `imgs`.
|
||||
|
||||
<a id="markdown-text-recognition" name="text-recognition"></a>
|
||||
## Text Recognition
|
||||
**The structure of the text recognition dataset directory is organized as follows.**
|
||||
|
||||
```
|
||||
├── mixture
|
||||
│ ├── coco_text
|
||||
│ │ ├── train_label.txt
|
||||
│ │ ├── train_words
|
||||
│ ├── icdar_2011
|
||||
│ │ ├── training_label.txt
|
||||
│ │ ├── Challenge1_Training_Task3_Images_GT
|
||||
│ ├── icdar_2013
|
||||
│ │ ├── train_label.txt
|
||||
│ │ ├── test_label_1015.txt
|
||||
│ │ ├── test_label_1095.txt
|
||||
│ │ ├── Challenge2_Training_Task3_Images_GT
|
||||
│ │ ├── Challenge2_Test_Task3_Images
|
||||
│ ├── icdar_2015
|
||||
│ │ ├── train_label.txt
|
||||
│ │ ├── test_label.txt
|
||||
│ │ ├── ch4_training_word_images_gt
|
||||
│ │ ├── ch4_test_word_images_gt
|
||||
│ ├── III5K
|
||||
│ │ ├── train_label.txt
|
||||
│ │ ├── test_label.txt
|
||||
│ │ ├── train
|
||||
│ │ ├── test
|
||||
│ ├── ct80
|
||||
│ │ ├── test_label.txt
|
||||
│ │ ├── image
|
||||
│ ├── svt
|
||||
│ │ ├── test_label.txt
|
||||
│ │ ├── image
|
||||
│ ├── svtp
|
||||
│ │ ├── test_label.txt
|
||||
│ │ ├── image
|
||||
│ ├── Synth90k
|
||||
│ │ ├── shuffle_labels.txt
|
||||
│ │ ├── label.lmdb
|
||||
│ │ ├── mnt
|
||||
│ ├── SynthText
|
||||
│ │ ├── shuffle_labels.txt
|
||||
│ │ ├── instances_train.txt
|
||||
│ │ ├── label.lmdb
|
||||
│ │ ├── synthtext
|
||||
│ ├── SynthAdd
|
||||
│ │ ├── label.txt
|
||||
│ │ ├── SynthText_Add
|
||||
|
||||
```
|
||||
| Dataset | | images | annotation file | annotation file | Note |
|
||||
|:----------:|:-:|:---------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------:|:----:|
|
||||
|| | |training | test | |
|
||||
| coco_text ||[homepage](https://rrc.cvc.uab.es/?ch=5&com=downloads) |[train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/coco_text/train_label.txt) |- | |
|
||||
| icdar_2011 ||[homepage](http://www.cvc.uab.es/icdar2011competition/?com=downloads) |[train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/train_label.txt) |- | |
|
||||
| icdar_2013 | | [homepage](https://rrc.cvc.uab.es/?ch=2&com=downloads) | [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2013/train_label.txt) | [test_label_1015.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2013/test_label_1015.txt) | |
|
||||
| icdar_2015 | | [homepage](https://rrc.cvc.uab.es/?ch=4&com=downloads) | [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/train_label.txt) | [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/test_label.txt) | |
|
||||
| IIIT5K | | [homepage](http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/IIIT5K.html) | [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/train_label.txt) | [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/test_label.txt) | |
|
||||
| ct80 | | - |-|[test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/ct80/test_label.txt)||
|
||||
| svt | | [homepage](http://www.iapr-tc11.org/mediawiki/index.php/The_Street_View_Text_Dataset) | - | [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/svt/test_label.txt) | |
|
||||
| svtp | | - | - | [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/svtp/test_label.txt) | |
|
||||
| Synth90k | | [homepage](https://www.robots.ox.ac.uk/~vgg/data/text/) | [shuffle_labels.txt](https://download.openmmlab.com/mmocr/data/mixture/Synth90k/shuffle_labels.txt) \| [label.lmdb](https://download.openmmlab.com/mmocr/data/mixture/Synth90k/label.lmdb) | - | |
|
||||
| SynthText | | [homepage](https://www.robots.ox.ac.uk/~vgg/data/scenetext/) | [shuffle_labels.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthText/shuffle_labels.txt) \| [instances_train.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthText/instances_train.txt) \| [label.lmdb](https://download.openmmlab.com/mmocr/data/mixture/SynthText/label.lmdb) | - | |
|
||||
| SynthAdd | | [SynthText_Add.zip](https://pan.baidu.com/s/1uV0LtoNmcxbO-0YA7Ch4dg) (code:627x) | [label.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthAdd/label.txt)|- | |
|
||||
|
||||
- For `icdar_2013`:
|
||||
- Step1: Download `Challenge2_Test_Task3_Images.zip` and `Challenge2_Training_Task3_Images_GT.zip` from [homepage](https://rrc.cvc.uab.es/?ch=2&com=downloads)
|
||||
- Step2: Download [test_label_1015.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2013/test_label_1015.txt) and [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2013/train_label.txt)
|
||||
- For `icdar_2015`:
|
||||
- Step1: Download `ch4_training_word_images_gt.zip` and `ch4_test_word_images_gt.zip` from [homepage](https://rrc.cvc.uab.es/?ch=4&com=downloads)
|
||||
- Step2: Download [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/train_label.txt) and [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/test_label.txt)
|
||||
- For `IIIT5K`:
|
||||
- Step1: Download `IIIT5K-Word_V3.0.tar.gz` from [homepage](http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/IIIT5K.html)
|
||||
- Step2: Download [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/train_label.txt) and [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/test_label.txt)
|
||||
- For `svt`:
|
||||
- Step1: Download `svt.zip` form [homepage](http://www.iapr-tc11.org/mediawiki/index.php/The_Street_View_Text_Dataset)
|
||||
- Step2: Download [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/svt/test_label.txt)
|
||||
- For `ct80`:
|
||||
- Step1: Download [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/ct80/test_label.txt)
|
||||
- For `svtp`:
|
||||
- Step1: Download [test_label.txt](https://download.openmmlab.com/mmocr/data/mixture/svtp/test_label.txt)
|
||||
- For `coco_text`:
|
||||
- Step1: Download from [homepage](https://rrc.cvc.uab.es/?ch=5&com=downloads)
|
||||
- Step2: Download [train_label.txt](https://download.openmmlab.com/mmocr/data/mixture/coco_text/train_label.txt)
|
||||
|
||||
- For `Syn90k`:
|
||||
- Step1: Download `mjsynth.tar.gz` from [homepage](https://www.robots.ox.ac.uk/~vgg/data/text/)
|
||||
- Step2: Download [shuffle_labels.txt](https://download.openmmlab.com/mmocr/data/mixture/Synth90k/shuffle_labels.txt)
|
||||
- Step3:
|
||||
```bash
|
||||
mkdir Syn90k && cd Syn90k
|
||||
|
||||
mv /path/to/mjsynth.tar.gz .
|
||||
|
||||
tar -xzf mjsynth.tar.gz
|
||||
|
||||
mv /path/to/shuffle_labels.txt .
|
||||
|
||||
# create soft link
|
||||
cd /path/to/mmocr/data/mixture
|
||||
|
||||
ln -s /path/to/Syn90k Syn90k
|
||||
```
|
||||
- For `SynthText`:
|
||||
- Step1: Download `SynthText.zip` from [homepage](https://www.robots.ox.ac.uk/~vgg/data/scenetext/)
|
||||
- Step2: Download [shuffle_labels.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthText/shuffle_labels.txt)
|
||||
- Step3: Download [instances_train.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthText/instances_train.txt)
|
||||
- Step4:
|
||||
```bash
|
||||
unzip SynthText.zip
|
||||
|
||||
cd SynthText
|
||||
|
||||
mv /path/to/shuffle_labels.txt .
|
||||
|
||||
# create soft link
|
||||
cd /path/to/mmocr/data/mixture
|
||||
|
||||
ln -s /path/to/SynthText SynthText
|
||||
```
|
||||
- For `SynthAdd`:
|
||||
- Step1: Download `SynthText_Add.zip` from [SynthAdd](https://pan.baidu.com/s/1uV0LtoNmcxbO-0YA7Ch4dg) (code:627x))
|
||||
- Step2: Download [label.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthAdd/label.txt)
|
||||
- Step3:
|
||||
```bash
|
||||
mkdir SynthAdd && cd SynthAdd
|
||||
|
||||
mv /path/to/SynthText_Add.zip .
|
||||
|
||||
unzip SynthText_Add.zip
|
||||
|
||||
mv /path/to/label.txt .
|
||||
|
||||
# create soft link
|
||||
cd /path/to/mmocr/data/mixture
|
||||
|
||||
ln -s /path/to/SynthAdd SynthAdd
|
||||
```
|
||||
|
||||
<a id="markdown-key-information-extraction" name="key-information-extraction"></a>
|
||||
## Key Information Extraction
|
||||
**The structure of the key information extraction dataset directory is organized as follows.**
|
||||
```
|
||||
└── wildreceipt
|
||||
├── anno_files
|
||||
├── class_list.txt
|
||||
├── dict.txt
|
||||
├── image_files
|
||||
├── test.txt
|
||||
└── train.txt
|
||||
```
|
||||
- Download [wildreceipt.tar](https://download.openmmlab.com/mmocr/data/wildreceipt.tar)
|
|
@ -0,0 +1,369 @@
|
|||
<a id="markdown-getting-started" name="getting-started"></a>
|
||||
# Getting Started
|
||||
|
||||
This page provides basic tutorials on the usage of MMOCR.
|
||||
For the installation instructions, please see [install.md](install.md).
|
||||
<!-- TOC -->
|
||||
|
||||
- [Getting Started](#getting-started)
|
||||
- [Inference with Pretrained Models](#inference-with-pretrained-models)
|
||||
- [Test a Single Image](#test-a-single-image)
|
||||
- [Test Multiple Images](#test-multiple-images)
|
||||
- [Test a Dataset](#test-a-dataset)
|
||||
- [Test with Single/Multiple GPUs](#test-with-singlemultiple-gpus)
|
||||
- [Optional Arguments](#optional-arguments)
|
||||
- [Test with Slurm](#test-with-slurm)
|
||||
- [Optional Arguments](#optional-arguments-1)
|
||||
- [Train a Model](#train-a-model)
|
||||
- [Train with Single/Multiple GPUs](#train-with-singlemultiple-gpus)
|
||||
- [Train with Toy Dataset.](#train-with-toy-dataset)
|
||||
- [Train with Slurm](#train-with-slurm)
|
||||
- [Launch Multiple Jobs on a Single Machine](#launch-multiple-jobs-on-a-single-machine)
|
||||
- [Useful Tools](#useful-tools)
|
||||
- [Publish a Model](#publish-a-model)
|
||||
- [Customized Settings](#customized-settings)
|
||||
- [Flexible Dataset](#flexible-dataset)
|
||||
- [Encoder-Decoder-Based Text Recognition Task](#encoder-decoder-based-text-recognition-task)
|
||||
- [Optional Arguments:](#optional-arguments-2)
|
||||
- [Segmentation-Based Text Recognition Task](#segmentation-based-text-recognition-task)
|
||||
- [Text Detection Task](#text-detection-task)
|
||||
- [COCO-like Dataset](#coco-like-dataset)
|
||||
|
||||
<!-- /TOC -->
|
||||
|
||||
<a id="markdown-inference-with-pretrained-models" name="inference-with-pretrained-models"></a>
|
||||
## Inference with Pretrained Models
|
||||
|
||||
We provide testing scripts to evaluate a full dataset, as well as some task-specific image demos.
|
||||
|
||||
<a id="markdown-test-a-single-image" name="test-a-single-image"></a>
|
||||
### Test a Single Image
|
||||
|
||||
You can use the following command to test a single image with one GPU.
|
||||
|
||||
```shell
|
||||
python demo/image_demo.py ${TEST_IMG} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${SAVE_PATH} [--imshow] [--device ${GPU_ID}]
|
||||
```
|
||||
|
||||
If `--imshow` is specified, the demo will also show the image with OpenCV. For example:
|
||||
|
||||
```shell
|
||||
python demo/image_demo.py demo/demo_text_det.jpg configs/xxx.py xxx.pth demo/demo_text_det_pred.jpg
|
||||
```
|
||||
|
||||
The predicted result will be saved as `demo/demo_text_det_pred.jpg`.
|
||||
|
||||
<a id="markdown-test-multiple-images" name="test-multiple-images"></a>
|
||||
### Test Multiple Images
|
||||
|
||||
```shell
|
||||
# for text detection
|
||||
sh tools/test_imgs.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${IMG_ROOT_PATH} ${IMG_LIST} ${RESULTS_DIR}
|
||||
|
||||
# for text recognition
|
||||
sh tools/ocr_test_imgs.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${IMG_ROOT_PATH} ${IMG_LIST} ${RESULTS_DIR}
|
||||
```
|
||||
It will save both the prediction results and visualized images to `${RESULTS_DIR}`
|
||||
|
||||
<a id="markdown-test-a-dataset" name="test-a-dataset"></a>
|
||||
### Test a Dataset
|
||||
|
||||
MMOCR implements **distributed** testing with `MMDistributedDataParallel`. (Please refer to [datasets.md](datasets.md) to prepare your datasets)
|
||||
|
||||
<a id="markdown-test-with-singlemultiple-gpus" name="test-with-singlemultiple-gpus"></a>
|
||||
#### Test with Single/Multiple GPUs
|
||||
|
||||
You can use the following command to test a dataset with single/multiple GPUs.
|
||||
|
||||
```shell
|
||||
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--eval ${EVAL_METRIC}]
|
||||
```
|
||||
For example,
|
||||
|
||||
```shell
|
||||
./tools/dist_test.sh configs/example_config.py work_dirs/example_exp/example_model_20200202.pth 1 --eval hmean-iou
|
||||
```
|
||||
<a id="markdown-optional-arguments" name="optional-arguments"></a>
|
||||
##### Optional Arguments
|
||||
|
||||
- `--eval`: Specify the evaluation metric. For text detection, the metric should be either 'hmean-ic13' or 'hmean-iou'. For text recognition, the metric should be 'acc'.
|
||||
|
||||
<a id="markdown-test-with-slurm" name="test-with-slurm"></a>
|
||||
#### Test with Slurm
|
||||
|
||||
If you run MMOCR on a cluster managed with [Slurm](https://slurm.schedmd.com/), you can use the script `slurm_test.sh`.
|
||||
|
||||
```shell
|
||||
[GPUS=${GPUS}] ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--eval ${EVAL_METRIC}]
|
||||
```
|
||||
Here is an example of using 8 GPUs to test an example model on the 'dev' partition with job name 'test_job'.
|
||||
|
||||
```shell
|
||||
GPUS=8 ./tools/slurm_test.sh dev test_job configs/example_config.py work_dirs/example_exp/example_model_20200202.pth --eval hmean-iou
|
||||
```
|
||||
|
||||
You can check [slurm_test.sh](https://github.com/open-mmlab/mmocr/blob/master/tools/slurm_test.sh) for full arguments and environment variables.
|
||||
|
||||
|
||||
<a id="markdown-optional-arguments-1" name="optional-arguments-1"></a>
|
||||
##### Optional Arguments
|
||||
|
||||
- `--eval`: Specify the evaluation metric. For text detection, the metric should be either 'hmean-ic13' or 'hmean-iou'. For text recognition, the metric should be 'acc'.
|
||||
|
||||
|
||||
<a id="markdown-train-a-model" name="train-a-model"></a>
|
||||
## Train a Model
|
||||
|
||||
MMOCR implements **distributed** training with `MMDistributedDataParallel`. (Please refer to [datasets.md](datasets.md) to prepare your datasets)
|
||||
|
||||
All outputs (log files and checkpoints) will be saved to a working directory specified by `work_dir` in the config file.
|
||||
|
||||
By default, we evaluate the model on the validation set after several iterations. You can change the evaluation interval by adding the interval argument in the training config as follows:
|
||||
```python
|
||||
evaluation = dict(interval=1, by_epoch=True) # This evaluates the model per epoch.
|
||||
```
|
||||
|
||||
|
||||
<a id="markdown-train-with-singlemultiple-gpus" name="train-with-singlemultiple-gpus"></a>
|
||||
### Train with Single/Multiple GPUs
|
||||
|
||||
```shell
|
||||
./tools/dist_train.sh ${CONFIG_FILE} ${WORK_DIR} ${GPU_NUM} [optional arguments]
|
||||
```
|
||||
|
||||
Optional Arguments:
|
||||
|
||||
- `--no-validate` (**not suggested**): By default, the codebase will perform evaluation at every k-th iteration during training. To disable this behavior, use `--no-validate`.
|
||||
|
||||
<a id="markdown-train-with-toy-dataset" name="train-with-toy-dataset"></a>
|
||||
#### Train with Toy Dataset.
|
||||
We provide a toy dataset under `tests/data`, and you can train a toy model directly, before the academic dataset is prepared.
|
||||
|
||||
For example, train a text recognition task with `seg` method and toy dataset,
|
||||
```
|
||||
./tools/dist_train.sh configs/textrecog/seg/seg_r31_1by16_fpnocr_toy_dataset.py work_dirs/seg 1
|
||||
```
|
||||
|
||||
And train a text recognition task with `sar` method and toy dataset,
|
||||
```
|
||||
./tools/dist_train.sh configs/textrecog/sar/sar_r31_parallel_decoder_toy_dataset.py work_dirs/sar 1
|
||||
```
|
||||
|
||||
<a id="markdown-train-with-slurm" name="train-with-slurm"></a>
|
||||
### Train with Slurm
|
||||
|
||||
If you run MMOCR on a cluster managed with [Slurm](https://slurm.schedmd.com/), you can use the script `slurm_train.sh`.
|
||||
|
||||
```shell
|
||||
[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR}
|
||||
```
|
||||
|
||||
Here is an example of using 8 GPUs to train a text detection model on the dev partition.
|
||||
|
||||
```shell
|
||||
GPUS=8 ./tools/slurm_train.sh dev psenet-ic15 configs/textdet/psenet/psenet_r50_fpnf_sbn_1x_icdar2015.py /nfs/xxxx/psenet-ic15
|
||||
```
|
||||
|
||||
You can check [slurm_train.sh](https://github.com/open-mmlab/mmocr/blob/master/tools/slurm_train.sh) for full arguments and environment variables.
|
||||
|
||||
<a id="markdown-launch-multiple-jobs-on-a-single-machine" name="launch-multiple-jobs-on-a-single-machine"></a>
|
||||
### Launch Multiple Jobs on a Single Machine
|
||||
|
||||
If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
|
||||
you need to specify different ports (29500 by default) for each job to avoid communication conflicts.
|
||||
|
||||
If you use `dist_train.sh` to launch training jobs, you can set the ports in the command shell.
|
||||
|
||||
```shell
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
|
||||
CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
|
||||
```
|
||||
|
||||
If you launch training jobs with Slurm, you need to modify the config files to set different communication ports.
|
||||
|
||||
In `config1.py`,
|
||||
```python
|
||||
dist_params = dict(backend='nccl', port=29500)
|
||||
```
|
||||
|
||||
In `config2.py`,
|
||||
```python
|
||||
dist_params = dict(backend='nccl', port=29501)
|
||||
```
|
||||
|
||||
Then you can launch two jobs with `config1.py` ang `config2.py`.
|
||||
|
||||
```shell
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
|
||||
CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
|
||||
```
|
||||
|
||||
|
||||
<a id="markdown-useful-tools" name="useful-tools"></a>
|
||||
## Useful Tools
|
||||
|
||||
We provide numerous useful tools under `mmocr/tools` directory.
|
||||
|
||||
<a id="markdown-publish-a-model" name="publish-a-model"></a>
|
||||
### Publish a Model
|
||||
|
||||
Before you upload a model to AWS, you may want to
|
||||
(1) convert the model weights to CPU tensors, (2) delete the optimizer states and
|
||||
(3) compute the hash of the checkpoint file and append the hash id to the filename.
|
||||
|
||||
```shell
|
||||
python tools/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
|
||||
```
|
||||
|
||||
E.g.,
|
||||
|
||||
```shell
|
||||
python tools/publish_model.py work_dirs/psenet/latest.pth psenet_r50_fpnf_sbn_1x_20190801.pth
|
||||
```
|
||||
|
||||
The final output filename will be `psenet_r50_fpnf_sbn_1x_20190801-{hash id}.pth`.
|
||||
|
||||
<a id="markdown-customized-settings" name="customized-settings"></a>
|
||||
## Customized Settings
|
||||
|
||||
<a id="markdown-flexible-dataset" name="flexible-dataset"></a>
|
||||
### Flexible Dataset
|
||||
To support the tasks of `text detection`, `text recognition` and `key information extraction`, we have designed a new type of dataset which consists of `loader` and `parser` to load and parse different types of annotation files.
|
||||
- **loader**: Load the annotation file. There are two types of loader, `HardDiskLoader` and `LmdbLoader`
|
||||
- `HardDiskLoader`: Load `txt` format annotation file from hard disk to memory.
|
||||
- `LmdbLoader`: Load `lmdb` format annotation file with lmdb backend, which is very useful for **extremely large** annotation files to avoid out-of-memory problem when ten or more GPUs are used, since each GPU will start multiple processes to load annotation file to memory.
|
||||
- **parser**: Parse the annotation file line-by-line and return with `dict` format. There are two types of parser, `LineStrParser` and `LineJsonParser`.
|
||||
- `LineStrParser`: Parse one line in ann file while treating it as a string and separating it to several parts by a `separator`. It can be used on tasks with simple annotation files such as text recognition where each line of the annotation files contains the `filename` and `label` attribute only.
|
||||
- `LineJsonParser`: Parse one line in ann file while treating it as a json-string and using `json.loads` to convert it to `dict`. It can be used on tasks with complex annotation files such as text detection where each line of the annotation files contains multiple attributes (e.g. `filename`, `height`, `width`, `box`, `segmentation`, `iscrowd`, `category_id`, etc.).
|
||||
|
||||
Here we show some examples of using different combination of `loader` and `parser`.
|
||||
|
||||
<a id="markdown-encoder-decoder-based-text-recognition-task" name="encoder-decoder-based-text-recognition-task"></a>
|
||||
#### Encoder-Decoder-Based Text Recognition Task
|
||||
```python
|
||||
dataset_type = 'OCRDataset'
|
||||
img_prefix = 'tests/data/ocr_toy_dataset/imgs'
|
||||
train_anno_file = 'tests/data/ocr_toy_dataset/label.txt'
|
||||
train = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=10,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
```
|
||||
You can check the content of the annotation file in `tests/data/ocr_toy_dataset/label.txt`.
|
||||
The combination of `HardDiskLoader` and `LineStrParser` will return a dict for each file by calling `__getitem__`: `{'filename': '1223731.jpg', 'text': 'GRAND'}`.
|
||||
|
||||
<a id="markdown-optional-arguments" name="optional-arguments"></a>
|
||||
##### Optional Arguments:
|
||||
|
||||
- `repeat`: The number of repeated lines in the annotation files. For example, if there are `10` lines in the annotation file, setting `repeat=10` will generate a corresponding annotation file with size `100`.
|
||||
|
||||
If the annotation file is extreme large, you can convert it from txt format to lmdb format with the following command:
|
||||
```python
|
||||
python tools/data_converter/txt2lmdb.py -i ann_file.txt -o ann_file.lmdb
|
||||
```
|
||||
|
||||
After that, you can use `LmdbLoader` in dataset like below.
|
||||
```python
|
||||
img_prefix = 'tests/data/ocr_toy_dataset/imgs'
|
||||
train_anno_file = 'tests/data/ocr_toy_dataset/label.lmdb'
|
||||
train = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=train_anno_file,
|
||||
loader=dict(
|
||||
type='LmdbLoader',
|
||||
repeat=10,
|
||||
parser=dict(
|
||||
type='LineStrParser',
|
||||
keys=['filename', 'text'],
|
||||
keys_idx=[0, 1],
|
||||
separator=' ')),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=False)
|
||||
```
|
||||
|
||||
<a id="markdown-segmentation-based-text-recognition-task" name="segmentation-based-text-recognition-task"></a>
|
||||
#### Segmentation-Based Text Recognition Task
|
||||
```python
|
||||
prefix = 'tests/data/ocr_char_ann_toy_dataset/'
|
||||
train = dict(
|
||||
type='OCRSegDataset',
|
||||
img_prefix=prefix + 'imgs',
|
||||
ann_file=prefix + 'instances_train.txt',
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=10,
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['file_name', 'annotations', 'text'])),
|
||||
pipeline=train_pipeline,
|
||||
test_mode=True)
|
||||
```
|
||||
You can check the content of the annotation file in `tests/data/ocr_char_ann_toy_dataset/instances_train.txt`.
|
||||
The combination of `HardDiskLoader` and `LineJsonParser` will return a dict for each file by calling `__getitem__` each time:
|
||||
```python
|
||||
{"file_name": "resort_88_101_1.png", "annotations": [{"char_text": "F", "char_box": [11.0, 0.0, 22.0, 0.0, 12.0, 12.0, 0.0, 12.0]}, {"char_text": "r", "char_box": [23.0, 2.0, 31.0, 1.0, 24.0, 11.0, 16.0, 11.0]}, {"char_text": "o", "char_box": [33.0, 2.0, 43.0, 2.0, 36.0, 12.0, 25.0, 12.0]}, {"char_text": "m", "char_box": [46.0, 2.0, 61.0, 2.0, 53.0, 12.0, 39.0, 12.0]}, {"char_text": ":", "char_box": [61.0, 2.0, 69.0, 2.0, 63.0, 12.0, 55.0, 12.0]}], "text": "From:"}
|
||||
```
|
||||
|
||||
<a id="markdown-text-detection-task" name="text-detection-task"></a>
|
||||
#### Text Detection Task
|
||||
```python
|
||||
dataset_type = 'TextDetDataset'
|
||||
img_prefix = 'tests/data/toy_dataset/imgs'
|
||||
test_anno_file = 'tests/data/toy_dataset/instances_test.txt'
|
||||
test = dict(
|
||||
type=dataset_type,
|
||||
img_prefix=img_prefix,
|
||||
ann_file=test_anno_file,
|
||||
loader=dict(
|
||||
type='HardDiskLoader',
|
||||
repeat=4,
|
||||
parser=dict(
|
||||
type='LineJsonParser',
|
||||
keys=['file_name', 'height', 'width', 'annotations'])),
|
||||
pipeline=test_pipeline,
|
||||
test_mode=True)
|
||||
```
|
||||
The results are generated in the same way as the segmentation-based text recognition task above.
|
||||
You can check the content of the annotation file in `tests/data/toy_dataset/instances_test.txt`.
|
||||
The combination of `HardDiskLoader` and `LineJsonParser` will return a dict for each file by calling `__getitem__`:
|
||||
```python
|
||||
{"file_name": "test/img_10.jpg", "height": 720, "width": 1280, "annotations": [{"iscrowd": 1, "category_id": 1, "bbox": [260.0, 138.0, 24.0, 20.0], "segmentation": [[261, 138, 284, 140, 279, 158, 260, 158]]}, {"iscrowd": 0, "category_id": 1, "bbox": [288.0, 138.0, 129.0, 23.0], "segmentation": [[288, 138, 417, 140, 416, 161, 290, 157]]}, {"iscrowd": 0, "category_id": 1, "bbox": [743.0, 145.0, 37.0, 18.0], "segmentation": [[743, 145, 779, 146, 780, 163, 746, 163]]}, {"iscrowd": 0, "category_id": 1, "bbox": [783.0, 129.0, 50.0, 26.0], "segmentation": [[783, 129, 831, 132, 833, 155, 785, 153]]}, {"iscrowd": 1, "category_id": 1, "bbox": [831.0, 133.0, 43.0, 23.0], "segmentation": [[831, 133, 870, 135, 874, 156, 835, 155]]}, {"iscrowd": 1, "category_id": 1, "bbox": [159.0, 204.0, 72.0, 15.0], "segmentation": [[159, 205, 230, 204, 231, 218, 159, 219]]}, {"iscrowd": 1, "category_id": 1, "bbox": [785.0, 158.0, 75.0, 21.0], "segmentation": [[785, 158, 856, 158, 860, 178, 787, 179]]}, {"iscrowd": 1, "category_id": 1, "bbox": [1011.0, 157.0, 68.0, 16.0], "segmentation": [[1011, 157, 1079, 160, 1076, 173, 1011, 170]]}]}
|
||||
```
|
||||
|
||||
|
||||
<a id="markdown-coco-like-dataset" name="coco-like-dataset"></a>
|
||||
### COCO-like Dataset
|
||||
For text detection, you can also use an annotation file in a COCO format that is defined in [mmdet](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py):
|
||||
```python
|
||||
dataset_type = 'IcdarDataset'
|
||||
prefix = 'tests/data/toy_dataset/'
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=prefix + 'instances_test.json',
|
||||
img_prefix=prefix + 'imgs',
|
||||
pipeline=test_pipeline)
|
||||
```
|
||||
You can check the content of the annotation file in `tests/data/toy_dataset/instances_test.json`
|
||||
- The icdar2015/2017 annotations have to be converted into the COCO format using `tools/data_converter/icdar_converter.py`:
|
||||
|
||||
```shell
|
||||
python tools/data_converter/icdar_converter.py ${src_root_path} -o ${out_path} -d ${data_type} --split-list training validation test
|
||||
```
|
||||
|
||||
- The ctw1500 annotations have to be converted into the COCO format using `tools/data_converter/ctw1500_converter.py`:
|
||||
|
||||
```shell
|
||||
python tools/data_converter/ctw1500_converter.py ${src_root_path} -o ${out_path} --split-list training test
|
||||
```
|
||||
```
|
|
@ -0,0 +1,38 @@
|
|||
Welcome to MMOCR's documentation!
|
||||
=======================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Get Started
|
||||
|
||||
install.md
|
||||
getting_started.md
|
||||
technical_details.md
|
||||
contributing.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Model Zoo
|
||||
|
||||
modelzoo.md
|
||||
textdet_models.md
|
||||
textrecog_models.md
|
||||
kie_models.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Notes
|
||||
|
||||
changelog.md
|
||||
faq.md
|
||||
|
||||
.. toctree::
|
||||
:caption: API Reference
|
||||
|
||||
api.rst
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`search`
|
|
@ -0,0 +1,249 @@
|
|||
<a id="markdown-installation" name="installation"></a>
|
||||
# Installation
|
||||
<!-- TOC -->
|
||||
|
||||
- [Installation](#installation)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Step-by-Step Installation Instructions](#step-by-step-installation-instructions)
|
||||
- [Full Set-up Script](#full-set-up-script)
|
||||
- [Another option: Docker Image](#another-option-docker-image)
|
||||
- [Prepare Datasets](#prepare-datasets)
|
||||
|
||||
<!-- /TOC -->
|
||||
<a id="markdown-prerequisites" name="prerequisites"></a>
|
||||
## Prerequisites
|
||||
|
||||
- Linux (Windows is not officially supported)
|
||||
- Python 3.7
|
||||
- PyTorch 1.5 or higher
|
||||
- torchvision 0.6.0
|
||||
- CUDA 10.1
|
||||
- NCCL 2
|
||||
- GCC 5.4.0 or higher
|
||||
- [mmcv](https://github.com/open-mmlab/mmcv) 1.2.6
|
||||
|
||||
We have tested the following versions of OS and softwares:
|
||||
|
||||
- OS: Ubuntu 16.04
|
||||
- CUDA: 10.1
|
||||
- GCC(G++): 5.4.0
|
||||
- mmcv 1.2.6
|
||||
- PyTorch 1.5
|
||||
- torchvision 0.6.0
|
||||
|
||||
MMOCR depends on Pytorch and mmdetection v2.9.0.
|
||||
|
||||
<a id="markdown-step-by-step-installation-instructions" name="step-by-step-installation-instructions"></a>
|
||||
## Step-by-Step Installation Instructions
|
||||
|
||||
a. Create a conda virtual environment and activate it.
|
||||
|
||||
```shell
|
||||
conda create -n open-mmlab python=3.7 -y
|
||||
conda activate open-mmlab
|
||||
```
|
||||
|
||||
b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/), e.g.,
|
||||
|
||||
```shell
|
||||
conda install pytorch==1.5.0 torchvision==0.6.0 cudatoolkit=10.1 -c pytorch
|
||||
```
|
||||
Note: Make sure that your compilation CUDA version and runtime CUDA version match.
|
||||
You can check the supported CUDA version for precompiled packages on the [PyTorch website](https://pytorch.org/).
|
||||
|
||||
`E.g. 1` If you have CUDA 10.1 installed under `/usr/local/cuda` and would like to install
|
||||
PyTorch 1.5, you need to install the prebuilt PyTorch with CUDA 10.1.
|
||||
|
||||
```python
|
||||
conda install pytorch cudatoolkit=10.1 torchvision -c pytorch
|
||||
```
|
||||
|
||||
`E.g. 2` If you have CUDA 9.2 installed under `/usr/local/cuda` and would like to install
|
||||
PyTorch 1.3.1., you need to install the prebuilt PyTorch with CUDA 9.2.
|
||||
|
||||
```python
|
||||
conda install pytorch=1.3.1 cudatoolkit=9.2 torchvision=0.4.2 -c pytorch
|
||||
```
|
||||
|
||||
If you build PyTorch from source instead of installing the prebuilt package,
|
||||
you can use more CUDA versions such as 9.0.
|
||||
|
||||
c. Create a folder called `code` and clone the mmcv repository into it.
|
||||
|
||||
```shell
|
||||
mkdir code
|
||||
cd code
|
||||
git clone https://github.com/open-mmlab/mmcv.git
|
||||
cd mmcv
|
||||
git checkout -b v1.2.6 v1.2.6
|
||||
pip install -r requirements.txt
|
||||
MMCV_WITH_OPS=1 pip install -v -e .
|
||||
```
|
||||
|
||||
d. Clone the mmdetection repository into it. The mmdetection repo is separate from the mmcv repo in `code`.
|
||||
|
||||
```shell
|
||||
cd ..
|
||||
git clone https://github.com/open-mmlab/mmdetection.git
|
||||
cd mmdetection
|
||||
git checkout -b v2.9.0 v2.9.0
|
||||
pip install -r requirements.txt
|
||||
pip install -v -e .
|
||||
export PYTHONPATH=$(pwd):$PYTHONPATH
|
||||
```
|
||||
|
||||
Note that we have tested mmdetection v2.9.0 only. Other versions might be incompatible.
|
||||
|
||||
e. Clone the mmocr repository into it. The mmdetection repo is separate from the mmcv and mmdetection repo in `code`.
|
||||
|
||||
```shell
|
||||
cd ..
|
||||
git clone https://github.com/open-mmlab/mmocr.git
|
||||
cd mmocr
|
||||
```
|
||||
|
||||
f. Install build requirements and then install MMOCR.
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
pip install -v -e . # or "python setup.py build_ext --inplace"
|
||||
export PYTHONPATH=$(pwd):$PYTHONPATH
|
||||
```
|
||||
|
||||
<a id="markdown-full-set-up-script" name="full-set-up-script"></a>
|
||||
## Full Set-up Script
|
||||
|
||||
Here is the full script for setting up mmocr with conda.
|
||||
|
||||
```shell
|
||||
conda create -n open-mmlab python=3.7 -y
|
||||
conda activate open-mmlab
|
||||
|
||||
# install latest pytorch prebuilt with the default prebuilt CUDA version (usually the latest)
|
||||
conda install pytorch==1.5.0 torchvision==0.6.0 cudatoolkit=10.1 -c pytorch
|
||||
|
||||
# install mmcv
|
||||
mkdir code
|
||||
cd code
|
||||
git clone https://github.com/open-mmlab/mmcv.git
|
||||
cd mmcv # code/mmcv
|
||||
git checkout -b v1.2.6 v1.2.6
|
||||
pip install -r requirements.txt
|
||||
MMCV_WITH_OPS=1 pip install -v -e .
|
||||
|
||||
# install mmdetection
|
||||
cd .. # exit to code
|
||||
git clone https://github.com/open-mmlab/mmdetection.git
|
||||
cd mmdetection # code/mmdetection
|
||||
git checkout -b v2.9.0 v2.9.0
|
||||
pip install -r requirements.txt
|
||||
pip install -v -e .
|
||||
export PYTHONPATH=$(pwd):$PYTHONPATH
|
||||
|
||||
# install mmocr
|
||||
cd ..
|
||||
git clone https://github.com/open-mmlab/mmocr.git
|
||||
cd mmocr # code/mmocr
|
||||
|
||||
pip install -r requirements.txt
|
||||
pip install -v -e . # or "python setup.py build_ext --inplace"
|
||||
export PYTHONPATH=$(pwd):$PYTHONPATH
|
||||
```
|
||||
|
||||
<a id="markdown-another-option-docker-image" name="another-option-docker-image"></a>
|
||||
## Another option: Docker Image
|
||||
|
||||
We provide a [Dockerfile](https://github.com/open-mmlab/mmocr/blob/master/docker/Dockerfile) to build an image.
|
||||
|
||||
```shell
|
||||
# build an image with PyTorch 1.5, CUDA 10.1
|
||||
docker build -t mmocr docker/
|
||||
```
|
||||
|
||||
Run it with
|
||||
|
||||
```shell
|
||||
docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmocr/data mmocr
|
||||
```
|
||||
|
||||
<a id="markdown-prepare-datasets" name="prepare-datasets"></a>
|
||||
## Prepare Datasets
|
||||
|
||||
It is recommended to symlink the dataset root to `mmocr/data`. Please refer to [datasets.md](datasets.md) to prepare your datasets.
|
||||
If your folder structure is different, you may need to change the corresponding paths in config files.
|
||||
|
||||
The `mmocr` folder is organized as follows:
|
||||
```
|
||||
mmocr
|
||||
.
|
||||
├── configs
|
||||
│ ├── _base_
|
||||
│ ├── kie
|
||||
│ ├── textdet
|
||||
│ └── textrecog
|
||||
├── demo
|
||||
│ ├── demo_text_det.jpg
|
||||
│ ├── demo_text_recog.jpg
|
||||
│ ├── image_demo.py
|
||||
│ └── webcam_demo.py
|
||||
├── docs
|
||||
│ ├── api.rst
|
||||
│ ├── changelog.md
|
||||
│ ├── code_of_conduct.md
|
||||
│ ├── conf.py
|
||||
│ ├── contributing.md
|
||||
│ ├── datasets.md
|
||||
│ ├── getting_started.md
|
||||
│ ├── index.rst
|
||||
│ ├── install.md
|
||||
│ ├── make.bat
|
||||
│ ├── Makefile
|
||||
│ ├── merge_docs.sh
|
||||
│ ├── requirements.txt
|
||||
│ ├── res
|
||||
│ ├── stats.py
|
||||
│ └── technical_details.md
|
||||
├── LICENSE
|
||||
├── mmocr
|
||||
│ ├── apis
|
||||
│ ├── core
|
||||
│ ├── datasets
|
||||
│ ├── __init__.py
|
||||
│ ├── models
|
||||
│ ├── utils
|
||||
│ └── version.py
|
||||
├── README.md
|
||||
├── requirements
|
||||
│ ├── build.txt
|
||||
│ ├── docs.txt
|
||||
│ ├── optional.txt
|
||||
│ ├── readthedocs.txt
|
||||
│ ├── runtime.txt
|
||||
│ └── tests.txt
|
||||
├── requirements.txt
|
||||
├── resources
|
||||
│ ├── illustration.jpg
|
||||
│ └── mmocr-logo.png
|
||||
├── setup.cfg
|
||||
├── setup.py
|
||||
├── tests
|
||||
│ ├── data
|
||||
│ ├── test_dataset
|
||||
│ ├── test_metrics
|
||||
│ ├── test_models
|
||||
│ ├── test_tools
|
||||
│ └── test_utils
|
||||
└── tools
|
||||
├── data
|
||||
├── dist_test.sh
|
||||
├── dist_train.sh
|
||||
├── ocr_test_imgs.py
|
||||
├── ocr_test_imgs.sh
|
||||
├── publish_model.py
|
||||
├── slurm_test.sh
|
||||
├── slurm_train.sh
|
||||
├── test_imgs.py
|
||||
├── test_imgs.sh
|
||||
├── test.py
|
||||
└── train.py
|
||||
```
|
|
@ -0,0 +1,36 @@
|
|||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=.
|
||||
set BUILDDIR=_build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
sed -i '$a\\n' ../configs/kie/*/*.md
|
||||
sed -i '$a\\n' ../configs/textdet/*/*.md
|
||||
sed -i '$a\\n' ../configs/textrecog/*/*.md
|
||||
|
||||
# gather models
|
||||
cat ../configs/kie/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# Kie Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >kie_models.md
|
||||
cat ../configs/textdet/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# Text Detection Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >textdet_models.md
|
||||
cat ../configs/textrecog/*/*.md | sed "s/md###t/html#t/g" | sed "s/#/#&/" | sed '1i\# Text Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmocr/tree/master/=g' >textrecog_models.md
|
|
@ -0,0 +1,4 @@
|
|||
recommonmark
|
||||
sphinx
|
||||
sphinx_markdown_tables
|
||||
sphinx_rtd_theme
|
Binary file not shown.
After Width: | Height: | Size: 24 KiB |
Binary file not shown.
After Width: | Height: | Size: 17 KiB |
|
@ -0,0 +1,94 @@
|
|||
#!/usr/bin/env python
|
||||
import functools as func
|
||||
import glob
|
||||
import re
|
||||
from os.path import basename, splitext
|
||||
|
||||
import numpy as np
|
||||
import titlecase
|
||||
|
||||
|
||||
def anchor(name):
|
||||
return re.sub(r'-+', '-', re.sub(r'[^a-zA-Z0-9]', '-',
|
||||
name.strip().lower())).strip('-')
|
||||
|
||||
|
||||
# Count algorithms
|
||||
|
||||
files = sorted(glob.glob('*_models.md'))
|
||||
# files = sorted(glob.glob('docs/*_models.md'))
|
||||
|
||||
stats = []
|
||||
|
||||
for f in files:
|
||||
with open(f, 'r') as content_file:
|
||||
content = content_file.read()
|
||||
|
||||
# title
|
||||
title = content.split('\n')[0].replace('#', '')
|
||||
|
||||
# count papers
|
||||
papers = set((papertype, titlecase.titlecase(paper.lower().strip()))
|
||||
for (papertype, paper) in re.findall(
|
||||
r'\n\s*\[([A-Z]+?)\]\s*\n.*?\btitle\s*=\s*{(.*?)}',
|
||||
content, re.DOTALL))
|
||||
# paper links
|
||||
revcontent = '\n'.join(list(reversed(content.splitlines())))
|
||||
paperlinks = {}
|
||||
for _, p in papers:
|
||||
print(p)
|
||||
q = p.replace('\\', '\\\\').replace('?', '\\?')
|
||||
paperlinks[p] = ' '.join(
|
||||
(f'[⇨]({splitext(basename(f))[0]}.html#{anchor(paperlink)})'
|
||||
for paperlink in re.findall(
|
||||
rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
|
||||
revcontent, re.DOTALL | re.IGNORECASE)))
|
||||
print(' ', paperlinks[p])
|
||||
paperlist = '\n'.join(
|
||||
sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
|
||||
# count configs
|
||||
configs = set(x.lower().strip()
|
||||
for x in re.findall(r'https.*configs/.*\.py', content))
|
||||
|
||||
# count ckpts
|
||||
ckpts = set(x.lower().strip()
|
||||
for x in re.findall(r'https://download.*\.pth', content)
|
||||
if 'mmaction' in x)
|
||||
|
||||
statsmsg = f"""
|
||||
## [{title}]({f})
|
||||
|
||||
* Number of checkpoints: {len(ckpts)}
|
||||
* Number of configs: {len(configs)}
|
||||
* Number of papers: {len(papers)}
|
||||
{paperlist}
|
||||
|
||||
"""
|
||||
|
||||
stats.append((papers, configs, ckpts, statsmsg))
|
||||
|
||||
allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _, _ in stats])
|
||||
allconfigs = func.reduce(lambda a, b: a.union(b), [c for _, c, _, _ in stats])
|
||||
allckpts = func.reduce(lambda a, b: a.union(b), [c for _, _, c, _ in stats])
|
||||
msglist = '\n'.join(x for _, _, _, x in stats)
|
||||
|
||||
papertypes, papercounts = np.unique([t for t, _ in allpapers],
|
||||
return_counts=True)
|
||||
countstr = '\n'.join(
|
||||
[f' - {t}: {c}' for t, c in zip(papertypes, papercounts)])
|
||||
|
||||
modelzoo = f"""
|
||||
# Overview
|
||||
|
||||
* Number of checkpoints: {len(allckpts)}
|
||||
* Number of configs: {len(allconfigs)}
|
||||
* Number of papers: {len(allpapers)}
|
||||
{countstr}
|
||||
|
||||
For supported datasets, see [datasets overview](datasets.md).
|
||||
|
||||
{msglist}
|
||||
"""
|
||||
|
||||
with open('modelzoo.md', 'w') as f:
|
||||
f.write(modelzoo)
|
|
@ -0,0 +1,3 @@
|
|||
from .version import __version__, short_version
|
||||
|
||||
__all__ = ['__version__', 'short_version']
|
|
@ -0,0 +1,3 @@
|
|||
from .inference import model_inference
|
||||
|
||||
__all__ = ['model_inference']
|
|
@ -0,0 +1,43 @@
|
|||
import torch
|
||||
from mmcv.ops import RoIPool
|
||||
from mmcv.parallel import collate, scatter
|
||||
|
||||
from mmdet.datasets.pipelines import Compose
|
||||
|
||||
|
||||
def model_inference(model, img):
|
||||
"""Inference image(s) with the detector.
|
||||
|
||||
Args:
|
||||
model (nn.Module): The loaded detector.
|
||||
imgs (str): Image files.
|
||||
|
||||
Returns:
|
||||
result (dict): Detection results.
|
||||
"""
|
||||
assert isinstance(img, str)
|
||||
|
||||
cfg = model.cfg
|
||||
device = next(model.parameters()).device # model device
|
||||
data = dict(img_info=dict(filename=img), img_prefix=None)
|
||||
# build the data pipeline
|
||||
test_pipeline = Compose(cfg.data.test.pipeline)
|
||||
data = test_pipeline(data)
|
||||
data = collate([data], samples_per_gpu=1)
|
||||
|
||||
# process img_metas
|
||||
data['img_metas'] = data['img_metas'][0].data
|
||||
|
||||
if next(model.parameters()).is_cuda:
|
||||
# scatter to specified GPU
|
||||
data = scatter(data, [device])[0]
|
||||
else:
|
||||
for m in model.modules():
|
||||
assert not isinstance(
|
||||
m, RoIPool
|
||||
), 'CPU inference with RoIPool is not supported currently.'
|
||||
|
||||
# forward the model
|
||||
with torch.no_grad():
|
||||
result = model(return_loss=False, rescale=True, **data)[0]
|
||||
return result
|
|
@ -0,0 +1,3 @@
|
|||
from .evaluation import * # noqa: F401, F403
|
||||
from .mask import * # noqa: F401, F403
|
||||
from .visualize import * # noqa: F401, F403
|
|
@ -0,0 +1,10 @@
|
|||
from .hmean import eval_hmean
|
||||
from .hmean_ic13 import eval_hmean_ic13
|
||||
from .hmean_iou import eval_hmean_iou
|
||||
from .kie_metric import compute_f1_score
|
||||
from .ocr_metric import eval_ocr_metric
|
||||
|
||||
__all__ = [
|
||||
'eval_hmean_ic13', 'eval_hmean_iou', 'eval_ocr_metric', 'eval_hmean',
|
||||
'compute_f1_score'
|
||||
]
|
|
@ -0,0 +1,149 @@
|
|||
from operator import itemgetter
|
||||
|
||||
import mmcv
|
||||
from mmcv.utils import print_log
|
||||
|
||||
import mmocr.utils as utils
|
||||
from mmocr.core.evaluation import hmean_ic13, hmean_iou
|
||||
from mmocr.core.evaluation.utils import (filter_2dlist_result,
|
||||
select_top_boundary)
|
||||
from mmocr.core.mask import extract_boundary
|
||||
|
||||
|
||||
def output_ranklist(img_results, img_infos, out_file):
|
||||
"""Output the worst results for debugging.
|
||||
|
||||
Args:
|
||||
img_results (list[dict]): Image result list.
|
||||
img_infos (list[dict]): Image information list.
|
||||
out_file (str): The output file path.
|
||||
|
||||
Returns:
|
||||
sorted_results (list[dict]): Image results sorted by hmean.
|
||||
"""
|
||||
assert utils.is_type_list(img_results, dict)
|
||||
assert utils.is_type_list(img_infos, dict)
|
||||
assert isinstance(out_file, str)
|
||||
assert out_file.endswith('json')
|
||||
|
||||
sorted_results = []
|
||||
for inx, result in enumerate(img_results):
|
||||
name = img_infos[inx]['file_name']
|
||||
img_result = result
|
||||
img_result['file_name'] = name
|
||||
sorted_results.append(img_result)
|
||||
sorted_results = sorted(
|
||||
sorted_results, key=itemgetter('hmean'), reverse=False)
|
||||
|
||||
mmcv.dump(sorted_results, file=out_file)
|
||||
|
||||
return sorted_results
|
||||
|
||||
|
||||
def get_gt_masks(ann_infos):
|
||||
"""Get ground truth masks and ignored masks.
|
||||
|
||||
Args:
|
||||
ann_infos (list[dict]): Each dict contains annotation
|
||||
infos of one image, containing following keys:
|
||||
masks, masks_ignore.
|
||||
Returns:
|
||||
gt_masks (list[list[list[int]]]): Ground truth masks.
|
||||
gt_masks_ignore (list[list[list[int]]]): Ignored masks.
|
||||
"""
|
||||
assert utils.is_type_list(ann_infos, dict)
|
||||
|
||||
gt_masks = []
|
||||
gt_masks_ignore = []
|
||||
for ann_info in ann_infos:
|
||||
masks = ann_info['masks']
|
||||
mask_gt = []
|
||||
for mask in masks:
|
||||
assert len(mask[0]) >= 8 and len(mask[0]) % 2 == 0
|
||||
mask_gt.append(mask[0])
|
||||
gt_masks.append(mask_gt)
|
||||
|
||||
masks_ignore = ann_info['masks_ignore']
|
||||
mask_gt_ignore = []
|
||||
for mask_ignore in masks_ignore:
|
||||
assert len(mask_ignore[0]) >= 8 and len(mask_ignore[0]) % 2 == 0
|
||||
mask_gt_ignore.append(mask_ignore[0])
|
||||
gt_masks_ignore.append(mask_gt_ignore)
|
||||
|
||||
return gt_masks, gt_masks_ignore
|
||||
|
||||
|
||||
def eval_hmean(results,
|
||||
img_infos,
|
||||
ann_infos,
|
||||
metrics={'hmean-iou'},
|
||||
score_thr=0.3,
|
||||
rank_list=None,
|
||||
logger=None,
|
||||
**kwargs):
|
||||
"""Evaluation in hmean metric.
|
||||
|
||||
Args:
|
||||
results (list[dict]): Each dict corresponds to one image,
|
||||
containing the following keys: boundary_result
|
||||
img_infos (list[dict]): Each dict corresponds to one image,
|
||||
containing the following keys: filename, height, width
|
||||
ann_infos (list[dict]): Each dict corresponds to one image,
|
||||
containing the following keys: masks, masks_ignore
|
||||
score_thr (float): Score threshold of prediction map.
|
||||
metrics (set{str}): Hmean metric set, should be one or all of
|
||||
{'hmean-iou', 'hmean-ic13'}
|
||||
Returns:
|
||||
dict[str: float]
|
||||
"""
|
||||
assert utils.is_type_list(results, dict)
|
||||
assert utils.is_type_list(img_infos, dict)
|
||||
assert utils.is_type_list(ann_infos, dict)
|
||||
assert len(results) == len(img_infos) == len(ann_infos)
|
||||
assert isinstance(metrics, set)
|
||||
|
||||
gts, gts_ignore = get_gt_masks(ann_infos)
|
||||
|
||||
preds = []
|
||||
pred_scores = []
|
||||
for result in results:
|
||||
_, texts, scores = extract_boundary(result)
|
||||
if len(texts) > 0:
|
||||
assert utils.valid_boundary(texts[0], False)
|
||||
valid_texts, valid_text_scores = filter_2dlist_result(
|
||||
texts, scores, score_thr)
|
||||
preds.append(valid_texts)
|
||||
pred_scores.append(valid_text_scores)
|
||||
|
||||
eval_results = {}
|
||||
for metric in metrics:
|
||||
msg = f'Evaluating {metric}...'
|
||||
if logger is None:
|
||||
msg = '\n' + msg
|
||||
print_log(msg, logger=logger)
|
||||
best_result = dict(hmean=-1)
|
||||
for iter in range(3, 10):
|
||||
thr = iter * 0.1
|
||||
top_preds = select_top_boundary(preds, pred_scores, thr)
|
||||
if metric == 'hmean-iou':
|
||||
result, img_result = hmean_iou.eval_hmean_iou(
|
||||
top_preds, gts, gts_ignore)
|
||||
elif metric == 'hmean-ic13':
|
||||
result, img_result = hmean_ic13.eval_hmean_ic13(
|
||||
top_preds, gts, gts_ignore)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
if rank_list is not None:
|
||||
output_ranklist(img_result, img_infos, rank_list)
|
||||
|
||||
print_log(
|
||||
'thr {0:.1f}, recall:{1[recall]:.3f}, '
|
||||
'precision: {1[precision]:.3f}, '
|
||||
'hmean:{1[hmean]:.3f}'.format(thr, result),
|
||||
logger=logger)
|
||||
if result['hmean'] > best_result['hmean']:
|
||||
best_result = result
|
||||
eval_results[metric + ':recall'] = best_result['recall']
|
||||
eval_results[metric + ':precision'] = best_result['precision']
|
||||
eval_results[metric + ':hmean'] = best_result['hmean']
|
||||
return eval_results
|
|
@ -0,0 +1,216 @@
|
|||
import numpy as np
|
||||
|
||||
import mmocr.utils as utils
|
||||
from . import utils as eval_utils
|
||||
|
||||
|
||||
def compute_recall_precision(gt_polys, pred_polys):
|
||||
"""Compute the recall and the precision matrices between gt and predicted
|
||||
polygons.
|
||||
|
||||
Args:
|
||||
gt_polys (list[Polygon]): List of gt polygons.
|
||||
pred_polys (list[Polygon]): List of predicted polygons.
|
||||
|
||||
Returns:
|
||||
recall (ndarray): Recall matrix of size gt_num x det_num.
|
||||
precision (ndarray): Precision matrix of size gt_num x det_num.
|
||||
"""
|
||||
assert isinstance(gt_polys, list)
|
||||
assert isinstance(pred_polys, list)
|
||||
|
||||
gt_num = len(gt_polys)
|
||||
det_num = len(pred_polys)
|
||||
sz = [gt_num, det_num]
|
||||
|
||||
recall = np.zeros(sz)
|
||||
precision = np.zeros(sz)
|
||||
# compute area recall and precision for each (gt, det) pair
|
||||
# in one img
|
||||
for gt_id in range(gt_num):
|
||||
for pred_id in range(det_num):
|
||||
gt = gt_polys[gt_id]
|
||||
det = pred_polys[pred_id]
|
||||
|
||||
inter_area, _ = eval_utils.poly_intersection(det, gt)
|
||||
gt_area = gt.area()
|
||||
det_area = det.area()
|
||||
if gt_area != 0:
|
||||
recall[gt_id, pred_id] = inter_area / gt_area
|
||||
if det_area != 0:
|
||||
precision[gt_id, pred_id] = inter_area / det_area
|
||||
|
||||
return recall, precision
|
||||
|
||||
|
||||
def eval_hmean_ic13(det_boxes,
|
||||
gt_boxes,
|
||||
gt_ignored_boxes,
|
||||
precision_thr=0.4,
|
||||
recall_thr=0.8,
|
||||
center_dist_thr=1.0,
|
||||
one2one_score=1.,
|
||||
one2many_score=0.8,
|
||||
many2one_score=1.):
|
||||
"""Evalute hmean of text detection using the icdar2013 standard.
|
||||
|
||||
Args:
|
||||
det_boxes (list[list[list[float]]]): List of arrays of shape (n, 2k).
|
||||
Each element is the det_boxes for one img. k>=4.
|
||||
gt_boxes (list[list[list[float]]]): List of arrays of shape (m, 2k).
|
||||
Each element is the gt_boxes for one img. k>=4.
|
||||
gt_ignored_boxes (list[list[list[float]]]): List of arrays of
|
||||
(l, 2k). Each element is the ignored gt_boxes for one img. k>=4.
|
||||
precision_thr (float): Precision threshold of the iou of one
|
||||
(gt_box, det_box) pair.
|
||||
recall_thr (float): Recall threshold of the iou of one
|
||||
(gt_box, det_box) pair.
|
||||
center_dist_thr (float): Distance threshold of one (gt_box, det_box)
|
||||
center point pair.
|
||||
one2one_score (float): Reward when one gt matches one det_box.
|
||||
one2many_score (float): Reward when one gt matches many det_boxes.
|
||||
many2one_score (float): Reward when many gts match one det_box.
|
||||
|
||||
Returns:
|
||||
hmean (tuple[dict]): Tuple of dicts which encodes the hmean for
|
||||
the dataset and all images.
|
||||
"""
|
||||
assert utils.is_3dlist(det_boxes)
|
||||
assert utils.is_3dlist(gt_boxes)
|
||||
assert utils.is_3dlist(gt_ignored_boxes)
|
||||
|
||||
assert 0 <= precision_thr <= 1
|
||||
assert 0 <= recall_thr <= 1
|
||||
assert center_dist_thr > 0
|
||||
assert 0 <= one2one_score <= 1
|
||||
assert 0 <= one2many_score <= 1
|
||||
assert 0 <= many2one_score <= 1
|
||||
|
||||
img_num = len(det_boxes)
|
||||
assert img_num == len(gt_boxes)
|
||||
assert img_num == len(gt_ignored_boxes)
|
||||
|
||||
dataset_gt_num = 0
|
||||
dataset_pred_num = 0
|
||||
dataset_hit_recall = 0.0
|
||||
dataset_hit_prec = 0.0
|
||||
|
||||
img_results = []
|
||||
|
||||
for i in range(img_num):
|
||||
gt = gt_boxes[i]
|
||||
gt_ignored = gt_ignored_boxes[i]
|
||||
pred = det_boxes[i]
|
||||
|
||||
gt_num = len(gt)
|
||||
ignored_num = len(gt_ignored)
|
||||
pred_num = len(pred)
|
||||
|
||||
accum_recall = 0.
|
||||
accum_precision = 0.
|
||||
|
||||
gt_points = gt + gt_ignored
|
||||
gt_polys = [eval_utils.points2polygon(p) for p in gt_points]
|
||||
gt_ignored_index = [gt_num + i for i in range(len(gt_ignored))]
|
||||
gt_num = len(gt_polys)
|
||||
|
||||
pred_polys, pred_points, pred_ignored_index = eval_utils.ignore_pred(
|
||||
pred, gt_ignored_index, gt_polys, precision_thr)
|
||||
|
||||
if pred_num > 0 and gt_num > 0:
|
||||
|
||||
gt_hit = np.zeros(gt_num, np.int8).tolist()
|
||||
pred_hit = np.zeros(pred_num, np.int8).tolist()
|
||||
|
||||
# compute area recall and precision for each (gt, pred) pair
|
||||
# in one img.
|
||||
recall_mat, precision_mat = compute_recall_precision(
|
||||
gt_polys, pred_polys)
|
||||
|
||||
# match one gt to one pred box.
|
||||
for gt_id in range(gt_num):
|
||||
for pred_id in range(pred_num):
|
||||
if (gt_hit[gt_id] != 0 or pred_hit[pred_id] != 0
|
||||
or gt_id in gt_ignored_index
|
||||
or pred_id in pred_ignored_index):
|
||||
continue
|
||||
match = eval_utils.one2one_match_ic13(
|
||||
gt_id, pred_id, recall_mat, precision_mat, recall_thr,
|
||||
precision_thr)
|
||||
|
||||
if match:
|
||||
gt_point = np.array(gt_points[gt_id])
|
||||
det_point = np.array(pred_points[pred_id])
|
||||
|
||||
norm_dist = eval_utils.box_center_distance(
|
||||
det_point, gt_point)
|
||||
norm_dist /= eval_utils.box_diag(
|
||||
det_point) + eval_utils.box_diag(gt_point)
|
||||
norm_dist *= 2.0
|
||||
|
||||
if norm_dist < center_dist_thr:
|
||||
gt_hit[gt_id] = 1
|
||||
pred_hit[pred_id] = 1
|
||||
accum_recall += one2one_score
|
||||
accum_precision += one2one_score
|
||||
|
||||
# match one gt to many det boxes.
|
||||
for gt_id in range(gt_num):
|
||||
if gt_id in gt_ignored_index:
|
||||
continue
|
||||
match, match_det_set = eval_utils.one2many_match_ic13(
|
||||
gt_id, recall_mat, precision_mat, recall_thr,
|
||||
precision_thr, gt_hit, pred_hit, pred_ignored_index)
|
||||
|
||||
if match:
|
||||
gt_hit[gt_id] = 1
|
||||
accum_recall += one2many_score
|
||||
accum_precision += one2many_score * len(match_det_set)
|
||||
for pred_id in match_det_set:
|
||||
pred_hit[pred_id] = 1
|
||||
|
||||
# match many gt to one det box. One pair of (det,gt) are matched
|
||||
# successfully if their recall, precision, normalized distance
|
||||
# meet some thresholds.
|
||||
for pred_id in range(pred_num):
|
||||
if pred_id in pred_ignored_index:
|
||||
continue
|
||||
|
||||
match, match_gt_set = eval_utils.many2one_match_ic13(
|
||||
pred_id, recall_mat, precision_mat, recall_thr,
|
||||
precision_thr, gt_hit, pred_hit, gt_ignored_index)
|
||||
|
||||
if match:
|
||||
pred_hit[pred_id] = 1
|
||||
accum_recall += many2one_score * len(match_gt_set)
|
||||
accum_precision += many2one_score
|
||||
for gt_id in match_gt_set:
|
||||
gt_hit[gt_id] = 1
|
||||
|
||||
gt_care_number = gt_num - ignored_num
|
||||
pred_care_number = pred_num - len(pred_ignored_index)
|
||||
|
||||
r, p, h = eval_utils.compute_hmean(accum_recall, accum_precision,
|
||||
gt_care_number, pred_care_number)
|
||||
|
||||
img_results.append({'recall': r, 'precision': p, 'hmean': h})
|
||||
|
||||
dataset_gt_num += gt_care_number
|
||||
dataset_pred_num += pred_care_number
|
||||
dataset_hit_recall += accum_recall
|
||||
dataset_hit_prec += accum_precision
|
||||
|
||||
total_r, total_p, total_h = eval_utils.compute_hmean(
|
||||
dataset_hit_recall, dataset_hit_prec, dataset_gt_num, dataset_pred_num)
|
||||
|
||||
dataset_results = {
|
||||
'num_gts': dataset_gt_num,
|
||||
'num_dets': dataset_pred_num,
|
||||
'num_recall': dataset_hit_recall,
|
||||
'num_precision': dataset_hit_prec,
|
||||
'recall': total_r,
|
||||
'precision': total_p,
|
||||
'hmean': total_h
|
||||
}
|
||||
|
||||
return dataset_results, img_results
|
|
@ -0,0 +1,116 @@
|
|||
import numpy as np
|
||||
|
||||
import mmocr.utils as utils
|
||||
from . import utils as eval_utils
|
||||
|
||||
|
||||
def eval_hmean_iou(pred_boxes,
|
||||
gt_boxes,
|
||||
gt_ignored_boxes,
|
||||
iou_thr=0.5,
|
||||
precision_thr=0.5):
|
||||
"""Evalute hmean of text detection using IOU standard.
|
||||
|
||||
Args:
|
||||
pred_boxes (list[list[list[float]]]): Text boxes for an img list. Each
|
||||
box has 2k (>=8) values.
|
||||
gt_boxes (list[list[list[float]]]): Ground truth text boxes for an img
|
||||
list. Each box has 2k (>=8) values.
|
||||
gt_ignored_boxes (list[list[list[float]]]): Ignored ground truth text
|
||||
boxes for an img list. Each box has 2k (>=8) values.
|
||||
iou_thr (float): Iou threshold when one (gt_box, det_box) pair is
|
||||
matched.
|
||||
precision_thr (float): Precision threshold when one (gt_box, det_box)
|
||||
pair is matched.
|
||||
|
||||
Returns:
|
||||
hmean (tuple[dict]): Tuple of dicts indicates the hmean for the dataset
|
||||
and all images.
|
||||
"""
|
||||
assert utils.is_3dlist(pred_boxes)
|
||||
assert utils.is_3dlist(gt_boxes)
|
||||
assert utils.is_3dlist(gt_ignored_boxes)
|
||||
assert 0 <= iou_thr <= 1
|
||||
assert 0 <= precision_thr <= 1
|
||||
|
||||
img_num = len(pred_boxes)
|
||||
assert img_num == len(gt_boxes)
|
||||
assert img_num == len(gt_ignored_boxes)
|
||||
|
||||
dataset_gt_num = 0
|
||||
dataset_pred_num = 0
|
||||
dataset_hit_num = 0
|
||||
|
||||
img_results = []
|
||||
|
||||
for i in range(img_num):
|
||||
gt = gt_boxes[i]
|
||||
gt_ignored = gt_ignored_boxes[i]
|
||||
pred = pred_boxes[i]
|
||||
|
||||
gt_num = len(gt)
|
||||
gt_ignored_num = len(gt_ignored)
|
||||
pred_num = len(pred)
|
||||
|
||||
hit_num = 0
|
||||
|
||||
# get gt polygons.
|
||||
gt_all = gt + gt_ignored
|
||||
gt_polys = [eval_utils.points2polygon(p) for p in gt_all]
|
||||
gt_ignored_index = [gt_num + i for i in range(len(gt_ignored))]
|
||||
gt_num = len(gt_polys)
|
||||
pred_polys, _, pred_ignored_index = eval_utils.ignore_pred(
|
||||
pred, gt_ignored_index, gt_polys, precision_thr)
|
||||
|
||||
# match.
|
||||
if gt_num > 0 and pred_num > 0:
|
||||
sz = [gt_num, pred_num]
|
||||
iou_mat = np.zeros(sz)
|
||||
|
||||
gt_hit = np.zeros(gt_num, np.int8)
|
||||
pred_hit = np.zeros(pred_num, np.int8)
|
||||
|
||||
for gt_id in range(gt_num):
|
||||
for pred_id in range(pred_num):
|
||||
gt_pol = gt_polys[gt_id]
|
||||
det_pol = pred_polys[pred_id]
|
||||
|
||||
iou_mat[gt_id,
|
||||
pred_id] = eval_utils.poly_iou(det_pol, gt_pol)
|
||||
|
||||
for gt_id in range(gt_num):
|
||||
for pred_id in range(pred_num):
|
||||
if (gt_hit[gt_id] != 0 or pred_hit[pred_id] != 0
|
||||
or gt_id in gt_ignored_index
|
||||
or pred_id in pred_ignored_index):
|
||||
continue
|
||||
if iou_mat[gt_id, pred_id] > iou_thr:
|
||||
gt_hit[gt_id] = 1
|
||||
pred_hit[pred_id] = 1
|
||||
hit_num += 1
|
||||
|
||||
gt_care_number = gt_num - gt_ignored_num
|
||||
pred_care_number = pred_num - len(pred_ignored_index)
|
||||
|
||||
r, p, h = eval_utils.compute_hmean(hit_num, hit_num, gt_care_number,
|
||||
pred_care_number)
|
||||
|
||||
img_results.append({'recall': r, 'precision': p, 'hmean': h})
|
||||
|
||||
dataset_hit_num += hit_num
|
||||
dataset_gt_num += gt_care_number
|
||||
dataset_pred_num += pred_care_number
|
||||
|
||||
dataset_r, dataset_p, dataset_h = eval_utils.compute_hmean(
|
||||
dataset_hit_num, dataset_hit_num, dataset_gt_num, dataset_pred_num)
|
||||
|
||||
dataset_results = {
|
||||
'num_gts': dataset_gt_num,
|
||||
'num_dets': dataset_pred_num,
|
||||
'num_match': dataset_hit_num,
|
||||
'recall': dataset_r,
|
||||
'precision': dataset_p,
|
||||
'hmean': dataset_h
|
||||
}
|
||||
|
||||
return dataset_results, img_results
|
|
@ -0,0 +1,27 @@
|
|||
import torch
|
||||
|
||||
|
||||
def compute_f1_score(preds, gts, ignores=[]):
|
||||
"""Compute the F1-score of prediction.
|
||||
|
||||
Args:
|
||||
preds (Tensor): The predicted probability NxC map
|
||||
with N and C being the sample number and class
|
||||
number respectively.
|
||||
gts (Tensor): The ground truth vector of size N.
|
||||
ignores (list): The index set of classes that are ignored when
|
||||
reporting results.
|
||||
Note: all samples are participated in computing.
|
||||
|
||||
Returns:
|
||||
The numpy list of f1-scores of valid classes.
|
||||
"""
|
||||
C = preds.size(1)
|
||||
classes = torch.LongTensor(sorted(set(range(C)) - set(ignores)))
|
||||
hist = torch.bincount(
|
||||
gts * C + preds.argmax(1), minlength=C**2).view(C, C).float()
|
||||
diag = torch.diag(hist)
|
||||
recalls = diag / hist.sum(1).clamp(min=1)
|
||||
precisions = diag / hist.sum(0).clamp(min=1)
|
||||
f1 = 2 * recalls * precisions / (recalls + precisions).clamp(min=1e-8)
|
||||
return f1[classes].cpu().numpy()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue