Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into fix_vqa

pull/5174/head
WenmuZhou 2022-01-05 11:08:12 +00:00
commit 0d7ee96807
73 changed files with 23284 additions and 5510 deletions

2
.gitignore vendored
View File

@ -29,3 +29,5 @@ paddleocr.egg-info/
/deploy/android_demo/app/PaddleLite/
/deploy/android_demo/app/.cxx/
/deploy/android_demo/app/cache/
test_tipc/web/models/
test_tipc/web/node_modules/

View File

@ -61,7 +61,7 @@ from combobox import ComboBox
from libs.constants import *
from libs.utils import *
from libs.settings import Settings
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR,DEFAULT_LOCK_COLOR
from libs.stringBundle import StringBundle
from libs.canvas import Canvas
from libs.zoomWidget import ZoomWidget
@ -126,7 +126,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelHist = []
self.lastOpenDir = None
self.result_dic = []
self.result_dic_locked = []
self.changeFileFolder = False
self.haveAutoReced = False
self.labelFile = None
@ -395,6 +395,7 @@ class MainWindow(QMainWindow, WindowMixin):
delete = action(getStr('delBox'), self.deleteSelectedShape,
'backspace', 'delete', getStr('delBoxDetail'), enabled=False)
copy = action(getStr('dupBox'), self.copySelectedShape,
'Ctrl+C', 'copy', getStr('dupBoxDetail'),
enabled=False)
@ -405,6 +406,7 @@ class MainWindow(QMainWindow, WindowMixin):
showAll = action(getStr('showBox'), partial(self.togglePolygons, True),
'Ctrl+A', 'hide', getStr('showAllBoxDetail'),
enabled=False)
help = action(getStr('tutorial'), self.showTutorialDialog, None, 'help', getStr('tutorialDetail'))
showInfo = action(getStr('info'), self.showInfoDialog, None, 'help', getStr('info'))
@ -476,6 +478,10 @@ class MainWindow(QMainWindow, WindowMixin):
undo = action(getStr("undo"), self.undoShapeEdit,
'Ctrl+Z', "undo", getStr("undo"), enabled=False)
lock = action(getStr("lockBox"), self.lockSelectedShape,
None, "lock", getStr("lockBoxDetail"),
enabled=False)
self.editButton.setDefaultAction(edit)
self.newButton.setDefaultAction(create)
@ -538,13 +544,13 @@ class MainWindow(QMainWindow, WindowMixin):
fitWindow=fitWindow, fitWidth=fitWidth,
zoomActions=zoomActions, saveLabel=saveLabel,
undo=undo, undoLastPoint=undoLastPoint,open_dataset_dir=open_dataset_dir,
rotateLeft=rotateLeft,rotateRight=rotateRight,
rotateLeft=rotateLeft,rotateRight=rotateRight,lock=lock,
fileMenuActions=(
opendir, open_dataset_dir, saveLabel, resetAll, quit),
beginner=(), advanced=(),
editMenu=(createpoly, edit, copy, delete,singleRere,None, undo, undoLastPoint,
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,),
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption,lock),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,lock),
advancedContext=(createMode, editMode, edit, copy,
delete, shapeLineColor, shapeFillColor),
onLoadActive=(
@ -998,6 +1004,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.actions.delete.setEnabled(n_selected)
self.actions.copy.setEnabled(n_selected)
self.actions.edit.setEnabled(n_selected == 1)
self.actions.lock.setEnabled(n_selected)
def addLabel(self, shape):
shape.paintLabel = self.displayLabelOption.isChecked()
@ -1041,7 +1048,7 @@ class MainWindow(QMainWindow, WindowMixin):
def loadLabels(self, shapes):
s = []
for label, points, line_color, fill_color, difficult in shapes:
shape = Shape(label=label)
shape = Shape(label=label,line_color=line_color)
for x, y in points:
# Ensure the labels are within the bounds of the image. If not, fix them.
@ -1051,6 +1058,7 @@ class MainWindow(QMainWindow, WindowMixin):
shape.addPoint(QPointF(x, y))
shape.difficult = difficult
#shape.locked = False
shape.close()
s.append(shape)
@ -1063,10 +1071,12 @@ class MainWindow(QMainWindow, WindowMixin):
# shape.fill_color = QColor(*fill_color)
# else:
# shape.fill_color = generateColorByText(label)
self.addLabel(shape)
self.updateComboBox()
self.canvas.loadShapes(s)
def singleLabel(self, shape):
if shape is None:
@ -1106,10 +1116,9 @@ class MainWindow(QMainWindow, WindowMixin):
difficult=s.difficult) # bool
shapes = [] if mode == 'Auto' else \
[format_shape(shape) for shape in self.canvas.shapes]
[format_shape(shape) for shape in self.canvas.shapes if shape.line_color != DEFAULT_LOCK_COLOR]
# Can add differrent annotation formats here
for box in self.result_dic:
for box in self.result_dic :
trans_dic = {"label": box[1][0], "points": box[0], 'difficult': False}
if trans_dic["label"] == "" and mode == 'Auto':
continue
@ -1120,7 +1129,6 @@ class MainWindow(QMainWindow, WindowMixin):
for box in shapes:
trans_dic.append({"transcription": box['label'], "points": box['points'], 'difficult': box['difficult']})
self.PPlabel[annotationFilePath] = trans_dic
if mode == 'Auto':
self.Cachelabel[annotationFilePath] = trans_dic
@ -1313,6 +1321,7 @@ class MainWindow(QMainWindow, WindowMixin):
# unicodeFilePath = os.path.abspath(unicodeFilePath)
# Tzutalin 20160906 : Add file list and dock to move faster
# Highlight the file item
if unicodeFilePath and self.fileListWidget.count() > 0:
if unicodeFilePath in self.mImgList:
index = self.mImgList.index(unicodeFilePath)
@ -1322,6 +1331,7 @@ class MainWindow(QMainWindow, WindowMixin):
###
self.iconlist.clear()
self.additems5(None)
for i in range(5):
item_tooltip = self.iconlist.item(i).toolTip()
# print(i,"---",item_tooltip)
@ -1340,7 +1350,6 @@ class MainWindow(QMainWindow, WindowMixin):
if unicodeFilePath and os.path.exists(unicodeFilePath):
self.canvas.verified = False
cvimg = cv2.imdecode(np.fromfile(unicodeFilePath, dtype=np.uint8), 1)
height, width, depth = cvimg.shape
cvimg = cv2.cvtColor(cvimg, cv2.COLOR_BGR2RGB)
@ -1361,16 +1370,19 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.dirty = False
self.actions.save.setEnabled(True)
if len(self.canvas.lockedShapes) != 0:
self.actions.save.setEnabled(True)
self.setDirty()
self.canvas.setEnabled(True)
self.adjustScale(initial=True)
self.paintCanvas()
self.addRecentFile(self.filePath)
self.toggleActions(True)
self.showBoundingBoxFromPPlabel(filePath)
self.setWindowTitle(__appname__ + ' ' + filePath)
# Default : select last item if there is at least one item
if self.labelList.count():
self.labelList.setCurrentItem(self.labelList.item(self.labelList.count() - 1))
@ -1380,15 +1392,23 @@ class MainWindow(QMainWindow, WindowMixin):
return True
return False
def showBoundingBoxFromPPlabel(self, filePath):
width, height = self.image.width(), self.image.height()
imgidx = self.getImglabelidx(filePath)
if imgidx not in self.PPlabel.keys():
return
shapes = []
for box in self.PPlabel[imgidx]:
shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
shapes =[]
#box['ratio'] of the shapes saved in lockedShapes contains the ratio of the
# four corner coordinates of the shapes to the height and width of the image
for box in self.canvas.lockedShapes:
if self.canvas.isInTheSameImage:
shapes.append((box['transcription'], [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
else:
shapes.append(('锁定框:待检测', [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
if imgidx in self.PPlabel.keys():
for box in self.PPlabel[imgidx]:
shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
self.loadLabels(shapes)
self.canvas.verified = False
@ -1646,9 +1666,37 @@ class MainWindow(QMainWindow, WindowMixin):
else:
return fullFilePath
return ''
def saveLockedShapes(self):
self.canvas.lockedShapes = []
self.canvas.selectedShapes = []
for s in self.canvas.shapes:
if s.line_color == DEFAULT_LOCK_COLOR:
self.canvas.selectedShapes.append(s)
self.lockSelectedShape()
for s in self.canvas.shapes:
if s.line_color == DEFAULT_LOCK_COLOR:
self.canvas.selectedShapes.remove(s)
self.canvas.shapes.remove(s)
def _saveFile(self, annotationFilePath, mode='Manual'):
if len(self.canvas.lockedShapes) != 0:
self.saveLockedShapes()
if mode == 'Manual':
self.result_dic_locked = []
img = cv2.imread(self.filePath)
width, height = self.image.width(), self.image.height()
for shape in self.canvas.lockedShapes:
box = [[int(p[0]*width), int(p[1]*height)] for p in shape['ratio']]
assert len(box) == 4
result = [(shape['transcription'],1)]
result.insert(0, box)
self.result_dic_locked.append(result)
self.result_dic += self.result_dic_locked
self.result_dic_locked = []
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
self.setClean()
self.statusBar().showMessage('Saved to %s' % annotationFilePath)
@ -1663,13 +1711,13 @@ class MainWindow(QMainWindow, WindowMixin):
self.savePPlabel(mode='Auto')
self.fileListWidget.insertItem(int(currIndex), item)
self.openNextImg()
if not self.canvas.isInTheSameImage:
self.openNextImg()
self.actions.saveRec.setEnabled(True)
self.actions.saveLabel.setEnabled(True)
elif mode == 'Auto':
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
self.setClean()
self.statusBar().showMessage('Saved to %s' % annotationFilePath)
self.statusBar().show()
@ -1733,7 +1781,9 @@ class MainWindow(QMainWindow, WindowMixin):
if discardChanges == QMessageBox.No:
return True
elif discardChanges == QMessageBox.Yes:
self.canvas.isInTheSameImage = True
self.saveFile()
self.canvas.isInTheSameImage = False
return True
else:
return False
@ -1872,6 +1922,7 @@ class MainWindow(QMainWindow, WindowMixin):
# org_box = [dic['points'] for dic in self.PPlabel[self.getImglabelidx(self.filePath)]]
if self.canvas.shapes:
self.result_dic = []
self.result_dic_locked = [] # result_dic_locked stores the ocr result of self.canvas.lockedShapes
rec_flag = 0
for shape in self.canvas.shapes:
box = [[int(p.x()), int(p.y())] for p in shape.points]
@ -1883,21 +1934,32 @@ class MainWindow(QMainWindow, WindowMixin):
return
result = self.ocr.ocr(img_crop, cls=True, det=False)
if result[0][0] != '':
result.insert(0, box)
print('result in reRec is ', result)
self.result_dic.append(result)
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
result.insert(0, box)
self.result_dic_locked.append(result)
else:
result.insert(0, box)
self.result_dic.append(result)
else:
print('Can not recognise the box')
self.result_dic.append([box,(self.noLabelText,0)])
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
if len(self.result_dic) > 0 and rec_flag > 0:
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
self.result_dic_locked.append([box,(self.noLabelText,0)])
else:
self.result_dic.append([box,(self.noLabelText,0)])
try:
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
except IndexError as e:
print('Can not recognise the box')
if (len(self.result_dic) > 0 and rec_flag > 0)or self.canvas.lockedShapes:
self.canvas.isInTheSameImage = True
self.saveFile(mode='Auto')
self.loadFile(self.filePath)
self.canvas.isInTheSameImage = False
self.setDirty()
elif len(self.result_dic) == len(self.canvas.shapes) and rec_flag == 0:
QMessageBox.information(self, "Information", "The recognition result remains unchanged!")
@ -2107,6 +2169,44 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelList.clearSelection()
self._noSelectionSlot = False
self.canvas.loadShapes(shapes, replace=replace)
print("loadShapes")#1
def lockSelectedShape(self):
"""lock the selsected shapes.
Add self.selectedShapes to lock self.canvas.lockedShapes,
which holds the ratio of the four coordinates of the locked shapes
to the width and height of the image
"""
width, height = self.image.width(), self.image.height()
def format_shape(s):
return dict(label=s.label, # str
line_color=s.line_color.getRgb(),
fill_color=s.fill_color.getRgb(),
ratio=[[int(p.x())/width, int(p.y())/height] for p in s.points], # QPonitF
# add chris
difficult=s.difficult) # bool
#lock
if len(self.canvas.lockedShapes) == 0:
for s in self.canvas.selectedShapes:
s.line_color = DEFAULT_LOCK_COLOR
s.locked = True
shapes = [format_shape(shape) for shape in self.canvas.selectedShapes]
trans_dic = []
for box in shapes:
trans_dic.append({"transcription": box['label'], "ratio": box['ratio'], 'difficult': box['difficult']})
self.canvas.lockedShapes = trans_dic
self.actions.save.setEnabled(True)
#unlock
else:
for s in self.canvas.shapes:
s.line_color = DEFAULT_LINE_COLOR
self.canvas.lockedShapes = []
self.result_dic_locked = []
self.setDirty()
self.actions.save.setEnabled(True)
def inverted(color):

View File

@ -78,14 +78,14 @@ PPOCRLabel # run
```bash
cd PaddleOCR/PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl
```
#### 1.2.3 Run PPOCRLabel by Python Script
```bash
cd ./PPOCRLabel # Switch to the PPOCRLabel directory
python PPOCRLabel.py --lang ch
python PPOCRLabel.py
```

View File

@ -78,7 +78,7 @@ PPOCRLabel --lang ch # 启动
```bash
cd PaddleOCR/PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
```
#### 1.2.3 通过Python脚本运行PPOCRLabel

View File

@ -87,6 +87,10 @@ class Canvas(QWidget):
#initialisation for panning
self.pan_initial_pos = QPoint()
#lockedshapes related
self.lockedShapes = []
self.isInTheSameImage = False
def setDrawingColor(self, qColor):
self.drawingLineColor = qColor
self.drawingRectColor = qColor

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,7 @@ DEFAULT_SELECT_LINE_COLOR = QColor(255, 255, 255)
DEFAULT_SELECT_FILL_COLOR = QColor(0, 128, 255, 155)
DEFAULT_VERTEX_FILL_COLOR = QColor(0, 255, 0, 255)
DEFAULT_HVERTEX_FILL_COLOR = QColor(255, 0, 0)
DEFAULT_LOCK_COLOR = QColor(255, 0, 255)
MIN_Y_LABEL = 10
@ -57,7 +58,7 @@ class Shape(object):
self.selected = False
self.difficult = difficult
self.paintLabel = paintLabel
self.locked = False
self._highlightIndex = None
self._highlightMode = self.NEAR_VERTEX
self._highlightSettings = {

View File

@ -60,7 +60,7 @@ class StringBundle:
def __createLookupFallbackList(self, localeStr):
resultPaths = []
basePath = "\strings" if os.name == 'nt' else ":/strings"
basePath = "\strings" if os.name == 'nt' else "/strings"
resultPaths.append(basePath)
if localeStr is not None:
# Don't follow standard BCP47. Simple fallback

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

View File

@ -104,4 +104,6 @@ singleRe=Re-recognition RectBox
labelDialogOption=Pop-up Label Input Dialog
undo=Undo
undoLastPoint=Undo Last Point
autoSaveMode=Auto Export Label Mode
autoSaveMode=Auto Export Label Mode
lockBox=Lock selected box/Unlock all box
lockBoxDetail=Lock selected box/Unlock all box

View File

@ -104,4 +104,6 @@ singleRe=重识别此区块
labelDialogOption=弹出标记输入框
undo=撤销
undoLastPoint=撤销上个点
autoSaveMode=自动导出标记结果
autoSaveMode=自动导出标记结果
lockBox=锁定框/解除锁定框
lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态

View File

@ -33,7 +33,7 @@ setup(
package_dir={'PPOCRLabel': ''},
include_package_data=True,
entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]},
version='1.0.0',
version='1.0.2',
install_requires=requirements,
license='Apache License 2.0',
description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models',

View File

@ -24,7 +24,7 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
**Recent updates**
- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2LayoutXLM).
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)).
- PaddleOCR R&D team would like to share the key points of PP-OCRv2, at 20:15 pm on September 8th, [Course Address](https://aistudio.baidu.com/aistudio/education/group/info/6758).
- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files).
@ -39,7 +39,7 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
- General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M
- Support Chinese, English, and digit recognition, vertical text recognition, and long text recognition
- Support multi-language recognition: about 80 languages like Korean, Japanese, German, French, etc
- document structurize system PP-Structure
- PP-Structure: a document structurize system
- support layout analysis and table recognition (support export to Excel)
- support key information extraction
- support DocVQA
@ -90,7 +90,7 @@ Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Andr
| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model |
| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| Chinese and English ultra-lightweight PP-OCRv2 model11.6M | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/ch/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCRv2 model11.6M | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) |
| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_traingit.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) |
@ -102,11 +102,11 @@ For a new language request, please refer to [Guideline for new language_requests
## Tutorials
- [Environment Preparation](./doc/doc_en/environment_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md)
- [PaddleOCR Overview and Installation](./doc/doc_en/paddleOCR_overview_en.md)
- [PaddleOCR Overview and Project Clone](./doc/doc_en/paddleOCR_overview_en.md)
- PP-OCR Industry Landing: from Training to Deployment
- [PP-OCR Model and Configuration](./doc/doc_en/models_and_config_en.md)
- [PP-OCR Model Zoo](./doc/doc_en/models_en.md)
- [PP-OCR Model Download](./doc/doc_en/models_list_en.md)
- [Python Inference for PP-OCR Model Library](./doc/doc_en/inference_ppocr_en.md)
- [Python Inference for PP-OCR Model Zoo](./doc/doc_en/inference_ppocr_en.md)
- [PP-OCR Training](./doc/doc_en/training_en.md)
- [Text Detection](./doc/doc_en/detection_en.md)
- [Text Recognition](./doc/doc_en/recognition_en.md)
@ -120,6 +120,8 @@ For a new language request, please refer to [Guideline for new language_requests
- [PP-Structure: Information Extraction](./ppstructure/README.md)
- [Layout Parser](./ppstructure/layout/README.md)
- [Table Recognition](./ppstructure/table/README.md)
- [DocVQA](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)
- [Key Information Extraction](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)
- Academic Circles
- [Two-stage Algorithm](./doc/doc_en/algorithm_overview_en.md)
- [PGNet Algorithm](./doc/doc_en/pgnet_en.md)

View File

@ -19,8 +19,8 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力
## 近期更新
- 2021.12.21 《OCR十讲》课程开讲12月21日起每晚八点半线上授课 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法PSENet3种文本识别算法NRTR、SEED、SAR文档结构化算法新增1种关键信息提取算法SDMGR3种DocVQA算法LayoutLM、LayoutLMv2LayoutXLM
- 2021.12.21《动手学OCR · 十讲》课程开讲12月21日起每晚八点半线上授课[免费报名地址](https://aistudio.baidu.com/aistudio/course/introduce/25207)。
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法PSENet3种文本识别算法NRTR、SEED、SAR文档结构化算法新增1种关键信息提取算法SDMGR[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)3种DocVQA算法LayoutLM、LayoutLMv2LayoutXLM[文档](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa))。
- PaddleOCR研发团队对最新发版内容技术深入解读9月8日晚上20:15[课程回放](https://aistudio.baidu.com/aistudio/education/group/info/6758)。
- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](#PP-OCRv2)CPU推理速度相比于PP-OCR server提升220%效果相比于PP-OCR mobile 提升7%。
- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包支持版面分析与表格识别含Excel导出
@ -54,8 +54,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力
- 加入社区:微信扫描下方二维码加入官方交流群,与各行各业开发者充分交流,期待您的加入。
- 社区贡献:[社区贡献](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等是官方为社区开发者打造的荣誉墙、也是帮助优质项目宣传的广播站。如果您的OCR项目未被收集在文档中可根据文档说明与我们联系。最新社区贡献可查看[此处](#社区贡献)。
- 社区常规赛作为社区贡献的具体承载形式社区常规赛是面向OCR开发者的积分赛事。首届社区常规赛与《动手学OCR · 十讲》课程联合推广,课程详情可参考[链接](https://aistudio.baidu.com/aistudio/course/introduce/25207),课程奖励与作业说明可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。
- 社区常规赛作为社区贡献的具体承载形式社区常规赛是面向OCR开发者的积分赛事。首届社区常规赛与[《动手学OCR · 十讲》课程](https://aistudio.baidu.com/aistudio/course/introduce/25207)联合推广。社区常规赛的赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。
<div align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/dygraph/doc/joinus.PNG" width = "200" height = "200" />
@ -64,22 +63,33 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力
## 零代码体验
- 在线网站体验超轻量PP-OCR mobile模型体验地址https://www.paddlepaddle.org.cn/hub/scene/ocr
- 移动端:[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统)
<a name="模型下载"></a>
## PP-OCR系列模型列表更新中
| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 |
| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| 中英文超轻量PP-OCRv2模型13.0M | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
| 中英文超轻量PP-OCR mobile模型9.4M | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
| 中英文通用PP-OCR server模型143.4M | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](./doc/doc_ch/models_list.md)
## 文档教程
- [运行环境准备](./doc/doc_ch/environment.md)
- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md)
- [PaddleOCR全景图与项目克隆](./doc/doc_ch/paddleOCR_overview.md)
- PP-OCR产业落地从训练到部署
- [PP-OCR模型与配置文件](./doc/doc_ch/models_and_config.md)
- [PP-OCR模型库](./doc/doc_ch/models.md)
- [PP-OCR模型下载](./doc/doc_ch/models_list.md)
- [PP-OCR模型库快速推理](./doc/doc_ch/inference_ppocr.md)
- [PP-OCR模型训练](./doc/doc_ch/training.md)
- [文本检测](./doc/doc_ch/detection.md)
- [文本识别](./doc/doc_ch/recognition.md)
- [文本方向分类器](./doc/doc_ch/angle_class.md)
- [知识蒸馏](./doc/doc_ch/knowledge_distillation.md)
- [配置文件内容与生成](./doc/doc_ch/config.md)
- PP-OCR模型推理部署
- [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
@ -89,6 +99,8 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力
- [PP-Structure信息提取](./ppstructure/README_ch.md)
- [版面分析](./ppstructure/layout/README_ch.md)
- [表格识别](./ppstructure/table/README_ch.md)
- [DocVQA](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)
- [关键信息提取](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)
- OCR学术圈
- [两阶段模型介绍与下载](./doc/doc_ch/algorithm_overview.md)
- [端到端PGNet算法](./doc/doc_ch/pgnet.md)
@ -119,7 +131,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力
</div>
[1] PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测、检测框矫正和CRNN文本识别三部分组成。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面采用19个有效策略对各个模块的模型进行效果调优和瘦身(如绿框所示)最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941
[2] PP-OCRv2在PP-OCR的基础上进一步在5个方面重点优化检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和Enhanced CTC loss损失函数改进如上图红框所示进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。
[2] PP-OCRv2在PP-OCR的基础上进一步在5个方面重点优化检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./doc/doc_ch/enhanced_ctc_loss.md)损失函数改进如上图红框所示进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。
<a name="效果展示"></a>

View File

@ -21,6 +21,7 @@ Architecture:
model_type: det
Models:
Teacher:
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true
return_all_feats: false
model_type: det
@ -36,6 +37,7 @@ Architecture:
name: DBHead
k: 50
Student:
pretrained:
freeze_params: false
return_all_feats: false
model_type: det
@ -52,6 +54,7 @@ Architecture:
name: DBHead
k: 50
Student2:
pretrained:
freeze_params: false
return_all_feats: false
model_type: det

View File

@ -18,6 +18,7 @@ Global:
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Student:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained

View File

@ -18,6 +18,7 @@ Global:
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Student:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained

View File

@ -18,8 +18,8 @@ python3.7 -m pip install paddle2onnx
- 安装 ONNX
```
# 建议安装 1.4.0 版本,可根据环境更换版本号
python3.7 -m pip install onnxruntime==1.4.0
# 建议安装 1.9.0 版本,可根据环境更换版本号
python3.7 -m pip install onnxruntime==1.9.0
```
## 2. 模型转换
@ -47,13 +47,15 @@ paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ \
--params_filename=inference.pdiparams \
--save_file=./inference/det_mobile_onnx/model.onnx \
--opset_version=10 \
--input_shape_dict="{'x': [-1, 3, -1, -1]}" \
--enable_onnx_checker=True
```
执行完毕后ONNX 模型会被保存在 `./inference/det_mobile_onnx/` 路径下
* 注意:以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
* 注意对于OCR模型转化过程中必须采用动态shape的形式即加入选项--input_shape_dict="{'x': [-1, 3, -1, -1]}"否则预测结果可能与直接使用Paddle预测有细微不同。
另外,以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
## 3. onnx 预测
@ -72,5 +74,3 @@ root INFO: 1.jpg [[[291, 295], [334, 292], [348, 844], [305, 847]], [[344, 296]
The predict time of ../../doc/imgs/1.jpg: 0.06162881851196289
The visualized image saved in ./inference_results/det_res_1.jpg
```
* 注意ONNX暂时不支持变长预测需要将输入resize到固定输入预测结果可能与直接使用Paddle预测有细微不同。

View File

@ -57,7 +57,7 @@ PaddleOCR基于动态图开源的文本识别算法列表
- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2))
- [x] SEED([paper](https://arxiv.org/pdf/2005.10977.pdf))
参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程使用MJSynth和SynthText两个文字识别数据集训练在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估算法效果如下
参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程使用MJSynth和SynthText两个文字识别数据集训练在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估算法效果如下
|模型|骨干网络|Avg Accuracy|模型存储命名|下载链接|
|---|---|---|---|---|

View File

@ -146,7 +146,7 @@ PaddleOCR欢迎大家向repo中积极贡献代码下面给出一些贡献代
- 将 `远程仓库` Clone到本地
```
# 拉取develop分支的代码
# 拉取dygraph分支的代码
git clone https://github.com/{your_name}/PaddleOCR.git -b dygraph
cd PaddleOCR
```
@ -191,11 +191,11 @@ git checkout -b new_branch
也可以基于远程或者上游的分支创建新的分支,命令如下。
```
# 基于用户远程仓库(origin)的develop创建new_branch分支
git checkout -b new_branch origin/develop
# 基于上游远程仓库(upstream)的develop创建new_branch分支
# 基于用户远程仓库(origin)的dygraph创建new_branch分支
git checkout -b new_branch origin/dygraph
# 基于上游远程仓库(upstream)的dygraph创建new_branch分支
# 如果需要从upstream创建新的分支需要首先使用git fetch upstream获取上游代码
git checkout -b new_branch upstream/develop
git checkout -b new_branch upstream/dygraph
```
最终会显示切换到新的分支,输出信息如下
@ -246,8 +246,8 @@ git commit -m "your commit info"
```
git fetch upstream
# 如果是希望提交到其他分支则需要从upstream的其他分支pull代码这里是develop
git pull upstream develop
# 如果是希望提交到其他分支则需要从upstream的其他分支pull代码这里是dygraph
git pull upstream dygraph
```
#### 3.2.7 push到远程仓库

View File

@ -1,4 +1,4 @@
## 数据合成工具
# 数据合成工具
除了开源数据,用户还可使用合成工具自行合成。这里整理了常用的数据合成工具,持续更新中,欢迎各位小伙伴贡献工具~
- [text_renderer](https://github.com/Sanster/text_renderer)
- [SynthText](https://github.com/ankush-me/SynthText)
@ -6,3 +6,4 @@
- [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)
- [SynthText3D](https://github.com/MhLiao/SynthText3D)
- [UnrealText](https://github.com/Jyouhou/UnrealText/)
- [SynthTIGER](https://github.com/clovaai/synthtiger)

View File

@ -78,11 +78,11 @@ json.dumps编码前的图像标注信息是包含多个字典的list字典中
cd PaddleOCR/
# 根据backbone的不同选择下载对应的预训练模型
# 下载MobileNetV3的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams
# 或下载ResNet18_vd的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams
# 或下载ResNet50_vd的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams
```
<a name="2-----"></a>

View File

@ -281,4 +281,274 @@ paddle.save(s_params, "ch_PP-OCRv2_rec_train/student.pdparams")
### 2.2 检测配置文件解析
* coming soon!
检测模型蒸馏的配置文件在PaddleOCR/configs/det/ch_PP-OCRv2/目录下,包含三个蒸馏配置文件:
- ch_PP-OCRv2_det_cml.yml采用cml蒸馏采用一个大模型蒸馏两个小模型且两个小模型互相学习的方法
- ch_PP-OCRv2_det_dml.yml采用DML的蒸馏两个Student模型互蒸馏的方法
- ch_PP-OCRv2_det_distill.yml采用Teacher大模型蒸馏小模型Student的方法
#### 2.2.1 模型结构
知识蒸馏任务中,模型结构配置如下所示:
```
Architecture:
name: DistillationModel # 结构名称蒸馏任务中为DistillationModel用于构建对应的结构
algorithm: Distillation # 算法名称
Models: # 模型,包含子网络的配置信息
Student: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false # 是否需要固定参数
return_all_feats: false # 子网络的参数表示是否需要返回所有的features如果为False则只返回最后的输出
model_type: det
algorithm: DB
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
Neck:
name: DBFPN
out_channels: 96
Head:
name: DBHead
k: 50
Teacher: # 另外一个子网络,这里给的是普通大模型蒸小模型的蒸馏示例,
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true # Teacher模型是训练好的不需要参与训练freeze_params设置为True
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
layers: 18
Neck:
name: DBFPN
out_channels: 256
Head:
name: DBHead
k: 50
```
如果是采用DML即两个小模型互相学习的方法上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置具体参考配置文件[ch_PP-OCRv2_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml)。

下面介绍[ch_PP-OCRv2_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)的配置文件参数:
```
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Teacher: # CML蒸馏的Teacher模型配置
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true # Teacher 不训练
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
layers: 18
Neck:
name: DBFPN
out_channels: 256
Head:
name: DBHead
k: 50
Student: # CML蒸馏的Student模型配置
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false
return_all_feats: false
model_type: det
algorithm: DB
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
Neck:
name: DBFPN
out_channels: 96
Head:
name: DBHead
k: 50
Student2: # CML蒸馏的Student2模型配置
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
Neck:
name: DBFPN
out_channels: 96
Head:
name: DBHead
k: 50
```
蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)。
最终模型`forward`输出为一个字典key为所有的子网络名称例如这里为`Student`与`Teacher`value为对应子网络的输出可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。
在蒸馏任务中,为了方便添加蒸馏损失函数,每个网络的输出保存为`dict`,其中包含子模块输出。每个子网络的输出结果均为`dict`key包含`backbone_out`,`neck_out`, `head_out``value`为对应模块的tensor最终对于上述配置文件`DistillationModel`的输出格式如下。
```json
{
"Teacher": {
"backbone_out": tensor,
"neck_out": tensor,
"head_out": tensor,
},
"Student": {
"backbone_out": tensor,
"neck_out": tensor,
"head_out": tensor,
}
}
```
#### 2.1.2 损失函数
知识蒸馏任务中检测ch_PP-OCRv2_det_distill.yml蒸馏损失函数配置如下所示。
```yaml
Loss:
name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类
loss_config_list: # 损失函数配置文件列表为CombinedLoss的必备函数
- DistillationDilaDBLoss: # 基于蒸馏的DB损失函数继承自标准的DBloss
weight: 1.0 # 损失函数的权重loss_config_list中每个损失函数的配置都必须包含该字段
model_name_pairs: # 对于蒸馏模型的预测结果提取这两个子网络的输出计算Teacher模型和Student模型输出的loss
- ["Student", "Teacher"]
key: maps # 取子网络输出dict中该key对应的tensor
balance_loss: true # 以下几个参数为标准DBloss的配置参数
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
- DistillationDBLoss: # 基于蒸馏的DB损失函数继承自标准的DBloss用于计算Student和GT之间的loss
weight: 1.0
model_name_list: ["Student"] # 模型名字只有Student表示计算Student和GT之间的loss
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
```
同理检测ch_PP-OCRv2_det_cml.yml蒸馏损失函数配置如下所示。相比较于ch_PP-OCRv2_det_distill.yml的损失函数配置cml蒸馏的损失函数配置做了3个改动
```yaml
Loss:
name: CombinedLoss
loss_config_list:
- DistillationDilaDBLoss:
weight: 1.0
model_name_pairs:
- ["Student", "Teacher"]
- ["Student2", "Teacher"] # 改动1计算两个Student和Teacher的损失
key: maps
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
- DistillationDMLLoss: # 改动2增加计算两个Student之间的损失
model_name_pairs:
- ["Student", "Student2"]
maps_name: "thrink_maps"
weight: 1.0
# act: None
key: maps
- DistillationDBLoss:
weight: 1.0
model_name_list: ["Student", "Student2"] # 改动3计算两个Student和GT之间的损失
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
```
关于`DistillationDilaDBLoss`更加具体的实现可以参考: [distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppocr/losses/distillation_loss.py#L185)。关于`DistillationDBLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/04c44974b13163450dfb6bd2c327863f8a194b3c/ppocr/losses/distillation_loss.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L148)。
#### 2.1.3 后处理
知识蒸馏任务中,检测蒸馏后处理配置如下所示。
```yaml
PostProcess:
name: DistillationDBPostProcess # DB检测蒸馏任务的CTC解码后处理继承自标准的DBPostProcess类
model_name: ["Student", "Student2", "Teacher"] # 对于蒸馏模型的预测结果提取多个子网络的输出进行解码不需要后处理的网络可以不在model_name中设置
thresh: 0.3
box_thresh: 0.6
max_candidates: 1000
unclip_ratio: 1.5
```
以上述配置为例,最终会同时计算`Student``Student2`和`Teacher` 3个子网络的输出做后处理计算。同时由于有多个输入后处理返回的输出也有多个
关于`DistillationDBPostProcess`更加具体的实现可以参考: [db_postprocess.py](../../ppocr/postprocess/db_postprocess.py#L195)
#### 2.1.4 蒸馏指标计算
知识蒸馏任务中,检测蒸馏指标计算配置如下所示。
```yaml
Metric:
name: DistillationMetric
base_metric_name: DetMetric
main_indicator: hmean
key: "Student"
```
由于蒸馏需要包含多个网络甚至多个Student网络在计算指标的时候只需要计算一个Student网络的指标即可`key`字段设置为`Student`则表示只计算`Student`网络的精度。
#### 2.1.5 检测蒸馏模型finetune
检测蒸馏有三种方式:
- 采用ch_PP-OCRv2_det_distill.ymlTeacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
- 采用ch_PP-OCRv2_det_cml.yml采用cml蒸馏同样Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
- 采用ch_PP-OCRv2_det_dml.yml采用DML的蒸馏两个Student模型互蒸馏的方法在PaddleOCR采用的数据集上大约有1.7%的精度提升。
在具体finetune时需要在网络结构的`pretrained`参数中设置要加载的预训练模型。
在精度提升方面cml的精度>dml的精度>distill蒸馏方法的精度。当数据量不足或者Teacher模型精度与Student精度相差不大的时候这个结论或许会改变。
另外由于PaddleOCR提供的蒸馏预训练模型包含了多个模型的参数如果您希望提取Student模型的参数可以参考如下代码
```
# 下载蒸馏训练模型的参数
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar
```
```python
import paddle
# 加载预训练模型
all_params = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams")
# 查看权重参数的keys
print(all_params.keys())
# 学生模型的权重提取
s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key}
# 查看学生模型权重参数的keys
print(s_params.keys())
# 保存
paddle.save(s_params, "ch_PP-OCRv2_det_distill_train/student.pdparams")
```
最终`Student`模型的参数将会保存在`ch_PP-OCRv2_det_distill_train/student.pdparams`中用于模型的fine-tune。

View File

@ -1,47 +0,0 @@
# PP-OCR模型与配置文件
PP-OCR模型与配置文件一章主要补充一些OCR模型的基本概念、配置文件的内容与作用以便对模型后续的参数调整和训练中拥有更好的体验。
本章包含三个部分,首先在[PP-OCR模型下载](./models_list.md)中解释PP-OCR模型的类型概念并提供所有模型的下载链接。然后在[配置文件内容与生成](./config.md)中详细说明调整PP-OCR模型所需的参数。最后的[模型库快速使用](./inference_ppocr.md)是对第一节PP-OCR模型库使用方法的介绍可以通过Python推理引擎快速利用丰富的模型库模型获得测试结果。
------
下面我们首先了解一些OCR相关的基本概念
- [1. OCR 简要介绍](#1-ocr-----)
* [1.1 OCR 检测模型基本概念](#11-ocr---------)
* [1.2 OCR 识别模型基本概念](#12-ocr---------)
* [1.3 PP-OCR模型](#13-pp-ocr--)
<a name="1-ocr-----"></a>
## 1. OCR 简要介绍
本节简要介绍OCR检测模型、识别模型的基本概念并介绍PaddleOCR的PP-OCR模型。
OCROptical Character Recognition光学字符识别目前是文字识别的统称已不限于文档或书本文字识别更包括识别自然场景下的文字又可以称为STRScene Text Recognition
OCR文字识别一般包括两个部分文本检测和文本识别文本检测首先利用检测算法检测到图像中的文本行然后检测到的文本行用识别算法去识别到具体文字。
<a name="11-ocr---------"></a>
### 1.1 OCR 检测模型基本概念
文本检测就是要定位图像中的文字区域,然后通常以边界框的形式将单词或文本行标记出来。传统的文字检测算法多是通过手工提取特征的方式,特点是速度快,简单场景效果好,但是面对自然场景,效果会大打折扣。当前多是采用深度学习方法来做。
基于深度学习的文本检测算法可以大致分为以下几类:
1. 基于目标检测的方法一般是预测得到文本框后通过NMS筛选得到最终文本框多是四点文本框对弯曲文本场景效果不理想。典型算法为EAST、Text Box等方法。
2. 基于分割的方法将文本行当成分割目标然后通过分割结果构建外接文本框可以处理弯曲文本对于文本交叉场景问题效果不理想。典型算法为DB、PSENet等方法。
3. 混合目标检测和分割的方法;
<a name="12-ocr---------"></a>
### 1.2 OCR 识别模型基本概念
OCR识别算法的输入数据一般是文本行背景信息不多文字占据主要部分识别算法目前可以分为两类算法
1. 基于CTC的方法即识别算法的文字预测模块是基于CTC的常用的算法组合为CNN+RNN+CTC。目前也有一些算法尝试在网络中加入transformer模块等等。
2. 基于Attention的方法即识别算法的文字预测模块是基于Attention的常用算法组合是CNN+RNN+Attention。
<a name="13-pp-ocr--"></a>
### 1.3 PP-OCR模型
PaddleOCR 中集成了很多OCR算法文本检测算法有DB、EAST、SAST等等文本识别算法有CRNN、RARE、StarNet、Rosetta、SRN等算法。
其中PaddleOCR针对中英文自然场景通用OCR推出了PP-OCR系列模型PP-OCR模型由DB+CRNN算法组成利用海量中文数据训练加上模型调优方法在中文场景上具备较高的文本检测识别能力。并且PaddleOCR推出了高精度超轻量PP-OCRv2模型检测模型仅3M识别模型仅8.5M,利用[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)的模型量化方法可以在保持精度不降低的情况下将检测模型压缩到0.8M识别压缩到3M更加适用于移动端部署场景。

View File

@ -143,8 +143,10 @@ PaddleOCR主要聚焦通用OCR如果有垂类需求您可以用PaddleOCR+
具体的训练教程可点击下方链接跳转:
\- [文本检测模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/detection.md)
- [文本检测模型训练](./detection.md)
\- [文本识别模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/recognition.md)
- [文本识别模型训练](./recognition.md)
- [文本方向分类器训练](./angle_class.md)
- [知识蒸馏](./knowledge_distillation.md)
\- [文本方向分类器训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/angle_class.md)

View File

@ -9,3 +9,4 @@ There are the commonly used data synthesis tools, which will be continuously upd
* [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)
* [SynthText3D](https://github.com/MhLiao/SynthText3D)
* [UnrealText](https://github.com/Jyouhou/UnrealText/)
* [SynthTIGER](https://github.com/clovaai/synthtiger)

View File

@ -67,11 +67,11 @@ And the responding download link of backbone pretrain weights can be found in (h
```shell
cd PaddleOCR/
# Download the pre-trained model of MobileNetV3
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams
# or, download the pre-trained model of ResNet18_vd
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams
# or, download the pre-trained model of ResNet50_vd
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams
```

View File

@ -1,48 +0,0 @@
# PP-OCR Model and Configuration
The chapter on PP-OCR model and configuration file mainly adds some basic concepts of OCR model and the content and role of configuration file to have a better experience in the subsequent parameter adjustment and training of the model.
This chapter contains three parts. Firstly, [PP-OCR Model Download](. /models_list_en.md) explains the concept of PP-OCR model types and provides links to download all models. Then in [Yml Configuration](. /config_en.md) details the parameters needed to fine-tune the PP-OCR models. The final [Python Inference for PP-OCR Model Library](. /inference_ppocr_en.md) is an introduction to the use of the PP-OCR model library in the first section, which can quickly utilize the rich model library models to obtain test results through the Python inference engine.
------
Let's first understand some basic concepts.
- [INTRODUCTION ABOUT OCR](#introduction-about-ocr)
* [BASIC CONCEPTS OF OCR DETECTION MODEL](#basic-concepts-of-ocr-detection-model)
* [Basic concepts of OCR recognition model](#basic-concepts-of-ocr-recognition-model)
* [PP-OCR model](#pp-ocr-model)
* [And a table of contents](#and-a-table-of-contents)
* [On the right](#on-the-right)
## 1. INTRODUCTION ABOUT OCR
This section briefly introduces the basic concepts of OCR detection model and recognition model, and introduces PaddleOCR's PP-OCR model.
OCR (Optical Character Recognition, Optical Character Recognition) is currently the general term for text recognition. It is not limited to document or book text recognition, but also includes recognizing text in natural scenes. It can also be called STR (Scene Text Recognition).
OCR text recognition generally includes two parts, text detection and text recognition. The text detection module first uses detection algorithms to detect text lines in the image. And then the recognition algorithm to identify the specific text in the text line.
### 1.1 BASIC CONCEPTS OF OCR DETECTION MODEL
Text detection can locate the text area in the image, and then usually mark the word or text line in the form of a bounding box. Traditional text detection algorithms mostly extract features manually, which are characterized by fast speed and good effect in simple scenes, but the effect will be greatly reduced when faced with natural scenes. Currently, deep learning methods are mostly used.
Text detection algorithms based on deep learning can be roughly divided into the following categories:
1. Method based on target detection. Generally, after the text box is predicted, the final text box is filtered through NMS, which is mostly four-point text box, which is not ideal for curved text scenes. Typical algorithms are methods such as EAST and Text Box.
2. Method based on text segmentation. The text line is regarded as the segmentation target, and then the external text box is constructed through the segmentation result, which can handle curved text, and the effect is not ideal for the text cross scene problem. Typical algorithms are DB, PSENet and other methods.
3. Hybrid target detection and segmentation method.
### 1.2 Basic concepts of OCR recognition model
The input of the OCR recognition algorithm is generally text lines images which has less background information, and the text information occupies the main part. The recognition algorithm can be divided into two types of algorithms:
1. CTC-based method. The text prediction module of the recognition algorithm is based on CTC, and the commonly used algorithm combination is CNN+RNN+CTC. There are also some algorithms that try to add transformer modules to the network and so on.
2. Attention-based method. The text prediction module of the recognition algorithm is based on Attention, and the commonly used algorithm combination is CNN+RNN+Attention.
### 1.3 PP-OCR model
PaddleOCR integrates many OCR algorithms, text detection algorithms include DB, EAST, SAST, etc., text recognition algorithms include CRNN, RARE, StarNet, Rosetta, SRN and other algorithms.
Among them, PaddleOCR has released the PP-OCR series model for the general OCR in Chinese and English natural scenes. The PP-OCR model is composed of the DB+CRNN algorithm. It uses massive Chinese data training and model tuning methods to have high text detection and recognition capabilities in Chinese scenes. And PaddleOCR has launched a high-precision and ultra-lightweight PP-OCRv2 model. The detection model is only 3M, and the recognition model is only 8.5M. Using [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)'s model quantification method, the detection model can be compressed to 0.8M without reducing the accuracy. The recognition is compressed to 3M, which is more suitable for mobile deployment scenarios.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 186 KiB

After

Width:  |  Height:  |  Size: 200 KiB

View File

@ -0,0 +1,433 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"![](https://ai-studio-static-online.cdn.bcebos.com/72b2077605dd49b78f7f647d6821d10231f6bc52d7ed463da451a6a0bd1fc5ff)\n",
"*注:以上图片来自网络*\n",
"\n",
"# 1. OCR技术背景\n",
"## 1.1 OCR技术的应用场景\n",
"\n",
"* **<font color=red>OCR是什么</font>**\n",
"\n",
"OCROptical Character Recognition光学字符识别是计算机视觉重要方向之一。传统定义的OCR一般面向扫描文档类对象现在我们常说的OCR一般指场景文字识别Scene Text RecognitionSTR主要面向自然场景如下图中所示的牌匾等各种自然场景可见的文字。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/c87c0e6f6c0a42cdbc552a4f973c1b0217c369194c1243558753896f3e66032c)\n",
"<center>图1 文档场景文字识别 VS. 自然场景文字识别</center>\n",
"\n",
"<br>\n",
"\n",
"* **<font color=red>OCR有哪些应用场景</font>**\n",
"\n",
"OCR技术有着丰富的应用场景一类典型的场景是日常生活中广泛应用的面向垂类的结构化文本识别比如车牌识别、银行卡信息识别、身份证信息识别、火车票信息识别等等。这些小垂类的共同特点是格式固定因此非常适合使用OCR技术进行自动化可以极大的减轻人力成本提升效率。\n",
"\n",
"这种面向垂类的结构化文本识别是目前ocr应用最广泛、并且技术相对较成熟的场景。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/56e0df91d0d34443aacb17c9a1c5c186608ee675092648a693503df7fe45e535)\n",
"<center>图2 OCR技术的应用场景</center>\n",
"\n",
"除了面向垂类的结构化文本识别通用OCR技术也有广泛的应用并且常常和其他技术结合完成多模态任务例如在视频场景中经常使用OCR技术进行字幕自动翻译、内容安全监控等等或者与视觉特征相结合完成视频理解、视频搜索等任务。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/ca2341a51eb242ee8e1afe121ce3ebbc87a113cef1b643ed9bba92d0c8ee4f0f)\n",
"<center>图3 多模态场景中的通用OCR</center>\n",
"\n",
"## 1.2 OCR技术挑战\n",
"OCR的技术难点可以分为算法层和应用层两方面。\n",
"\n",
"* **<font color=red>算法层</font>**\n",
"\n",
"OCR丰富的应用场景决定了它会存在很多技术难点。这里给出了常见的8种问题\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a56831fbf0c449fe9156a893002cadfe110ccfea835b4d90854a7ce4b1df2a4f)\n",
"<center>图4 OCR算法层技术难点</center>\n",
"\n",
"这些问题给文本检测和文本识别都带来了巨大的技术挑战可以看到这些挑战主要都是面向自然场景目前学术界的研究也主要聚焦在自然场景OCR领域在学术上的常用数据集也都是自然场景。针对这些问题的研究很多相对来说识别比检测面临更大的挑战。\n",
"\n",
"* **<font color=red>应用层</font>**\n",
"\n",
"在实际应用中尤其是在广泛的通用场景下除了上一节总结的仿射变换、尺度问题、光照不足、拍摄模糊等算法层面的技术难点OCR技术还面临两大落地难点\n",
"1. **海量数据要求OCR能够实时处理。** OCR应用常对接海量数据我们要求或希望数据能够得到实时处理模型的速度做到实时是一个不小的挑战。\n",
"2. **端侧应用要求OCR模型足够轻量识别速度足够快。** OCR应用常部署在移动端或嵌入式硬件端侧OCR应用一般有两种模式上传到服务器 vs. 端侧直接识别考虑到上传到服务器的方式对网络有要求实时性较低并且请求量过大时服务器压力大以及数据传输的安全性问题我们希望能够直接在端侧完成OCR识别而端侧的存储空间和计算能力有限因此对OCR模型的大小和预测速度有很高的要求。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/5bafdc3da1614c41a95ae39a2c36632f95e2893031a64929b9f49d4a4985cd2d)\n",
"<center>图5 OCR应用层技术难点</center>\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 2. OCR前沿算法\n",
"\n",
"虽然OCR是一个相对具体的任务但涉及了多方面的技术包括文本检测、文本识别、端到端文本识别、文档分析等等。学术上关于OCR各项相关技术的研究层出不穷下文将简要介绍OCR任务中的几种关键技术的相关工作。\n",
"\n",
"## 2.1 文本检测\n",
"\n",
"文本检测的任务是定位出输入图像中的文字区域。近年来学术界关于文本检测的研究非常丰富一类方法将文本检测视为目标检测中的一个特定场景基于通用目标检测算法进行改进适配如TextBoxes[1]基于一阶段目标检测器SSD[2]算法调整目标框使之适合极端长宽比的文本行CTPN[3]则是基于Faster RCNN[4]架构改进而来。但是文本检测与目标检测在目标信息以及任务本身上仍存在一些区别如文本一般长宽比较大往往呈“条状”文本行之间可能比较密集弯曲文本等因此又衍生了很多专用于文本检测的算法如EAST[5]、PSENet[6]、DBNet[7]等等。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/548b50212935402abb2e671c158c204737c2c64b9464442a8f65192c8a31b44d\" width=\"500\"></center>\n",
"<center>图6 文本检测任务示例</center>\n",
"\n",
"<br>\n",
"\n",
"目前较为流行的文本检测算法可以大致分为**基于回归**和**基于分割**的两大类文本检测算法也有一些算法将二者相结合。基于回归的算法借鉴通用物体检测算法通过设定anchor回归检测框或者直接做像素回归这类方法对规则形状文本检测效果较好但是对不规则形状的文本检测效果会相对差一些比如CTPN[3]对水平文本的检测效果较好但对倾斜、弯曲文本的检测效果较差SegLink[8]对长文本比较好但对分布稀疏的文本效果较差基于分割的算法引入了Mask-RCNN[9],这类算法在各种场景、对各种形状文本的检测效果都可以达到一个更高的水平,但缺点就是后处理一般会比较复杂,因此常常存在速度问题,并且无法解决重叠文本的检测问题。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/4f4ea65578384900909efff93d0b7386e86ece144d8c4677b7bc94b4f0337cfb\" width=\"800\"></center>\n",
"<center>图7 文本检测算法概览</center>\n",
"\n",
"<br>\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/473ba28cd0274d568f90eb8ca9e78864d994f3ebffe6419cb638e193c607b7b3)|![](https://ai-studio-static-online.cdn.bcebos.com/e968807b3ed9493cab20f3be0d8dc07b0baf8b8cecb24ee99ccda9d3a241832a)|![](https://ai-studio-static-online.cdn.bcebos.com/53b9e85ce46645c08481d7d7377720f5eea5ac30e37e4e9c9930e1f26b02e278)\n",
"|---|---|---|\n",
"<center>图8 基于回归的CTPN[3]算法优化anchor 基于分割的DB[7]算法优化后处理 (右)回归+分割的SAST[10]算法</center>\n",
"\n",
"<br>\n",
"\n",
"文本检测相关技术将在第二章进行详细解读和实战。\n",
"\n",
"## 2.2 文本识别\n",
"\n",
"文本识别的任务是识别出图像中的文字内容,一般输入来自于文本检测得到的文本框截取出的图像文字区域。文本识别一般可以根据待识别文本形状分为**规则文本识别**和**不规则文本识别**两大类。规则文本主要指印刷字体、扫描文本等,文本大致处在水平线位置;不规则文本往往不在水平位置,存在弯曲、遮挡、模糊等问题。不规则文本场景具有很大的挑战性,也是目前文本识别领域的主要研究方向。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/b292f21e50c94debab7496d4ced96a93774a8525c12346f49cb151bde2a58fe8)\n",
"<center>图9 (左)规则文本 VS. (右)不规则文本</center>\n",
"\n",
"<br>\n",
"\n",
"规则文本识别的算法根据解码方式的不同可以大致分为基于CTC和Sequence2Sequence两种将网络学习到的序列特征 转化为 最终的识别结果 的处理方式不同。基于CTC的算法以经典的CRNN[11]为代表。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/403ca85c59d344f88d3b1229ca14b1e90c5c73c9f1d248b7aa94103f9d0af597)\n",
"<center>图10 基于CTC的识别算法 VS. 基于Attention的识别算法</center>\n",
"\n",
"不规则文本的识别算法相比更为丰富如STAR-Net[12]等方法通过加入TPS等矫正模块将不规则文本矫正为规则的矩形后再进行识别RARE[13]等基于Attention的方法增强了对序列之间各部分相关性的关注基于分割的方法将文本行的各字符作为独立个体相比与对整个文本行做矫正后识别识别分割出的单个字符更加容易此外随着近年来Transfomer[14]的快速发展和在各类任务中的有效性验证也出现了一批基于Transformer的文本识别算法这类方法利用transformer结构解决CNN在长依赖建模上的局限性问题也取得了不错的效果。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/0fa30c3789424473ad9be1c87a4f742c1db69e3defb64651906e5334ed9571a8)\n",
"<center>图11 基于字符分割的识别算法[15]</center>\n",
"\n",
"<br>\n",
"\n",
"文本识别相关技术将在第三章进行详细解读和实战。\n",
"\n",
"## 2.3 文档结构化识别\n",
"\n",
"传统意义上的OCR技术可以解决文字的检测和识别需求但在实际应用场景中最终需要获取的往往是结构化的信息如身份证、发票的信息格式化抽取表格的结构化识别等等多在快递单据抽取、合同内容比对、金融保理单信息比对、物流业单据识别等场景下应用。OCR结果+后处理是一种常用的结构化方案但流程往往比较复杂并且后处理需要精细设计泛化性也比较差。在OCR技术逐渐成熟、结构化信息抽取需求日益旺盛的背景下版面分析、表格识别、关键信息提取等关于智能文档分析的各种技术受到了越来越多的关注和研究。\n",
"\n",
"* **版面分析**\n",
"\n",
"版面分析Layout Analysis主要是对文档图像进行内容分类类别一般可分为纯文本、标题、表格、图片等。现有方法一般将文档中不同的板式当做不同的目标进行检测或分割如Soto Carlos[16]在目标检测算法Faster R-CNN的基础上结合上下文信息并利用文档内容的固有位置信息来提高区域检测性能Sarkar Mausoom[17]等人提出了一种基于先验的分割机制,在非常高的分辨率的图像上训练文档分割模型,解决了过度缩小原始图像导致的密集区域不同结构无法区分进而合并的问题。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/dedb212e8972497998685ff51af7bfe03fdea57f6acd450281ad100807086e1a)\n",
"<center>图12 版面分析任务示意图</center>\n",
"\n",
"<br>\n",
"\n",
"* **表格识别**\n",
"\n",
"表格识别Table Recognition的任务就是将文档里的表格信息进行识别和转换到excel文件中。文本图像中表格种类和样式复杂多样例如不同的行列合并不同的内容文本类型等除此之外文档的样式和拍摄时的光照环境等都为表格识别带来了极大的挑战。这些挑战使得表格识别一直是文档理解领域的研究难点。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/47119a2a2f9a45788390d6506f90d5de7449738008aa4c0ab619b18f37bd8d57)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/22ca5749441441e69dc0eaeb670832a5d0ae0ce522f34731be7d609a2d36e8c1)\n",
"<center>图13 表格识别任务示意图</center>\n",
"\n",
"<br>\n",
"\n",
"表格识别的方法种类较为丰富早期的基于启发式规则的传统算法如Kieninger[18]等人提出的T-Rect等算法一般通过人工设计规则连通域检测分析处理近年来随着深度学习的发展开始涌现一些基于CNN的表格结构识别算法如Siddiqui Shoaib Ahmed[19]等人提出的DeepTabStRRaja Sachin[20]等人提出的TabStruct-Net等此外随着图神经网络Graph Neural Network的兴起也有一些研究者尝试将图神经网络应用到表格结构识别问题上基于图神经网络将表格识别看作图重建问题如Xue Wenyuan[21]等人提出的TGRNet基于端到端的方法直接使用网络完成表格结构的HTML表示输出端到端的方法大多采用Seq2Seq方法来完成表格结构的预测如一些基于Attention或Transformer的方法如TableMaster[22]。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a9a3c91898c84f03b382583859526c4b451ace862dbc4a15838f5dde4d0ea657)\n",
"<center>图14 表格识别方法示意图</center>\n",
"\n",
"<br>\n",
"\n",
"* **关键信息提取**\n",
"\n",
"关键信息提取Key Information ExtractionKIE是Document VQA中的一个重要任务主要从图像中提取所需要的关键信息如从身份证中提取出姓名和公民身份号码信息这类信息的种类往往在特定任务下是固定的但是在不同任务间是不同的。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/8af011647bb4464f80d07f3efeac469baed27c8185ef4c4883a19f40e8ba91f5)\n",
"<center>图15 DocVQA任务示意图</center>\n",
"\n",
"<br>\n",
"\n",
"KIE通常分为两个子任务进行研究\n",
"\n",
"- SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。\n",
"- RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题和的答案。然后对每一个问题找到对应的答案。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/2f1bc1a3e4a341ab9552bbf5f6c2be71ba78d7d65da64818b776efe0691e310b)\n",
"<center>图16 ser与re任务</center>\n",
"\n",
"<br>\n",
"\n",
"一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的的原理可以将这些方法分为下面四种:\n",
"\n",
"- 基于Grid的方法\n",
"- 基于Token的方法\n",
"- 基于GCN的方法\n",
"- 基于End to End 的方法\n",
"\n",
"<br>\n",
"\n",
"文档分析相关技术将在第六章进行详细解读和实战。\n",
"\n",
"## 2.4 其他相关技术\n",
"\n",
"前面主要介绍了OCR领域的三种关键技术文本检测、文本识别、文档结构化识别更多其他OCR相关前沿技术介绍包括端到端文本识别、OCR中的图像预处理技术、OCR数据合成等可参考教程第七章和第八章。\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 3. OCR技术的产业实践\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/3d5f18f7598f405884fa2fab041c95ce415af40712e9489996747f9d122c3d90)\n",
"\n",
"> 你是小王,该怎么办? \n",
"> 1. 我不会,我不行,我不干了😭\n",
"> 2. 建议老板找外包公司或者商业化方案,反正花老板的钱😊\n",
"> 3. 网上找找类似项目面向Github编程😏\n",
"\n",
"<br>\n",
"\n",
"OCR技术最终还是要落到产业实践当中。虽然学术上关于OCR技术的研究很多OCR技术的商业化应用相比于其他AI技术也已经相对成熟但在实际的产业应用中还是存在一些难点与挑战。下文将从技术和产业实践两个角度进行分析。\n",
"\n",
"\n",
"## 3.1 产业实践难点\n",
"\n",
"在实际的产业实践中,开发者常常需要依托开源社区资源启动或推进项目,而开发者使用开源模型又往往面临三大难题:\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/7e5e79240b9c4f13b675b56bc12edf540f159c922bf24e3cbc4a0635a356c7f9)\n",
"<center>图17 OCR技术产业实践三大难题</center>\n",
"\n",
"**1. 找不到、选不出**\n",
"\n",
"开源社区资源丰富,但是信息不对称导致开发者并不能高效地解决痛点问题。一方面,开源社区资源过于丰富,开发者面对一项需求,无法快速从海量的代码仓库中找到匹配业务需求的项目,即存在“找不到”的问题;另一方面,在算法选型时,英文公开数据集上的指标,无法给开发者常常面对的中文场景提供直接的参考,逐个算法验证需要耗费大量时间和人力,且不能保证选出最合适的算法,即“选不出”。\n",
"\n",
"**2. 不适用产业场景**\n",
"\n",
"开源社区中的工作往往更多地偏向效果优化如学术论文代码开源或复现一般更侧重算法效果平衡考虑模型大小和速度的工作相比就少很多而模型大小和预测耗时在产业实践中是两项不容忽视的指标其重要程度不亚于模型效果。无论是移动端和服务器端待识别的图像数目往往非常多都希望模型更小精度更高预测速度更快。GPU太贵最好使用CPU跑起来更经济。在满足业务需求的前提下模型越轻量占用的资源越少。\n",
"\n",
"**3. 优化难、训练部署问题多**\n",
"\n",
"直接使用开源算法或模型一般无法直接满足业务需求实际业务场景中OCR面临的问题多种多样业务场景个性化往往需要自定义数据集重新训练现有的开源项目上实验各种优化方法的成本较高。此外OCR应用场景十分丰富服务端和各种移动端设备上都有着广泛的应用需求硬件环境多样化就需要支持丰富的部署方式而开源社区的项目更侧重算法和模型在预测部署这部分明显支撑不足。要把OCR技术从论文上的算法做到技术落地应用对开发者的算法和工程能力都有很高的要求。\n",
"\n",
"## 3.2 产业级OCR开发套件PaddleOCR\n",
"\n",
"OCR产业实践需要一套完整全流程的解决方案来加快研发进度节约宝贵的研发时间。也就是说超轻量模型及其全流程解决方案尤其对于算力、存储空间有限的移动端、嵌入式设备而言可以说是刚需。\n",
"\n",
"在此背景下产业级OCR开发套件[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)应运而生。\n",
"\n",
"PaddleOCR的建设思路从用户画像和需求出发依托飞桨核心框架精选并复现丰富的前沿算法基于复现的算法研发更适用于产业落地的PP特色模型并打通训推一体提供多种预测部署方式满足实际应用的不同需求场景。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/e09929b4a31e44f9b5e3d542d12411332669d2e1a21d45ad88b1dd91142ec86c)\n",
"<center>图18 PaddleOCR开发套件全景图</center>\n",
"\n",
"<br>\n",
"\n",
"从全景图可以看出PaddleOCR依托于飞桨核心框架在模型算法、预训练模型库、工业级部署等层面均提供了丰富的解决方案并且提供了数据合成、半自动数据标注工具满足开发者的数据生产需求。\n",
"\n",
"**在模型算法层面**PaddleOCR对**文字检测识别**和**文档结构化分析**两类任务分别提供了解决方案。在文字检测识别方面PaddleOCR复现或开源了4种文本检测算法、8种文本识别算法、1种端到端文本识别算法并在此基础上研发了PP-OCR系列的通用文本检测识别解决方案在文档结构化分析方面PaddleOCR提供了版面分析、表格识别、关键信息抽取、命名实体识别等算法并在此基础提出了PP-Structure文档分析解决方案。丰富的精选算法可以满足开发者不同业务场景的需求代码框架的统一也方便开发者进行不同算法的优化和性能对比。\n",
"\n",
"**在预训练模型库层面**基于PP-OCR和PP-Structure解决方案PaddleOCR研发并开源了适用于产业实践的PP系列特色模型包括通用、超轻量和多语言的文本检测识别模型和复杂文档分析模型。PP系列特色模型均在原始算法上进行了深度优化使其在效果和性能上均能达到产业实用级别开发者既可以直接应用于业务场景也可以用业务数据进行简单的finetune便可以轻松研发出适用于自己业务需求的“实用模型”。\n",
"\n",
"**在工业级部署层面**PaddleOCR提供了基于Paddle Inference的服务器端预测方案基于Paddle Serving的服务化部署方案以及基于Paddle-Lite的端侧部署方案满足不同硬件环境下的部署需求同时提供了基于PaddleSlim的模型压缩方案可以进一步压缩模型大小。以上部署方式都完成了训推一体全流程打通以保障开发者可以高效部署稳定可靠。\n",
"\n",
"**在数据工具层面**PaddleOCR提供了半自动数据标注工具PPOCRLabel和数据合成工具Style-Text助力开发者更方便的生产模型训练所需的数据集和标注信息。PPOCRLabel作为业界首个开源的半自动OCR数据标注工具针对标注过程枯燥繁琐、机械性高大量训练数据所需人工标记时间金钱成本昂贵的问题内置PP-OCR模型实现预标注+人工校验的标注模式可以极大提升标注效率节省人力成本。数据合成工具Style-Text主要解决实际场景真实数据严重不足传统合成算法无法合成文字风格字体、颜色、间距、背景的问题只需要少许目标场景图像就可以批量合成大量与目标场景风格相近的文本图像。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/90a358d6a62c49b7b8db47e18c77878c60f80cf9c81541bfa3befea68d9dbc0f)\n",
"<center>图19 PPOCRLabel使用示意图</center>\n",
"\n",
"<br>\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/b63b10bc530c42bea3d3b923da6000f1cfef006d7eec4ff3bdc0439bd9c333c9)\n",
"<center>图20 Style-Text合成效果示例</center>\n",
"\n",
"<br>\n",
"\n",
"### 3.2.1 PP-OCR与PP-Structrue\n",
"\n",
"PP系列特色模型是飞桨各视觉开发套件针对产业实践需求进行深度优化的模型力求速度与精度平衡。PaddleOCR中的PP系列特色模型包括针对文字检测识别任务的PP-OCR系列模型和针对文档分析的PP-Structure系列模型。\n",
"\n",
"**1PP-OCR中英文模型**\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/3372558042044d43983b815069e1e43cb84432b993ed400f946976e75bd51f38)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/f0a0b936382c42dd8809e98759b4c84434d79386606b4d5b8a86416db6dbaeee)\n",
"<center>图21 PP-OCR中英文模型识别结果示例</center>\n",
"\n",
"<br>\n",
"\n",
"PP-OCR中英文模型采用的典型的两阶段OCR算法即检测模型+识别模型的组成方式,具体的算法框架如下:\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/8af1371b5e3c486bb90a041903200c7c666c8bbc98c245dc802ff8c4da98617e)\n",
"<center>图22 PP-OCR系统pipeline示意图</center>\n",
"\n",
"<br>\n",
"\n",
"可以看到除输入输出外PP-OCR核心框架包含了3个模块分别是文本检测模块、检测框矫正模块、文本识别模块。\n",
"- 文本检测模块:核心是一个基于[DB](https://arxiv.org/abs/1911.08947)检测算法训练的文本检测模型,检测出图像中的文字区域;\n",
"- 检测框矫正模块:将检测到的文本框输入检测框矫正模块,在这一阶段,将四点表示的文本框矫正为矩形框,方便后续进行文本识别,另一方面会进行文本方向判断和校正,例如如果判断文本行是倒立的情况,则会进行转正,该功能通过训练一个文本方向分类器实现;\n",
"- 文本识别模块最后文本识别模块对矫正后的检测框进行文本识别得到每个文本框内的文字内容PP-OCR中使用的经典文本识别算法[CRNN](https://arxiv.org/abs/1507.05717)。\n",
"\n",
"PaddleOCR先后推出了PP-OCR[23]和PP-OCRv2[24]模型。\n",
"\n",
"PP-OCR模型分为mobile版轻量版和server版通用版其中mobile版模型主要基于轻量级骨干网络MobileNetV3进行优化优化后模型检测模型+文本方向分类模型+识别模型大小仅8.1MCPU上平均单张图像预测耗时350msT4 GPU上约110ms裁剪量化后可在精度不变的情况下进一步压缩到3.5M便于端侧部署在骁龙855上测试预测耗时仅260ms。更多的PP-OCR评估数据可参考[benchmark](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/benchmark.md)。\n",
"\n",
"PP-OCRv2保持了PP-OCR的整体框架主要做了效果上的进一步策略优化。提升包括3个方面\n",
"- 在模型效果上相对于PP-OCR mobile版本提升超7%\n",
"- 在速度上相对于PP-OCR server版本提升超过220%\n",
"- 在模型大小上11.6M的总大小,服务器端和移动端都可以轻松部署。\n",
"\n",
"PP-OCR和PP-OCRv2的具体优化策略将在第四章中进行详细解读。\n",
"\n",
"除了中英文模型PaddleOCR也基于不同的数据集训练并开源了英文数字模型、多语言识别模型以上均为超轻量模型适用于不同的语言场景。\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/5978652a826647b98344cf61aa1c2027662af989b73e4a0e917d83718422eeb0)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/1a8a8e24b5a440d388dae767adf0ea9c049335b04e964abbb176f58c5b028d7e)\n",
"<center>图23 PP-OCR的英文数字模型和多语言模型识别效果示意图</center>\n",
"\n",
"<br>\n",
"\n",
"**2PP-Structure文档分析模型**\n",
"\n",
"PP-Structure支持版面分析layout analysis、表格识别table recognition、文档视觉问答DocVQA三种子任务。\n",
"\n",
"PP-Structure核心功能点如下\n",
"- 支持对图片形式的文档进行版面分析可以划分文字、标题、表格、图片以及列表5类区域与Layout-Parser联合使用\n",
"- 支持文字、标题、图片以及列表区域提取为文字字段与PP-OCR联合使用\n",
"- 支持表格区域进行结构化分析最终结果输出Excel文件\n",
"- 支持Python whl包和命令行两种方式简单易用\n",
"- 支持版面分析和表格结构化两类任务自定义训练\n",
"- 支持VQA任务-SER和RE\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/129708c265644dbc90d6c8f7db224b3a6f11f37bb586463a82e7ccb50bcc2e76)\n",
"<center>图24 PP-Structure系统示意图本图仅含版面分析+表格识别)</center>\n",
"\n",
"<br>\n",
"\n",
"PP-Structure的具体方案将在第六章中进行详细解读。\n",
"\n",
"### 3.2.2 工业级部署方案\n",
"\n",
"飞桨支持全流程、全场景推理部署模型来源主要分为三种第一种使用PaddlePaddle API构建网络结构进行训练所得第二种是基于飞桨套件系列飞桨套件提供了丰富的模型库、简洁易用的API具备开箱即用包括视觉模型库PaddleCV、智能语音库PaddleSpeech以及自然语言处理库PaddleNLP等第三种采用X2Paddle工具从第三方框架PyTorh、ONNX、TensorFlow等产出的模型。\n",
"\n",
"飞桨模型可以选用PaddleSlim工具进行压缩、量化以及蒸馏支持五种部署方案分别为服务化Paddle Serving、服务端/云端Paddle Inference、移动端/边缘端Paddle Lite、网页前端Paddle.js, 对于Paddle不支持的硬件比如MCU、地平线、鲲云等国产芯片可以借助Paddle2ONNX转化为支持ONNX的第三方框架。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/c9ffe78e7db14e4eb103e7f393a16fbf2ab438540250474a8e0e7adc4aeb7ee0)\n",
"<center>图25 飞桨支持部署方式</center>\n",
"\n",
"<br>\n",
"\n",
"Paddle Inference支持服务端和云端部署具备高性能与通用性针对不同平台和不同应用场景进行了深度的适配和优化Paddle Inference是飞桨的原生推理库保证模型在服务器端即训即用快速部署适用于高性能硬件上使用多种应用语言环境部署算法复杂的模型硬件覆盖x86 CPU、Nvidia GPU、以及百度昆仑XPU、华为昇腾等AI加速器。\n",
"\n",
"Paddle Lite 是端侧推理引擎具有轻量化和高性能特点针对端侧设备和各应用场景进行了深度的设配和优化。当前支持Android、IOS、嵌入式Linux设备、macOS 等多个平台硬件覆盖ARM CPU和GPU、X86 CPU和新硬件如百度昆仑、华为昇腾与麒麟、瑞芯微等。\n",
"\n",
"Paddle Serving是一套高性能服务框架旨在帮助用户几个步骤快速将模型在云端服务化部署。目前Paddle Serving支持自定义前后处理、模型组合、模型热加载更新、多机多卡多模型、分布式推理、K8S部署、安全网关和模型加密部署、支持多语言多客户端访问等功能Paddle Serving官方还提供了包括PaddleOCR在内的40多种模型的部署示例以帮助用户更快上手。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/4d8063d74194434ea9b7c9f81c7fbdfd2131e13770124d2e99c1b9670f12e019)\n",
"<center>图26 飞桨支持部署方式</center>\n",
"\n",
"<br>\n",
"\n",
"以上部署方案将在第五章中基于PP-OCRv2模型进行详细解读与实战。"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 4. 总结\n",
"\n",
"本节首先介绍了OCR技术的应用场景和前沿算法然后分析了OCR技术在产业实践中的难点与三大挑战。\n",
"\n",
"本教程后续章节内容安排如下:\n",
"\n",
"* 第二、三章分别介绍检测、识别技术并实践;\n",
"* 第四章介绍PP-OCR优化策略 \n",
"* 第五章进行预测部署实战; \n",
"* 第六章介绍文档结构化; \n",
"* 第七章介绍端到端、数据预处理、数据合成等其他OCR相关算法 \n",
"* 第八章介绍OCR相关数据集和数据合成工具。\n",
"\n",
"# 参考文献\n",
"\n",
"[1] Liao, Minghui, et al. \"Textboxes: A fast text detector with a single deep neural network.\" Thirty-first AAAI conference on artificial intelligence. 2017.\n",
"\n",
"[2] Liu W, Anguelov D, Erhan D, et al. Ssd: Single shot multibox detector[C]//European conference on computer vision. Springer, Cham, 2016: 21-37.\n",
"\n",
"[3] Tian, Zhi, et al. \"Detecting text in natural image with connectionist text proposal network.\" European conference on computer vision. Springer, Cham, 2016.\n",
"\n",
"[4] Ren S, He K, Girshick R, et al. Faster r-cnn: Towards real-time object detection with region proposal networks[J]. Advances in neural information processing systems, 2015, 28: 91-99.\n",
"\n",
"[5] Zhou, Xinyu, et al. \"East: an efficient and accurate scene text detector.\" Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2017.\n",
"\n",
"[6] Wang, Wenhai, et al. \"Shape robust text detection with progressive scale expansion network.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.\n",
"\n",
"[7] Liao, Minghui, et al. \"Real-time scene text detection with differentiable binarization.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 07. 2020.\n",
"\n",
"[8] Deng, Dan, et al. \"Pixellink: Detecting scene text via instance segmentation.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 32. No. 1. 2018.\n",
"\n",
"[9] He K, Gkioxari G, Dollár P, et al. Mask r-cnn[C]//Proceedings of the IEEE international conference on computer vision. 2017: 2961-2969.\n",
"\n",
"[10] Wang P, Zhang C, Qi F, et al. A single-shot arbitrarily-shaped text detector based on context attended multi-task \n",
"learning[C]//Proceedings of the 27th ACM international conference on multimedia. 2019: 1277-1285.\n",
"\n",
"[11] Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, 39(11), 2298-2304.\n",
"\n",
"[12] Star-Net Max Jaderberg, Karen Simonyan, Andrew Zisserman, et al. Spa- tial transformer networks. In Advances in neural information processing systems, pages 20172025, 2015.\n",
"\n",
"[13] Shi, B., Wang, X., Lyu, P., Yao, C., & Bai, X. (2016). Robust scene text recognition with automatic rectification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4168-4176).\n",
"\n",
"[14] Sheng, F., Chen, Z., & Xu, B. (2019, September). NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) (pp. 781-786). IEEE.\n",
"\n",
"[15] Lyu P, Liao M, Yao C, et al. Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes[C]//Proceedings of the European Conference on Computer Vision (ECCV). 2018: 67-83.\n",
"\n",
"[16] Soto C, Yoo S. Visual detection with context for document layout analysis[C]//Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 2019: 3464-3470.\n",
"\n",
"[17] Sarkar M, Aggarwal M, Jain A, et al. Document Structure Extraction using Prior based High Resolution Hierarchical Semantic Segmentation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 649-666.\n",
"\n",
"[18] Kieninger T, Dengel A. A paper-to-HTML table converting system[C]//Proceedings of document analysis systems (DAS). 1998, 98: 356-365.\n",
"\n",
"[19] Siddiqui S A, Fateh I A, Rizvi S T R, et al. Deeptabstr: Deep learning based table structure recognition[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1403-1409.\n",
"\n",
"[20] Raja S, Mondal A, Jawahar C V. Table structure recognition using top-down and bottom-up cues[C]//European Conference on Computer Vision. Springer, Cham, 2020: 70-86.\n",
"\n",
"[21] Xue W, Yu B, Wang W, et al. TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition[J]. arXiv preprint arXiv:2106.10598, 2021.\n",
"\n",
"[22] Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021.\n",
"\n",
"[23] Du Y, Li C, Guo R, et al. PP-OCR: A practical ultra lightweight OCR system[J]. arXiv preprint arXiv:2009.09941, 2020.\n",
"\n",
"[24] Du Y, Li C, Guo R, et al. PP-OCRv2: Bag of Tricks for Ultra Lightweight OCR System[J]. arXiv preprint arXiv:2109.03144, 2021.\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -0,0 +1,171 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 文本检测FAQ\n",
"\n",
"本节罗列一些开发者们使用PaddleOCR的文本检测模型常遇到的一些问题并给出相应的问题解决方法或建议。\n",
"\n",
"FAQ分两个部分来介绍分别是\n",
" - 文本检测训练相关\n",
" - 文本检测预测相关"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 1. 文本检测训练相关FAQ\n",
"\n",
"**1.1 PaddleOCR提供的文本检测算法包括哪些**\n",
"\n",
"**A**PaddleOCR中包含多种文本检测模型包括基于回归的文本检测方法EAST、SAST和基于分割的文本检测方法DBPSENet。\n",
"\n",
"\n",
"**1.2请问PaddleOCR项目中的中文超轻量和通用模型用了哪些数据集训练多少样本gpu什么配置跑了多少个epoch大概跑了多久**\n",
"\n",
"**A**对于超轻量DB检测模型训练数据包括开源数据集lsvtrctwCASIACCPDMSRAMLTBornDigitiflytekSROIE和合成的数据集等总数据量越10W数据集分为5个部分训练时采用随机采样策略在4卡V100GPU上约训练500epoch耗时3天。\n",
"\n",
"\n",
"**1.3 文本检测训练标签是否需要具体文本标注,标签中的”###”是什么意思?**\n",
"\n",
"**A**文本检测训练只需要文本区域的坐标即可标注可以是四点或者十四点按照左上右上右下左下的顺序排列。PaddleOCR提供的标签文件中包含文本字段对于文本区域文字不清晰会使用###代替。训练检测模型时,不会用到标签中的文本字段。\n",
" \n",
"**1.4 对于文本行较紧密的情况下训练的文本检测模型效果较差?**\n",
"\n",
"**A**使用基于分割的方法如DB检测密集文本行时最好收集一批数据进行训练并且在训练时并将生成二值图像的[shrink_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/ppocr/data/imaug/make_shrink_map.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L37)参数调小一些。另外,在预测的时候,可以适当减小[unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L59)参数unclip_ratio参数值越大检测框就越大。\n",
"\n",
"\n",
"**1.5 对于一些尺寸较大的文档类图片, DB在检测时会有较多的漏检怎么避免这种漏检的问题呢**\n",
"\n",
"**A**:首先,需要确定是模型没有训练好的问题还是预测时处理的问题。如果是模型没有训练好,建议多加一些数据进行训练,或者在训练的时候多加一些数据增强。\n",
"如果是预测图像过大的问题,可以增大预测时输入的最长边设置参数[det_limit_side_len](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L47)默认为960。\n",
"其次,可以通过可视化后处理的分割图观察漏检的文字是否有分割结果,如果没有分割结果,说明是模型没有训练好。如果有完整的分割区域,说明是预测后处理的问题,建议调整[DB后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L51-L53)。\n",
"\n",
"\n",
"**1.6 DB模型弯曲文本如略微形变的文档图像漏检问题?**\n",
"\n",
"**A**: DB后处理中计算文本框平均得分时是求rectangle区域的平均分数容易造成弯曲文本漏检已新增求polygon区域的平均分数会更准确但速度有所降低可按需选择在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`]`fast`对应原始的rectangle方式`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题。\n",
"\n",
"\n",
"**1.7 简单的对于精度要求不高的OCR任务数据集需要准备多少张呢**\n",
"\n",
"**A**1训练数据的数量和需要解决问题的复杂度有关系。难度越大精度要求越高则数据集需求越大而且一般情况实际中的训练数据越多效果越好。\n",
"\n",
"2对于精度要求不高的场景检测任务和识别任务需要的数据量是不一样的。对于检测任务500张图像可以保证基本的检测效果。对于识别任务需要保证识别字典中每个字符出现在不同场景的行文本图像数目需要大于200张举例如果有字典中有5个字每个字都需要出现在200张图片以上那么最少要求的图像数量应该在200-1000张之间这样可以保证基本的识别效果。\n",
"\n",
"\n",
"**1.8 当训练数据量少时,如何获取更多的数据?**\n",
"\n",
"**A**当训练数据量少时可以尝试以下三种方式获取更多的数据1人工采集更多的训练数据最直接也是最有效的方式。2基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中opencv的旋转仿射变换高斯滤波等。3利用数据生成算法合成数据例如pix2pix等算法。\n",
"\n",
"\n",
"**1.9 如何更换文本检测/识别的backbone**\n",
"\n",
"A无论是文字检测还是文字识别骨干网络的选择是预测效果和预测效率的权衡。一般选择更大规模的骨干网络例如ResNet101_vd则检测或识别更准确但预测耗时相应也会增加。而选择更小规模的骨干网络例如MobileNetV3_small_x0_35则预测更快但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。飞桨图像分类套件PaddleClas汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构在上述图像分类任务的top1识别准确率GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的117个预训练模型下载地址。\n",
"\n",
"1文字检测骨干网络的替换主要是确定类似与ResNet的4个stages以方便集成后续的类似FPN的检测头。此外对于文字检测问题使用ImageNet训练的分类预训练模型可以加速收敛和效果提升。\n",
"\n",
"2文字识别的骨干网络的替换需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大因此高度下降频率少一些宽度下降频率多一些。可以参考[PaddleOCR中MobileNetV3骨干网络的改动](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/ppocr/modeling/backbones/rec_mobilenet_v3.py)。\n",
"\n",
"\n",
"**1.10 如何对检测模型finetune比如冻结前面的层或某些层使用小的学习率学习**\n",
"\n",
"**A**如果是冻结某些层可以将变量的stop_gradient属性设置为True这样计算这个变量之前的所有参数都不会更新了参考https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/faq/train_cn.html#id4\n",
"\n",
"如果对某些层使用更小的学习率学习静态图里还不是很方便一个方法是在参数初始化的时候给权重的属性设置固定的学习率参考https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/param_attr/ParamAttr_cn.html#paramattr\n",
"\n",
"实际上我们实验发现直接加载模型去fine-tune不设置某些层不同学习率效果也都不错\n",
"\n",
"**1.11 DB的预处理部分图片的长和宽为什么要处理成32的倍数**\n",
"\n",
"**A**和网络下采样的倍数stride有关。以检测中的resnet骨干网络为例图像输入网络之后需要经过5次2倍降采样共32倍因此建议输入的图像尺寸为32的倍数。\n",
"\n",
"\n",
"**1.12 在PP-OCR系列的模型中文本检测的骨干网络为什么没有使用SEBlock**\n",
"\n",
"**A**SE模块是MobileNetV3网络一个重要模块目的是估计特征图每个特征通道重要性给特征图每个特征分配权重提高网络的表达能力。但是对于文本检测输入网络的分辨率比较大一般是640\\*640利用SE模块估计特征图每个特征通道重要性比较困难网络提升能力有限但是该模块又比较耗时因此在PP-OCR系统中文本检测的骨干网络没有使用SE模块。实验也表明当去掉SE模块超轻量模型大小可以减小40%文本检测效果基本不受影响。详细可以参考PP-OCR技术文章https://arxiv.org/abs/2009.09941.\n",
"\n",
"\n",
"**1.13 PP-OCR检测效果不好该如何优化**\n",
"\n",
"A 具体问题具体分析:\n",
"- 如果在你的场景上检测效果不可用首选是在你的数据上做finetune训练\n",
"- 如果图像过大文字过于密集建议不要过度压缩图像可以尝试修改检测预处理的resize逻辑防止图像被过度压缩\n",
"- 检测框大小过于紧贴文字或检测框过大可以调整db_unclip_ratio这个参数加大参数可以扩大检测框减小参数可以减小检测框大小\n",
"- 检测框存在很多漏检问题可以减小DB检测后处理的阈值参数det_db_box_thresh防止一些检测框被过滤掉也可以尝试设置det_db_score_mode为'slow';\n",
"- 其他方法可以选择use_dilation为True对检测输出的feature map做膨胀处理一般情况下会有效果改善\n",
"\n",
"\n",
"## 2. 文本检测预测相关FAQ\n",
"\n",
"**2.1 DB有些框太贴文本了反而去掉了一些文本的边角影响识别这个问题有什么办法可以缓解吗**\n",
"\n",
"**A**:可以把后处理的参数[unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/d80afce9b51f09fd3d90e539c40eba8eb5e50dd6/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L52)适当调大一点,该参数越大文本框越大。\n",
"\n",
"\n",
"**2.2 为什么PaddleOCR检测预测是只支持一张图片测试即test_batch_size_per_card=1**\n",
"\n",
"**A**预测的时候对图像等比例缩放最长边960不同图像等比例缩放后长宽不一致无法组成batch所以设置为test_batch_size为1。\n",
"\n",
"\n",
"**2.3 在CPU上加速PaddleOCR的文本检测模型预测**\n",
"\n",
"**A**x86 CPU可以使用mkldnnOneDNN进行加速在支持mkldnn加速的CPU上开启[enable_mkldnn](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L105)参数。另外配合增加CPU上预测使用的[线程数num_threads](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L106)可以有效加快CPU上的预测速度。\n",
"\n",
"**2.4 在GPU上加速PaddleOCR的文本检测模型预测**\n",
"\n",
"**A**GPU加速预测推荐使用TensorRT。\n",
"- 1. 从[链接](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html)下载带TensorRT的Paddle安装包或者预测库。\n",
"- 2. 从Nvidia官网下载TensorRT版本注意下载的TensorRT版本与paddle安装包中编译的TensorRT版本一致。\n",
"- 3. 设置环境变量LD_LIBRARY_PATH指向TensorRT的lib文件夹\n",
"```\n",
"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>\n",
"```\n",
"- 4. 开启PaddleOCR预测的[tensorrt选项](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L38)。\n",
"\n",
"**2.5 如何在移动端部署PaddleOCR模型**\n",
"\n",
"**A**: 飞桨Paddle有专门针对移动端部署的工具[PaddleLite](https://github.com/PaddlePaddle/Paddle-Lite)并且PaddleOCR提供了DB+CRNN为demo的android arm部署代码参考[链接](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/deploy/lite/readme.md)。\n",
"\n",
"\n",
"**2.6 如何使用PaddleOCR多进程预测**\n",
"\n",
"**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L111)`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。\n",
"\n",
"**2.7 预测时显存爆炸、内存泄漏问题?**\n",
"\n",
"**A**: 如果是训练模型的预测由于模型太大或者输入图像太大导致显存不够用可以参考代码在主函数运行前加上paddle.no_grad()即可减小显存占用。如果是inference模型预测时显存占用过高可以配置Config时加入[config.enable_memory_optim()](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L267)用于减小内存占用。\n",
"\n",
"另外关于使用Paddle预测时出现内存泄漏的问题建议安装paddle最新版本内存泄漏已修复。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,287 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# OCR七日课之文本检测综述\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 1. 文本检测\n",
"\n",
"文本检测任务是找出图像或视频中的文字位置。不同于目标检测任务,目标检测不仅要解决定位问题,还要解决目标分类问题。\n",
"\n",
"文本在图像中的表现形式可以视为一种‘目标‘,通用的目标检测的方法也适用于文本检测,从任务本身上来看:\n",
"\n",
"- 目标检测给定图像或者视频找出目标的位置box并给出目标的类别\n",
"- 文本检测:给定输入图像或者视频,找出文本的区域,可以是单字符位置或者整个文本行位置;\n",
"\n",
"\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/af2d8eca913a4d5a968945ae6cac180b009c6cc94abc43bfbaf1ba6a3de98125\" width=\"400\" ></center>\n",
"\n",
"<br><center>图1 目标检测示意图</center>\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/400b9100573b4286b40b0a668358bcab9627f169ab934133a1280361505ddd33\" width=\"1000\" ></center>\n",
"\n",
"<br><center>图2 文本检测示意图</center>\n",
"\n",
"目标检测和文本检测同属于“定位”问题。但是文本检测无需对目标分类,并且文本形状复杂多样。\n",
"\n",
"当前所说的文本检测一般是自然场景文本检测,其难点在于:\n",
"\n",
"1. 自然场景中文本具有多样性:文本检测受到文字颜色、大小、字体、形状、方向、语言、以及文本长度的影响;\n",
"2. 复杂的背景和干扰;文本检测受到图像失真,模糊,低分辨率,阴影,亮度等因素的影响;\n",
"3. 文本密集甚至重叠会影响文字的检测;\n",
"4. 文字存在局部一致性,文本行的一小部分,也可视为是独立的文本;\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/072f208f2aff47e886cf2cf1378e23c648356686cf1349c799b42f662d8ced00\"\n",
"width=\"1000\" ></center>\n",
"\n",
"<br><center>图3 文本检测场景</center>\n",
"\n",
"针对以上问题,衍生了很多基于深度学习的文本检测算法,解决自然场景文字检测问题,这些方法可以分为基于回归和基于分割的文本检测方法。\n",
"\n",
"下一节将简要介绍基于深度学习技术的经典文字检测算法。"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 2. 文本检测方法介绍\n",
"\n",
"\n",
"近些年来基于深度学习的文本检测算法层出不穷,这些方法大致可以分为两类:\n",
"1. 基于回归的文本检测方法\n",
"2. 基于分割的文本检测方法\n",
"\n",
"\n",
"本节筛选了2017-2021年的常用文本检测方法按照如上两类方法分类如下表格所示\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/22314238b70b486f942701107ffddca48b87235a473c4d8db05b317f132daea0\"\n",
"width=\"600\" ></center>\n",
"<br><center>图4 文本检测算法</center>\n",
"\n",
"\n",
"### 2.1 基于回归的文本检测\n",
"\n",
"基于回归文本检测方法和目标检测算法的方法相似,文本检测方法只有两个类别,图像中的文本视为待检测的目标,其余部分视为背景。\n",
"\n",
"#### 2.1.1 水平文本检测\n",
"\n",
"早期基于深度学习的文本检测算法是从目标检测的方法改进而来支持水平文本检测。比如TextBoxes算法基于SSD算法改进而来CTPN根据二阶段目标检测Fast-RCNN算法改进而来。\n",
"\n",
"在TextBoxes[1]算法根据一阶段目标检测器SSD调整将默认文本框更改为适应文本方向和宽高比的规格的四边形提供了一种端对端训练的文字检测方法并且无需复杂的后处理。\n",
"- 采用更大长宽比的预选框\n",
"- 卷积核从3x3变成了1x5更适合长文本检测\n",
"- 采用多尺度输入\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/3864ccf9d009467cbc04225daef0eb562ac0c8c36f9b4f5eab036c319e5f05e7\" width=\"1000\" ></center>\n",
"<br><center>图5 textbox框架图</center>\n",
"\n",
"CTPN[3]基于Fast-RCNN算法扩展RPN模块并且设计了基于CRNN的模块让整个网络从卷积特征中检测到文本序列二阶段的方法通过ROI Pooling获得了更准确的特征定位。但是TextBoxes和CTPN只支持检测横向文本。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/452833c2016e4cf7b35291efd09740c13c4bfb8f7c56446b8f7a02fc7eb3e901\" width=\"1000\" ></center>\n",
"<br><center>图6 CTPN框架图</center>\n",
"\n",
"#### 2.1.2 任意角度文本检测\n",
"\n",
"TextBoxes++[2]在TextBoxes基础上进行改进支持检测任意角度的文本。从结构上来说不同于TextBoxesTextBoxes++针对多角度文本进行检测首先修改预选框的宽高比调整宽高比aspect ratio为1、2、3、5、1/2、1/3、1/5。其次是将$1*5$的卷积核改为$3*5$更好的学习倾斜文本的特征最后TextBoxes++的输出旋转框的表示信息。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/ae96e3acbac04be296b6d54a4d72e5881d592fcc91f44882b24bc7d38b9d2658\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图7 TextBoxes++框架图</center>\n",
"\n",
"\n",
"EAST[4]针对倾斜文本的定位问题提出了two-stage的文本检测方法包含 FCN特征提取和NMS部分。EAST提出了一种新的文本检测pipline结构可以端对端训练并且支持检测任意朝向的文本并且具有结构简单性能高的特点。FCN支持输出倾斜的矩形框和水平框可以自由选择输出格式。\n",
"- 如果输出检测形状为RBox则输出Box旋转角度以及AABB文本形状信息AABB表示到文本框上下左右边的偏移。RBox可以旋转矩形的文本。\n",
"- 如果输出检测框为四点框则输出的最后一个维度为8个数字表示从四边形的四个角顶点的位置偏移。该输出方式可以预测不规则四边形的文本。\n",
"\n",
"考虑到FCN输出的文本框是比较冗余的比如一个文本区域的邻近的像素生成的框重合度较高但不是同一个文本生成的检测框重合度都很小因此EAST提出先按行合并预测框最后再把剩下的四边形用原始的NMS筛选。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/d7411ada08714adab73fa0edf7555a679327b71e29184446a33d81cdd910e4fc\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图8 EAST框架图</center> \n",
"\n",
"\n",
"MOST[15]提出TFAM模块动态的调整粗粒度的检测结果的感受野另外提出PA-NMS根据位置信息合并可靠的检测预测结果。此外训练中还提出 Instance-wise IoU 损失函数用于平衡训练以处理不同尺度的文本实例。该方法可以和EAST方法结合在检测极端长宽比和不同尺度的文本有更好的检测效果和性能。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/73052d9439714bba86ffe4a959d58c523b07baf3f1d74882b4517e71f5a645fe\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图9 MOST框架图</center>\n",
"\n",
"\n",
"#### 2.1.3 弯曲文本检测\n",
"\n",
"利用回归的方法解决弯曲文本的检测问题,一个简单的思路是用多点坐标描述弯曲文本的边界多边形,然后直接预测多边形的顶点坐标。\n",
"\n",
"CTD[6]提出了直接预测弯曲文本14个顶点的边界多边形网络中利用Bi-LSTM[13]层以细化顶点的预测坐标,实现了基于回归方法的弯曲文本检测。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/6e33d76ebb814cac9ebb2942b779054af160857125294cd69481680aca2fa98a\"\n",
"width=\"600\" ></center>\n",
"<br><center>图10 CTD框架图</center>\n",
"\n",
"\n",
"\n",
"LOMO[19]针对长文本和弯曲文本问题提出迭代的优化文本定位特征获取更精细的文本定位该方法包括三个部分坐标回归模块DR迭代优化模块IRM以及任意形状表达模块SEM。分别用于生成文本大致区域迭代优化文本定位特征预测文本区域、文本中心线以及文本边界。迭代的优化文本特征可以更好的解决长文本定位问题以及获得更精确的文本区域定位。\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/e90adf3ca25a45a0af0b84a181fbe2c4954be1fcca8f4049957128548b7131ef\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图11 LOMO框架图</center>\n",
"\n",
"\n",
"Contournet[18]基于提出对文本轮廓点建模获取弯曲文本检测框该方法首先使用Adaptive-RPN获取文本区域的proposal特征然后设计了局部正交纹理感知LOTM模块学习水平与竖直方向的纹理特征并用轮廓点表示最后通过同时考虑两个正交方向上的特征响应利用Point Re-Scoring算法可以有效地滤除强单向或弱正交激活的预测最终文本轮廓可以用一组高质量的轮廓点表示出来。\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/1f59ab5db899412f8c70ba71e8dd31d4ea9480d6511f498ea492c97dd2152384\"\n",
"width=\"600\" ></center>\n",
"<br><center>图12 Contournet框架图</center>\n",
"\n",
"\n",
"PCR[14]提出渐进式的坐标回归处理弯曲文本检测问题总体分为三个阶段首先大致检测到文本区域获得文本框另外通过所设计的Contour Localization Mechanism预测文本最小包围框的角点坐标然后通过叠加多个CLM模块和RCLM模块预测得到弯曲文本。该方法利用文本轮廓信息聚合得到丰富的文本轮廓特征表示不仅能抑制冗余的噪声点对坐标回归的影响还能更精确的定位文本区域。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/c677c4602cee44999ae4b38bd780b69795887f2ae10747968bb084db6209b6cc\"\n",
"width=\"600\" ></center>\n",
"<br><center>图13 PCR框架图</center>\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"\n",
"### 2.2 基于分割的文本检测\n",
"\n",
"基于回归的方法虽然在文本检测上取得了很好的效果,但是对解决弯曲文本往往难以得到平滑的文本包围曲线,并且模型较为复杂不具备性能优势。于是研究者们提出了基于图像分割的文本分割方法,先从像素层面做分类,判别每一个像素点是否属于一个文本目标,得到文本区域的概率图,通过后处理方式得到文本分割区域的包围曲线。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/fb9e50c410984c339481869ba11c1f39f80a4d74920b44b084601f2f8a23099f\"\n",
"width=\"600\" ></center>\n",
"<br><center>图14 文本分割算法示意图</center>\n",
"\n",
"\n",
"此类方法通常是基于分割的方法实现文本检测基于分割的方法对不规则形状的文本检测有着天然的优势。基于分割的文本检测方法主体思想为通过分割方法得到图像中文本区域再利用opencvpolygon等后处理得到文本区域的最小包围曲线。\n",
"\n",
"\n",
"Pixellink[7]采用分割的方法解决文本检测问题分割对象为文本区域将同属于一个文本行单词中的像素链接在一起来分割文本直接从分割结果中提取文本边界框无需位置回归就能达到基于回归的文本检测的效果。但是基于分割的方法存在一个问题对于位置相近的文本文本分割区域容易出现“粘连“问题。Wu, Yue等人[8]提出分割文本的同时学习文本的边界位置用于更好的区分文本区域。另外Tian等人[9]提出将同一文本的像素映射到映射空间,在映射空间中令统一文本的映射向量距离相近,不同文本的映射向量距离变远。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/462b5e1472824452a2c530939cda5e59ada226b2d0b745d19dd56068753a7f97\"\n",
"width=\"600\" ></center>\n",
"<br><center>图15 PixelLink框架图</center>\n",
"\n",
"MSR[20]针对文本检测的多尺度问题提出提取相同图像的多个scale的特征然后将这些特征融合并上采样到原图尺寸网络最后预测文本中心区域、文本中心区域每个点到最近的边界点的x坐标偏移和y坐标偏移最终可以得到文本区域的轮廓坐标集合。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/9597efd68a224d60b74d7c51c99f7ff0ba9939e5cdb84fb79209b7e213f7d039\"\n",
"width=\"600\" ></center>\n",
"<br><center>图16 MSR框架图</center>\n",
" \n",
"针对基于分割的文本算法难以区分相邻文本的问题PSENet[10]提出渐进式的尺度扩张网络学习文本分割区域,预测不同收缩比例的文本区域,并逐个扩大检测到的文本区域,该方法本质上是边界学习方法的变体,可以有效解决任意形状相邻文本的检测问题。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/fa870b69a2a5423cad7422f64c32e0645dfc31a4ecc94a52832cf8742cded5ba\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图17 PSENet框架图</center>\n",
"\n",
"假设PSENet后处理用了3个不同尺度的kernel如上图s1,s2,s3所示。首先从最小kernel s1开始计算文本分割区域的连通域得到(b)然后对连通域沿着上下左右做尺度扩张对于扩张区域属于s2但不属于s1的像素进行归类遇到冲突点时采用“先到先得”原则重复尺度扩张的操作最终可以得到不同文本行的独立的分割区域。\n",
"\n",
"\n",
"Seglink++[17]针对弯曲文本和密集文本问题提出了一种文本块单元之间的吸引关系和排斥关系的表征然后设计了一种最小生成树算法进行单元组合得到最终的文本检测框并提出instance-aware 损失函数使Seglink++方法可以端对端训练。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/1a16568361c0468db537ac25882eed096bca83f9c1544a92aee5239890f9d8d9\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图18 Seglink++框架图</center>\n",
"\n",
"虽然分割方法解决了弯曲文本的检测问题,但是复杂的后处理逻辑以及预测速度也是需要优化的目标。\n",
"\n",
"PAN[11]针对文本检测预测速度慢的问题从网络设计和后处理方面进行改进提升算法性能。首先PAN使用了轻量级的ResNet18作为Backbone另外设计了轻量级的特征增强模块FPEM和特征融合模块FFM增强Backbone提取的特征。在后处理方面采用像素聚类方法沿着预测的文本中心kernel四周合并与kernel的距离小于阈值d的像素。PAN保证高精度的同时具有更快的预测速度。\n",
"\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/a76771f91db246ee8be062f96fa2a8abc7598dd87e6d4755b63fac71a4ebc170\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图19 PAN框架图</center>\n",
"\n",
"DBNet[12]针对基于分割的方法需要使用阈值进行二值化处理而导致后处理耗时的问题,提出了可学习阈值并巧妙地设计了一个近似于阶跃函数的二值化函数,使得分割网络在训练的时候能端对端的学习文本分割的阈值。自动调节阈值不仅带来精度的提升,同时简化了后处理,提高了文本检测的性能。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/0d6423e3c79448f8b09090cf2dcf9d0c7baa0f6856c645808502678ae88d2917\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图20 DB框架图</center>\n",
"\n",
"FCENet[16]提出将文本包围曲线用傅立叶变换的参数表示,由于傅里叶系数表示在理论上可以拟合任意的封闭曲线,通过设计合适的模型预测基于傅里叶变换的任意形状文本包围框表示,从而实现了自然场景文本检测中对于高度弯曲文本实例的检测精度的提升。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/45e9a374d97145689a961977f896c8f9f470a66655234c1498e1c8477e277954\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图21 FCENet框架图</center>\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 3. 总结\n",
"\n",
"本节介绍了近几年来文本检测领域的发展包括基于回归、分割的文本检测方法并分别列举并介绍了一些经典论文的方法思路。下一节以PaddleOCR开源库为例详细介绍DBNet的算法原理以及核心代码实现。"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 参考文献\n",
"1. Liao, Minghui, et al. \"Textboxes: A fast text detector with a single deep neural network.\" Thirty-first AAAI conference on artificial intelligence. 2017.\n",
"2. Liao, Minghui, Baoguang Shi, and Xiang Bai. \"Textboxes++: A single-shot oriented scene text detector.\" IEEE transactions on image processing 27.8 (2018): 3676-3690.\n",
"3. Tian, Zhi, et al. \"Detecting text in natural image with connectionist text proposal network.\" European conference on computer vision. Springer, Cham, 2016.\n",
"4. Zhou, Xinyu, et al. \"East: an efficient and accurate scene text detector.\" Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2017.\n",
"5. Wang, Fangfang, et al. \"Geometry-aware scene text detection with instance transformation network.\" Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2018.\n",
"6. Yuliang, Liu, et al. \"Detecting curve text in the wild: New dataset and new solution.\" arXiv preprint arXiv:1712.02170 (2017).\n",
"7. Deng, Dan, et al. \"Pixellink: Detecting scene text via instance segmentation.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 32. No. 1. 2018.\n",
"8. Wu, Yue, and Prem Natarajan. \"Self-organized text detection with minimal post-processing via border learning.\" Proceedings of the IEEE International Conference on Computer Vision. 2017.\n",
"9. Tian, Zhuotao, et al. \"Learning shape-aware embedding for scene text detection.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.\n",
"10. Wang, Wenhai, et al. \"Shape robust text detection with progressive scale expansion network.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.\n",
"11. Wang, Wenhai, et al. \"Efficient and accurate arbitrary-shaped text detection with pixel aggregation network.\" Proceedings of the IEEE/CVF International Conference on Computer Vision. 2019.\n",
"12. Liao, Minghui, et al. \"Real-time scene text detection with differentiable binarization.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 07. 2020.\n",
"13. Hochreiter, Sepp, and Jürgen Schmidhuber. \"Long short-term memory.\" Neural computation 9.8 (1997): 1735-1780.\n",
"14. Dai, Pengwen, et al. \"Progressive Contour Regression for Arbitrary-Shape Scene Text Detection.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2021.\n",
"15. He, Minghang, et al. \"MOST: A Multi-Oriented Scene Text Detector with Localization Refinement.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2021.\n",
"16. Zhu, Yiqin, et al. \"Fourier contour embedding for arbitrary-shaped text detection.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2021.\n",
"17. Tang, Jun, et al. \"Seglink++: Detecting dense and arbitrary-shaped scene text by instance-aware component grouping.\" Pattern recognition 96 (2019): 106954.\n",
"18. Wang, Yuxin, et al. \"Contournet: Taking a further step toward accurate arbitrary-shaped scene text detection.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2020.\n",
"19. Zhang, Chengquan, et al. \"Look more than once: An accurate detector for text of arbitrary shapes.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.\n",
"20. Xue C, Lu S, Zhang W. Msr: Multi-scale shape regression for scene text detection[J]. arXiv preprint arXiv:1901.02596, 2019. \n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,234 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"\n",
"# 文本识别算法理论\n",
"\n",
"本章主要介绍文本识别算法的理论知识,包括背景介绍、算法分类和部分经典论文思路。\n",
"\n",
"通过本章的学习,你可以掌握:\n",
"\n",
"1. 文本识别的目标\n",
"\n",
"2. 文本识别算法的分类\n",
"\n",
"3. 各类算法的典型思想\n",
"\n",
"\n",
"## 1 背景介绍\n",
"\n",
"文本识别是OCROptical Character Recognition的一个子任务其任务为识别一个固定区域的的文本内容。在OCR的两阶段方法里它接在文本检测后面将图像信息转换为文字信息。\n",
"\n",
"具体地,模型输入一张定位好的文本行,由模型预测出图片中的文字内容和置信度,可视化结果如下图所示:\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/a7c3404f778b489db9c1f686c7d2ff4d63b67c429b454f98b91ade7b89f8e903 width=\"600\"></center>\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/e72b1d6f80c342ac951d092bc8c325149cebb3763ec849ec8a2f54e7c8ad60ca width=\"600\"></center>\n",
"\n",
"\n",
"文本识别的应用场景很多,有文档识别、路标识别、车牌识别、工业编号识别等等,根据实际场景可以把文本识别任务分为两个大类:**规则文本识别**和**不规则文本识别**。\n",
"\n",
"* 规则文本识别:主要指印刷字体、扫描文本等,认为文本大致处在水平线位置\n",
"\n",
"* 不规则文本识别: 往往出现在自然场景中,且由于文本曲率、方向、变形等方面差异巨大,文字往往不在水平位置,存在弯曲、遮挡、模糊等问题。\n",
"\n",
"\n",
"下图展示的是 IC15 和 IC13 的数据样式,它们分别代表了不规则文本和规则文本。可以看出不规则文本往往存在扭曲、模糊、字体差异大等问题,更贴近真实场景,也存在更大的挑战性。\n",
"\n",
"因此目前各大算法都试图在不规则数据集上获得更高的指标。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/bae4fce1370b4751a3779542323d0765a02a44eace7b44d2a87a241c13c6f8cf width=\"400\">\n",
"<br><center>IC15 图片样例(不规则文本)</center>\n",
"<img src=https://ai-studio-static-online.cdn.bcebos.com/b55800d3276f4f5fad170ea1b567eb770177fce226f945fba5d3247a48c15c34 width=\"400\"></center>\n",
"<br><center>IC13 图片样例(规则文本)</center>\n",
"\n",
"\n",
"不同的识别算法在对比能力时,往往也在这两大类公开数据集上比较。对比多个维度上的效果,目前较为通用的英文评估集合分类如下:\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/4d0aada261064031a16816b39a37f2ff6af70dbb57004cb7a106ae6485f14684 width=\"600\"></center>\n",
"\n",
"## 2 文本识别算法分类\n",
"\n",
"在传统的文本识别方法中任务分为3个步骤即图像预处理、字符分割和字符识别。需要对特定场景进行建模一旦场景变化就会失效。面对复杂的文字背景和场景变动基于深度学习的方法具有更优的表现。\n",
"\n",
"多数现有的识别算法可用如下统一框架表示算法流程被划分为4个阶段\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a2750f4170864f69a3af36fc13db7b606d851f2f467d43cea6fbf3521e65450f)\n",
"\n",
"\n",
"我们整理了主流的算法类别和主要论文,参考下表:\n",
"\n",
"<center>\n",
" \n",
"| 算法类别 | 主要思路 | 主要论文 |\n",
"| -------- | --------------- | -------- |\n",
"| 传统算法 | 滑动窗口、字符提取、动态规划 | - |\n",
"| ctc | 基于ctc的方法序列不对齐更快速识别 | CRNN, Rosetta |\n",
"| Attention | 基于attention的方法应用于非常规文本 | RARE, DAN, PREN |\n",
"| Transformer | 基于transformer的方法 | SRN, NRTR, Master, ABINet |\n",
"| 校正 | 校正模块学习文本边界并校正成水平方向 | RARE, ASTER, SAR | \n",
"| 分割 | 基于分割的方法,提取字符位置再做分类 | Text Scanner Mask TextSpotter |\n",
" \n",
"</center>\n",
"\n",
"\n",
"### 2.1 规则文本识别\n",
"\n",
"\n",
"文本识别的主流算法有两种,分别是基于 CTC (Conectionist Temporal Classification) 的算法和 Sequence2Sequence 算法,区别主要在解码阶段。\n",
"\n",
"基于 CTC 的算法是将编码产生的序列接入 CTC 进行解码;基于 Sequence2Sequence 的方法则是把序列接入循环神经网络(Recurrent Neural Network, RNN)模块进行循环解码,两种方式都验证有效也是主流的两大做法。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f64eee66e4a6426f934c1befc3b138629324cf7360c74f72bd6cf3c0de9d49bd width=\"600\"></center>\n",
"<br><center>左基于CTC的方法基于Sequece2Sequence的方法 </center>\n",
"\n",
"\n",
"#### 2.1.1 基于CTC的算法\n",
"\n",
"基于 CTC 最典型的算法是CRNN (Convolutional Recurrent Neural Network)[1]它的特征提取部分使用主流的卷积结构常用的有ResNet、MobileNet、VGG等。由于文本识别任务的特殊性输入数据中存在大量的上下文信息卷积神经网络的卷积核特性使其更关注于局部信息缺乏长依赖的建模能力因此仅使用卷积网络很难挖掘到文本之间的上下文联系。为了解决这一问题CRNN文本识别算法引入了双向 LSTM(Long Short-Term Memory) 用来增强上下文建模通过实验证明双向LSTM模块可以有效的提取出图片中的上下文信息。最终将输出的特征序列输入到CTC模块直接解码序列结果。该结构被验证有效并广泛应用在文本识别任务中。Rosetta[2]是FaceBook提出的识别网络由全卷积模型和CTC组成。Gao Y[3]等人使用CNN卷积替代LSTM参数更少性能提升精度持平。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/d3c96dd9e9794fddb12fa16f926abdd3485194f0a2b749e792e436037490899b width=\"600\"></center>\n",
"<center> CRNN 结构图 </center>\n",
"\n",
"\n",
"#### 2.1.2 Sequence2Sequence 算法\n",
"\n",
"Sequence2Sequence 算法是由编码器 Encoder 把所有的输入序列都编码成一个统一的语义向量然后再由解码器Decoder解码。在解码器Decoder解码的过程中不断地将前一个时刻的输出作为后一个时刻的输入循环解码直到输出停止符为止。一般编码器是一个RNN对于每个输入的词编码器输出向量和隐藏状态并将隐藏状态用于下一个输入的单词循环得到语义向量解码器是另一个RNN它接收编码器输出向量并输出一系列字以创建转换。受到 Sequence2Sequence 在翻译领域的启发, Shi[4]提出了一种基于注意的编解码框架来识别文本,通过这种方式rnn能够从训练数据中学习隐藏在字符串中的字符级语言模型。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f575333696b7438d919975dc218e61ccda1305b638c5497f92b46a7ec3b85243 width=\"400\" hight=\"500\"></center>\n",
"<center> Sequence2Sequence 结构图 </center>\n",
"\n",
"以上两个算法在规则文本上都有很不错的效果,但由于网络设计的局限性,这类方法很难解决弯曲和旋转的不规则文本识别任务。为了解决这类问题,部分算法研究人员在以上两类算法的基础上提出了一系列改进算法。\n",
"\n",
"### 2.2 不规则文本识别\n",
"\n",
"* 不规则文本识别算法可以被分为4大类基于校正的方法基于 Attention 的方法;基于分割的方法;基于 Transformer 的方法。\n",
"\n",
"#### 2.2.1 基于校正的方法\n",
"\n",
"基于校正的方法利用一些视觉变换模块,将非规则的文本尽量转换为规则文本,然后使用常规方法进行识别。\n",
"\n",
"RARE[4]模型首先提出了对不规则文本的校正方案整个网络分为两个主要部分一个空间变换网络STN(Spatial Transformer Network) 和一个基于Sequence2Squence的识别网络。其中STN就是校正模块不规则文本图像进入STN通过TPS(Thin-Plate-Spline)变换成一个水平方向的图像,该变换可以一定程度上校正弯曲、透射变换的文本,校正后送入序列识别网络进行解码。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/66406f89507245e8a57969b9bed26bfe0227a8cf17a84873902dd4a464b97bb5 width=\"600\"></center>\n",
"<center> RARE 结构图 </center>\n",
"\n",
"RARE论文指出该方法在不规则文本数据集上有较大的优势特别比较了CUTE80和SVTP这两个数据集相较CRNN高出5个百分点以上证明了校正模块的有效性。基于此[6]同样结合了空间变换网络(STN)和基于注意的序列识别网络的文本识别系统。\n",
"\n",
"基于校正的方法有较好的迁移性除了RARE这类基于Attention的方法外STAR-Net[5]将校正模块应用到基于CTC的算法上相比传统CRNN也有很好的提升。\n",
"\n",
"#### 2.2.2 基于Attention的方法\n",
"\n",
"基于 Attention 的方法主要关注的是序列之间各部分的相关性该方法最早在机器翻译领域提出认为在文本翻译的过程中当前词的结果主要由某几个单词影响的因此需要给有决定性的单词更大的权重。在文本识别领域也是如此将编码后的序列解码时每一步都选择恰当的context来生成下一个状态这样有利于得到更准确的结果。\n",
"\n",
"R^2AM [7] 首次将 Attention 引入文本识别领域该模型首先将输入图像通过递归卷积层提取编码后的图像特征然后利用隐式学习到的字符级语言统计信息通过递归神经网络解码输出字符。在解码过程中引入了Attention 机制实现了软特征选择,以更好地利用图像特征,这一有选择性的处理方式更符合人类的直觉。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/a64ef10d4082422c8ac81dcda4ab75bf1db285d6b5fd462a8f309240445654d5 width=\"600\"></center>\n",
"<center> R^2AM 结构图 </center>\n",
"\n",
"后续有大量算法在Attention领域进行探索和更新例如SAR[8]将1D attention拓展到2D attention上校正模块提到的RARE也是基于Attention的方法。实验证明基于Attention的方法相比CTC的方法有很好的精度提升。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/4e2507fb58d94ec7a9b4d17151a986c84c5053114e05440cb1e7df423d32cb02 width=\"600\"></center>\n",
"\n",
"\n",
"#### 2.2.3 基于分割的方法\n",
"\n",
"基于分割的方法是将文本行的各字符作为独立个体相比与对整个文本行做矫正后识别识别分割出的单个字符更加容易。它试图从输入的文本图像中定位每个字符的位置并应用字符分类器来获得这些识别结果将复杂的全局问题简化成了局部问题解决在不规则文本场景下有比较不错的效果。然而这种方法需要字符级别的标注数据获取上存在一定的难度。Lyu[9]等人提出了一种用于单词识别的实例分词模型,该模型在其识别部分使用了基于 FCN(Fully Convolutional Network) 的方法。[10]从二维角度考虑文本识别问题设计了一个字符注意FCN来解决文本识别问题当文本弯曲或严重扭曲时该方法对规则文本和非规则文本都具有较优的定位结果。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/fd3e8ef0d6ce4249b01c072de31297ca5d02fc84649846388f890163b624ff10 width=\"800\"></center>\n",
"<center> Mask TextSpotter 结构图 </center>\n",
"\n",
"\n",
"\n",
"#### 2.2.4 基于Transformer的方法\n",
"\n",
"随着 Transformer 的快速发展,分类和检测领域都验证了 Transformer 在视觉任务中的有效性。如规则文本识别部分所说CNN在长依赖建模上存在局限性Transformer 结构恰好解决了这一问题它可以在特征提取器中关注全局信息并且可以替换额外的上下文建模模块LSTM。\n",
"\n",
"一部分文本识别算法使用 Transformer 的 Encoder 结构和卷积共同提取序列特征Encoder 由多个 MultiHeadAttentionLayer 和 Positionwise Feedforward Layer 堆叠而成的block组成。MulitHeadAttention 中的 self-attention 利用矩阵乘法模拟了RNN的时序计算打破了RNN中时序长时依赖的障碍。也有一部分算法使用 Transformer 的 Decoder 模块解码相比传统RNN可获得更强的语义信息同时并行计算具有更高的效率。\n",
"\n",
"SRN[11] 算法将Transformer的Encoder模块接在ResNet50后增强了2D视觉特征。并提出了一个并行注意力模块将读取顺序用作查询使得计算与时间无关最终并行输出所有时间步长的对齐视觉特征。此外SRN还利用Transformer的Eecoder作为语义模块将图片的视觉信息和语义信息做融合在遮挡、模糊等不规则文本上有较大的收益。\n",
"\n",
"NRTR[12] 使用了完整的Transformer结构对输入图片进行编码和解码只使用了简单的几个卷积层做高层特征提取在文本识别上验证了Transformer结构的有效性。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/e7859f4469a842f0bd450e7e793a679d6e828007544241d09785c9b4ea2424a2 width=\"800\"></center>\n",
"<center> NRTR 结构图 </center>\n",
"\n",
"SRACN[13]使用Transformer的解码器替换LSTM再一次验证了并行训练的高效性和精度优势。\n",
"\n",
"## 3 总结\n",
"\n",
"本节主要介绍了文本识别相关的理论知识和主流算法包括基于CTC的方法、基于Sequence2Sequence的方法以及基于分割的方法并分别列举了经典论文的思路和贡献。下一节将基于CRNN算法进行实践课程讲解从组网到优化完成整个训练过程\n",
"\n",
"## 4 参考文献\n",
"\n",
"\n",
"[1]Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, 39(11), 2298-2304.\n",
"\n",
"[2]Fedor Borisyuk, Albert Gordo, and Viswanath Sivakumar. Rosetta: Large scale system for text detection and recognition in images. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pages 7179. ACM, 2018.\n",
"\n",
"[3]Gao, Y., Chen, Y., Wang, J., & Lu, H. (2017). Reading scene text with attention convolutional sequence modeling. arXiv preprint arXiv:1709.04303.\n",
"\n",
"[4]Shi, B., Wang, X., Lyu, P., Yao, C., & Bai, X. (2016). Robust scene text recognition with automatic rectification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4168-4176).\n",
"\n",
"[5] Star-Net Max Jaderberg, Karen Simonyan, Andrew Zisserman, et al. Spa- tial transformer networks. In Advances in neural information processing systems, pages 20172025, 2015.\n",
"\n",
"[6]Baoguang Shi, Mingkun Yang, XingGang Wang, Pengyuan Lyu, Xiang Bai, and Cong Yao. Aster: An attentional scene text recognizer with flexible rectification. IEEE transactions on pattern analysis and machine intelligence, 31(11):855868, 2018.\n",
"\n",
"[7] Lee C Y , Osindero S . Recursive Recurrent Nets with Attention Modeling for OCR in the Wild[C]// IEEE Conference on Computer Vision & Pattern Recognition. IEEE, 2016.\n",
"\n",
"[8]Li, H., Wang, P., Shen, C., & Zhang, G. (2019, July). Show, attend and read: A simple and strong baseline for irregular text recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 33, No. 01, pp. 8610-8617).\n",
"\n",
"[9]P. Lyu, C. Yao, W. Wu, S. Yan, and X. Bai. Multi-oriented scene text detection via corner localization and region segmentation. In Proc. CVPR, pages 75537563, 2018.\n",
"\n",
"[10] Liao, M., Zhang, J., Wan, Z., Xie, F., Liang, J., Lyu, P., ... & Bai, X. (2019, July). Scene text recognition from two-dimensional perspective. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 33, No. 01, pp. 8714-8721).\n",
"\n",
"[11] Yu, D., Li, X., Zhang, C., Liu, T., Han, J., Liu, J., & Ding, E. (2020). Towards accurate scene text recognition with semantic reasoning networks. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 12113-12122).\n",
"\n",
"[12] Sheng, F., Chen, Z., & Xu, B. (2019, September). NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) (pp. 781-786). IEEE.\n",
"\n",
"[13]Yang, L., Wang, P., Li, H., Li, Z., & Zhang, Y. (2020). A holistic representation guided attention network for scene text recognition. Neurocomputing, 414, 67-75.\n",
"\n",
"[14]Wang, T., Zhu, Y., Jin, L., Luo, C., Chen, X., Wu, Y., ... & Cai, M. (2020, April). Decoupled attention network for text recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 34, No. 07, pp. 12216-12224).\n",
"\n",
"[15] Wang, Y., Xie, H., Fang, S., Wang, J., Zhu, S., & Zhang, Y. (2021). From two to one: A new scene text recognizer with visual language modeling network. In Proceedings of the IEEE/CVF International Conference on Computer Vision (pp. 14194-14203).\n",
"\n",
"[16] Fang, S., Xie, H., Wang, Y., Mao, Z., & Zhang, Y. (2021). Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 7098-7107).\n",
"\n",
"[17] Yan, R., Peng, L., Xiao, S., & Yao, G. (2021). Primitive Representation Learning for Scene Text Recognition. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 284-293)."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,464 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 文档分析技术\n",
"\n",
"本章主要介绍文档分析技术的理论知识,包括背景介绍、算法分类和对应思路。\n",
"\n",
"通过本章的学习,你可以掌握:\n",
"\n",
"1. 版面分析的分类和典型思想\n",
"2. 表格识别的分类和典型思想\n",
"3. 信息提取的分类和典型思想"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"\n",
"作为信息承载工具,文档的不同布局代表了各种不同的信息,如清单和身份证。文档分析是一个从文档中阅读、解释和提取信息的自动化过程。文档分析常包含以下几个研究方向:\n",
"\n",
"1. 版面分析模块: 将每个文档页面划分为不同的内容区域。该模块不仅可用于划定相关区域和不相关区域,还可用于对其识别的内容类型进行分类。\n",
"2. 光学字符识别 (OCR) 模块: 定位并识别文档中存在的所有文本。\n",
"3. 表格识别模块: 将文档里的表格信息进行识别和转换到excel文件中。\n",
"4. 信息提取模块: 借助OCR结果和图像信息来理解和识别文档中表达的特定信息或信息之间的关系。\n",
"\n",
"由于OCR模块在前面的章节中进行了详细的介绍接下来将针对上面版面分析、表格识别和信息提取三个模块做单独的介绍。对于每一个模块会介绍该模块的经典或常用方法以及数据集。"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 1. 版面分析\n",
"\n",
"### 1.1 背景介绍\n",
"\n",
"版面分析主要用于文档检索,关键信息提取,内容分类等,其任务主要是对文档图像进行内容分类,内容的类别一般可分为纯文本、标题、表格、图片和列表等。但是文档布局、格式的多样性和复杂性,文档图像质量差,大规模的带标注的数据集的缺少等问题使得版面分析仍然是一个很有挑战性的任务。\n",
"版面分析任务的可视化如下图所示:\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/2510dc76c66c49b8af079f25d08a9dcba726b2ce53d14c8ba5cd9bd57acecf19\" width=\"1000\"/></center>\n",
"<center>图 1版面分析效果图</center>\n",
"\n",
"现有的解决办法一般是基于目标检测或语义分割的方法,这类方法基将文档中不同的板式当做不同的目标进行检测或分割。\n",
"\n",
"一些代表性论文被划分为上述两个类别中,具体如下表所示:\n",
"\n",
"| 类别 | 主要论文 |\n",
"| ---------------- | -------- |\n",
"| 基于目标检测的方法 | [Visual Detection with Context](https://aclanthology.org/D19-1348.pdf)[Object Detection](https://arxiv.org/pdf/2003.13197v1.pdf)[VSR](https://arxiv.org/pdf/2105.06220v1.pdf)|\n",
"| 基于语义分割的方法 |[Semantic Segmentation](https://arxiv.org/pdf/1911.12170v2.pdf) |\n",
"\n",
"\n",
"### 1.2 基于目标检测的方法 \n",
"\n",
"Soto Carlos[1]在目标检测算法Faster R-CNN的基础上结合上下文信息并利用文档内容的固有位置信息来提高区域检测性能。Li Kai [2]等人也提出了一种基于目标检测的文档分析方法通过引入了特征金字塔对齐模块区域对齐模块渲染层对齐模块来解决跨域的问题这三个模块相互补充并从一般的图像角度和特定的文档图像角度调整域从而解决了大型标记训练数据集与目标域不同的问题。下图是一个基于目标检测Faster R-CNN算法进行版面分析的流程图。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/d396e0d6183243898c0961250ee7a49bc536677079fb4ba2ac87c653f5472f01\" width=\"800\"/></center>\n",
"<center>图 2基于Faster R-CNN的版面分析流程图</center>\n",
"\n",
"### 1.3 基于语义分割的方法 \n",
"\n",
"Sarkar Mausoom[3]等人提出了一种基于先验的分割机制在非常高的分辨率的图像上训练文档分割模型解决了过度缩小原始图像导致的密集区域不同结构无法区分进而合并的问题。Zhang Peng[4]等人结合文档中的视觉、语义和关系提出了一个统一的框架VSRVision, Semantics and Relations用于文档布局分析该框架使用一个双流网络来提取特定模态的视觉和语义特征并通过自适应聚合模块自适应地融合这些特征解决了现有基于CV的方法不同模态融合效率低下和布局组件之间缺乏关系建模的局限性。\n",
"\n",
"### 1.4 数据集\n",
"\n",
"虽然现有的方法可以在一定程度上解决版面分析任务,但是该类方法依赖于大量有标记的训练数据。最近也有很多数据集被提出用于文档分析任务。\n",
"\n",
"1. PubLayNet[5]: 该数据集包含50万张文档图像其中40万用于训练5万用于验证5万用于测试,共标记了表格,文本,图像,标题和列表五种形式\n",
"2. HJDataset[6]: 数据集包含2271张文档图像, 除了内容区域的边界框和掩码之外,它还包括布局元素的层次结构和阅读顺序。\n",
"\n",
"PubLayNet数据集样例如下图所示:\n",
"<center class=\"two\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/4b153117c9384f98a0ce5a6c6e7c205a4b1c57e95c894ccb9688cbfc94e68a1c\" width=\"400\"/><img src=\"https://ai-studio-static-online.cdn.bcebos.com/efb9faea39554760b280f9e0e70631d2915399fa97774eecaa44ee84411c4994\" width=\"400\"/>\n",
"</center>\n",
"<center>图 3PubLayNet样例</center>\n",
"参考文献:\n",
"\n",
"[1]Soto C, Yoo S. Visual detection with context for document layout analysis[C]//Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 2019: 3464-3470.\n",
"\n",
"[2]Li K, Wigington C, Tensmeyer C, et al. Cross-domain document object detection: Benchmark suite and method[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2020: 12915-12924.\n",
"\n",
"[3]Sarkar M, Aggarwal M, Jain A, et al. Document Structure Extraction using Prior based High Resolution Hierarchical Semantic Segmentation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 649-666.\n",
"\n",
"[4]Zhang P, Li C, Qiao L, et al. VSR: A Unified Framework for Document Layout Analysis combining Vision, Semantics and Relations[J]. arXiv preprint arXiv:2105.06220, 2021.\n",
"\n",
"[5]Zhong X, Tang J, Yepes A J. Publaynet: largest dataset ever for document layout analysis[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1015-1022.\n",
"\n",
"[6]Li M, Xu Y, Cui L, et al. DocBank: A benchmark dataset for document layout analysis[J]. arXiv preprint arXiv:2006.01038, 2020.\n",
"\n",
"[7]Shen Z, Zhang K, Dell M. A large dataset of historical japanese documents with complex layouts[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2020: 548-549."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 2. 表格识别\n",
"\n",
"### 2.1 背景介绍\n",
"\n",
"表格是各类文档中常见的页面元素,随着各类文档的爆炸性增长,如何高效地从文档中找到表格并获取内容与结构信息即表格识别,成为了一个亟需解决的问题。表格识别的难点总结如下:\n",
"\n",
"1. 表格种类和样式复杂多样,例如*不同的行列合并,不同的内容文本类型*等。\n",
"2. 文档的样式本身的样式多样。\n",
"3. 拍摄时的光照环境等\n",
"\n",
"表格识别的任务就是将文档里的表格信息转换到excel文件中任务可视化如下\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/99faa017e28b4928a408573406870ecaa251b626e0e84ab685e4b6f06f601a5f\" width=\"1600\"/></center>\n",
"\n",
"\n",
"<center>图 4表格识别示例图其中左边为原图右边为表格识别后的结果图以Excel形式呈现</center>\n",
"\n",
"现有的表格识别算法根据表格结构重建的原理可以分为下面四大类:\n",
"1. 基于启发式规则的方法\n",
"2. 基于CNN的方法\n",
"3. 基于GCN的方法\n",
"4. 基于End to End的方法\n",
"\n",
"一些代表性论文被划分为上述四个类别中,具体如下表所示:\n",
"| 类别 | 思路 | 主要论文 |\n",
"| ---------------- | ---- | -------- |\n",
"|基于启发式规则的方法|人工设计规则,连通域检测分析处理|[T-Rect](https://www.researchgate.net/profile/Andreas-Dengel/publication/249657389_A_Paper-to-HTML_Table_Converting_System/links/0c9605322c9a67274d000000/A-Paper-to-HTML-Table-Converting-System.pdf)[pdf2table](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.724.7272&rep=rep1&type=pdf)|\n",
"| 基于CNN的方法 | 目标检测,语义分割 | [CascadeTabNet](https://arxiv.org/pdf/2004.12629v2.pdf), [Multi-Type-TD-TSR](https://arxiv.org/pdf/2105.11021v1.pdf), [LGPMA](https://arxiv.org/pdf/2105.06224v2.pdf), [tabstruct-net](https://arxiv.org/pdf/2010.04565v1.pdf), [CDeC-Net](https://arxiv.org/pdf/2008.10831v1.pdf), [TableNet](https://arxiv.org/pdf/2001.01469v1.pdf), [TableSense](https://arxiv.org/pdf/2106.13500v1.pdf), [Deepdesrt](https://www.dfki.de/fileadmin/user_upload/import/9672_PID4966073.pdf), [Deeptabstr](https://www.dfki.de/fileadmin/user_upload/import/10649_DeepTabStR.pdf), [GTE](https://arxiv.org/pdf/2005.00589v2.pdf), [Cycle-CenterNet](https://arxiv.org/pdf/2109.02199v1.pdf), [FCN](https://www.researchgate.net/publication/339027294_Rethinking_Semantic_Segmentation_for_Table_Structure_Recognition_in_Documents)|\n",
"| 基于GCN的方法 | 基于图神经网络,将表格识别看作图重建问题 | [GNN](https://arxiv.org/pdf/1905.13391v2.pdf), [TGRNet](https://arxiv.org/pdf/2106.10598v3.pdf), [GraphTSR](https://arxiv.org/pdf/1908.04729v2.pdf)|\n",
"| 基于End to End的方法 | 利用attention机制 | [Table-Master](https://arxiv.org/pdf/2105.01848v1.pdf)|\n",
"\n",
"### 2.2 基于启发式规则的传统算法\n",
"早期的表格识别研究主要是基于启发式规则的方法。例如由Kieninger[1]等人提出的T-Rect系统使用自底向上的方法对文档图像进行连通域分析然后按照定义的规则进行合并得到逻辑文本块。而之后由Yildiz[2]等人提出的pdf2table则是第一个在PDF文档上进行表格识别的方法它利用了PDF文件的一些特有信息例如文字、绘制路径等图像文档中难以获取的信息来协助表格识别。而在最近的工作中Koci[3]等人将页面中的布局区域表示为图Graph的形式然后使用了Remove and Conquer(RAC)算法从中将表格作为一个子图识别出来。\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/66aeedb3f0924d80aee15f185e6799cc687b51fc20b74b98b338ca2ea25be3f3\" width=\"1000\"/></center>\n",
"<center>图 5启发式算法示意图</center>\n",
"\n",
"### 2.3 基于深度学习CNN的方法\n",
"随着深度学习技术在计算机视觉、自然语言处理、语音处理等领域的飞速发展,研究者将深度学习技术应用到表格识别领域并取得了不错的效果。\n",
"\n",
"Siddiqui Shoaib Ahmed[12]等人在DeepTabStR算法中将表格结构识别问题表述为对象检测问题并利用可变形卷积来进更好的进行表格单元格的检测。Raja Sachin[6]等人提出TabStruct-Net将单元格检测和结构识别在视觉上结合起来进行表格结构识别解决了现有方法由于表格布局发生较大变化而识别错误的问题但是该方法无法处理行列出现较多空单元格的问题。\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/838be28836444bc1835ac30a25613d8b045a1b5aedd44b258499fe9f93dd298f\" width=\"1600\"/></center>\n",
"<center>图 6基于深度学习CNN的算法示意图</center>\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/4c40dda737bd44b09a533e1b1dd2e4c6a90ceea083bf4238b7f3c7b21087f409\" width=\"1600\"/></center>\n",
"<center>图 7基于深度学习CNN的算法错误示例</center>\n",
"\n",
"之前的表格结构识别方法一般是从不同粒度(行/列、文本区域的元素开始处理问题容易忽略空单元格合并的问题。Qiao Liang[10]等人提出了一个新框架LGPMA通过掩码重评分策略充分利用来自局部和全局特征的信息进而可以获得更可靠的对齐单元格区域最后引入了包括单元格匹配、空单元格搜索和空单元格合并的表格结构复原pipeline来处理表格结构识别问题。\n",
"\n",
"除了以上单独做表格识别的算法外也有部分方法将表格检测和表格识别在一个模型里完成Schreiber Sebastian[11]等人提出了DeepDeSRT通过Faster RCNN进行表格检测通过FCN语义分割模型用于表格结构行列检测但是该方法是用两个独立的模型来解决这两个问题。Prasad Devashish[4]等人提出了一种基于端到端深度学习的方法CascadeTabNet使用Cascade Mask R-CNN HRNet模型同时进行表格检测和结构识别解决了以往方法使用独立的两个方法处理表格识别问题的不足。Paliwal Shubham[8]等人提出一种新颖的端到端深度多任务架构TableNet用于表格检测和结构识别同时在训练期间向TableNet添加额外的空间语义特征进一步提高了模型性能。Zheng Xinyi[13]等人提出了表格识别的系统框架GTE利用单元格检测网络来指导表格检测网络的训练同时提出了一种层次网络和一种新的基于聚类的单元格结构识别算法该框架可以接入到任何目标检测模型的后面方便训练不同的表格识别算法。之前的研究主要集中在从扫描的PDF文档中解析具有简单布局的对齐良好的表格图像但是现实场景中的表格一般很复杂可能存在严重变形弯曲或者遮挡等问题因此Long Rujiao[14]等人同时构造了一个现实复杂场景下的表格识别数据集WTW并提出了一种Cycle-CenterNet方法它利用循环配对模块优化和提出的新配对损失将离散单元精确地分组到结构化表中提高了表格识别的性能。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/a01f714cbe1f42fc9c45c6658317d9d7da2cec9726844f6b9fa75e30cadc9f76\" width=\"1600\"/></center>\n",
"<center>图 8端到端算法示意图</center>\n",
"\n",
"基于CNN的方法对跨行列的表格无法很好的处理因此在后续的方法中分为了两个研究方法来解决表格中跨行列的问题。\n",
"\n",
"### 2.4 基于深度学习GCN的方法\n",
"近些年来随着图卷积神经网络Graph Convolutional Network的兴起也有一些研究者尝试将图神经网络应用到表格结构识别问题上。Qasim Shah Rukh[20]等人将表格结构识别问题转换为与图神经网络兼容的图问题并设计了一种新颖的可微架构该架构既可以利用卷积神经网络提取特征的优点也可以利用图神经网络顶点之间有效交互的优点但是该方法只使用了单元格的位置特征没有利用语义特征。Chi Zewen[19]等人提出了一种新颖的图神经网络GraphTSR用于PDF文件中的表格结构识别它以表格中的单元格为输入然后通过利用图的边和节点相连的特性来预测单元格之间的关系来识别表格结构一定程度上解决了跨行或者跨列的单元格识别问题。Xue Wenyuan[21]等人将表格结构识别的问题重新表述为表图重建并提出了一种用于表格结构识别的端到端方法TGRNet该方法包含单元格检测分支和单元格逻辑位置分支这两个分支共同预测不同单元格的空间位置和逻辑位置解决了之前方法没有关注单元格逻辑位置的问题。\n",
"\n",
"GraphTSR表格识别算法示意图\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/8ff89661142045a8aef54f8a7a2c69b1d243f8269034406a9e66bee2149f730f\" width=\"1600\"/></center>\n",
"<center>图 9GraphTSR表格识别算法示意图</center>\n",
"\n",
"### 2.5 基于端到端的方法\n",
"\n",
"和其他使用后处理完成表格结构的重建不同基于端到端的方法直接使用网络完成表格结构的HTML表示输出\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/7865e58a83824facacfaa91bec12ccf834217cb706454dc5a0c165c203db79fb) | ![](https://ai-studio-static-online.cdn.bcebos.com/77d913b1b92f4a349b8f448e08ba78458d687eef4af142678a073830999f3edc))\n",
"---|---\n",
"图 10端到端方法的输入输出|图 11Image Caption示例\n",
"\n",
"端到端的方法大多采用Image Caption(看图说话)的Seq2Seq方法来完成表格结构的预测如一些基于Attention或Transformer的方法。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/3571280a9c364d3499a062e3edc724294fb5eaef8b38440991941e87f0af0c3b\" width=\"800\"/></center>\n",
"<center>图 12Seq2Seq示意图</center>\n",
"\n",
"Ye Jiaquan[22]在TableMaster中通过改进基于Transformer的Master文字算法来得到表格结构输出模型。此外还添加了一个分支进行框的坐标回归作者并没有在最后一层将模型拆分为两个分支而是在第一个 Transformer 解码层之后就将序列预测和框回归解耦为两个分支。其网络结构和原始Master网络的对比如下图所示\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/f573709447a848b4ba7c73a2e297f0304caaca57c5c94588aada1f4cd893946c\" width=\"800\"/></center>\n",
"<center>图 13master网络图TableMaster网络图</center>\n",
"\n",
"\n",
"### 2.6 数据集\n",
"\n",
"由于深度学习方法是数据驱动的方法,需要大量的标注数据对模型进行训练,而现有的数据集规模偏小也是一个重要的制约因素,因此也有一些数据集被提出。\n",
"\n",
"1. PubTabNet[16]: 包含568k表格图像和相应的结构化HTML表示。\n",
"2. PubMed Tables(PubTables-1M)[17]:表格结构识别数据集包含高度详细的结构注释460,589张pdf图像用于表格检测任务 947,642张表格图像用于表格识别任务。\n",
"3. TableBank[18]: 表格检测和识别数据集使用互联网上Word和Latex文档构建了包含417K高质量标注的表格数据。\n",
"4. SciTSR[19]: 表格结构识别数据集图像大部分从论文中转换而来其中包含来自PDF文件的15,000个表格及其相应的结构标签。\n",
"5. TabStructDB[12]: 包括1081个表格区域这些区域用行和列信息密集标记。\n",
"6. WTW[14]: 大规模数据集场景表格检测识别数据集该数据集包含各种变形弯曲和遮挡等情况下的表格数据共包含14,581 张图像。\n",
"\n",
"数据集示例\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/c9763df56e67434f97cd435100d50ded71ba66d9d4f04d7f8f896d613cdf02b0\" /></center>\n",
"<center>图 14PubTables-1M数据集样例图</center>\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/64de203bbe584642a74f844ac4b61d1ec3c5a38cacb84443ac961fbcc54a66ce\" width=\"600\"/></center>\n",
"<center>图 15WTW数据集样例图</center>\n",
"\n",
"\n",
"\n",
"参考文献\n",
"\n",
"[1]Kieninger T, Dengel A. A paper-to-HTML table converting system[C]//Proceedings of document analysis systems (DAS). 1998, 98: 356-365.\n",
"\n",
"[2]Yildiz B, Kaiser K, Miksch S. pdf2table: A method to extract table information from pdf files[C]//IICAI. 2005: 1773-1785.\n",
"\n",
"[3]Koci E, Thiele M, Lehner W, et al. Table recognition in spreadsheets via a graph representation[C]//2018 13th IAPR International Workshop on Document Analysis Systems (DAS). IEEE, 2018: 139-144.\n",
"\n",
"[4]Prasad D, Gadpal A, Kapadni K, et al. CascadeTabNet: An approach for end to end table detection and structure recognition from image-based documents[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2020: 572-573.\n",
"\n",
"[5]Fischer P, Smajic A, Abrami G, et al. Multi-Type-TD-TSRExtracting Tables from Document Images Using a Multi-stage Pipeline for Table Detection and Table Structure Recognition: From OCR to Structured Table Representations[C]//German Conference on Artificial Intelligence (Künstliche Intelligenz). Springer, Cham, 2021: 95-108.\n",
"\n",
"[6]Raja S, Mondal A, Jawahar C V. Table structure recognition using top-down and bottom-up cues[C]//European Conference on Computer Vision. Springer, Cham, 2020: 70-86.\n",
"\n",
"[7]Agarwal M, Mondal A, Jawahar C V. Cdec-net: Composite deformable cascade network for table detection in document images[C]//2020 25th International Conference on Pattern Recognition (ICPR). IEEE, 2021: 9491-9498.\n",
"\n",
"[8]Paliwal S S, Vishwanath D, Rahul R, et al. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 128-133.\n",
"\n",
"[9]Dong H, Liu S, Han S, et al. Tablesense: Spreadsheet table detection with convolutional neural networks[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2019, 33(01): 69-76.\n",
"\n",
"[10]Qiao L, Li Z, Cheng Z, et al. LGPMA: Complicated Table Structure Recognition with Local and Global Pyramid Mask Alignment[J]. arXiv preprint arXiv:2105.06224, 2021.\n",
"\n",
"[11]Schreiber S, Agne S, Wolf I, et al. Deepdesrt: Deep learning for detection and structure recognition of tables in document images[C]//2017 14th IAPR international conference on document analysis and recognition (ICDAR). IEEE, 2017, 1: 1162-1167.\n",
"\n",
"[12]Siddiqui S A, Fateh I A, Rizvi S T R, et al. Deeptabstr: Deep learning based table structure recognition[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1403-1409.\n",
"\n",
"[13]Zheng X, Burdick D, Popa L, et al. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context[C]//Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2021: 697-706.\n",
"\n",
"[14]Long R, Wang W, Xue N, et al. Parsing Table Structures in the Wild[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. 2021: 944-952.\n",
"\n",
"[15]Siddiqui S A, Khan P I, Dengel A, et al. Rethinking semantic segmentation for table structure recognition in documents[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1397-1402.\n",
"\n",
"[16]Zhong X, ShafieiBavani E, Jimeno Yepes A. Image-based table recognition: data, model, and evaluation[C]//Computer VisionECCV 2020: 16th European Conference, Glasgow, UK, August 2328, 2020, Proceedings, Part XXI 16. Springer International Publishing, 2020: 564-580.\n",
"\n",
"[17]Smock B, Pesala R, Abraham R. PubTables-1M: Towards a universal dataset and metrics for training and evaluating table extraction models[J]. arXiv preprint arXiv:2110.00061, 2021.\n",
"\n",
"[18]Li M, Cui L, Huang S, et al. Tablebank: Table benchmark for image-based table detection and recognition[C]//Proceedings of the 12th Language Resources and Evaluation Conference. 2020: 1918-1925.\n",
"\n",
"[19]Chi Z, Huang H, Xu H D, et al. Complicated table structure recognition[J]. arXiv preprint arXiv:1908.04729, 2019.\n",
"\n",
"[20]Qasim S R, Mahmood H, Shafait F. Rethinking table recognition using graph neural networks[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 142-147.\n",
"\n",
"[21]Xue W, Yu B, Wang W, et al. TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition[J]. arXiv preprint arXiv:2106.10598, 2021.\n",
"\n",
"[22]Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 3. Document VQA\n",
"\n",
"老板派任务:开发一个身份证识别系统\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/63bbe893465e4f98b3aec80a042758b520d43e1a993a47e39bce1123c2d29b3f\" width=\"1600\"/></center>\n",
"\n",
"\n",
"> 如何选择方案 \n",
"> 1. 文字检测之后用规则来进行信息提取\n",
"> 2. 文字检测之后用规模型来进行信息提取\n",
"> 3. 外包出去\n",
"\n",
"\n",
"### 3.1 背景介绍\n",
"在VQA(Visual Question Answering)任务中主要针对图像内容进行提问和回答但是对于文本图像来说关注的内容是图像中的文字信息因此这类方法可以分为自然场景的Text-VQA和扫描文档场景的DocVQA三者的关系如下图所示。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/a91cfd5152284152b020ca8a396db7a21fd982e3661540d5998cc19c17d84861\" width=\"600\"/></center>\n",
"<center>图 16: VQA层级</center>\n",
"\n",
"VQAText-VQA和DocVQA的示例图如下图所示。\n",
"\n",
"|任务类型|VQA | Text-VQA | DocVQA| \n",
"|---|---|---|---|\n",
"|任务描述|针对**图片内容**提出问题|针对**图片上的文字内容**提出问题|针对**文档图像的文字内容**提出问题|\n",
"|示例图片|![vqa](https://ai-studio-static-online.cdn.bcebos.com/fc21b593276247249591231b3373608151ed8ae7787f4d6ba39e8779fdd12201)|![textvqa](https://ai-studio-static-online.cdn.bcebos.com/cd2404edf3bf430b89eb9b2509714499380cd02e4aa74ec39ca6d7aebcf9a559)|![docvqa](https://ai-studio-static-online.cdn.bcebos.com/0eec30a6f91b4f949c56729b856f7ff600d06abee0774642801c070303edfe83)|\n",
"\n",
"DocVQA由于其更加贴近实际应用场景涌现出了大批学术界和工业界的工作。在常用的场景中DocVQA里提问的问题都是固定的比如身份证场景下的问题一般为\n",
"1. 公民身份号码是什么?\n",
"2. 姓名是什么?\n",
"3. 名族是什么?\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/2d2b86468daf47c98be01f44b8d6efa64bc09e43cd764298afb127f19b07aede\" width=\"800\"/></center>\n",
"<center>图 17: 身份证示例</center>\n",
"\n",
"\n",
"基于这样的先验知识DocVQA的 研究开始偏向Key Information Extraction(KIE)任务本次我们也主要讨论KIE相关的研究KIE任务主要从图像中提取所需要的关键信息如从身份证中提取出姓名和公民身份号码信息。\n",
"\n",
"KIE通常分为两个子任务进行研究\n",
"1. SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。\n",
"2. RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题和的答案。然后对每一个问题找到对应的答案。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/899470ba601349fbbc402a4c83e6cdaee08aaa10b5004977b1f684f346ebe31f\" width=\"800\"/></center>\n",
"<center>图 18: SER,RE任务示例</center>\n",
"\n",
"一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的的原理可以将这些方法分为下面三种:\n",
"\n",
"1. 基于Grid的方法\n",
"1. 基于Token的方法\n",
"2. 基于GCN的方法\n",
"3. 基于End to End 的方法\n",
"\n",
"一些代表性论文被划分为上述三个类别中,具体如下表所示:\n",
"| 类别 | 思路 | 主要论文 |\n",
"| ---------------- | ---- | -------- |\n",
"| 基于Grid的方法 |在图像上多模态信息的融合(文本,布局,图像)| [Chargrid](https://arxiv.org/pdf/1809.08799) |\n",
"| 基于Token的方法 |利用Bert这类方法进行多模态信息的融合|[LayoutLM](https://arxiv.org/pdf/1912.13318), [LayoutLMv2](https://arxiv.org/pdf/2012.14740), [StrucText](https://arxiv.org/pdf/2108.02923), |\n",
"| 基于GCN的方法 |利用图网络结构进行多模态信息的融合 |[GCN](https://arxiv.org/pdf/1903.11279), [PICK](https://arxiv.org/pdf/2004.07464), [SDMG-R](https://arxiv.org/pdf/2103.14470)[SERA](https://arxiv.org/pdf/2110.09915) |\n",
"| 基于End to End的方法 |将OCR和关键信息提取统一到一个网络 |[Trie](https://arxiv.org/pdf/2005.13118) |\n",
"\n",
"### 3.2 基于Grid的方法\n",
"\n",
"基于Grid的方法在图像层面进行多模态信息的融合。Chargrid[5]首先对图像进行字符级的文字检测和识别然后通过将类别的one-hot编码填充到对应的字符区域(下图中右图的非黑色部分)内来完成对网络输入的构建输入最后通过encoder-decoder结构的CNN网络来进行关键信息的坐标检测和类别分类。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/f248841769ec4312a9015b4befda37bf29db66226431420ca1faad517783875e\" width=\"800\"/></center>\n",
"<center>图 19: Chargrid数据示例</center>\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/0682e52e275b4187a0e74f54961a50091fd3a0cdff734e17bedcbc993f6e29f9\" width=\"800\"/></center>\n",
"<center>图 20: Chargrid网络</center>\n",
"\n",
"\n",
"相比于传统的仅基于文本的方法,该方法能够同时利用文本信息和结构信息,因此能够取得一定的精度提升,但是该方法对文本和结构信息的融合只是做了简单的嵌入,并没有很好的将二者进行融合\n",
"\n",
"### 3.3 基于Token的方法\n",
"LayoutLM[6]将2D位置信息和文本信息一起编码到BERT模型中并且借鉴NLP中Bert的预训练思想在大规模的数据集上进行预训练在下游任务中LayoutLM还加入了图像信息来进一步提升模型性能。LayoutLM虽然将文本位置和图像信息做了融合但是图像信息是在下游任务的训练中进行融合这样对三种信息的多模态融合并不充分。LayoutLMv2[7]在LayoutLM的基础上通过transformers在预训练阶段将图像信息和文本layout信息进行融合还在Transformer中加入空间感知自注意力机制辅助模型更好地融合视觉和文本特征。LayoutLMv2虽然在预训练阶段对文本位置和图像信息做了融合但是由于预训练任务的限制模型学到的视觉特征不够精细。StrucTexT[8]在以往多模态方法的基础上在预训练任务提出Sentence Length Prediction (SLP) 和Paired Boxes Direction (PBD)两个新任务来帮助网络学习精细的视觉特征其中SLP任务让模型学习文本段的长度PDB任务让模型学习Box方向之间的匹配关系。通过这两个新的预训练任务能够加速文本、视觉和布局信息之间的深度跨模态融合。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/17a26ade09ee4311b90e49a1c61d88a72a82104478434f9dabd99c27a65d789b) | ![](https://ai-studio-static-online.cdn.bcebos.com/d75addba67ef4b06a02ae40145e609d3692d613ff9b74cec85123335b465b3cc))\n",
"---|---\n",
"图 21transformer算法流程图|图 22LayoutLMv2算法流程图\n",
"\n",
"### 3.4 基于GCN的方法\n",
"\n",
"现有的基于GCN的方法[10]虽然利用了文字和结构信息但是没有对图像信息进行很好的利用。PICK[11]在GCN网络中加入了图像信息并且提出graph learning module来自动学习edge的类型。SDMG-R [12]将图像编码为双模态图图的节点为文字区域的视觉和文本信息边表示相邻文本直接的空间关系通过迭代地沿边传播信息和推理图节点类别SDMG-R解决了现有的方法对没见过的模板无能为力的问题。\n",
"\n",
"\n",
"PICK流程图如下图所示\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/d3282959e6b2448c89b762b3b9bbf6197a0364b101214a1f83cf01a28623c01c\" width=\"800\"/></center>\n",
"<center>图 23PICK算法流程图</center>\n",
"\n",
"SERA[10]将依存句法分析里的biaffine parser引入到文档关系抽取中并且使用GCN来融合文本和视觉信息。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/a97b7647968a4fa59e7b14b384dd7ffe812f158db8f741459b6e6bb0e8b657c7\" width=\"800\"/></center>\n",
"<center>图 24SERA算法流程图</center>\n",
"\n",
"### 3.5 基于End to End 的方法\n",
"\n",
"现有的方法将KIE分为两个独立的任务文本读取和信息提取然而他们主要关注于改进信息提取任务而忽略了文本读取和信息提取是相互关联的因此Trie[9]提出了一个统一的端到端网络,可以同时学习这两个任务,并且在学习过程中相互加强。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/6e4a3b0f65254f6b9d40cea0875854d4f47e1dca6b1e408cad435b3629600608\" width=\"1300\"/></center>\n",
"<center>图 25: Trie算法流程图</center>\n",
"\n",
"\n",
"### 3.6 数据集\n",
"用于KIE的数据集主要有下面两个\n",
"1. SROIE: SROIE数据集[2]的任务3旨在从扫描收据中提取四个预定义的信息公司、日期、地址或总数。数据集中有626个样本用于训练347个样本用于测试。\n",
"2. FUNSD: FUNSD数据集[3]是一个用于从扫描文档中提取表单信息的数据集。它包含199个标注好的真实扫描表单。199个样本中149个用于训练50个用于测试。FUNSD数据集为每个单词分配一个语义实体标签问题、答案、标题或其他。\n",
"3. XFUN: XFUN数据集是微软提出的一个多语言数据集包含7种语言每种语言包含149张训练集50张测试集。\n",
"\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/dfdf530d79504761919c1f093f9a86dac21e6db3304c4892998ea1823f3187c6) | ![](https://ai-studio-static-online.cdn.bcebos.com/3b2a9f9476be4e7f892b73bd7096ce8d88fe98a70bae47e6ab4c5fcc87e83861))\n",
"---|---\n",
"图 26: sroie示例图|图 27: xfun示例图\n",
"\n",
"参考文献:\n",
"\n",
"[1]Mathew M, Karatzas D, Jawahar C V. Docvqa: A dataset for vqa on document images[C]//Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2021: 2200-2209.\n",
"\n",
"[2]Huang Z, Chen K, He J, et al. Icdar2019 competition on scanned receipt ocr and information extraction[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1516-1520.\n",
"\n",
"[3]Jaume G, Ekenel H K, Thiran J P. Funsd: A dataset for form understanding in noisy scanned documents[C]//2019 International Conference on Document Analysis and Recognition Workshops (ICDARW). IEEE, 2019, 2: 1-6.\n",
"\n",
"[4]Lample G, Ballesteros M, Subramanian S, et al. Neural architectures for named entity recognition[J]. arXiv preprint arXiv:1603.01360, 2016.\n",
"\n",
"[5]Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018.\n",
"\n",
"[6]Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200.\n",
"\n",
"[7]Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020.\n",
"\n",
"[8]Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920.\n",
"\n",
"[9]Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422.\n",
"\n",
"[10]Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019.\n",
"\n",
"[11]Yu W, Lu N, Qi X, et al. Pick: Processing key information extraction from documents using improved graph learning-convolutional networks[C]//2020 25th International Conference on Pattern Recognition (ICPR). IEEE, 2021: 4363-4370.\n",
"\n",
"[12]Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 4. 总结\n",
"本节我们主要介绍了文档分析技术相关的三个子模块的理论知识版面分析、表格识别和信息提取。下面我们会基于PaddleOCR框架对这表格识别和DOC-VQA进行实战教程的讲解。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -0,0 +1,83 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 1. 课程预备知识\n",
"\n",
"本课所涉及的OCR模型建立在深度学习的基础之上因此与其相关的基础知识、环境配置、项目工程与其他资料将在本节介绍尤其对深度学习不熟悉的读者可以查看和学习相应内容。\n",
"\n",
"### 1.1 预备知识\n",
"\n",
"深度学习的“学习”由机器学习中的神经元、感知机、多层神经网络等内容一路发展而来,因此了解基础的机器学习算法对于深度学习的理解和应用有很大帮助。而深度学习的“深”则体现在对大量信息处理过程中使用的卷积、池化等一系列以向量为基础的数学运算。如果缺乏这两者的理论基础,可以学习李宏毅老师的[线性代数](https://aistudio.baidu.com/aistudio/course/introduce/2063)和[机器学习](https://aistudio.baidu.com/aistudio/course/introduce/1978)课程。\n",
"\n",
"对于深度学习本身的理解,可以参考百度杰出架构师毕然老师的零基础课程:[百度架构师手把手带你零基础实践深度学习](https://aistudio.baidu.com/aistudio/course/introduce/1297),其中覆盖了深度学习的发展历史,通过一个经典案例介绍深度学习的完整组成部分,是一套以实践为导向的深度学习课程。\n",
"\n",
"对于理论知识的实践,[Python基础知识](https://aistudio.baidu.com/aistudio/course/introduce/1224)必不可少同时为了快速复现深度学习模型本课程使用的深度学习框架为飞桨PaddlePaddle。如果你已经使用过其他框架通过[快速上手文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/practices/quick_start/hello_paddle.html)可以迅速了解飞桨的使用方法。\n",
"\n",
"### 1.2 基础环境准备\n",
"\n",
"如果你想在本地环境运行本课程的代码且之前未搭建过Python环境可以根据[零基础运行环境准备](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/environment.md)根据自己的操作系统安装Anaconda或docker环境。\n",
"\n",
"如果你没有本地资源可以通过AI Studio实训平台完成代码运行其中的每个项目都通过Notebook的方式呈现方便开发者学习。若对Notebook的相关操作不熟悉可以参考[AI Studio项目说明](https://ai.baidu.com/ai-doc/AISTUDIO/0k3e2tfzm)。\n",
"\n",
"### 1.3 获取和运行代码\n",
"\n",
"本课程依托PaddleOCR的代码库形成首先克隆PaddleOCR的完整项目\n",
"\n",
"```bash\n",
"#【推荐】\n",
"git clone https://github.com/PaddlePaddle/PaddleOCR\n",
"\n",
"# 如果因为网络问题无法pull成功也可选择使用码云上的托管\n",
"git clone https://gitee.com/paddlepaddle/PaddleOCR\n",
"```\n",
"\n",
"> 注码云托管代码可能无法实时同步本github项目更新存在3~5天延时请优先使用推荐方式。\n",
">\n",
"> \t\t如果你不熟悉git操作可以直接在PaddleOCR的首页的 `Code` 中下载压缩包\n",
"\n",
"然后安装第三方库:\n",
"\n",
"```\n",
"cd PaddleOCR\n",
"pip3 install -r requirements.txt\n",
"```\n",
"\n",
"\n",
"\n",
"### 1.4 查阅资料\n",
"\n",
"[PaddleOCR使用文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/README_ch.md#%E6%96%87%E6%A1%A3%E6%95%99%E7%A8%8B) (中文) 中详细介绍了如何使用PaddleOCR完成模型应用、训练和部署。文档内容丰富大多数用户的问题都在文档或FAQ中有所描述尤其在[FAQ(中文)](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/FAQ.md)中,按照深度学习的应用过程沉淀了用户的常见问题,建议大家仔细阅读。\n",
"\n",
"### 1.5 寻求帮助\n",
"\n",
"如果你在使用PaddleOCR的过程中遇到BUG、易用性或者文档相关的问题可通过[Github issue](https://github.com/PaddlePaddle/PaddleOCR/issues)与官方联系请按照issue模板尽可能多的提供信息以便官方人员迅速定位问题。同时微信群是广大PaddleOCR用户的日常交流阵地更适合提问一些咨询类问题除了有PaddleOCR团队成员以外还会有热心开发者回答大家的问题。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -345,7 +345,7 @@ class KieLabelEncode(object):
max_num = 300
temp_bboxes = np.zeros([max_num, 4])
h, _ = bboxes.shape
temp_bboxes[:h, :h] = bboxes
temp_bboxes[:h, :] = bboxes
temp_relations = np.zeros([max_num, max_num, 5])
temp_relations[:h, :h, :] = relations

View File

@ -23,7 +23,6 @@ import sys
import six
import cv2
import numpy as np
import fasttext
class DecodeImage(object):
@ -136,6 +135,7 @@ class ToCHWImage(object):
class Fasttext(object):
def __init__(self, path="None", **kwargs):
import fasttext
self.fast_model = fasttext.load_model(path)
def __call__(self, data):

View File

@ -138,13 +138,16 @@ def load_pretrained_params(model, path):
params = paddle.load(path + '.pdparams')
state_dict = model.state_dict()
new_state_dict = {}
for k1, k2 in zip(state_dict.keys(), params.keys()):
if list(state_dict[k1].shape) == list(params[k2].shape):
new_state_dict[k1] = params[k2]
for k1 in params.keys():
if k1 not in state_dict.keys():
logger.warning("The pretrained params {} not in model".format(k1))
else:
logger.warning(
"The shape of model params {} {} not matched with loaded params {} {} !".
format(k1, state_dict[k1].shape, k2, params[k2].shape))
if list(state_dict[k1].shape) == list(params[k1].shape):
new_state_dict[k1] = params[k1]
else:
logger.warning(
"The shape of model params {} {} not matched with loaded params {} {} !".
format(k1, state_dict[k1].shape, k1, params[k1].shape))
model.set_state_dict(new_state_dict)
logger.info("load pretrain successful from {}".format(path))
return model

View File

@ -3,7 +3,7 @@ model_name:PPOCRv2_ocr_det_kl
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_PP-OCRv2_det_infer/
infer_model:./inference/ch_PP-OCRv2_det_infer
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
infer_quant:True
inference:tools/infer/predict_det.py

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:null
quant_export:null
fpgm_export:deploy/slim/prune/export_prune_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:null
quant_export:null
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_FPGM/rec_chinese_lite_train_v2.0.yml -o

View File

@ -13,7 +13,7 @@ inference:tools/infer/predict_rec.py
--rec_batch_num:1
--use_tensorrt:False|True
--precision:int8
--det_model_dir:
--rec_model_dir:
--image_dir:./inference/rec_inference
null:null
--benchmark:True

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/ch_ppocr_server_v2.0_det/det_r50_vd_db.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_east_v2.0/det_mv3_east.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/det/det_r50_vd_db.yml -o
quant_export:null
fpgm_export:null
@ -34,7 +34,7 @@ distill_export:null
export1:null
export2:null
##
train_model:./inference/ch_ppocr_server_v2.0_det_train/best_accuracy
train_model:./inference/det_r50_vd_db_v2.0_train/best_accuracy
infer_export:tools/export_model.py -c configs/det/det_r50_vd_db.yml -o
infer_quant:False
inference:tools/infer/predict_det.py

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_east_v2.0/det_r50_vd_east.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_pse_v2.0/det_r50_vd_pse.yml -o
quant_export:null
fpgm_export:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_sast_icdar15_v2.0/det_r50_vd_sast_icdar2015.yml -o
quant_export:null
fpgm_export:null
@ -43,7 +43,7 @@ inference:tools/infer/predict_det.py
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32|fp16|int8
--precision:fp32|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_sast_totaltext_v2.0/det_r50_vd_sast_totaltext.yml -o
quant_export:null
fpgm_export:null
@ -43,7 +43,7 @@ inference:tools/infer/predict_det.py
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32|fp16|int8
--precision:fp32|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null

View File

@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o
quant_export:null
fpgm_export:null

View File

@ -0,0 +1,50 @@
# Web 端基础预测功能测试
Web 端主要基于 Jest-Puppeteer 完成 e2e 测试,其中 Puppeteer 操作 Chrome 完成推理流程Jest 完成测试流程。
>Puppeteer 是一个 Node 库,它提供了一个高级 API 来通过 DevTools 协议控制 Chromium 或 Chrome
>Jest 是一个 JavaScript 测试框架,旨在确保任何 JavaScript 代码的正确性。
#### 环境准备
* 安装 Node包含 npm https://nodejs.org/zh-cn/download/
* 确认是否安装成功,在命令行执行
```sh
# 显示所安 node 版本号,即表示成功安装
node -v
```
* 确认 npm 是否安装成成
```sh
# npm 随着 node 一起安装,一般无需额外安装
# 显示所安 npm 版本号,即表示成功安装
npm -v
```
#### 使用
```sh
# web 测试环境准备
bash test_tipc/prepare_js.sh 'js_infer'
# web 推理测试
bash test_tipc/test_inference_js.sh
```
#### 流程设计
###### paddlejs prepare
1. 判断 node, npm 是否安装
2. 下载测试模型,当前检测模型是 ch_PP-OCRv2_det_infer ,识别模型是 ch_PP-OCRv2_rec_infer[1, 3, 32, 320]。如果需要替换模型可直接将模型文件放在test_tipc/web/models/目录下。
- 文本检测模型https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
- 文本识别模型https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
- 文本识别模型[1, 3, 32, 320]https://paddlejs.bj.bcebos.com/models/ch_PP-OCRv2_rec_infer.tar
- 保证较为准确的识别效果需要将文本识别模型导出为输入shape是[1, 3, 32, 320]的静态模型
3. 转换模型, model.pdmodel model.pdiparams 转换为 model.json chunk.dat检测模型保存地址test_tipc/web/models/ch_PP-OCRv2/det识别模型保存地址test_tipc/web/models/ch_PP-OCRv2/rec
4. 安装最新版本 ocr sdk @paddlejs-models/ocr@latest
5. 安装测试环境依赖 puppeteer、jest、jest-puppeteer如果检查到已经安装则不会进行二次安装
###### paddlejs infer test
1. Jest 执行 server command`python3 -m http.server 9811` 开启本地服务
2. 启动 Jest 测试服务,通过 jest-puppeteer 插件完成 chrome 操作,加载 @paddlejs-models/ocr 脚本完成推理流程
3. 测试用例为原图识别后的文本结果与预期文本结果expect.json进行对比测试通过有两个标准
* 原图识别结果逐字符与预期结果对比,误差不超过 **10个字符**
* 原图识别结果每个文本框字符内容与预期结果进行相似度对比,相似度不小于 0.9全部一致则相似度为1
只有满足上述两个标准,视为测试通过。通过为如下显示:
<img width="600" src="https://user-images.githubusercontent.com/43414102/146406599-80b30c66-f2f8-4f57-a68a-007c479ff0f7.png">

View File

@ -179,7 +179,7 @@ elif [ ${MODE} = "whole_infer" ];then
cd ./inference/ && tar xf rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar && cd ../
fi
if [ ${model_name} == "ch_ppocr_server_v2.0_rec" ]; then
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/ch_ppocr_server_v2.0_rec_train.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar --no-check-certificate
cd ./inference/ && tar xf ch_ppocr_server_v2.0_rec_train.tar && cd ../
fi
if [ ${model_name} == "ch_ppocr_mobile_v2.0_rec" ]; then
@ -239,18 +239,21 @@ fi
if [ ${MODE} = "klquant_whole_infer" ]; then
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_lite.tar --no-check-certificate
cd ./train_data/ && tar xf icdar2015_lite.tar
ln -s ./icdar2015_lite ./icdar2015 && cd ../
cd ./train_data/ && tar xf icdar2015_lite.tar && rm -rf ./icdar2015 && ln -s ./icdar2015_lite ./icdar2015 && cd ../
if [ ${model_name} = "ch_ppocr_mobile_v2.0_det_KL" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar --no-check-certificate
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_det_data_50.tar && cd ../
fi
if [ ${model_name} = "ch_PPOCRv2_det" ]; then
eval_model_name="ch_PP-OCRv2_det_infer"
if [ ${model_name} = "PPOCRv2_ocr_rec_kl" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate
cd ./inference && tar xf rec_inference.tar && tar xf ch_PP-OCRv2_rec_infer.tar && cd ../
fi
if [ ${model_name} = "PPOCRv2_ocr_det_kl" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar --no-check-certificate
cd ./inference && tar xf ${eval_model_name}.tar && tar xf ch_det_data_50.tar && cd ../
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar --no-check-certificate
cd ./inference && tar xf ch_PP-OCRv2_det_infer.tar && tar xf ch_det_data_50.tar && cd ../
fi
if [ ${model_name} = "ch_ppocr_mobile_v2.0_rec_KL" ]; then
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar --no-check-certificate

View File

@ -0,0 +1,92 @@
#!/bin/bash
set -o errexit
set -o nounset
shopt -s extglob
# paddlejs prepare 主要流程
# 1. 判断 node, npm 是否安装
# 2. 下载测试模型,当前检测模型是 ch_PP-OCRv2_det_infer ,识别模型是 ch_PP-OCRv2_rec_infer [1, 3, 32, 320]。如果需要替换模型可直接将模型文件放在test_tipc/web/models/目录下。
# - 文本检测模型https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
# - 文本识别模型https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
# - 文本识别模型[1, 3, 32, 320]https://paddlejs.bj.bcebos.com/models/ch_PP-OCRv2_rec_infer.tar
# - 保证较为准确的识别效果需要将文本识别模型导出为输入shape[1, 3, 32, 320]的静态模型
# 3. 转换模型, model.pdmodel model.pdiparams 转换为 model.json chunk.dat检测模型保存地址test_tipc/web/models/ch_PP-OCRv2/det识别模型保存地址test_tipc/web/models/ch_PP-OCRv2/rec
# 4. 安装最新版本 ocr sdk @paddlejs-models/ocr@latest
# 5. 安装测试环境依赖 puppeteer、jest、jest-puppeteer如果检查到已经安装则不会进行二次安装
# 判断是否安装了node
if ! type node >/dev/null 2>&1; then
echo -e "\033[31m node 未安装 \033[0m"
exit
fi
# 判断是否安装了npm
if ! type npm >/dev/null 2>&1; then
echo -e "\033[31m npm 未安装 \033[0m"
exit
fi
# MODE be 'js_infer'
MODE=$1
# js_infer MODE , load model file and convert model to js_infer
if [ ${MODE} != "js_infer" ];then
echo "Please change mode to 'js_infer'"
exit
fi
# saved_model_name
det_saved_model_name=ch_PP-OCRv2_det_infer
rec_saved_model_name=ch_PP-OCRv2_rec_infer
# model_path
model_path=test_tipc/web/models/
rm -rf $model_path
echo ${model_path}${det_saved_model_name}
echo ${model_path}${rec_saved_model_name}
# download ocr_det inference model
wget -nc -P $model_path https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
cd $model_path && tar xf ch_PP-OCRv2_det_infer.tar && cd ../../../
# download ocr_rec inference model
wget -nc -P $model_path https://paddlejs.bj.bcebos.com/models/ch_PP-OCRv2_rec_infer.tar
cd $model_path && tar xf ch_PP-OCRv2_rec_infer.tar && cd ../../../
MYDIR=`pwd`
echo $MYDIR
pip3 install paddlejsconverter
# convert inference model to web model: model.json、chunk.dat
paddlejsconverter \
--modelPath=$model_path$det_saved_model_name/inference.pdmodel \
--paramPath=$model_path$det_saved_model_name/inference.pdiparams \
--outputDir=$model_path$det_saved_model_name/ \
paddlejsconverter \
--modelPath=$model_path$rec_saved_model_name/inference.pdmodel \
--paramPath=$model_path$rec_saved_model_name/inference.pdiparams \
--outputDir=$model_path$rec_saved_model_name/ \
# always install latest ocr sdk
cd test_tipc/web
echo -e "\033[33m Installing the latest ocr sdk... \033[0m"
npm install @paddlejs-models/ocr@latest
npm info @paddlejs-models/ocr
echo -e "\033[32m The latest ocr sdk installed completely.!~ \033[0m"
# install dependencies
if [ `npm list --dept 0 | grep puppeteer | wc -l` -ne 0 ] && [ `npm list --dept 0 | grep jest | wc -l` -ne 0 ];then
echo -e "\033[32m Dependencies have installed \033[0m"
else
echo -e "\033[33m Installing dependencies ... \033[0m"
npm install jest jest-puppeteer puppeteer
echo -e "\033[32m Dependencies installed completely.!~ \033[0m"
fi
# del package-lock.json
rm package-lock.json

View File

@ -0,0 +1,8 @@
#!/bin/bash
set -o errexit
set -o nounset
cd test_tipc/web
# run ocr test in chrome
./node_modules/.bin/jest --config ./jest.config.js

View File

@ -183,7 +183,7 @@ function func_inference(){
if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
continue
fi
if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
continue
fi
for batch_size in ${batch_size_list[*]}; do
@ -227,7 +227,12 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
for infer_model in ${infer_model_dir_list[*]}; do
# run export
if [ ${infer_run_exports[Count]} != "null" ];then
save_infer_dir=$(dirname $infer_model)
if [ ${MODE} = "klquant_whole_infer" ]; then
save_infer_dir="${infer_model}_klquant"
fi
if [ ${MODE} = "whole_infer" ]; then
save_infer_dir="${infer_model}"
fi
set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}"
@ -259,7 +264,6 @@ else
env=""
elif [ ${#gpu} -le 1 ];then
env="export CUDA_VISIBLE_DEVICES=${gpu}"
eval ${env}
elif [ ${#gpu} -le 15 ];then
IFS=","
array=(${gpu})
@ -280,6 +284,7 @@ else
set_amp_config=" "
fi
for trainer in ${trainer_list[*]}; do
eval ${env}
flag_quant=False
if [ ${trainer} = ${pact_key} ]; then
run_train=${pact_trainer}
@ -332,7 +337,6 @@ else
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
fi
# run train
eval "unset CUDA_VISIBLE_DEVICES"
eval $cmd
status_check $? "${cmd}" "${status_log}"

View File

@ -0,0 +1,20 @@
{
"text": [
"纯臻营养护发素",
"产品信息/参数",
"45元/每公斤100公斤起订",
"每瓶22元1000瓶起订",
"【品牌】:代加工方式/OEMODM",
"【品名】:纯臻营养护发素",
"【产品编号】YM-X-3011",
"ODMOEM",
"【净含量】220ml",
"【适用人群】:适合所有肤质",
"【主要成分】:鲸蜡硬脂醇、燕麦β-葡聚",
"糖、椰油酰胺丙基甜菜碱、泛醌",
"(成品包材)",
"【主要功能】:可紧致头发磷层,从而达到",
"即时持久改善头发光泽的效果,给干燥的头",
"发足够的滋养"
]
}

View File

@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>ocr test</title>
</head>
<body>
<img id="ocr" src="./test.jpg" />
</body>
<script src="./node_modules/@paddlejs-models/ocr/lib/index.js"></script>
</html>

View File

@ -0,0 +1,82 @@
const expectData = require('./expect.json');
describe('e2e test ocr model', () => {
beforeAll(async () => {
await page.goto(PATH);
});
it('ocr infer and diff test', async () => {
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
const text = await page.evaluate(async () => {
const $ocr = document.querySelector('#ocr');
const ocr = paddlejs['ocr'];
await ocr.init('./models/ch_PP-OCRv2_det_infer', './models/ch_PP-OCRv2_rec_infer');
const res = await ocr.recognize($ocr);
return res.text;
});
// 模型文字识别结果与预期结果diff的字符数
let diffNum = 0;
// 文本框字符串相似度
let similarity = 0;
// 预期字符diff数
const expectedDiffNum = 10;
// 预期文本框字符串相似度
const expecteSimilarity = 0.9;
// 预期文本内容
const expectResult = expectData.text;
expectResult && expectResult.forEach((item, index) => {
const word = text[index];
// 逐字符对比
for(let i = 0; i < item.length; i++) {
if (item[i] !== word[i]) {
console.log('expect: ', item[i], ' word: ', word[i]);
diffNum++;
}
}
// 文本框字符串相似度对比
const s = similar(item, word);
similarity += s;
});
similarity = similarity / expectResult.length;
expect(diffNum).toBeLessThanOrEqual(expectedDiffNum);
expect(similarity).toBeGreaterThanOrEqual(expecteSimilarity);
function similar(string, expect) {
if (!string || !expect) {
return 0;
}
const length = string.length > expect.length ? string.length : expect.length;
const n = string.length;
const m = expect.length;
let data = [];
const min = (a, b, c) => {
return a < b ? (a < c ? a : c) : (b < c ? b : c);
};
let i, j, si, ej, cost;
if (n === 0) return m;
if (m === 0) return n;
for (i = 0; i <= n; i++) {
data[i] = [];
[i][0] = i
}
for (j = 0; j <= m; j++) {
data[0][j] = j;
}
for (i = 1; i <= n; i++) {
si = string.charAt(i - 1);
for (j = 1; j <= m; j++) {
ej = expect.charAt(j - 1);
cost = si === ej ? 0 : 1;
data[i][j] = min(data[i - 1][j] + 1, data[i][j - 1] + 1, data[i - 1][j - 1] + cost);
}
}
return (1 - data[n][m] / length);
}
});
});

View File

@ -0,0 +1,14 @@
// jest-puppeteer.config.js
module.exports = {
launch: {
headless: false,
product: 'chrome'
},
browserContext: 'default',
server: {
command: 'python3 -m http.server 9811',
port: 9811,
launchTimeout: 10000,
debug: true
}
};

View File

@ -0,0 +1,111 @@
// For a detailed explanation regarding each configuration property and type check, visit:
// https://jestjs.io/docs/en/configuration.html
module.exports = {
preset: 'jest-puppeteer',
// All imported modules in your tests should be mocked automatically
// automock: false,
// Automatically clear mock calls and instances between every test
clearMocks: true,
// An object that configures minimum threshold enforcement for coverage results
// coverageThreshold: undefined,
// A set of global variables that need to be available in all test environments
globals: {
PATH: 'http://localhost:9811'
},
// The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers.
// maxWorkers: "50%",
// An array of directory names to be searched recursively up from the requiring module's location
// moduleDirectories: [
// "node_modules"
// ],
// An array of file extensions your modules use
moduleFileExtensions: [
'js',
'json',
'jsx',
'ts',
'tsx',
'node'
],
// The root directory that Jest should scan for tests and modules within
// rootDir: undefined,
// A list of paths to directories that Jest should use to search for files in
roots: [
'<rootDir>'
],
// Allows you to use a custom runner instead of Jest's default test runner
// runner: "jest-runner",
// The paths to modules that run some code to configure or set up the testing environment before each test
// setupFiles: [],
// A list of paths to modules that run some code to configure or set up the testing framework before each test
// setupFilesAfterEnv: [],
// The number of seconds after which a test is considered as slow and reported as such in the results.
// slowTestThreshold: 5,
// A list of paths to snapshot serializer modules Jest should use for snapshot testing
// snapshotSerializers: [],
// The test environment that will be used for testing
// testEnvironment: 'jsdom',
// Options that will be passed to the testEnvironment
// testEnvironmentOptions: {},
// An array of regexp pattern strings that are matched against all test paths, matched tests are skipped
testPathIgnorePatterns: [
'/node_modules/'
],
// The regexp pattern or array of patterns that Jest uses to detect test files
testRegex: '.(.+)\\.test\\.(js|ts)$',
// This option allows the use of a custom results processor
// testResultsProcessor: undefined,
// This option allows use of a custom test runner
// testRunner: "jest-circus/runner",
// This option sets the URL for the jsdom environment. It is reflected in properties such as location.href
testURL: 'http://localhost:9898/',
// Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout"
// timers: "real",
// A map from regular expressions to paths to transformers
transform: {
'^.+\\.js$': 'babel-jest'
},
// An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
transformIgnorePatterns: [
'/node_modules/',
'\\.pnp\\.[^\\/]+$'
],
// An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
// unmockedModulePathPatterns: undefined,
// Indicates whether each individual test should be reported during the run
verbose: true,
// An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode
// watchPathIgnorePatterns: [],
// Whether to use watchman for file crawling
// watchman: true,
testTimeout: 50000
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 281 KiB

View File

@ -101,17 +101,22 @@ class TextDetector(object):
else:
logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
sys.exit(0)
if self.use_onnx:
pre_process_list[0] = {
'DetResizeForTest': {
'image_shape': [640, 640]
}
}
self.preprocess_op = create_operators(pre_process_list)
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
args, 'det', logger)
if self.use_onnx:
img_h, img_w = self.input_tensor.shape[2:]
if img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
pre_process_list[0] = {
'DetResizeForTest': {
'image_shape': [img_h, img_w]
}
}
self.preprocess_op = create_operators(pre_process_list)
if args.benchmark:
import auto_log
pid = os.getpid()

View File

@ -109,7 +109,10 @@ class TextRecognizer(object):
assert imgC == img.shape[2]
imgW = int((32 * max_wh_ratio))
if self.use_onnx:
imgW = 100
w = self.input_tensor.shape[3:][0]
if w is not None and w > 0:
imgW = w
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:

View File

@ -15,6 +15,7 @@
import argparse
import os
import sys
import platform
import cv2
import numpy as np
import paddle
@ -313,6 +314,10 @@ def create_predictor(args, mode, logger):
def get_infer_gpuid():
sysstr = platform.system()
if sysstr == "Windows":
return 0
if not paddle.fluid.core.is_compiled_with_rocm():
cmd = "env | grep CUDA_VISIBLE_DEVICES"
else: