This is a combination of 59 commits from v2.7.0 to v2.7.1 in ()

release/2.7.1 branch

Co-authored-by: ToddBear <43341135+ToddBear@users.noreply.github.com>
pull/11836/head v2.7.4
jzhang533 2024-03-29 10:39:03 +08:00 committed by GitHub
parent ddaa85ddaf
commit 0b91f4dd50
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
77 changed files with 4101 additions and 565 deletions

View File

@ -0,0 +1,17 @@
---
name: New Feature Issue template
about: Issue template for new features.
title: ''
labels: 'Code PR is needed'
assignees: 'shiyutang'
---
## 背景
经过需求征集https://github.com/PaddlePaddle/PaddleOCR/issues/10334 和每周技术研讨会 https://github.com/PaddlePaddle/PaddleOCR/issues/10223 讨论我们确定了XXXX任务。
## 解决步骤
1. 根据开源代码进行网络结构、评估指标转换。代码链接XXXX
2. 结合[论文复现指南](https://github.com/PaddlePaddle/models/blob/release%2F2.2/tutorials/article-implementation/ArticleReproduction_CV.md)进行前反向对齐等操作达到论文Table.1中的指标。
3. 参考[PR提交规范](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/code_and_doc.md)提交代码PR到ppocr中。

View File

@ -40,4 +40,3 @@ repos:
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --no-cache]

View File

@ -17,48 +17,43 @@ def isCreateOrDeleteFolder(path, flag):
return flagAbsPath
def splitTrainVal(root, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag):
# 按照指定的比例划分训练集、验证集、测试集
dataAbsPath = os.path.abspath(root)
def splitTrainVal(root, abs_train_root_path, abs_val_root_path, abs_test_root_path, train_txt, val_txt, test_txt, flag):
data_abs_path = os.path.abspath(root)
label_file_name = args.detLabelFileName if flag == "det" else args.recLabelFileName
label_file_path = os.path.join(data_abs_path, label_file_name)
if flag == "det":
labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName)
elif flag == "rec":
labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName)
with open(label_file_path, "r", encoding="UTF-8") as label_file:
label_file_content = label_file.readlines()
random.shuffle(label_file_content)
label_record_len = len(label_file_content)
labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
labelFileContent = labelFileRead.readlines()
random.shuffle(labelFileContent)
labelRecordLen = len(labelFileContent)
for index, label_record_info in enumerate(label_file_content):
image_relative_path, image_label = label_record_info.split('\t')
image_name = os.path.basename(image_relative_path)
for index, labelRecordInfo in enumerate(labelFileContent):
imageRelativePath = labelRecordInfo.split('\t')[0]
imageLabel = labelRecordInfo.split('\t')[1]
imageName = os.path.basename(imageRelativePath)
if flag == "det":
image_path = os.path.join(data_abs_path, image_name)
elif flag == "rec":
image_path = os.path.join(data_abs_path, args.recImageDirName, image_name)
if flag == "det":
imagePath = os.path.join(dataAbsPath, imageName)
elif flag == "rec":
imagePath = os.path.join(dataAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
train_val_test_ratio = args.trainValTestRatio.split(":")
train_ratio = eval(train_val_test_ratio[0]) / 10
val_ratio = train_ratio + eval(train_val_test_ratio[1]) / 10
cur_ratio = index / label_record_len
# 按预设的比例划分训练集、验证集、测试集
trainValTestRatio = args.trainValTestRatio.split(":")
trainRatio = eval(trainValTestRatio[0]) / 10
valRatio = trainRatio + eval(trainValTestRatio[1]) / 10
curRatio = index / labelRecordLen
if curRatio < trainRatio:
imageCopyPath = os.path.join(absTrainRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
trainTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
elif curRatio >= trainRatio and curRatio < valRatio:
imageCopyPath = os.path.join(absValRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
valTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
else:
imageCopyPath = os.path.join(absTestRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
testTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
if cur_ratio < train_ratio:
image_copy_path = os.path.join(abs_train_root_path, image_name)
shutil.copy(image_path, image_copy_path)
train_txt.write("{}\t{}\n".format(image_copy_path, image_label))
elif cur_ratio >= train_ratio and cur_ratio < val_ratio:
image_copy_path = os.path.join(abs_val_root_path, image_name)
shutil.copy(image_path, image_copy_path)
val_txt.write("{}\t{}\n".format(image_copy_path, image_label))
else:
image_copy_path = os.path.join(abs_test_root_path, image_name)
shutil.copy(image_path, image_copy_path)
test_txt.write("{}\t{}\n".format(image_copy_path, image_label))
# 删掉存在的文件
@ -148,4 +143,4 @@ if __name__ == "__main__":
help="the name of the folder where the cropped recognition dataset is located"
)
args = parser.parse_args()
genDetRecTrainVal(args)
genDetRecTrainVal(args)

View File

@ -68,12 +68,10 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力
<a name="技术交流合作"></a>
## 📖 技术交流合作
- 飞桨低代码开发工具PaddleX—— 面向国内外主流AI硬件的飞桨精选模型一站式开发工具。包含如下核心优势
- 【产业高精度模型库】覆盖10个主流AI任务 40+精选模型,丰富齐全。
- 【特色模型产线】:提供融合大小模型的特色模型产线,精度更高,效果更好。
- 【低代码开发模式】:图形化界面支持统一开发范式,便捷高效。
- 【私有化部署多硬件支持】适配国内外主流AI硬件支持本地纯离线使用满足企业安全保密需要。
- 飞桨AI套件([PaddleX](http://10.136.157.23:8080/paddle/paddleX))提供了飞桨模型训压推一站式全流程高效率开发平台其使命是助力AI技术快速落地愿景是使人人成为AI Developer
- PaddleX 目前覆盖图像分类、目标检测、图像分割、3D、OCR和时序预测等领域方向已内置了36种基础单模型例如RT-DETR、PP-YOLOE、PP-HGNet、PP-LCNet、PP-LiteSeg等集成了12种实用的产业方案例如PP-OCRv4、PP-ChatOCR、PP-ShiTu、PP-TS、车载路面垃圾检测、野生动物违禁制品识别等。
- PaddleX 提供了“工具箱”和“开发者”两种AI开发模式。工具箱模式可以无代码调优关键超参开发者模式可以低代码进行单模型训压推和多模型串联推理同时支持云端和本地端。
- PaddleX 还支持联创开发,利润分成!目前 PaddleX 正在快速迭代,欢迎广大的个人开发者和企业开发者参与进来,共创繁荣的 AI 技术生态!
- PaddleX官网地址https://aistudio.baidu.com/intro/paddlex

254
README_ch.md 100755
View File

@ -0,0 +1,254 @@
[English](README.md) | 简体中文 | [हिन्दी](./doc/doc_i18n/README_हिन्द.md) | [日本語](./doc/doc_i18n/README_日本語.md) | [한국인](./doc/doc_i18n/README_한국어.md) | [Pу́сский язы́к](./doc/doc_i18n/README_Ру́сский_язы́к.md)
<p align="center">
<img src="./doc/PaddleOCR_log.png" align="middle" width = "600"/>
<p align="center">
<p align="left">
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
<a href="https://github.com/PaddlePaddle/PaddleOCR/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleOCR?color=ffa"></a>
<a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
<a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
<a href=""><img src="https://img.shields.io/pypi/format/PaddleOCR?color=c77"></a>
<a href="https://pypi.org/project/PaddleOCR/"><img src="https://img.shields.io/pypi/dm/PaddleOCR?color=9cf"></a>
<a href="https://github.com/PaddlePaddle/PaddleOCR/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf"></a>
</p>
## 简介
PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库助力开发者训练出更好的模型并应用落地。
<div align="center">
<img src="./doc/imgs_results/ch_ppocr_mobile_v2.0/test_add_91.jpg" width="800">
</div>
<div align="center">
<img src="./doc/imgs_results/ch_ppocr_mobile_v2.0/00006737.jpg" width="800">
</div>
## 📣 近期更新
- **🔥2023.3.10 PaddleOCR集成了高性能、全场景模型部署方案FastDeploy欢迎参考[指南](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/deploy/fastdeploy)试用注意使用dygraph分支。**
- 📚**2022.12 发布[《OCR产业范例20讲》电子书](./applications/README.md)**,新增蒙古文、身份证、液晶屏缺陷等**7个场景应用范例**
- 🔨**2022.11 新增实现[4种前沿算法](doc/doc_ch/algorithm_overview.md)**:文本检测 [DRRG](doc/doc_ch/algorithm_det_drrg.md), 文本识别 [RFL](doc/doc_ch/algorithm_rec_rfl.md), 文本超分[Text Telescope](doc/doc_ch/algorithm_sr_telescope.md),公式识别[CAN](doc/doc_ch/algorithm_rec_can.md)
- **2022.10 优化[JS版PP-OCRv3模型](./deploy/paddlejs/README_ch.md)**模型大小仅4.3M预测速度提升8倍配套web demo开箱即用
- **💥 直播回放PaddleOCR研发团队详解PP-StructureV2优化策略**。微信扫描[下方二维码](#开源社区)关注公众号并填写问卷后进入官方交流群获取直播回放链接与20G重磅OCR学习大礼包内含PDF转Word应用程序、10种垂类模型、《动手学OCR》电子书等
- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)**
- 发布[PP-StructureV2](./ppstructure/README_ch.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery/README_ch.md),支持**一行命令完成PDF转Word**
- [版面分析](./ppstructure/layout/README_ch.md)模型优化模型存储减少95%速度提升11倍平均CPU耗时仅需41ms
- [表格识别](./ppstructure/table/README_ch.md)模型优化设计3大优化策略预测耗时不变情况下模型精度提升6%
- [关键信息抽取](./ppstructure/kie/README_ch.md)模型优化设计视觉无关模型结构语义实体识别精度提升2.8%关系抽取精度提升9.1%。
- **2022.8 发布 [OCR场景应用集合](./applications)**包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**覆盖通用制造、金融、交通行业的主要OCR垂类应用。
- **2022.8 新增实现[8种前沿算法](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_overview.md)**
- 文本检测:[FCENet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_det_fcenet.md), [DB++](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_det_db.md)
- 文本识别:[ViTSTR](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_vitstr.md), [ABINet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_abinet.md), [VisionLAN](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_visionlan.md), [SPIN](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_spin.md), [RobustScanner](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_robustscanner.md)
- 表格识别:[TableMaster](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_table_master.md)
- **2022.5.9 发布 PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)**
- 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3)速度可比情况下中文场景效果相比于PP-OCRv2再提升5%英文场景提升11%80语种多语言模型平均识别准确率提升5%以上;
- 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能;
- 发布OCR产业落地工具集打通22种训练部署软硬件环境与方式覆盖企业90%的训练部署环境需求;
- 发布交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md)覆盖OCR全栈技术的前沿理论与代码实践并配套教学视频。
> [更多](./doc/doc_ch/update.md)
## 🌟 特性
支持多种OCR相关前沿算法在此基础上打造产业级特色模型[PP-OCR](./doc/doc_ch/ppocr_introduction.md)和[PP-Structure](./ppstructure/README_ch.md),并打通数据生产、模型训练、压缩、预测部署全流程。
<div align="center">
<img src="https://user-images.githubusercontent.com/25809855/186170862-b8f80f6c-fee7-4b26-badc-de9c327c76ce.png">
</div>
> 上述内容的使用方法建议从文档教程中的快速开始体验
## ⚡ 快速开始
- 在线网站体验超轻量PP-OCR mobile模型体验地址https://www.paddlepaddle.org.cn/hub/scene/ocr
- 移动端demo体验[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统)
- 一行命令快速使用:[快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md)
<a name="电子书"></a>
## 📚《动手学OCR》电子书
- [《动手学OCR》电子书](./doc/doc_ch/ocr_book.md)
<a name="开源社区"></a>
## 👫 开源社区
- **📑项目合作:** 如果您是企业开发者且有明确的OCR垂类应用需求填写[问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx)后可免费与官方团队展开不同层次的合作。
- **👫加入社区:** **微信扫描二维码并填写问卷之后加入交流群领取20G重磅OCR学习大礼包**
- **包括《动手学OCR》电子书** 配套讲解视频和notebook项目**PaddleOCR历次发版直播课回放链接**
- **OCR场景应用模型集合** 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等垂类模型覆盖通用制造、金融、交通行业的主要OCR垂类应用。
- PDF2Word应用程序OCR社区优秀开发者项目分享视频。
- **🏅️社区项目**[社区项目](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。
- **🎁社区常规赛**社区常规赛是面向OCR开发者的积分赛事覆盖文档、代码、模型和应用四大类型以季度为单位评选并发放奖励赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。
<div align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/dygraph/doc/joinus.PNG" width = "150" height = "150",caption='' />
<p>PaddleOCR官方交流群二维码</p>
</div>
<a name="模型下载"></a>
## 🛠️ PP-OCR系列模型列表更新中
| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 |
| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| 中英文超轻量PP-OCRv3模型16.2M | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
| 英文超轻量PP-OCRv3模型13.4M | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
- 超轻量OCR系列更多模型下载包括多语言可以参考[PP-OCR系列模型下载](./doc/doc_ch/models_list.md),文档分析相关模型参考[PP-Structure系列模型下载](./ppstructure/docs/models_list.md)
### PaddleOCR场景应用模型
| 行业 | 类别 | 亮点 | 文档说明 | 模型下载 |
| ---- | ------------ | ---------------------------------- | ------------------------------------------------------------ | --------------------------------------------- |
| 制造 | 数码管识别 | 数码管数据合成、漏识别调优 | [光功率计数码管字符识别](./applications/光功率计数码管字符识别/光功率计数码管字符识别.md) | [下载链接](./applications/README.md#模型下载) |
| 金融 | 通用表单识别 | 多模态通用表单结构化提取 | [多模态表单识别](./applications/多模态表单识别.md) | [下载链接](./applications/README.md#模型下载) |
| 交通 | 车牌识别 | 多角度图像处理、轻量模型、端侧部署 | [轻量级车牌识别](./applications/轻量级车牌识别.md) | [下载链接](./applications/README.md#模型下载) |
- 更多制造、金融、交通行业的主要OCR垂类应用模型如电表、液晶屏、高精度SVTR模型等可参考[场景应用模型下载](./applications)
<a name="文档教程"></a>
## 📖 文档教程
- [运行环境准备](./doc/doc_ch/environment.md)
- [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md)
- [快速开始](./doc/doc_ch/quickstart.md)
- [模型库](./doc/doc_ch/models_list.md)
- [模型训练](./doc/doc_ch/training.md)
- [文本检测](./doc/doc_ch/detection.md)
- [文本识别](./doc/doc_ch/recognition.md)
- [文本方向分类器](./doc/doc_ch/angle_class.md)
- 模型压缩
- [模型量化](./deploy/slim/quantization/README.md)
- [模型裁剪](./deploy/slim/prune/README.md)
- [知识蒸馏](./doc/doc_ch/knowledge_distillation.md)
- [推理部署](./deploy/README_ch.md)
- [基于Python预测引擎推理](./doc/doc_ch/inference_ppocr.md)
- [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md)
- [服务化部署](./deploy/pdserving/README_CN.md)
- [端侧部署](./deploy/lite/readme.md)
- [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md)
- [云上飞桨部署工具](./deploy/paddlecloud/README.md)
- [Benchmark](./doc/doc_ch/benchmark.md)
- [PP-Structure文档分析🔥](./ppstructure/README_ch.md)
- [快速开始](./ppstructure/docs/quickstart.md)
- [模型库](./ppstructure/docs/models_list.md)
- [模型训练](./doc/doc_ch/training.md)
- [版面分析](./ppstructure/layout/README_ch.md)
- [表格识别](./ppstructure/table/README_ch.md)
- [关键信息提取](./ppstructure/kie/README_ch.md)
- [推理部署](./deploy/README_ch.md)
- [基于Python预测引擎推理](./ppstructure/docs/inference.md)
- [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md)
- [服务化部署](./deploy/hubserving/readme.md)
- [前沿算法与模型🚀](./doc/doc_ch/algorithm_overview.md)
- [文本检测算法](./doc/doc_ch/algorithm_overview.md)
- [文本识别算法](./doc/doc_ch/algorithm_overview.md)
- [端到端OCR算法](./doc/doc_ch/algorithm_overview.md)
- [表格识别算法](./doc/doc_ch/algorithm_overview.md)
- [关键信息抽取算法](./doc/doc_ch/algorithm_overview.md)
- [使用PaddleOCR架构添加新算法](./doc/doc_ch/add_new_algorithm.md)
- [场景应用](./applications)
- 数据标注与合成
- [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md)
- [数据合成工具Style-Text](./StyleText/README_ch.md)
- [其它数据标注工具](./doc/doc_ch/data_annotation.md)
- [其它数据合成工具](./doc/doc_ch/data_synthesis.md)
- 数据集
- [通用中英文OCR数据集](doc/doc_ch/dataset/datasets.md)
- [手写中文OCR数据集](doc/doc_ch/dataset/handwritten_datasets.md)
- [垂类多语言OCR数据集](doc/doc_ch/dataset/vertical_and_multilingual_datasets.md)
- [版面分析数据集](doc/doc_ch/dataset/layout_datasets.md)
- [表格识别数据集](doc/doc_ch/dataset/table_datasets.md)
- [关键信息提取数据集](doc/doc_ch/dataset/kie_datasets.md)
- [代码组织结构](./doc/doc_ch/tree.md)
- [效果展示](#效果展示)
- [《动手学OCR》电子书📚](./doc/doc_ch/ocr_book.md)
- [开源社区](#开源社区)
- FAQ
- [通用问题](./doc/doc_ch/FAQ.md)
- [PaddleOCR实战问题](./doc/doc_ch/FAQ.md)
- [参考文献](./doc/doc_ch/reference.md)
- [许可证书](#许可证书)
<a name="效果展示"></a>
## 👀 效果展示 [more](./doc/doc_ch/visualization.md)
<details open>
<summary>PP-OCRv3 中文模型</summary>
<div align="center">
<img src="doc/imgs_results/PP-OCRv3/ch/PP-OCRv3-pic001.jpg" width="800">
<img src="doc/imgs_results/PP-OCRv3/ch/PP-OCRv3-pic002.jpg" width="800">
<img src="doc/imgs_results/PP-OCRv3/ch/PP-OCRv3-pic003.jpg" width="800">
</div>
</details>
<details open>
<summary>PP-OCRv3 英文模型</summary>
<div align="center">
<img src="doc/imgs_results/PP-OCRv3/en/en_1.png" width="800">
<img src="doc/imgs_results/PP-OCRv3/en/en_2.png" width="800">
</div>
</details>
<details open>
<summary>PP-OCRv3 多语言模型</summary>
<div align="center">
<img src="doc/imgs_results/PP-OCRv3/multi_lang/japan_2.jpg" width="800">
<img src="doc/imgs_results/PP-OCRv3/multi_lang/korean_1.jpg" width="800">
</div>
</details>
<details open>
<summary>PP-Structure 文档分析</summary>
- 版面分析+表格识别
<div align="center">
<img src="./ppstructure/docs/table/ppstructure.GIF" width="800">
</div>
- SER语义实体识别
<div align="center">
<img src="https://user-images.githubusercontent.com/14270174/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808.jpg" width="600">
</div>
<div align="center">
<img src="https://user-images.githubusercontent.com/14270174/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a.png" width="600">
</div>
<div align="center">
<img src="https://user-images.githubusercontent.com/14270174/197464552-69de557f-edff-4c7f-acbf-069df1ba097f.png" width="600">
</div>
- RE关系提取
<div align="center">
<img src="https://user-images.githubusercontent.com/14270174/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb.jpg" width="600">
</div>
<div align="center">
<img src="https://user-images.githubusercontent.com/14270174/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f.jpg" width="600">
</div>
<div align="center">
<img src="https://user-images.githubusercontent.com/25809855/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d.png" width="600">
</div>
</details>
<a name="许可证书"></a>
## 许可证书
本项目的发布受<a href="https://github.com/PaddlePaddle/PaddleOCR/blob/master/LICENSE">Apache 2.0 license</a>许可认证。

View File

@ -17,6 +17,7 @@ https://github.com/zcswdt/Color_OCR_image_generator
"""
import os
import random
import PIL
from PIL import Image, ImageDraw, ImageFont
import json
import argparse
@ -55,8 +56,11 @@ def get_horizontal_text_picture(image_file, chars, fonts_list, cf):
ch_w = []
ch_h = []
for ch in chars:
left, top, right, bottom = font.getbbox(ch)
wt, ht = right - left, bottom - top
if int(PIL.__version__.split('.')[0]) < 10:
wt, ht = font.getsize(ch)
else:
left, top, right, bottom = font.getbbox(ch)
wt, ht = right - left, bottom - top
ch_w.append(wt)
ch_h.append(ht)
f_w = sum(ch_w)
@ -102,8 +106,11 @@ def get_vertical_text_picture(image_file, chars, fonts_list, cf):
ch_w = []
ch_h = []
for ch in chars:
left, top, right, bottom = font.getbbox(ch)
wt, ht = right - left, bottom - top
if int(PIL.__version__.split('.')[0]) < 10:
wt, ht = font.getsize(ch)
else:
left, top, right, bottom = font.getbbox(ch)
wt, ht = right - left, bottom - top
ch_w.append(wt)
ch_h.append(ht)
f_w = max(ch_w)

View File

@ -0,0 +1,144 @@
Global:
debug: false
use_gpu: true
epoch_num: 200
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/svtr_large/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: true
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: false
infer_img: doc/imgs_words/ch/word_1.jpg
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: &max_text_length 40
infer_mode: false
use_space_char: true
distributed: true
save_res_path: ./output/rec/predicts_svtr_large.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 1.0e-08
weight_decay: 0.05
no_weight_decay_name: norm pos_embed char_node_embed pos_node_embed char_pos_embed vis_pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.00025 # 8gpus 64bs
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR_LCNet
Transform: null
Backbone:
name: SVTRNet
img_size:
- 48
- 320
out_char_num: 40
out_channels: 512
patch_merging: Conv
embed_dim: [192, 256, 512]
depth: [6, 6, 9]
num_heads: [6, 8, 16]
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv','Conv','Conv','Conv','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
local_mixer: [[5, 5], [5, 5], [5, 5]]
last_stage: False
prenorm: True
Head:
name: MultiHead
use_pool: true
use_pos: true
head_list:
- CTCHead:
Neck:
name: svtr
dims: 256
depth: 2
hidden_dims: 256
kernel_size: [1, 3]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 512
max_text_length: *max_text_length
Loss:
name: MultiLoss
loss_config_list:
- CTCLoss:
- NRTRLoss:
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
ignore_space: true
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/
ext_op_transform_idx: 1
label_file_list:
- ./train_data/train_list.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- RecAug:
- MultiLabelEncode:
gtc_encode: NRTRLabelEncode
- RecResizeImg:
image_shape: [3, 48, 320]
- KeepKeys:
keep_keys:
- image
- label_ctc
- label_gtc
- length
- valid_ratio
loader:
shuffle: true
batch_size_per_card: 64
drop_last: true
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data
label_file_list:
- ./train_data/val_list.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- MultiLabelEncode:
gtc_encode: NRTRLabelEncode
- SVTRRecResizeImg:
image_shape: [3, 48, 320]
- KeepKeys:
keep_keys:
- image
- label_ctc
- label_gtc
- length
- valid_ratio
loader:
shuffle: false
drop_last: false
batch_size_per_card: 128
num_workers: 4

View File

@ -10,13 +10,13 @@ Global:
cal_metric_during_train: True
pretrained_model: ./pretrain_models/abinet_vl_pretrained
checkpoints:
save_inference_dir:
save_inference_dir: ./output/rec/r45_abinet/infer
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
max_text_length: &max_text_length 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_abinet.txt
@ -29,7 +29,7 @@ Optimizer:
lr:
name: Piecewise
decay_epochs: [6]
values: [0.0001, 0.00001]
values: [0.0001, 0.00001]
regularizer:
name: 'L2'
factor: 0.
@ -45,7 +45,9 @@ Architecture:
name: ABINetHead
use_lang: True
iter_size: 3
max_length: *max_text_length
image_size: [ &h 32, &w 128 ] # [ h, w ]
Loss:
name: CELoss
@ -70,7 +72,7 @@ Train:
- ABINetLabelEncode: # Class handling label
ignore_index: *ignore_index
- ABINetRecResizeImg:
image_shape: [3, 32, 128]
image_shape: [3, *h, *w]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
@ -90,7 +92,7 @@ Eval:
- ABINetLabelEncode: # Class handling label
ignore_index: *ignore_index
- ABINetRecResizeImg:
image_shape: [3, 32, 128]
image_shape: [3, *h, *w]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:

View File

@ -0,0 +1,113 @@
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/svtr_cppd_base_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_cppd_base_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 1.e-8
weight_decay: 0.05
no_weight_decay_name: norm pos_embed char_node_embed pos_node_embed char_pos_embed vis_pos_embed
one_dim_param_no_weight_decay: True
lr:
name: Cosine
learning_rate: 0.0005 # 4gpus 128bs
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: CPPD
Transform:
Backbone:
name: SVTRNet
img_size: [32, 256]
patch_merging: 'Conv'
embed_dim: [128, 256, 384]
depth: [6, 6, 4]
num_heads: [4, 8, 12]
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
local_mixer: [[5, 5], [5, 5], [5, 5]]
last_stage: False
prenorm: True
Head:
name: CPPDHead
dim: 384
vis_seq: 128
ch: &ch True
Loss:
name: CPPDLoss
ignore_index: &ignore_index 7000 # must be greater than the number of character classes
smoothing: True
sideloss_weight: 1.0
PostProcess:
name: CPPDLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CPPDLabelEncode: # Class handling label
ignore_index: *ignore_index
ch: *ch
- SVTRRecResizeImg:
image_shape: [3, 32, 256]
padding: True
- KeepKeys:
keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 8
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CPPDLabelEncode: # Class handling label
ignore_index: *ignore_index
ch: *ch
- SVTRRecResizeImg:
image_shape: [3, 32, 256]
padding: True
- KeepKeys:
keep_keys: ['image', 'label', 'label_node','label_index','length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2

View File

@ -0,0 +1,112 @@
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/svtr_cppd_base/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_cppd_base.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 1.e-8
weight_decay: 0.05
no_weight_decay_name: norm pos_embed char_node_embed pos_node_embed char_pos_embed vis_pos_embed
one_dim_param_no_weight_decay: True
lr:
name: Cosine
learning_rate: 0.0005 # 4gpus 256bs
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: CPPD
Transform:
Backbone:
name: SVTRNet
img_size: [32, 100]
patch_merging: 'Conv'
embed_dim: [128, 256, 384]
depth: [6, 6, 4]
num_heads: [4, 8, 12]
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
local_mixer: [[5, 5], [5, 5], [5, 5]]
last_stage: False
prenorm: True
Head:
name: CPPDHead
dim: 384
vis_seq: 50
num_layer: 3
Loss:
name: CPPDLoss
ignore_index: &ignore_index 100 # must be greater than the number of character classes
smoothing: True
sideloss_weight: 1.0
PostProcess:
name: CPPDLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CPPDLabelEncode: # Class handling label
ignore_index: *ignore_index
- SVTRRecResizeImg:
image_shape: [3, 32, 100]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 8
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CPPDLabelEncode: # Class handling label
ignore_index: *ignore_index
- SVTRRecResizeImg:
image_shape: [3, 32, 100]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'label_node','length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2

View File

@ -0,0 +1,116 @@
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 5
save_model_dir: ./output/rec/parseq
save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 500]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path: ppocr/utils/dict/parseq_dict.txt
character_type: en
max_text_length: 25
num_heads: 8
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_parseq.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: OneCycle
max_lr: 0.0007
Architecture:
model_type: rec
algorithm: ParseQ
in_channels: 3
Transform:
Backbone:
name: ViTParseQ
img_size: [32, 128]
patch_size: [4, 8]
embed_dim: 384
depth: 12
num_heads: 6
mlp_ratio: 4
in_channels: 3
Head:
name: ParseQHead
# Architecture
max_text_length: 25
embed_dim: 384
dec_num_heads: 12
dec_mlp_ratio: 4
dec_depth: 1
# Training
perm_num: 6
perm_forward: true
perm_mirrored: true
dropout: 0.1
# Decoding mode (test)
decode_ar: true
refine_iters: 1
Loss:
name: ParseQLoss
PostProcess:
name: ParseQLabelDecode
Metric:
name: RecMetric
main_indicator: acc
is_filter: True
Train:
dataset:
name: LMDBDataSet
data_dir:
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- ParseQRecAug:
aug_type: 0 # or 1
- ParseQLabelEncode:
- SVTRRecResizeImg:
image_shape: [3, 32, 128]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 192
drop_last: True
num_workers: 4
Eval:
dataset:
name: LMDBDataSet
data_dir:
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- ParseQLabelEncode: # Class handling label
- SVTRRecResizeImg:
image_shape: [3, 32, 128]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length']
loader:
shuffle: False
drop_last: False
batch_size_per_card: 384
num_workers: 4

View File

@ -17,7 +17,7 @@
#include "paddle_api.h"
#include "paddle_inference_api.h"
#include <include/ocr_cls.h>
#include <include/preprocess_op.h>
#include <include/utility.h>
namespace PaddleOCR {

View File

@ -23,7 +23,7 @@ namespace PaddleOCR {
class PPOCR {
public:
explicit PPOCR();
~PPOCR();
~PPOCR() = default;
std::vector<std::vector<OCRPredictResult>> ocr(std::vector<cv::Mat> img_list,
bool det = true,
@ -47,9 +47,9 @@ protected:
std::vector<OCRPredictResult> &ocr_results);
private:
DBDetector *detector_ = nullptr;
Classifier *classifier_ = nullptr;
CRNNRecognizer *recognizer_ = nullptr;
std::unique_ptr<DBDetector> detector_;
std::unique_ptr<Classifier> classifier_;
std::unique_ptr<CRNNRecognizer> recognizer_;
};
} // namespace PaddleOCR

View File

@ -23,7 +23,7 @@ namespace PaddleOCR {
class PaddleStructure : public PPOCR {
public:
explicit PaddleStructure();
~PaddleStructure();
~PaddleStructure() = default;
std::vector<StructurePredictResult> structure(cv::Mat img,
bool layout = false,
@ -37,8 +37,8 @@ private:
std::vector<double> time_info_table = {0, 0, 0};
std::vector<double> time_info_layout = {0, 0, 0};
StructureTableRecognizer *table_model_ = nullptr;
StructureLayoutRecognizer *layout_model_ = nullptr;
std::unique_ptr<StructureTableRecognizer> table_model_;
std::unique_ptr<StructureLayoutRecognizer> layout_model_;
void layout(cv::Mat img,
std::vector<StructurePredictResult> &structure_result);

View File

@ -82,7 +82,7 @@ void check_params() {
}
void ocr(std::vector<cv::String> &cv_all_img_names) {
PPOCR ocr = PPOCR();
PPOCR ocr;
if (FLAGS_benchmark) {
ocr.reset_timer();
@ -120,7 +120,7 @@ void ocr(std::vector<cv::String> &cv_all_img_names) {
}
void structure(std::vector<cv::String> &cv_all_img_names) {
PaddleOCR::PaddleStructure engine = PaddleOCR::PaddleStructure();
PaddleOCR::PaddleStructure engine;
if (FLAGS_benchmark) {
engine.reset_timer();

View File

@ -20,12 +20,9 @@ void Classifier::Run(std::vector<cv::Mat> img_list,
std::vector<int> &cls_labels,
std::vector<float> &cls_scores,
std::vector<double> &times) {
std::chrono::duration<float> preprocess_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
std::chrono::duration<float> inference_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
std::chrono::duration<float> postprocess_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
std::chrono::duration<float> preprocess_diff = std::chrono::duration<float>::zero();
std::chrono::duration<float> inference_diff = std::chrono::duration<float>::zero();
std::chrono::duration<float> postprocess_diff = std::chrono::duration<float>::zero();
int img_num = img_list.size();
std::vector<int> cls_image_shape = {3, 48, 192};

View File

@ -20,12 +20,9 @@ void CRNNRecognizer::Run(std::vector<cv::Mat> img_list,
std::vector<std::string> &rec_texts,
std::vector<float> &rec_text_scores,
std::vector<double> &times) {
std::chrono::duration<float> preprocess_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
std::chrono::duration<float> inference_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
std::chrono::duration<float> postprocess_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
std::chrono::duration<float> preprocess_diff = std::chrono::duration<float>::zero();
std::chrono::duration<float> inference_diff = std::chrono::duration<float>::zero();
std::chrono::duration<float> postprocess_diff = std::chrono::duration<float>::zero();
int img_num = img_list.size();
std::vector<float> width_list;

View File

@ -21,28 +21,28 @@ namespace PaddleOCR {
PPOCR::PPOCR() {
if (FLAGS_det) {
this->detector_ = new DBDetector(
this->detector_.reset(new DBDetector(
FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem,
FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_limit_type,
FLAGS_limit_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh,
FLAGS_det_db_unclip_ratio, FLAGS_det_db_score_mode, FLAGS_use_dilation,
FLAGS_use_tensorrt, FLAGS_precision);
FLAGS_use_tensorrt, FLAGS_precision));
}
if (FLAGS_cls && FLAGS_use_angle_cls) {
this->classifier_ = new Classifier(
this->classifier_.reset(new Classifier(
FLAGS_cls_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem,
FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_cls_thresh,
FLAGS_use_tensorrt, FLAGS_precision, FLAGS_cls_batch_num);
FLAGS_use_tensorrt, FLAGS_precision, FLAGS_cls_batch_num));
}
if (FLAGS_rec) {
this->recognizer_ = new CRNNRecognizer(
this->recognizer_.reset(new CRNNRecognizer(
FLAGS_rec_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem,
FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_rec_char_dict_path,
FLAGS_use_tensorrt, FLAGS_precision, FLAGS_rec_batch_num,
FLAGS_rec_img_h, FLAGS_rec_img_w);
FLAGS_rec_img_h, FLAGS_rec_img_w));
}
};
}
std::vector<std::vector<OCRPredictResult>>
PPOCR::ocr(std::vector<cv::Mat> img_list, bool det, bool rec, bool cls) {
@ -51,7 +51,7 @@ PPOCR::ocr(std::vector<cv::Mat> img_list, bool det, bool rec, bool cls) {
if (!det) {
std::vector<OCRPredictResult> ocr_result;
ocr_result.resize(img_list.size());
if (cls && this->classifier_ != nullptr) {
if (cls && this->classifier_) {
this->cls(img_list, ocr_result);
for (int i = 0; i < img_list.size(); i++) {
if (ocr_result[i].cls_label % 2 == 1 &&
@ -92,7 +92,7 @@ std::vector<OCRPredictResult> PPOCR::ocr(cv::Mat img, bool det, bool rec,
img_list.push_back(crop_img);
}
// cls
if (cls && this->classifier_ != nullptr) {
if (cls && this->classifier_) {
this->cls(img_list, ocr_result);
for (int i = 0; i < img_list.size(); i++) {
if (ocr_result[i].cls_label % 2 == 1 &&
@ -190,16 +190,4 @@ void PPOCR::benchmark_log(int img_num) {
}
}
PPOCR::~PPOCR() {
if (this->detector_ != nullptr) {
delete this->detector_;
}
if (this->classifier_ != nullptr) {
delete this->classifier_;
}
if (this->recognizer_ != nullptr) {
delete this->recognizer_;
}
};
} // namespace PaddleOCR

View File

@ -21,20 +21,20 @@ namespace PaddleOCR {
PaddleStructure::PaddleStructure() {
if (FLAGS_layout) {
this->layout_model_ = new StructureLayoutRecognizer(
this->layout_model_.reset(new StructureLayoutRecognizer(
FLAGS_layout_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem,
FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_layout_dict_path,
FLAGS_use_tensorrt, FLAGS_precision, FLAGS_layout_score_threshold,
FLAGS_layout_nms_threshold);
FLAGS_layout_nms_threshold));
}
if (FLAGS_table) {
this->table_model_ = new StructureTableRecognizer(
this->table_model_.reset(new StructureTableRecognizer(
FLAGS_table_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem,
FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_table_char_dict_path,
FLAGS_use_tensorrt, FLAGS_precision, FLAGS_table_batch_num,
FLAGS_table_max_len, FLAGS_merge_no_span_structure);
FLAGS_table_max_len, FLAGS_merge_no_span_structure));
}
};
}
std::vector<StructurePredictResult>
PaddleStructure::structure(cv::Mat srcimg, bool layout, bool table, bool ocr) {
@ -65,7 +65,7 @@ PaddleStructure::structure(cv::Mat srcimg, bool layout, bool table, bool ocr) {
}
return structure_results;
};
}
void PaddleStructure::layout(
cv::Mat img, std::vector<StructurePredictResult> &structure_result) {
@ -123,7 +123,7 @@ void PaddleStructure::table(cv::Mat img,
structure_result.cell_box = structure_boxes[i];
structure_result.html_score = structure_scores[i];
}
};
}
std::string
PaddleStructure::rebuild_table(std::vector<std::string> structure_html_tags,
@ -286,10 +286,4 @@ void PaddleStructure::benchmark_log(int img_num) {
}
}
PaddleStructure::~PaddleStructure() {
if (this->table_model_ != nullptr) {
delete this->table_model_;
}
};
} // namespace PaddleOCR
} // namespace PaddleOCR

View File

@ -42,7 +42,7 @@ docker logs -f paddle_ocr
```
## 4.测试服务
a. 计算待识别图片的Base64编码如果只是测试一下效果可以通过免费的在线工具实现http://tool.chinaz.com/tools/imgtobase/
a. 计算待识别图片的Base64编码如果只是测试一下效果可以通过免费的在线工具实现http://tool.chinaz.com/tools/imgtobase/
b. 发送服务请求可参见sample_request.txt中的值
```
curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"填入图片Base64编码(需要删除'data:image/jpg;base64,'\"]}" http://localhost:8868/predict/ocr_system

View File

@ -3,7 +3,7 @@
- [基于PaddleHub Serving的服务部署](#基于paddlehub-serving的服务部署)
- [1. 近期更新](#1-近期更新)
- [2. 快速启动服务](#2-快速启动服务)
- [2.1 准备环境](#21-准备环境)
- [2.1 安装PaddleHub](#21-安装PaddleHub)
- [2.2 下载推理模型](#22-下载推理模型)
- [2.3 安装服务模块](#23-安装服务模块)
- [2.4 启动服务](#24-启动服务)
@ -15,8 +15,8 @@
PaddleOCR提供2种服务部署方式
- 基于PaddleHub Serving的部署代码路径为"`./deploy/hubserving`",按照本教程使用;
- 基于PaddleServing的部署代码路径为"`./deploy/pdserving`",使用方法参考[文档](../../deploy/pdserving/README_CN.md)。
- 基于PaddleHub Serving的部署代码路径为`./deploy/hubserving`,按照本教程使用;
- 基于PaddleServing的部署代码路径为`./deploy/pdserving`,使用方法参考[文档](../../deploy/pdserving/README_CN.md)。
# 基于PaddleHub Serving的服务部署
@ -51,120 +51,77 @@ deploy/hubserving/ocr_system/
## 2. 快速启动服务
以下步骤以检测+识别2阶段串联服务为例如果只需要检测服务或识别服务替换相应文件路径即可。
### 2.1 准备环境
```shell
# 安装paddlehub
# paddlehub 需要 python>3.6.2
### 2.1 安装PaddleHub
paddlehub 需要 python>3.6.2
```bash
pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple
```
### 2.2 下载推理模型
安装服务模块前需要准备推理模型并放到正确路径。默认使用的是PP-OCRv3模型默认模型路径为
| 模型 | 路径 |
| ------- | - |
| 检测模型 | `./inference/ch_PP-OCRv3_det_infer/` |
| 识别模型 | `./inference/ch_PP-OCRv3_rec_infer/` |
| 方向分类器 | `./inference/ch_ppocr_mobile_v2.0_cls_infer/` |
| 版面分析模型 | `./inference/picodet_lcnet_x1_0_fgd_layout_infer/` |
| 表格结构识别模型 | `./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/` |
| 关键信息抽取SER模型 | `./inference/ser_vi_layoutxlm_xfund_infer/` |
| 关键信息抽取RE模型 | `./inference/re_vi_layoutxlm_xfund_infer/` |
```
检测模型:./inference/ch_PP-OCRv3_det_infer/
识别模型:./inference/ch_PP-OCRv3_rec_infer/
方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/
版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/
表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/
关键信息抽取SER模型./inference/ser_vi_layoutxlm_xfund_infer/
关键信息抽取RE模型./inference/re_vi_layoutxlm_xfund_infer/
```
**模型路径可在`params.py`中查看和修改。**
**模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)和[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。
更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)和[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。
### 2.3 安装服务模块
PaddleOCR提供5种服务模块根据需要安装所需模块。
* 在Linux环境下安装示例如下
```shell
# 安装检测服务模块:
hub install deploy/hubserving/ocr_det/
# 或,安装分类服务模块:
hub install deploy/hubserving/ocr_cls/
# 或,安装识别服务模块:
hub install deploy/hubserving/ocr_rec/
# 或,安装检测+识别串联服务模块:
hub install deploy/hubserving/ocr_system/
# 或,安装表格识别服务模块:
hub install deploy/hubserving/structure_table/
# 或安装PP-Structure服务模块
hub install deploy/hubserving/structure_system/
# 或,安装版面分析服务模块:
hub install deploy/hubserving/structure_layout/
# 或安装关键信息抽取SER服务模块
hub install deploy/hubserving/kie_ser/
# 或安装关键信息抽取SER+RE服务模块
hub install deploy/hubserving/kie_ser_re/
```
* 在Windows环境下(文件夹的分隔符为`\`),安装示例如下:
```shell
# 安装检测服务模块:
hub install deploy\hubserving\ocr_det\
# 或,安装分类服务模块:
hub install deploy\hubserving\ocr_cls\
# 或,安装识别服务模块:
hub install deploy\hubserving\ocr_rec\
# 或,安装检测+识别串联服务模块:
hub install deploy\hubserving\ocr_system\
# 或,安装表格识别服务模块:
hub install deploy\hubserving\structure_table\
# 或安装PP-Structure服务模块
hub install deploy\hubserving\structure_system\
# 或,安装版面分析服务模块:
hub install deploy\hubserving\structure_layout\
# 或安装关键信息抽取SER服务模块
hub install deploy\hubserving\kie_ser\
# 或安装关键信息抽取SER+RE服务模块
hub install deploy\hubserving\kie_ser_re\
```
在Linux环境Windows环境请将`/`替换为`\`)下,安装模块命令如下表:
| 服务模块 | 命令 |
| ------- | - |
| 检测 | `hub install deploy/hubserving/ocr_det` |
| 分类 | `hub install deploy/hubserving/ocr_cls` |
| 识别 | `hub install deploy/hubserving/ocr_rec` |
| 检测+识别串联 | `hub install deploy/hubserving/ocr_system` |
| 表格识别 | `hub install deploy/hubserving/structure_table` |
| PP-Structure | `hub install deploy/hubserving/structure_system` |
| 版面分析 | `hub install deploy/hubserving/structure_layout` |
| 关键信息抽取SER | `hub install deploy/hubserving/kie_ser` |
| 关键信息抽取SER+RE | `hub install deploy/hubserving/kie_ser_re` |
### 2.4 启动服务
#### 2.4.1. 命令行命令启动仅支持CPU
**启动命令:**
```shell
$ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \
--port XXXX \
--use_multiprocess \
--workers \
**启动命令:**
```bash
hub serving start --modules Module1==Version1, Module2==Version2, ... \
--port 8866 \
--use_multiprocess \
--workers \
```
**参数:**
**参数:**
|参数|用途|
|---|---|
|`--modules`/`-m`|PaddleHub Serving预安装模型以多个Module==Version键值对的形式列出<br>**当不指定Version时默认选择最新版本**|
|`--port`/`-p`|服务端口默认为8866|
|`--use_multiprocess`|是否启用并发方式默认为单进程方式推荐多核CPU机器使用此方式<br>**Windows操作系统只支持单进程方式**|
|`--workers`|在并发方式下指定的并发任务数,默认为`2*cpu_count-1`,其中`cpu_count`为CPU核数|
|参数|用途|
|---|---|
|--modules/-m|PaddleHub Serving预安装模型以多个Module==Version键值对的形式列出<br>*`当不指定Version时默认选择最新版本`*|
|--port/-p|服务端口默认为8866|
|--use_multiprocess|是否启用并发方式默认为单进程方式推荐多核CPU机器使用此方式<br>*`Windows操作系统只支持单进程方式`*|
|--workers|在并发方式下指定的并发任务数,默认为`2*cpu_count-1`,其中`cpu_count`为CPU核数|
如启动串联服务: ```hub serving start -m ocr_system```
如启动串联服务:
```bash
hub serving start -m ocr_system
```
这样就完成了一个服务化API的部署使用默认端口号8866。
#### 2.4.2 配置文件启动支持CPU、GPU
**启动命令:**
```hub serving start -c config.json```
**启动命令:**
```bash
hub serving start -c config.json
```
其中,`config.json`格式如下:
```python
```json
{
"modules_info": {
"ocr_system": {
@ -182,48 +139,59 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \
}
```
- `init_args`中的可配参数与`module.py`中的`_initialize`函数接口一致。其中,**当`use_gpu`为`true`时表示使用GPU启动服务**。
- `init_args`中的可配参数与`module.py`中的`_initialize`函数接口一致。
**当`use_gpu`为`true`时表示使用GPU启动服务。**
- `predict_args`中的可配参数与`module.py`中的`predict`函数接口一致。
**注意:**
**注意**
- 使用配置文件启动服务时,其他参数会被忽略。
- 如果使用GPU预测(即,`use_gpu`置为`true`)则需要在启动服务之前设置CUDA_VISIBLE_DEVICES环境变量```export CUDA_VISIBLE_DEVICES=0```,否则不用设置。
- 如果使用GPU预测(即,`use_gpu`置为`true`)则需要在启动服务之前设置CUDA_VISIBLE_DEVICES环境变量
```bash
export CUDA_VISIBLE_DEVICES=0
```
- **`use_gpu`不可与`use_multiprocess`同时为`true`**。
使用GPU 3号卡启动串联服务
```shell
使用GPU 3号卡启动串联服务
```bash
export CUDA_VISIBLE_DEVICES=3
hub serving start -c deploy/hubserving/ocr_system/config.json
```
## 3. 发送预测请求
配置好服务端,可使用以下命令发送预测请求,获取预测结果:
配置好服务端,可使用以下命令发送预测请求,获取预测结果:
```bash
python tools/test_hubserving.py --server_url=server_url --image_dir=image_path
```
```python tools/test_hubserving.py --server_url=server_url --image_dir=image_path```
需要给脚本传递2个参数
- `server_url`:服务地址,格式为`http://[ip_address]:[port]/predict/[module_name]`
需要给脚本传递2个参数
- **server_url**:服务地址,格式为
`http://[ip_address]:[port]/predict/[module_name]`
例如,如果使用配置文件启动分类,检测、识别,检测+分类+识别3阶段表格识别和PP-Structure服务那么发送请求的url将分别是
`http://127.0.0.1:8865/predict/ocr_det`
`http://127.0.0.1:8866/predict/ocr_cls`
`http://127.0.0.1:8867/predict/ocr_rec`
`http://127.0.0.1:8868/predict/ocr_system`
`http://127.0.0.1:8869/predict/structure_table`
`http://127.0.0.1:8870/predict/structure_system`
`http://127.0.0.1:8870/predict/structure_layout`
`http://127.0.0.1:8871/predict/kie_ser`
`http://127.0.0.1:8872/predict/kie_ser_re`
- **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径
- **visualize**是否可视化结果默认为False
- **output**:可视化结果保存路径,默认为`./hubserving_result`
例如,如果使用配置文件启动分类,检测、识别,检测+分类+识别3阶段表格识别和PP-Structure服务
访问示例:
```python tools/test_hubserving.py --server_url=http://127.0.0.1:8868/predict/ocr_system --image_dir=./doc/imgs/ --visualize=false```
并为每个服务修改了port那么发送请求的url将分别是
```
http://127.0.0.1:8865/predict/ocr_det
http://127.0.0.1:8866/predict/ocr_cls
http://127.0.0.1:8867/predict/ocr_rec
http://127.0.0.1:8868/predict/ocr_system
http://127.0.0.1:8869/predict/structure_table
http://127.0.0.1:8870/predict/structure_system
http://127.0.0.1:8870/predict/structure_layout
http://127.0.0.1:8871/predict/kie_ser
http://127.0.0.1:8872/predict/kie_ser_re
```
- `image_dir`:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径
- `visualize`是否可视化结果默认为False
- `output`:可视化结果保存路径,默认为`./hubserving_result`
访问示例:
```bash
python tools/test_hubserving.py --server_url=http://127.0.0.1:8868/predict/ocr_system --image_dir=./doc/imgs/ --visualize=false
```
## 4. 返回结果格式说明
返回结果为列表list列表中的每一项为词典dict词典一共可能包含3种字段信息如下
|字段名称|数据类型|意义|
|---|---|---|
|angle|str|文本角度|
@ -231,41 +199,52 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
|confidence|float| 文本识别置信度或文本角度分类置信度|
|text_region|list|文本位置坐标|
|html|str|表格的html字符串|
|regions|list|版面分析+表格识别+OCR的结果每一项为一个list包含表示区域坐标的`bbox`,区域类型的`type`和区域结果的`res`三个字段|
|regions|list|版面分析+表格识别+OCR的结果每一项为一个list<br>包含表示区域坐标的`bbox`,区域类型的`type`和区域结果的`res`三个字段|
|layout|list|版面分析的结果每一项一个dict包含版面区域坐标的`bbox`,区域类型的`label`|
不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下:
| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | kie_ser | kie_re |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|angle| | ✔ | | ✔ | |||
|text| | |✔|✔| | ✔ | | ✔ | ✔ |
|confidence| |✔ |✔| | | ✔| |✔ | ✔ |
|text_region| ✔| | |✔ | | ✔| |✔ | ✔ |
|html| | | | |✔ |✔||| |
|regions| | | | |✔ |✔ | || |
|layout| | | | | | | ✔ || |
|ser_res| | | | | | | | ✔ | |
|re_res| | | | | | | | | ✔ |
|字段名/模块名 |ocr_det |ocr_cls |ocr_rec |ocr_system |structure_table |structure_system |structure_layout |kie_ser |kie_re |
|--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |
|angle | |✔ | |✔ | | | |
|text | | |✔ |✔ | |✔ | |✔ |✔ |
|confidence | |✔ |✔ |✔ | |✔ | |✔ |✔ |
|text_region |✔ | | |✔ | |✔ | |✔ |✔ |
|html | | | | |✔ |✔ | | | |
|regions | | | | |✔ |✔ | | | |
|layout | | | | | | |✔ | | |
|ser_res | | | | | | | |✔ | |
|re_res | | | | | | | | |✔ |
**说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。
## 5. 自定义修改服务模块
如果需要修改服务逻辑,一般需要操作以下步骤(以修改`ocr_system`为例):
如果需要修改服务逻辑,一般需要操作以下步骤(以修改`deploy/hubserving/ocr_system`为例):
- 1、 停止服务
```hub serving stop --port/-p XXXX```
1. 停止服务:
```bash
hub serving stop --port/-p XXXX
```
2. 到`deploy/hubserving/ocr_system`下的`module.py`和`params.py`等文件中根据实际需求修改代码。
- 2、 到相应的`module.py`和`params.py`等文件中根据实际需求修改代码。
例如,如果需要替换部署服务所用模型,则需要到`params.py`中修改模型路径参数`det_model_dir`和`rec_model_dir`,如果需要关闭文本方向分类器,则将参数`use_angle_cls`置为`False`,当然,同时可能还需要修改其他相关参数,请根据实际情况修改调试。 **强烈建议修改后先直接运行`module.py`调试,能正确运行预测后再启动服务测试。**
**注意** PPOCR-v3识别模型使用的图片输入shape为`3,48,320`,因此需要修改`params.py`中的`cfg.rec_image_shape = "3, 48, 320"`如果不使用PPOCR-v3识别模型则无需修改该参数。
例如,如果需要替换部署服务所用模型,则需要到`params.py`中修改模型路径参数`det_model_dir`和`rec_model_dir`,如果需要关闭文本方向分类器,则将参数`use_angle_cls`置为`False`
- 3、 卸载旧服务包
```hub uninstall ocr_system```
当然,同时可能还需要修改其他相关参数,请根据实际情况修改调试。
- 4、 安装修改后的新服务包
```hub install deploy/hubserving/ocr_system/```
**强烈建议修改后先直接运行`module.py`调试,能正确运行预测后再启动服务测试。**
- 5、重新启动服务
```hub serving start -m ocr_system```
**注意:** PPOCR-v3识别模型使用的图片输入shape为`3,48,320`,因此需要修改`params.py`中的`cfg.rec_image_shape = "3, 48, 320"`如果不使用PPOCR-v3识别模型则无需修改该参数。
3. (可选)如果想要重命名模块需要更改`module.py`文件中的以下行:
- [`from deploy.hubserving.ocr_system.params import read_params`中的`ocr_system`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L35)
- [`name="ocr_system",`中的`ocr_system`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L39)
4. (可选)可能需要删除`__pycache__`目录以强制刷新CPython缓存
```bash
find deploy/hubserving/ocr_system -name '__pycache__' -exec rm -r {} \;
```
5. 安装修改后的新服务包:
```bash
hub install deploy/hubserving/ocr_system
```
6. 重新启动服务:
```bash
hub serving start -m ocr_system
```

View File

@ -3,24 +3,23 @@ English | [简体中文](readme.md)
- [Service deployment based on PaddleHub Serving](#service-deployment-based-on-paddlehub-serving)
- [1. Update](#1-update)
- [2. Quick start service](#2-quick-start-service)
- [2.1 Prepare the environment](#21-prepare-the-environment)
- [2.1 Install PaddleHub](#21-install-paddlehub)
- [2.2 Download inference model](#22-download-inference-model)
- [2.3 Install Service Module](#23-install-service-module)
- [2.4 Start service](#24-start-service)
- [2.4.1 Start with command line parameters (CPU only)](#241-start-with-command-line-parameters-cpu-only)
- [2.4.2 Start with configuration fileCPUGPU](#242-start-with-configuration-filecpugpu)
- [2.4.2 Start with configuration fileCPU and GPU](#242-start-with-configuration-filecpugpu)
- [3. Send prediction requests](#3-send-prediction-requests)
- [4. Returned result format](#4-returned-result-format)
- [5. User defined service module modification](#5-user-defined-service-module-modification)
- [5. User-defined service module modification](#5-user-defined-service-module-modification)
PaddleOCR provides 2 service deployment methods:
- Based on **PaddleHub Serving**: Code path is "`./deploy/hubserving`". Please follow this tutorial.
- Based on **PaddleServing**: Code path is "`./deploy/pdserving`". Please refer to the [tutorial](../../deploy/pdserving/README.md) for usage.
- Based on **PaddleHub Serving**: Code path is `./deploy/hubserving`. Please follow this tutorial.
- Based on **PaddleServing**: Code path is `./deploy/pdserving`. Please refer to the [tutorial](../../deploy/pdserving/README.md) for usage.
# Service deployment based on PaddleHub Serving
# Service deployment based on PaddleHub Serving
The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, layout analysis, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows:
The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, layout analysis, table recognition, and PP-Structure. Please select the corresponding service package to install and start the service according to your needs. The directory is as follows:
```
deploy/hubserving/
└─ ocr_det text detection module service package
@ -34,13 +33,13 @@ deploy/hubserving/
└─ kie_ser_re KIE(SER+RE) service package
```
Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows:
Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows:
```
deploy/hubserving/ocr_system/
└─ __init__.py Empty file, required
└─ config.json Configuration file, optional, passed in as a parameter when using configuration to start the service
└─ module.py Main module file, required, contains the complete logic of the service
└─ params.py Parameter file, required, including parameters such as model path, pre- and post-processing parameters
└─ params.py Parameter file, required, including parameters such as model path, pre and post-processing parameters
```
## 1. Update
@ -49,124 +48,76 @@ deploy/hubserving/ocr_system/
* 2022.03.30 add PP-Structure and table recognition services.
* 2022.05.05 add PP-OCRv3 text detection and recognition services.
## 2. Quick start service
The following steps take the 2-stage series service as an example. If only the detection service or recognition service is needed, replace the corresponding file path.
### 2.1 Prepare the environment
```shell
# Install paddlehub
# python>3.6.2 is required bt paddlehub
pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
### 2.1 Install PaddleHub
```bash
pip3 install paddlehub==2.1.0 --upgrade
```
### 2.2 Download inference model
Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the PP-OCRv3 models are used, and the default model path is:
```
text detection model: ./inference/ch_PP-OCRv3_det_infer/
text recognition model: ./inference/ch_PP-OCRv3_rec_infer/
text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/
layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/
tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/
KIE(SER): ./inference/ser_vi_layoutxlm_xfund_infer/
KIE(SER+RE): ./inference/re_vi_layoutxlm_xfund_infer/
```
Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the PP-OCRv3 models are used, and the default model path is:
| Model | Path |
| ------- | - |
| text detection model | ./inference/ch_PP-OCRv3_det_infer/ |
| text recognition model | ./inference/ch_PP-OCRv3_rec_infer/ |
| text angle classifier | ./inference/ch_ppocr_mobile_v2.0_cls_infer/ |
| layout parse model | ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ |
| tanle recognition | ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ |
| KIE(SER) | ./inference/ser_vi_layoutxlm_xfund_infer/ |
| KIE(SER+RE) | ./inference/re_vi_layoutxlm_xfund_infer/ |
**The model path can be found and modified in `params.py`.** More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself.
**The model path can be found and modified in `params.py`.**
More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself.
### 2.3 Install Service Module
PaddleOCR provides 5 kinds of service modules, install the required modules according to your needs.
* On Linux platform, the examples are as follows.
```shell
# Install the text detection service module:
hub install deploy/hubserving/ocr_det/
# Or, install the text angle class service module:
hub install deploy/hubserving/ocr_cls/
# Or, install the text recognition service module:
hub install deploy/hubserving/ocr_rec/
# Or, install the 2-stage series service module:
hub install deploy/hubserving/ocr_system/
# Or install table recognition service module
hub install deploy/hubserving/structure_table/
# Or install PP-Structure service module
hub install deploy/hubserving/structure_system/
# Or install KIE(SER) service module
hub install deploy/hubserving/kie_ser/
# Or install KIE(SER+RE) service module
hub install deploy/hubserving/kie_ser_re/
```
* On Windows platform, the examples are as follows.
```shell
# Install the detection service module:
hub install deploy\hubserving\ocr_det\
# Or, install the angle class service module:
hub install deploy\hubserving\ocr_cls\
# Or, install the recognition service module:
hub install deploy\hubserving\ocr_rec\
# Or, install the 2-stage series service module:
hub install deploy\hubserving\ocr_system\
# Or install table recognition service module
hub install deploy/hubserving/structure_table/
# Or install PP-Structure service module
hub install deploy\hubserving\structure_system\
# Or install layout analysis service module
hub install deploy\hubserving\structure_layout\
# Or install KIE(SER) service module
hub install deploy\hubserving\kie_ser\
# Or install KIE(SER+RE) service module
hub install deploy\hubserving\kie_ser_re\
```
* On the Linux platform(replace `/` with `\` if using Windows), the examples are as the following table:
| Service model | Command |
| text detection | `hub install deploy/hubserving/ocr_det` |
| text angle class: | `hub install deploy/hubserving/ocr_cls` |
| text recognition: | `hub install deploy/hubserving/ocr_rec` |
| 2-stage series: | `hub install deploy/hubserving/ocr_system` |
| table recognition | `hub install deploy/hubserving/structure_table` |
| PP-Structure | `hub install deploy/hubserving/structure_system` |
| KIE(SER) | `hub install deploy/hubserving/kie_ser` |
| KIE(SER+RE) | `hub install deploy/hubserving/kie_ser_re` |
### 2.4 Start service
#### 2.4.1 Start with command line parameters (CPU only)
**start command:**
```bash
hub serving start --modules Module1==Version1, Module2==Version2, ... \
--port 8866 \
--use_multiprocess \
--workers \
```
**start command**
```shell
$ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \
--port XXXX \
--use_multiprocess \
--workers \
```
**parameters**
**Parameters:**
|parameters|usage|
|---|---|
|`--modules`/`-m`|PaddleHub Serving pre-installed model, listed in the form of multiple Module==Version key-value pairs<br>**When Version is not specified, the latest version is selected by default**|
|`--port`/`-p`|Service port, default is 8866|
|`--use_multiprocess`|Enable concurrent mode, by default using the single-process mode, this mode is recommended for multi-core CPU machines<br>**Windows operating system only supports single-process mode**|
|`--workers`|The number of concurrent tasks specified in concurrent mode, the default is `2*cpu_count-1`, where `cpu_count` is the number of CPU cores|
|parameters|usage|
|---|---|
|--modules/-m|PaddleHub Serving pre-installed model, listed in the form of multiple Module==Version key-value pairs<br>*`When Version is not specified, the latest version is selected by default`*|
|--port/-p|Service port, default is 8866|
|--use_multiprocess|Enable concurrent mode, the default is single-process mode, this mode is recommended for multi-core CPU machines<br>*`Windows operating system only supports single-process mode`*|
|--workers|The number of concurrent tasks specified in concurrent mode, the default is `2*cpu_count-1`, where `cpu_count` is the number of CPU cores|
For example, start the 2-stage series service:
```shell
For example, start the 2-stage series service:
```bash
hub serving start -m ocr_system
```
```
This completes the deployment of a service API, using the default port number 8866.
This completes the deployment of a service API, using the default port number 8866.
#### 2.4.2 Start with configuration fileCPUGPU
**start command**
```shell
#### 2.4.2 Start with configuration fileCPU and GPU
**start command:**
```bash
hub serving start --config/-c config.json
```
Wherein, the format of `config.json` is as follows:
```python
```
In which the format of `config.json` is as follows:
```json
{
"modules_info": {
"ocr_system": {
@ -183,51 +134,61 @@ Wherein, the format of `config.json` is as follows:
"workers": 2
}
```
- The configurable parameters in `init_args` are consistent with the `_initialize` function interface in `module.py`. Among them, **when `use_gpu` is `true`, it means that the GPU is used to start the service**.
- The configurable parameters in `init_args` are consistent with the `_initialize` function interface in `module.py`.
**When `use_gpu` is `true`, it means that the GPU is used to start the service**.
- The configurable parameters in `predict_args` are consistent with the `predict` function interface in `module.py`.
**Note:**
- When using the configuration file to start the service, other parameters will be ignored.
- If you use GPU prediction (that is, `use_gpu` is set to `true`), you need to set the environment variable CUDA_VISIBLE_DEVICES before starting the service, such as: ```export CUDA_VISIBLE_DEVICES=0```, otherwise you do not need to set it.
- **`use_gpu` and `use_multiprocess` cannot be `true` at the same time.**
**Note:**
- When using the configuration file to start the service, other parameters will be ignored.
- If you use GPU prediction (that is, `use_gpu` is set to `true`), you need to set the environment variable CUDA_VISIBLE_DEVICES before starting the service, such as:
```bash
export CUDA_VISIBLE_DEVICES=0
```
- **`use_gpu` and `use_multiprocess` cannot be `true` at the same time.**
For example, use GPU card No. 3 to start the 2-stage series service:
```shell
```bash
export CUDA_VISIBLE_DEVICES=3
hub serving start -c deploy/hubserving/ocr_system/config.json
```
```
## 3. Send prediction requests
After the service starts, you can use the following command to send a prediction request to obtain the prediction result:
```shell
After the service starts, you can use the following command to send a prediction request to obtain the prediction result:
```bash
python tools/test_hubserving.py --server_url=server_url --image_dir=image_path
```
```
Two parameters need to be passed to the script:
- **server_url**service addressformat of which is
`http://[ip_address]:[port]/predict/[module_name]`
For example, if using the configuration file to start the text angle classification, text detection, text recognition, detection+classification+recognition 3 stages, table recognition and PP-Structure service, then the `server_url` to send the request will be:
- **server_url**:service address, the format of which is
`http://[ip_address]:[port]/predict/[module_name]`
`http://127.0.0.1:8865/predict/ocr_det`
`http://127.0.0.1:8866/predict/ocr_cls`
`http://127.0.0.1:8867/predict/ocr_rec`
`http://127.0.0.1:8868/predict/ocr_system`
`http://127.0.0.1:8869/predict/structure_table`
`http://127.0.0.1:8870/predict/structure_system`
`http://127.0.0.1:8870/predict/structure_layout`
`http://127.0.0.1:8871/predict/kie_ser`
`http://127.0.0.1:8872/predict/kie_ser_re`
- **image_dir**Test image path, can be a single image path or an image directory path
- **visualize**Whether to visualize the results, the default value is False
- **output**The floder to save Visualization result, default value is `./hubserving_result`
For example, if using the configuration file to start the text angle classification, text detection, text recognition, detection+classification+recognition 3 stages, table recognition and PP-Structure service,
**Eg.**
```shell
also modified the port for each service, then the `server_url` to send the request will be:
```
http://127.0.0.1:8865/predict/ocr_det
http://127.0.0.1:8866/predict/ocr_cls
http://127.0.0.1:8867/predict/ocr_rec
http://127.0.0.1:8868/predict/ocr_system
http://127.0.0.1:8869/predict/structure_table
http://127.0.0.1:8870/predict/structure_system
http://127.0.0.1:8870/predict/structure_layout
http://127.0.0.1:8871/predict/kie_ser
http://127.0.0.1:8872/predict/kie_ser_re
```
- **image_dir**:Test image path, which can be a single image path or an image directory path
- **visualize**:Whether to visualize the results, the default value is False
- **output**:The folder to save the Visualization result, the default value is `./hubserving_result`
Example:
```bash
python tools/test_hubserving.py --server_url=http://127.0.0.1:8868/predict/ocr_system --image_dir=./doc/imgs/ --visualize=false`
```
## 4. Returned result format
The returned result is a list. Each item in the list is a dict. The dict may contain three fields. The information is as follows:
The returned result is a list. Each item in the list is a dictionary which may contain three fields. The information is as follows:
|field name|data type|description|
|----|----|----|
@ -235,45 +196,54 @@ The returned result is a list. Each item in the list is a dict. The dict may con
|text|str|text content|
|confidence|float|text recognition confidence|
|text_region|list|text location coordinates|
|html|str|table html str|
|regions|list|The result of layout analysis + table recognition + OCR, each item is a list, including `bbox` indicating area coordinates, `type` of area type and `res` of area results|
|html|str|table HTML string|
|regions|list|The result of layout analysis + table recognition + OCR, each item is a list<br>including `bbox` indicating area coordinates, `type` of area type and `res` of area results|
|layout|list|The result of layout analysis, each item is a dict, including `bbox` indicating area coordinates, `label` of area type|
The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows:
The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`, detailed table is as follows:
| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | kie_ser | kie_re |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|angle| | ✔ | | ✔ | |||
|text| | |✔|✔| | ✔ | | ✔ | ✔ |
|confidence| |✔ |✔| | | ✔| |✔ | ✔ |
|text_region| ✔| | |✔ | | ✔| |✔ | ✔ |
|html| | | | |✔ |✔||| |
|regions| | | | |✔ |✔ | || |
|layout| | | | | | | ✔ || |
|ser_res| | | | | | | | ✔ | |
|re_res| | | | | | | | | ✔ |
|field name/module name |ocr_det |ocr_cls |ocr_rec |ocr_system |structure_table |structure_system |structure_layout |kie_ser |kie_re |
|--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |
|angle | |✔ | |✔ | | | |
|text | | |✔ |✔ | |✔ | |✔ |✔ |
|confidence | |✔ |✔ |✔ | |✔ | |✔ |✔ |
|text_region |✔ | | |✔ | |✔ | |✔ |✔ |
|html | | | | |✔ |✔ | | | |
|regions | | | | |✔ |✔ | | | |
|layout | | | | | | |✔ | | |
|ser_res | | | | | | | |✔ | |
|re_res | | | | | | | | |✔ |
**Note** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section.
**Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section.
## 5. User defined service module modification
If you need to modify the service logic, the following steps are generally required (take the modification of `ocr_system` for example):
## 5. User-defined service module modification
If you need to modify the service logic, the following steps are generally required (take the modification of `deploy/hubserving/ocr_system` for example):
- 1. Stop service
```shell
1. Stop service:
```bash
hub serving stop --port/-p XXXX
```
- 2. Modify the code in the corresponding files, like `module.py` and `params.py`, according to the actual needs.
For example, if you need to replace the model used by the deployed service, you need to modify model path parameters `det_model_dir` and `rec_model_dir` in `params.py`. If you want to turn off the text direction classifier, set the parameter `use_angle_cls` to `False`. Of course, other related parameters may need to be modified at the same time. Please modify and debug according to the actual situation. It is suggested to run `module.py` directly for debugging after modification before starting the service test.
**Note** The image input shape used by the PPOCR-v3 recognition model is `3, 48, 320`, so you need to modify `cfg.rec_image_shape = "3, 48, 320"` in `params.py`, if you do not use the PPOCR-v3 recognition model, then there is no need to modify this parameter.
- 3. Uninstall old service module
```shell
hub uninstall ocr_system
```
- 4. Install modified service module
```shell
hub install deploy/hubserving/ocr_system/
```
- 5. Restart service
```shell
hub serving start -m ocr_system
```
2. Modify the code in the corresponding files under `deploy/hubserving/ocr_system`, such as `module.py` and `params.py`, to your actual needs.
For example, if you need to replace the model used by the deployed service, you need to modify model path parameters `det_model_dir` and `rec_model_dir` in `params.py`. If you want to turn off the text direction classifier, set the parameter `use_angle_cls` to `False`.
Of course, other related parameters may need to be modified at the same time. Please modify and debug according to the actual situation.
**It is suggested to run `module.py` directly for debugging after modification before starting the service test.**
**Note** The image input shape used by the PPOCR-v3 recognition model is `3, 48, 320`, so you need to modify `cfg.rec_image_shape = "3, 48, 320"` in `params.py`, if you do not use the PPOCR-v3 recognition model, then there is no need to modify this parameter.
3. (Optional) If you want to rename the module, the following lines should be modified:
- [`ocr_system` within `from deploy.hubserving.ocr_system.params import read_params`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L35)
- [`ocr_system` within `name="ocr_system",`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L39)
4. (Optional) It may require you to delete the directory `__pycache__` to force flush build cache of CPython:
```bash
find deploy/hubserving/ocr_system -name '__pycache__' -exec rm -r {} \;
```
5. Install modified service module:
```bash
hub install deploy/hubserving/ocr_system/
```
6. Restart service:
```bash
hub serving start -m ocr_system
```

View File

@ -106,13 +106,13 @@ python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_rec_infer/ \
检测模型转换完成后,会在当前文件夹多出`ppocr_det_v3_serving` 和`ppocr_det_v3_client`的文件夹,具备如下格式:
```
|- ppocr_det_v3_serving/
|- __model__
|- __model__
|- __params__
|- serving_server_conf.prototxt
|- serving_server_conf.prototxt
|- serving_server_conf.stream.prototxt
|- ppocr_det_v3_client
|- serving_client_conf.prototxt
|- serving_client_conf.prototxt
|- serving_client_conf.stream.prototxt
```
@ -232,6 +232,7 @@ cp -rf general_detection_op.cpp Serving/core/general-server/op
# 启动服务运行日志保存在log.txt
python3 -m paddle_serving_server.serve --model ppocr_det_v3_serving ppocr_rec_v3_serving --op GeneralDetectionOp GeneralInferOp --port 8181 &>log.txt &
```
成功启动服务后log.txt中会打印类似如下日志
![](./imgs/start_server.png)

View File

@ -188,7 +188,7 @@ A可以看下训练的尺度和预测的尺度是否相同如果训练的
#### Q: 如何识别招牌或者广告图中的艺术字?
**A**: 招牌或者广告图中的艺术字是文本识别一个非常有挑战性的难题,因为艺术字中的单字和印刷体相比,变化非常大。如果需要识别的艺术字是在一个词典列表内,可以将每个词典认为是一个待识别图像模板通过通用图像检索识别系统解决识别问题。可以尝试使用PaddleClas的图像识别系统。
**A**: 招牌或者广告图中的艺术字是文本识别一个非常有挑战性的难题,因为艺术字中的单字和印刷体相比,变化非常大。如果需要识别的艺术字是在一个词典列表内,可以将每个词典认为是一个待识别图像模板通过通用图像检索识别系统解决识别问题。可以尝试使用PaddleClas的图像识别系统PP-shituV2
#### Q: 图像正常识别出来的文字是OK的旋转90度后识别出来的结果就比较差有什么方法可以优化
@ -400,7 +400,7 @@ StyleText的用途主要是提取style_image中的字体、背景等style信
A无论是文字检测还是文字识别骨干网络的选择是预测效果和预测效率的权衡。一般选择更大规模的骨干网络例如ResNet101_vd则检测或识别更准确但预测耗时相应也会增加。而选择更小规模的骨干网络例如MobileNetV3_small_x0_35则预测更快但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。飞桨图像分类套件PaddleClas汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构在上述图像分类任务的top1识别准确率GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的117个预训练模型下载地址。
1文字检测骨干网络的替换主要是确定类似ResNet的4个stages以方便集成后续的类似FPN的检测头。此外对于文字检测问题使用ImageNet训练的分类预训练模型可以加速收敛和效果提升。
1文字检测骨干网络的替换主要是确定类似ResNet的4个stages以方便集成后续的类似FPN的检测头。此外对于文字检测问题使用ImageNet训练的分类预训练模型可以加速收敛和效果提升。
2文字识别的骨干网络的替换需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大因此高度下降频率少一些宽度下降频率多一些。可以参考PaddleOCR中MobileNetV3骨干网络的改动。

View File

@ -30,7 +30,7 @@ PP-OCRv3检测训练包括两个步骤
### 2.2 训练教师模型
教师模型训练的配置文件是[ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)。教师模型模型结构的Backbone、Neck、Head分别为Resnet50, LKPAN, DBHead采用DML的蒸馏方法训练。有关配置文件的详细介绍参考[文档](./knowledge_distillation)。
教师模型训练的配置文件是[ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)。教师模型模型结构的Backbone、Neck、Head分别为Resnet50, LKPAN, DBHead采用DML的蒸馏方法训练。有关配置文件的详细介绍参考[文档](./knowledge_distillation.md)。
下载ImageNet预训练模型

View File

@ -86,6 +86,8 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型**欢迎广
- [x] [SPIN](./algorithm_rec_spin.md)
- [x] [RobustScanner](./algorithm_rec_robustscanner.md)
- [x] [RFL](./algorithm_rec_rfl.md)
- [x] [ParseQ](./algorithm_rec_parseq.md)
- [x] [CPPD](./algorithm_rec_cppd.md)
参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程使用MJSynth和SynthText两个文字识别数据集训练在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估算法效果如下
@ -110,6 +112,8 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型**欢迎广
|SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) |
|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)|
|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) |
|ParseQ|VIT| 91.24% | rec_vit_parseq_synth | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz) |
|CPPD|SVTR-Base| 93.8% | rec_svtrnet_cppd_base_en | [训练模型](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) |
<a name="13"></a>

View File

@ -0,0 +1,188 @@
# 场景文本识别算法-CPPD
- [1. 算法简介](#1)
- [2. 环境配置](#2)
- [3. 模型训练、评估、预测](#3)
- [3.1 训练](#3-1)
- [3.2 评估](#3-2)
- [3.3 预测](#3-3)
- [4. 推理部署](#4)
- [4.1 Python推理](#4-1)
- [4.2 C++推理](#4-2)
- [4.3 Serving服务化部署](#4-3)
- [4.4 更多推理部署](#4-4)
<a name="1"></a>
## 1. 算法简介
论文信息:
> [Context Perception Parallel Decoder for Scene Text Recognition](https://arxiv.org/abs/2307.12270)
> Yongkun Du and Zhineng Chen and Caiyan Jia and Xiaoting Yin and Chenxia Li and Yuning Du and Yu-Gang Jiang
### CPPD算法简介
基于深度学习的场景文本识别模型通常是Encoder-Decoder结构其中decoder可以分为两种(1)CTC(2)Attention-based。目前SOTA模型大多使用Attention-based的decoder而attention-based可以分为AR和PD两种一般来说AR解码器识别精度优于PD而PD解码速度快于ARCPPD通过精心设计的CO和CC模块达到了“AR的精度PD的速度”的效果。
<a name="model"></a>
CPPD在场景文本识别公开数据集上的精度(%)和模型文件如下:
* 英文训练集和测试集来自于[PARSeq](https://github.com/baudm/parseq)。
| 模型 |IC13<br/>857 | SVT |IIIT5k<br/>3000 |IC15<br/>1811| SVTP |CUTE80 | Avg | 下载链接 |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|
| CPPD Tiny | 97.1 | 94.4 | 96.6 | 86.6 | 88.5 | 90.3 | 92.25 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_tiny_en_train.tar) |
| CPPD Base | 98.2 | 95.5 | 97.6 | 87.9 | 90.0 | 92.7 | 93.80 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar)|
| CPPD Base 48*160 | 97.5 | 95.5 | 97.7 | 87.7 | 92.4 | 93.7 | 94.10 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_48_160_en_train.tar) |
* 英文合成数据集(MJ+ST)训练英文Union14M-L benchmark测试结果[U14m](https://github.com/Mountchicken/Union14M/)。
| 模型 |Curve | Multi-<br/>Oriented |Artistic |Contextless| Salient | Multi-<br/>word | General | Avg | 下载链接 |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|
| CPPD Tiny | 52.4 | 12.3 | 48.2 | 54.4 | 61.5 | 53.4 | 61.4 | 49.10 | 同上表 |
| CPPD Base | 65.5 | 18.6 | 56.0 | 61.9 | 71.0 | 57.5 | 65.8 | 56.63 | 同上表 |
| CPPD Base 48*160 | 71.9 | 22.1 | 60.5 | 67.9 | 78.3 | 63.9 | 67.1 | 61.69 | 同上表 |
* Union14M-L 训练集训练,英文测试结果。
| 模型 |IC13<br/>857 | SVT |IIIT5k<br/>3000 |IC15<br/>1811| SVTP |CUTE80 | Avg | 下载链接 |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|
| CPPD Base 32*128 | 98.7 | 98.5 | 99.4 | 91.7 | 96.7 | 99.7 | 97.44 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_u14m_train.tar) |
| 模型 |Curve | Multi-<br/>Oriented |Artistic |Contextless| Salient | Multi-<br/>word | General | Avg | 下载链接 |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|
| CPPD Base 32*128 | 87.5 | 70.7 | 78.2 | 82.9 | 85.5 | 85.4 | 84.3 | 82.08 | 同上表 |
* 中文训练集和测试集来自于[Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition)。
| 模型 | Scene | Web | Document | Handwriting | Avg | 下载链接 |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|
| CPPD Base | 74.4 | 76.1 | 98.6 | 55.3 | 76.10 | [中文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_ch_train.tar) |
| CPPD Base + STN | 78.4 | 79.3 | 98.9 | 57.6 | 78.55 | [中文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_stn_ch_train.tar) |
<a name="2"></a>
## 2. 环境配置
请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境参考[《项目克隆》](./clone.md)克隆项目代码。
<a name="3"></a>
## 3. 模型训练、评估、预测
<a name="3-1"></a>
### 3.1 模型训练
#### 数据集准备
[英文数据集下载](https://github.com/baudm/parseq)
[Union14M-L 下载](https://github.com/Mountchicken/Union14M)
[中文数据集下载](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
#### 启动训练
请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化训练`CPPD`识别模型时需要**更换配置文件**为`CPPD`的[配置文件](../../configs/rec/rec_svtrnet_cppd_base_en.yml)。
具体地,在完成数据准备后,便可以启动训练,训练命令如下:
```shell
#单卡训练(训练周期长,不建议)
python3 tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml
#多卡训练,通过--gpus参数指定卡号
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml
```
<a name="3-2"></a>
### 3.2 评估
可下载`CPPD`提供的模型文件和配置文件:[下载地址](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) ,以`CPPD-B`为例,使用如下命令进行评估:
```shell
# 下载包含CPPD-B的模型文件和配置文件的tar压缩包并解压
wget https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar && tar xf rec_svtr_cppd_base_en_train.tar
# 注意将pretrained_model的路径设置为本地路径。
python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model
```
<a name="3-3"></a>
### 3.3 预测
使用如下命令进行单张图片预测:
```shell
# 注意将pretrained_model的路径设置为本地路径。
python3 tools/infer_rec.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model
# 预测文件夹下所有图像时可修改infer_img为文件夹如 Global.infer_img='./doc/imgs_words_en/'。
```
<a name="4"></a>
## 4. 推理部署
<a name="4-1"></a>
### 4.1 Python推理
首先将训练得到best模型转换成inference model。下面以基于`CPPD-B`,在英文数据集训练的模型为例([模型和配置文件下载地址](https://paddleocr.bj.bcebos.com/CPPD/rec_svtr_cppd_base_en_train.tar),可以使用如下命令进行转换:
**注意:**
- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。
执行如下命令进行模型导出和推理:
```shell
# 注意将pretrained_model的路径设置为本地路径。
# export model
# en
python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_en_infer
# ch
python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_ch.yml -o Global.pretrained_model=./rec_svtr_cppd_base_ch_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_ch_infer
# speed test
# docker image https://hub.docker.com/r/paddlepaddle/paddle/tags/: sudo docker pull paddlepaddle/paddle:2.4.2-gpu-cuda11.2-cudnn8.2-trt8.0
# install auto_log: pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl
# en
python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_en_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,32,100' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True
# ch
python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_ch_infer/' --rec_algorithm='CPPDPadding' --rec_image_shape='3,32,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True
# stn_ch
python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_stn_ch_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,64,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True
```
导出成功后,在目录下有三个文件:
```
/inference/rec_svtr_cppd_base_en_infer/
├── inference.pdiparams # 识别inference模型的参数文件
├── inference.pdiparams.info # 识别inference模型的参数信息可忽略
└── inference.pdmodel # 识别inference模型的program文件
```
<a name="4-2"></a>
### 4.2 C++推理部署
由于C++预处理后处理还未支持CPPD所以暂未支持
<a name="4-3"></a>
### 4.3 Serving服务化部署
暂不支持
<a name="4-4"></a>
### 4.4 更多推理部署
暂不支持
## 引用
```bibtex
@article{Du2023CPPD,
title = {Context Perception Parallel Decoder for Scene Text Recognition},
author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang},
booktitle = {Arxiv},
year = {2023},
url = {https://arxiv.org/abs/2307.12270}
}
```

View File

@ -0,0 +1,124 @@
# ParseQ
- [1. 算法简介](#1)
- [2. 环境配置](#2)
- [3. 模型训练、评估、预测](#3)
- [3.1 训练](#3-1)
- [3.2 评估](#3-2)
- [3.3 预测](#3-3)
- [4. 推理部署](#4)
- [4.1 Python推理](#4-1)
- [4.2 C++推理](#4-2)
- [4.3 Serving服务化部署](#4-3)
- [4.4 更多推理部署](#4-4)
- [5. FAQ](#5)
<a name="1"></a>
## 1. 算法简介
论文信息:
> [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/abs/2207.06966)
> Darwin Bautista, Rowel Atienza
> ECCV, 2021
原论文分别使用真实文本识别数据集(Real)和合成文本识别数据集(Synth)进行训练在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估。
其中:
- 真实文本识别数据集(Real)包含COCO-Text, RCTW17, Uber-Text, ArT, LSVT, MLT19, ReCTS, TextOCR, OpenVINO数据集
- 合成文本识别数据集(Synth)包含MJSynth和SynthText数据集
在不同数据集上训练的算法的复现效果如下:
|数据集|模型|骨干网络|配置文件|Acc|下载链接|
| --- | --- | --- | --- | --- | --- |
|Synth|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|91.24%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz)|
|Real|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|94.74%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz)|
|||||||
<a name="2"></a>
## 2. 环境配置
请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境参考[《项目克隆》](./clone.md)克隆项目代码。
<a name="3"></a>
## 3. 模型训练、评估、预测
请参考[文本识别教程](./recognition.md)。PaddleOCR对代码进行了模块化训练不同的识别模型只需要**更换配置文件**即可。
训练
具体地,在完成数据准备后,便可以启动训练,训练命令如下:
```
#单卡训练(训练周期长,不建议)
python3 tools/train.py -c configs/rec/rec_vit_parseq.yml
#多卡训练,通过--gpus参数指定卡号
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_vit_parseq.yml
```
评估
```
# GPU 评估, Global.pretrained_model 为待测权重
python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
```
预测:
```
# 预测使用的配置文件必须与训练一致
python3 tools/infer_rec.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
```
<a name="4"></a>
## 4. 推理部署
<a name="4-1"></a>
### 4.1 Python推理
首先将ParseQ文本识别训练过程中保存的模型转换成inference model。 [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz) ),可以使用如下命令进行转换:
```
python3 tools/export_model.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model=./rec_vit_parseq_real/best_accuracy Global.save_inference_dir=./inference/rec_parseq
```
ParseQ文本识别模型推理可以执行如下命令
```
python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_parseq/" --rec_image_shape="3, 32, 128" --rec_algorithm="ParseQ" --rec_char_dict_path="ppocr/utils/dict/parseq_dict.txt" --max_text_length=25 --use_space_char=False
```
<a name="4-2"></a>
### 4.2 C++推理
由于C++预处理后处理还未支持ParseQ所以暂未支持
<a name="4-3"></a>
### 4.3 Serving服务化部署
暂不支持
<a name="4-4"></a>
### 4.4 更多推理部署
暂不支持
<a name="5"></a>
## 5. FAQ
## 引用
```bibtex
@InProceedings{bautista2022parseq,
title={Scene Text Recognition with Permuted Autoregressive Sequence Models},
author={Bautista, Darwin and Atienza, Rowel},
booktitle={European Conference on Computer Vision},
pages={178--196},
month={10},
year={2022},
publisher={Springer Nature Switzerland},
address={Cham},
doi={10.1007/978-3-031-19815-1_11},
url={https://doi.org/10.1007/978-3-031-19815-1_11}
}
```

View File

@ -30,12 +30,12 @@ PaddleOCR场景应用覆盖通用制造、金融、交通行业的主要OCR
| 类别 | 亮点 | 类别 | 亮点 |
| -------------- | ------------------------ | ------------ | --------------------- |
| 表单VQA | 多模态通用表单结构化提取 | 通用卡证识别 | 通用结构化提取 |
| 增值税发票 | 请期待 | 身份证识别 | 结构化提取、图像阴影 |
| 增值税发票 | 请期待 | 身份证识别 | 结构化提取、图像阴影 |
| 印章检测与识别 | 端到端弯曲文本识别 | 合同比对 | 密集文本检测、NLP串联 |
## 交通
| 类别 | 亮点 | 类别 | 亮点 |
| ----------------- | ------------------------------ | ---------- | -------- |
| 车牌识别 | 多角度图像、轻量模型、端侧部署 | 快递单识别 | 请期待 |
| 驾驶证/行驶证识别 | 请期待 | | |
| 车牌识别 | 多角度图像、轻量模型、端侧部署 | 快递单识别 | 请期待 |
| 驾驶证/行驶证识别 | 请期待 | | |

View File

@ -223,4 +223,4 @@ PaddleOCR目前已支持80种除中文外语种识别`configs/rec/multi
| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 斯拉夫字母 |
| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 梵文字母 |
更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md#%E8%AF%AD%E7%A7%8D%E7%BC%A9%E5%86%99)
更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md)

View File

@ -27,4 +27,4 @@ PaddleOCR提供了检测和识别模型的串联工具可以将训练好的
```
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/det/" --rec_model_dir="./inference/rec/"
```
更多的文本检测、识别串联推理使用方式请参考文档教程中的[基于预测引擎推理](./inference.md)。
更多的文本检测、识别串联推理使用方式请参考文档教程中的[基于预测引擎推理](./algorithm_inference.md)。

View File

@ -205,7 +205,7 @@ Architecture:
name: LayoutXLMForSer
pretrained: True
mode: vi
# 假设字典中包含n个字段包含other由于采用BIO标注则类别数为2n-1
# 由于采用BIO标注假设字典中包含n个字段包含other则类别数为2n-1; 假设字典中包含n个字段不含other则类别数为2n+1。否则在train过程会报IndexError: (OutOfRange) label value should less than the shape of axis dimension 。
num_classes: &num_classes 7
PostProcess:

View File

@ -69,7 +69,7 @@ PaddleOCR中集成了知识蒸馏的算法具体地有以下几个主要
```yaml
Architecture:
model_type: &model_type "rec" # 模型类别rec、det等每个子网络的模型类别都与
model_type: &model_type "rec" # 模型类别rec、det等每个子网络的模型类别
name: DistillationModel # 结构名称蒸馏任务中为DistillationModel用于构建对应的结构
algorithm: Distillation # 算法名称
Models: # 模型,包含子网络的配置信息

View File

@ -107,6 +107,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|en_number_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|原始超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
**注意:** 所有英文识别模型的字典文件均为`ppocr/utils/en_dict.txt`
<a name="多语言识别模型"></a>
### 2.3 多语言识别模型(更多语言持续更新中...
@ -152,3 +153,4 @@ Paddle-Lite 是一个高性能、轻量级、灵活性强且易于扩展的深
|PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.9M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9|
|V2.0|ppocr_v2.0超轻量中文OCR移动端模型|7.8M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9|
|V2.0(slim)|ppocr_v2.0超轻量中文OCR移动端模型|3.3M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9|

View File

@ -83,6 +83,8 @@ Supported text recognition algorithms (Click the link to get the tutorial):
- [x] [SPIN](./algorithm_rec_spin_en.md)
- [x] [RobustScanner](./algorithm_rec_robustscanner_en.md)
- [x] [RFL](./algorithm_rec_rfl_en.md)
- [x] [ParseQ](./algorithm_rec_parseq.md)
- [x] [CPPD](./algorithm_rec_cppd_en.md)
Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow:
@ -107,6 +109,9 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
|SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) |
|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)|
|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) |
|ParseQ|VIT| 91.24% | rec_vit_parseq_synth | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz) |
|CPPD|SVTR-Base| 93.8% | rec_svtrnet_cppd_base_en | [trained model](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) |
<a name="13"></a>

View File

@ -0,0 +1,171 @@
# CPPD
- [1. Introduction](#1)
- [2. Environment](#2)
- [3. Model Training / Evaluation / Prediction](#3)
- [3.1 Training](#3-1)
- [3.2 Evaluation](#3-2)
- [3.3 Prediction](#3-3)
- [4. Inference and Deployment](#4)
- [4.1 Python Inference](#4-1)
- [4.2 C++ Inference](#4-2)
- [4.3 Serving](#4-3)
- [4.4 More](#4-4)
<a name="1"></a>
## 1. Introduction
Paper:
> [Context Perception Parallel Decoder for Scene Text Recognition](https://arxiv.org/abs/2307.12270)
> Yongkun Du and Zhineng Chen and Caiyan Jia and Xiaoting Yin and Chenxia Li and Yuning Du and Yu-Gang Jiang
<a name="model"></a>
Scene text recognition models based on deep learning typically follow an Encoder-Decoder structure, where the decoder can be categorized into two types: (1) CTC and (2) Attention-based. Currently, most state-of-the-art (SOTA) models use an Attention-based decoder, which can be further divided into AR and PD types. In general, AR decoders achieve higher recognition accuracy than PD, while PD decoders are faster than AR. CPPD, with carefully designed CO and CC modules, achieves a balance between the accuracy of AR and the speed of PD.
<a name="model"></a>
The accuracy (%) and model files of CPPD on the public dataset of scene text recognition are as follows:
* English dataset from [PARSeq](https://github.com/baudm/parseq).
| Model |IC13<br/>857 | SVT |IIIT5k<br/>3000 |IC15<br/>1811| SVTP |CUTE80 | Avg | Download |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|
| CPPD Tiny | 97.1 | 94.4 | 96.6 | 86.6 | 88.5 | 90.3 | 92.25 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_tiny_en_train.tar) |
| CPPD Base | 98.2 | 95.5 | 97.6 | 87.9 | 90.0 | 92.7 | 93.80 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar)|
| CPPD Base 48*160 | 97.5 | 95.5 | 97.7 | 87.7 | 92.4 | 93.7 | 94.10 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_48_160_en_train.tar) |
* Trained on Synth dataset(MJ+ST), Test on Union14M-L benchmark from [U14m](https://github.com/Mountchicken/Union14M/).
| Model |Curve | Multi-<br/>Oriented |Artistic |Contextless| Salient | Multi-<br/>word | General | Avg | Download |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|
| CPPD Tiny | 52.4 | 12.3 | 48.2 | 54.4 | 61.5 | 53.4 | 61.4 | 49.10 | Same as the table above. |
| CPPD Base | 65.5 | 18.6 | 56.0 | 61.9 | 71.0 | 57.5 | 65.8 | 56.63 | Same as the table above. |
| CPPD Base 48*160 | 71.9 | 22.1 | 60.5 | 67.9 | 78.3 | 63.9 | 67.1 | 61.69 | Same as the table above. |
* Trained on Union14M-L training dataset.
| Model |IC13<br/>857 | SVT |IIIT5k<br/>3000 |IC15<br/>1811| SVTP |CUTE80 | Avg | Download |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|
| CPPD Base 32*128 | 98.7 | 98.5 | 99.4 | 91.7 | 96.7 | 99.7 | 97.44 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_u14m_train.tar) |
| Model |Curve | Multi-<br/>Oriented |Artistic |Contextless| Salient | Multi-<br/>word | General | Avg | Download |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|
| CPPD Base 32*128 | 87.5 | 70.7 | 78.2 | 82.9 | 85.5 | 85.4 | 84.3 | 82.08 | Same as the table above. |
* Chinese dataset from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
| Model | Scene | Web | Document | Handwriting | Avg | Download |
|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|
| CPPD Base | 74.4 | 76.1 | 98.6 | 55.3 | 76.10 | [ch](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_ch_train.tar) |
| CPPD Base + STN | 78.4 | 79.3 | 98.9 | 57.6 | 78.55 | [ch](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_stn_ch_train.tar) |
<a name="2"></a>
## 2. Environment
Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
#### Dataset Preparation
[English dataset download](https://github.com/baudm/parseq)
[Union14M-Benchmark download](https://github.com/Mountchicken/Union14M)
[Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
<a name="3"></a>
## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training:
Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
```
#Single GPU training (long training period, not recommended)
python3 tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml
#Multi GPU training, specify the gpu number through the --gpus parameter
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml
```
Evaluation:
You can download the model files and configuration files provided by `CPPD`: [download link](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar), take `CPPD-B` as an example, using the following command to evaluate:
```
# Download the tar archive containing the model files and configuration files of CPPD-B and extract it
wget https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar && tar xf rec_svtr_cppd_base_en_train.tar
# GPU evaluation
python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model
```
Prediction:
```
python3 tools/infer_rec.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model
```
<a name="4"></a>
## 4. Inference and Deployment
<a name="4-1"></a>
### 4.1 Python Inference
First, the model saved during the CPPD text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) ), you can use the following command to convert:
```
# export model
# en
python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_en_infer
# ch
python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_ch.yml -o Global.pretrained_model=./rec_svtr_cppd_base_ch_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_ch_infer
# speed test
# docker image https://hub.docker.com/r/paddlepaddle/paddle/tags/: sudo docker pull paddlepaddle/paddle:2.4.2-gpu-cuda11.2-cudnn8.2-trt8.0
# install auto_log: pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl
# en
python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_en_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,32,100' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True
# ch
python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_ch_infer/' --rec_algorithm='CPPDPadding' --rec_image_shape='3,32,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True
# stn_ch
python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_stn_ch_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,64,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True
```
**Note:**
- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file.
After the conversion is successful, there are three files in the directory:
```
/inference/rec_svtr_cppd_base_en_infer/
├── inference.pdiparams
├── inference.pdiparams.info
└── inference.pdmodel
```
<a name="4-2"></a>
### 4.2 C++ Inference
Not supported
<a name="4-3"></a>
### 4.3 Serving
Not supported
<a name="4-4"></a>
### 4.4 More
Not supported
## Citation
```bibtex
@article{Du2023CPPD,
title = {Context Perception Parallel Decoder for Scene Text Recognition},
author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang},
booktitle = {Arxiv},
year = {2023},
url = {https://arxiv.org/abs/2307.12270}
}
```

View File

@ -0,0 +1,123 @@
# PasreQ
- [1. Introduction](#1)
- [2. Environment](#2)
- [3. Model Training / Evaluation / Prediction](#3)
- [3.1 Training](#3-1)
- [3.2 Evaluation](#3-2)
- [3.3 Prediction](#3-3)
- [4. Inference and Deployment](#4)
- [4.1 Python Inference](#4-1)
- [4.2 C++ Inference](#4-2)
- [4.3 Serving](#4-3)
- [4.4 More](#4-4)
- [5. FAQ](#5)
<a name="1"></a>
## 1. Introduction
Paper:
> [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/abs/2207.06966)
> Darwin Bautista, Rowel Atienza
> ECCV, 2021
Using real datasets (real) and synthetic datsets (synth) for training respectivelyand evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets.
- The real datasets include COCO-Text, RCTW17, Uber-Text, ArT, LSVT, MLT19, ReCTS, TextOCR and OpenVINO datasets.
- The synthesis datasets include MJSynth and SynthText datasets.
the algorithm reproduction effect is as follows:
|Training Dataset|Model|Backbone|config|Acc|Download link|
| --- | --- | --- | --- | --- | --- |
|Synth|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|91.24%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz)|
|Real|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|94.74%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz)|
|||||||
<a name="2"></a>
## 2. Environment
Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a>
## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training:
Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
```
#Single GPU training (long training period, not recommended)
python3 tools/train.py -c configs/rec/rec_vit_parseq.yml
#Multi GPU training, specify the gpu number through the --gpus parameter
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_vit_parseq.yml
```
Evaluation:
```
# GPU evaluation
python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
```
Prediction:
```
# The configuration file used for prediction must match the training
python3 tools/infer_rec.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
```
<a name="4"></a>
## 4. Inference and Deployment
<a name="4-1"></a>
### 4.1 Python Inference
First, the model saved during the SAR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz) ), you can use the following command to convert:
```
python3 tools/export_model.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model=./rec_vit_parseq_real/best_accuracy Global.save_inference_dir=./inference/rec_parseq
```
For SAR text recognition model inference, the following commands can be executed:
```
python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_parseq/" --rec_image_shape="3, 32, 128" --rec_algorithm="ParseQ" --rec_char_dict_path="ppocr/utils/dict/parseq_dict.txt" --max_text_length=25 --use_space_char=False
```
<a name="4-2"></a>
### 4.2 C++ Inference
Not supported
<a name="4-3"></a>
### 4.3 Serving
Not supported
<a name="4-4"></a>
### 4.4 More
Not supported
<a name="5"></a>
## 5. FAQ
## Citation
```bibtex
@InProceedings{bautista2022parseq,
title={Scene Text Recognition with Permuted Autoregressive Sequence Models},
author={Bautista, Darwin and Atienza, Rowel},
booktitle={European Conference on Computer Vision},
pages={178--196},
month={10},
year={2022},
publisher={Springer Nature Switzerland},
address={Cham},
doi={10.1007/978-3-031-19815-1_11},
url={https://doi.org/10.1007/978-3-031-19815-1_11}
}
```

View File

@ -1,8 +1,8 @@
# OCR Model ListV3, updated on 2022.4.28
> **Note**
> 1. Compared with the model v2, the 3rd version of the detection model has a improvement in accuracy, and the 2.1 version of the recognition model has optimizations in accuracy and speed with CPU.
> 1. Compared with model v2, the 3rd version of the detection model has an improvement in accuracy, and the 2.1 version of the recognition model has optimizations in accuracy and speed with CPU.
> 2. Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 or higher are the dynamic graph trained version and achieve close performance.
> 3. All models in this tutorial are all ppocr-series models, for more introduction of algorithms and models based on public dataset, you can refer to [algorithm overview tutorial](./algorithm_overview_en.md).
> 3. All models in this tutorial are from the PaddleOCR series, for more introduction to algorithms and models based on the public dataset, you can refer to [algorithm overview tutorial](./algorithm_overview_en.md).
- [OCR Model ListV3, updated on 2022.4.28]()
- [1. Text Detection Model](#1-text-detection-model)
@ -16,15 +16,15 @@
- [3. Text Angle Classification Model](#3-text-angle-classification-model)
- [4. Paddle-Lite Model](#4-paddle-lite-model)
The downloadable models provided by PaddleOCR include `inference model`, `trained model`, `pre-trained model` and `nb model`. The differences between the models are as follows:
The downloadable models provided by PaddleOCR include the `inference model`, `trained model`, `pre-trained model` and `nb model`. The differences between the models are as follows:
|model type|model format|description|
|--- | --- | --- |
|inference model|inference.pdmodel、inference.pdiparams|Used for inference based on Paddle inference engine[detail](./inference_ppocr_en.md)|
|trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, mostly used for model evaluation and continuous training.|
|trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, is mostly used for model evaluation and continuous training.|
|nb model|\*.nb| Model optimized by Paddle-Lite, which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for nb model deployment). |
Relationship of the above models is as follows.
The relationship of the above models is as follows.
![](../imgs_en/model_prod_flow_en.png)
@ -51,10 +51,10 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|en_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
|ch_PP-OCRv3_det | [New] Original lightweight detection model, supporting English |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
|en_PP-OCRv3_det_slim | [New] Slim quantization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
|en_PP-OCRv3_det | [New] Original lightweight detection model, supporting English |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
* Note: English configuration file is same as Chinese except training data, here we only provide one configuration file.
* Note: English configuration file is the same as Chinese except for training data, here we only provide one configuration file.
<a name="1.3"></a>
@ -62,10 +62,10 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
| ml_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model ](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
| ml_PP-OCRv3_det_slim | [New] Slim quantization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model ](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
| ml_PP-OCRv3_det |[New] Original lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) |
* Note: English configuration file is same as Chinese except training data, here we only provide one configuration file.
* Note: English configuration file is the same as Chinese except for training data, here we only provide one configuration file.
<a name="Recognition"></a>
## 2. Text Recognition Model
@ -75,27 +75,29 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec_slim | [New] Slim quantization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
|ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9.0M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_PP-OCRv2_rec_slim| Slim quantization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9.0M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, and multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6.0M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) |
|ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
|ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
**Note:** The `trained model` is fine-tuned on the `pre-trained model` with real data and synthesized vertical text data, which achieved better performance in real scene. The `pre-trained model` is directly trained on the full amount of real data and synthesized data, which is more suitable for fine-tune on your own dataset.
**Note:** The `trained model` is fine-tuned on the `pre-trained model` with real data and synthesized vertical text data, which achieved better performance in the real scene. The `pre-trained model` is directly trained on the full amount of real data and synthesized data, which is more suitable for fine-tuning your dataset.
<a name="English"></a>
### 2.2 English Recognition Model
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_PP-OCRv3_rec_slim | [New] Slim quantization with distillation lightweight model, supporting English, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|en_PP-OCRv3_rec| [New] Original lightweight model, supporting English, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
**Note:** Dictionary file of all English recognition models is `ppocr/utils/en_dict.txt`.
<a name="Multilingual"></a>
### 2.3 Multilingual Recognition ModelUpdating...
@ -112,7 +114,7 @@ Relationship of the above models is as follows.
| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | Lightweight model for cyrillic recognition | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) |
| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt | Lightweight model for devanagari recognition | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) |
For a complete list of languages and tutorials, please refer to : [Multi-language model](./multi_languages_en.md)
For a complete list of languages and tutorials, please refer to [Multi-language model](./multi_languages_en.md)
<a name="Angle"></a>
## 3. Text Angle Classification Model
@ -125,9 +127,9 @@ For a complete list of languages and tutorials, please refer to : [Multi-l
<a name="Paddle-Lite"></a>
## 4. Paddle-Lite Model
Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It can further optimize the inference model and generate `nb model` used for edge devices. It's suggested to optimize the quantization model using Paddle-Lite because `INT8` format is used for the model storage and inference.
Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embedded, and IoT devices. It can further optimize the inference model and generate the `nb model` used for edge devices. It's suggested to optimize the quantization model using Paddle-Lite because the `INT8` format is used for the model storage and inference.
This chapter lists OCR nb models with PP-OCRv2 or earlier versions. You can access to the latest nb models from the above tables.
This chapter lists OCR nb models with PP-OCRv2 or earlier versions. You can access the latest nb models from the above tables.
|Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch|
|---|---|---|---|---|---|---|

View File

@ -80,7 +80,7 @@ PaddleOCR has built-in dictionaries, which can be used on demand.
`ppocr/utils/ppocr_keys_v1.txt` is a Chinese dictionary with 6623 characters.
`ppocr/utils/ic15_dict.txt` is an English dictionary with 63 characters
`ppocr/utils/ic15_dict.txt` is an English dictionary with 36 characters
`ppocr/utils/dict/french_dict.txt` is a French dictionary with 118 characters

View File

@ -45,7 +45,8 @@ tools = _import_file(
ppocr = importlib.import_module('ppocr', 'paddleocr')
ppstructure = importlib.import_module('ppstructure', 'paddleocr')
from ppocr.utils.logging import get_logger
from tools.infer import predict_system
logger = get_logger()
from ppocr.utils.utility import check_and_read, get_image_file_list, alpha_to_color, binarize_img
from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url
from tools.infer.utility import draw_ocr, str2bool, check_gpu
@ -59,7 +60,7 @@ __all__ = [
]
SUPPORT_DET_MODEL = ['DB']
VERSION = '2.7.3'
VERSION = '2.7.4'
SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet']
BASE_DIR = os.path.expanduser("~/.paddleocr/")
@ -408,6 +409,7 @@ def parse_args(mMain=True):
parser.add_argument("--det", type=str2bool, default=True)
parser.add_argument("--rec", type=str2bool, default=True)
parser.add_argument("--type", type=str, default='ocr')
parser.add_argument("--savefile", type=str2bool, default=False)
parser.add_argument(
"--ocr_version",
type=str,
@ -515,7 +517,19 @@ def img_decode(content: bytes):
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
def check_img(img):
def check_img(img, alpha_color=(255, 255, 255)):
"""
Check the image data. If it is another type of image file, try to decode it into a numpy array.
The inference network requires three-channel images, So the following channel conversions are done
single channel image: Gray to RGB RY,GY,BY
four channel image: alpha_to_color
args:
img: image data
file format: jpg, png and other image formats that opencv can decode, as well as gif and pdf formats
storage type: binary image, net image file, local image file
alpha_color: Background color in images in RGBA format
return: numpy.array (h, w, 3)
"""
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
@ -549,9 +563,12 @@ def check_img(img):
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
# single channel image array.shape:h,w
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
# four channel image array.shape:h,w,c
if isinstance(img, np.ndarray) and len(img.shape) == 3 and img.shape[2] == 4:
img = alpha_to_color(img, alpha_color)
return img
@ -616,17 +633,11 @@ class PaddleOCR(predict_system.TextSystem):
super().__init__(params)
self.page_num = params.page_num
def ocr(self,
img,
det=True,
rec=True,
cls=True,
bin=False,
inv=False,
alpha_color=(255, 255, 255)):
def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)):
"""
OCR with PaddleOCR
args
args:
img: img for OCR, support ndarray, img_path and list or ndarray
det: use text detection or not. If False, only rec will be exec. Default is True
rec: use text recognition or not. If False, only det will be exec. Default is True
@ -644,12 +655,13 @@ class PaddleOCR(predict_system.TextSystem):
'Since the angle classifier is not initialized, it will not be used during the forward process'
)
img = check_img(img)
img = check_img(img, alpha_color)
# for infer pdf file
if isinstance(img, list):
if self.page_num > len(img) or self.page_num == 0:
self.page_num = len(img)
imgs = img[:self.page_num]
imgs = img
else:
imgs = img[:self.page_num]
else:
imgs = [img]
@ -761,8 +773,8 @@ class PPStructure(StructureSystem):
logger.debug(params)
super().__init__(params)
def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
img = check_img(img)
def __call__(self, img, return_ocr_result_in_table=False, img_idx=0, alpha_color=(255, 255, 255)):
img = check_img(img, alpha_color)
res, _ = super().__call__(
img, return_ocr_result_in_table, img_idx=img_idx)
return res
@ -799,10 +811,25 @@ def main():
inv=args.invert,
alpha_color=args.alphacolor)
if result is not None:
lines = []
for idx in range(len(result)):
res = result[idx]
for line in res:
logger.info(line)
val = '['
for box in line[0]:
val += str(box[0]) + ',' + str(box[1]) + ','
val = val[:-1]
val += '],' + line[1][0] + ',' + str(line[1][1]) + '\n'
lines.append(val)
if args.savefile:
if os.path.exists(args.output) is False:
os.mkdir(args.output)
outfile = args.output + '/' + img_name + '.txt'
with open(outfile,'w',encoding='utf-8') as f:
f.writelines(lines)
elif args.type == 'structure':
img, flag_gif, flag_pdf = check_and_read(img_path)
if not flag_gif and not flag_pdf:

View File

@ -27,7 +27,7 @@ from .make_pse_gt import MakePseGt
from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \
SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \
ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg, \
RFLRecResizeImg, SVTRRecAug
RFLRecResizeImg, SVTRRecAug, ParseQRecAug
from .ssl_img_aug import SSLRotateResize
from .randaugment import RandAugment
from .copy_paste import CopyPaste

View File

@ -316,6 +316,35 @@ class CVGaussianNoise(object):
img = np.clip(img + noise, 0, 255).astype(np.uint8)
return img
class CVPossionNoise(object):
def __init__(self, lam=20):
self.lam = lam
if isinstance(lam, numbers.Number):
self.lam = max(int(sample_asym(lam)), 1)
elif isinstance(lam, (tuple, list)) and len(lam) == 2:
self.lam = int(sample_uniform(lam[0], lam[1]))
else:
raise Exception('lam must be number or list with length 2')
def __call__(self, img):
noise = np.random.poisson(lam=self.lam, size=img.shape)
img = np.clip(img + noise, 0, 255).astype(np.uint8)
return img
class CVGaussionBlur(object):
def __init__(self, radius):
self.radius = radius
if isinstance(radius, numbers.Number):
self.radius = max(int(sample_asym(radius)), 1)
elif isinstance(radius, (tuple, list)) and len(radius) == 2:
self.radius = int(sample_uniform(radius[0], radius[1]))
else:
raise Exception('radius must be number or list with length 2')
def __call__(self, img):
fil = cv2.getGaussianKernel(ksize=self.radius, sigma=1, ktype=cv2.CV_32F)
img = cv2.sepFilter2D(img, -1, fil, fil)
return img
class CVMotionBlur(object):
def __init__(self, degrees=12, angle=90):
@ -427,6 +456,29 @@ class SVTRDeterioration(object):
else:
return img
class ParseQDeterioration(object):
def __init__(self, var, degrees, lam, radius, factor, p=0.5):
self.p = p
transforms = []
if var is not None:
transforms.append(CVGaussianNoise(var=var))
if degrees is not None:
transforms.append(CVMotionBlur(degrees=degrees))
if lam is not None:
transforms.append(CVPossionNoise(lam=lam))
if radius is not None:
transforms.append(CVGaussionBlur(radius=radius))
if factor is not None:
transforms.append(CVRescale(factor=factor))
self.transforms = transforms
def __call__(self, img):
if random.random() < self.p:
random.shuffle(self.transforms)
transforms = Compose(self.transforms)
return transforms(img)
else:
return img
class SVTRGeometry(object):
def __init__(self,

View File

@ -14,18 +14,16 @@
import os
import cv2
import paddle
import random
import pyclipper
import paddle
import numpy as np
from ppocr.utils.utility import check_install
import scipy.io as scio
from PIL import Image
import paddle.vision.transforms as transforms
from ppocr.utils.utility import check_install
class RandomScale():
def __init__(self, short_size=640, **kwargs):

View File

@ -23,6 +23,7 @@ import string
from shapely.geometry import LineString, Point, Polygon
import json
import copy
import random
from random import sample
from ppocr.utils.logging import get_logger
@ -1306,6 +1307,39 @@ class NRTRLabelEncode(BaseRecLabelEncode):
return dict_character
class ParseQLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
BOS = '[B]'
EOS = '[E]'
PAD = '[P]'
def __init__(self,
max_text_length,
character_dict_path=None,
use_space_char=False,
**kwargs):
super(ParseQLabelEncode, self).__init__(
max_text_length, character_dict_path, use_space_char)
def __call__(self, data):
text = data['label']
text = self.encode(text)
if text is None:
return None
if len(text) >= self.max_text_len - 2:
return None
data['length'] = np.array(len(text))
text = [self.dict[self.BOS]] + text + [self.dict[self.EOS]]
text = text + [self.dict[self.PAD]] * (self.max_text_len - len(text))
data['label'] = np.array(text)
return data
def add_special_char(self, dict_character):
dict_character = [self.EOS] + dict_character + [self.BOS, self.PAD]
return dict_character
class ViTSTRLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
@ -1568,3 +1602,132 @@ class CANLabelEncode(BaseRecLabelEncode):
label.append(self.end_str)
data['label'] = self.encode(label)
return data
class CPPDLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
def __init__(self,
max_text_length,
character_dict_path=None,
use_space_char=False,
ch=False,
ignore_index=100,
**kwargs):
super(CPPDLabelEncode, self).__init__(
max_text_length, character_dict_path, use_space_char)
self.ch = ch
self.ignore_index = ignore_index
def __call__(self, data):
text = data['label']
if self.ch:
text, text_node_index, text_node_num = self.encodech(text)
if text is None:
return None
if len(text) > self.max_text_len:
return None
data['length'] = np.array(len(text))
text_pos_node = [1] * (len(text) + 1) + [0] * (self.max_text_len -
len(text))
text.append(0) # eos
text = text + [self.ignore_index] * (self.max_text_len + 1 -
len(text))
data['label'] = np.array(text)
data['label_node'] = np.array(text_node_num + text_pos_node)
data['label_index'] = np.array(text_node_index)
return data
else:
text, text_char_node, ch_order = self.encode(text)
if text is None:
return None
if len(text) >= self.max_text_len:
return None
data['length'] = np.array(len(text))
text_pos_node = [1] * (len(text) + 1) + [0] * (self.max_text_len -
len(text))
text.append(0) # eos
text = text + [self.ignore_index] * (self.max_text_len + 1 -
len(text))
data['label'] = np.array(text)
data['label_node'] = np.array(text_char_node + text_pos_node)
data['label_order'] = np.array(ch_order)
return data
def add_special_char(self, dict_character):
dict_character = ['</s>'] + dict_character
self.num_character = len(dict_character)
return dict_character
def encode(self, text):
"""
"""
if len(text) == 0 or len(text) > self.max_text_len:
return None, None, None
if self.lower:
text = text.lower()
text_node = [0 for _ in range(self.num_character)]
text_node[0] = 1
text_list = []
ch_order = []
order = 1
for char in text:
if char not in self.dict:
continue
text_list.append(self.dict[char])
text_node[self.dict[char]] += 1
ch_order.append(
[self.dict[char], text_node[self.dict[char]], order])
order += 1
no_ch_order = []
for char in self.character:
if char not in text:
no_ch_order.append([self.dict[char], 1, 0])
random.shuffle(no_ch_order)
ch_order = ch_order + no_ch_order
ch_order = ch_order[:self.max_text_len + 1]
if len(text_list) == 0:
return None, None, None
return text_list, text_node, ch_order.sort()
def encodech(self, text):
"""
"""
if len(text) == 0 or len(text) > self.max_text_len:
return None, None, None
if self.lower:
text = text.lower()
text_node_dict = {}
text_node_dict.update({0: 1})
character_index = [_ for _ in range(self.num_character)]
text_list = []
for char in text:
if char not in self.dict:
continue
i_c = self.dict[char]
text_list.append(i_c)
if i_c in text_node_dict.keys():
text_node_dict[i_c] += 1
else:
text_node_dict.update({i_c: 1})
for ic in list(text_node_dict.keys()):
character_index.remove(ic)
none_char_index = sample(character_index,
37 - len(list(text_node_dict.keys())))
for ic in none_char_index:
text_node_dict[ic] = 0
text_node_index = sorted(text_node_dict)
text_node_num = [text_node_dict[k] for k in text_node_index]
if len(text_list) == 0:
return None, None, None
return text_list, text_node_index, text_node_num

View File

@ -18,8 +18,9 @@ import numpy as np
import random
import copy
from PIL import Image
import PIL
from .text_image_aug import tia_perspective, tia_stretch, tia_distort
from .abinet_aug import CVGeometry, CVDeterioration, CVColorJitter, SVTRGeometry, SVTRDeterioration
from .abinet_aug import CVGeometry, CVDeterioration, CVColorJitter, SVTRGeometry, SVTRDeterioration, ParseQDeterioration
from paddle.vision.transforms import Compose
@ -46,7 +47,7 @@ class RecAug(object):
if h >= 20 and w >= 20:
img = tia_distort(img, random.randint(3, 6))
img = tia_stretch(img, random.randint(3, 6))
img = tia_perspective(img)
img = tia_perspective(img)
# bda
data['image'] = img
@ -203,6 +204,36 @@ class SVTRRecAug(object):
data['image'] = img
return data
class ParseQRecAug(object):
def __init__(self,
aug_type=0,
geometry_p=0.5,
deterioration_p=0.25,
colorjitter_p=0.25,
**kwargs):
self.transforms = Compose([
SVTRGeometry(
aug_type=aug_type,
degrees=45,
translate=(0.0, 0.0),
scale=(0.5, 2.),
shear=(45, 15),
distortion=0.5,
p=geometry_p), ParseQDeterioration(
var=20, degrees=6, lam=20, radius=2.0, factor=4, p=deterioration_p),
CVColorJitter(
brightness=0.5,
contrast=0.5,
saturation=0.5,
hue=0.1,
p=colorjitter_p)
])
def __call__(self, data):
img = data['image']
img = self.transforms(img)
data['image'] = img
return data
class ClsResizeImg(object):
def __init__(self, image_shape, **kwargs):
@ -406,7 +437,7 @@ class GrayRecResizeImg(object):
def __init__(self,
image_shape,
resize_type,
inter_type='Image.LANCZOS',
inter_type="Image.Resampling.LANCZOS",
scale=True,
padding=False,
**kwargs):

View File

@ -43,6 +43,8 @@ from .rec_rfl_loss import RFLLoss
from .rec_can_loss import CANLoss
from .rec_satrn_loss import SATRNLoss
from .rec_nrtr_loss import NRTRLoss
from .rec_parseq_loss import ParseQLoss
from .rec_cppd_loss import CPPDLoss
# cls loss
from .cls_loss import ClsLoss
@ -76,7 +78,7 @@ def build_loss(config):
'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss',
'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss',
'SLALoss', 'CTLoss', 'RFLLoss', 'DRRGLoss', 'CANLoss', 'TelescopeLoss',
'SATRNLoss', 'NRTRLoss'
'SATRNLoss', 'NRTRLoss', 'ParseQLoss', 'CPPDLoss'
]
config = copy.deepcopy(config)
module_name = config.pop('name')

View File

@ -745,6 +745,8 @@ class DistillationDilaDBLoss(DBLoss):
# dilation to teacher prediction
dilation_w = np.array([[1, 1], [1, 1]])
th_shrink_maps = tch_preds[:, 0, :, :]
if hasattr(paddle.Tensor, "contiguous"):
th_shrink_maps = th_shrink_maps.contiguous()
th_shrink_maps = th_shrink_maps.numpy() > 0.3 # thresh = 0.3
dilate_maps = np.zeros_like(th_shrink_maps).astype(np.float32)
for i in range(th_shrink_maps.shape[0]):

View File

@ -0,0 +1,78 @@
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
class CPPDLoss(nn.Layer):
def __init__(self,
smoothing=False,
ignore_index=100,
sideloss_weight=1.0,
**kwargs):
super(CPPDLoss, self).__init__()
self.edge_ce = nn.CrossEntropyLoss(
reduction='mean', ignore_index=ignore_index)
self.char_node_ce = nn.CrossEntropyLoss(reduction='mean')
self.pos_node_ce = nn.BCEWithLogitsLoss(reduction='mean')
self.smoothing = smoothing
self.ignore_index = ignore_index
self.sideloss_weight = sideloss_weight
def label_smoothing_ce(self, preds, targets):
non_pad_mask = paddle.not_equal(
targets,
paddle.zeros(
targets.shape, dtype=targets.dtype) + self.ignore_index)
tgts = paddle.where(
targets == (paddle.zeros(
targets.shape, dtype=targets.dtype) + self.ignore_index),
paddle.zeros(
targets.shape, dtype=targets.dtype),
targets)
eps = 0.1
n_class = preds.shape[1]
one_hot = F.one_hot(tgts, preds.shape[1])
one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
log_prb = F.log_softmax(preds, axis=1)
loss = -(one_hot * log_prb).sum(axis=1)
loss = loss.masked_select(non_pad_mask).mean()
return loss
def forward(self, pred, batch):
node_feats, edge_feats = pred
node_tgt = batch[2]
char_tgt = batch[1]
loss_char_node = self.char_node_ce(node_feats[0].flatten(0, 1),
node_tgt[:, :-26].flatten(0, 1))
loss_pos_node = self.pos_node_ce(node_feats[1].flatten(
0, 1), node_tgt[:, -26:].flatten(0, 1).cast('float32'))
loss_node = loss_char_node + loss_pos_node
edge_feats = edge_feats.flatten(0, 1)
char_tgt = char_tgt.flatten(0, 1)
if self.smoothing:
loss_edge = self.label_smoothing_ce(edge_feats, char_tgt)
else:
loss_edge = self.edge_ce(edge_feats, char_tgt)
return {
'loss': self.sideloss_weight * loss_node + loss_edge,
'loss_node': self.sideloss_weight * loss_node,
'loss_edge': loss_edge
}

View File

@ -0,0 +1,50 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import nn
class ParseQLoss(nn.Layer):
def __init__(self, **kwargs):
super(ParseQLoss, self).__init__()
def forward(self, predicts, targets):
label = targets[1] # label
label_len = targets[2]
max_step = paddle.max(label_len).cpu().numpy()[0] + 2
tgt = label[:, :max_step]
logits_list = predicts['logits_list']
pad_id = predicts['pad_id']
eos_id = predicts['eos_id']
tgt_out = tgt[:, 1:]
loss = 0
loss_numel = 0
n = (tgt_out != pad_id).sum().item()
for i, logits in enumerate(logits_list):
loss += n * paddle.nn.functional.cross_entropy(input=logits, label=tgt_out.flatten(), ignore_index=pad_id)
loss_numel += n
if i == 1:
tgt_out = paddle.where(condition=tgt_out == eos_id, x=pad_id, y=tgt_out)
n = (tgt_out != pad_id).sum().item()
loss /= loss_numel
return {'loss': loss}

View File

@ -24,6 +24,7 @@ def build_backbone(config, model_type):
from .det_pp_lcnet import PPLCNet
from .rec_lcnetv3 import PPLCNetV3
from .rec_hgnet import PPHGNet_small
from .rec_vit import ViT
support_dict = [
"MobileNetV3", "ResNet", "ResNet_vd", "ResNet_SAST", "PPLCNet",
"PPLCNetV3", "PPHGNet_small"
@ -50,11 +51,12 @@ def build_backbone(config, model_type):
from .rec_shallow_cnn import ShallowCNN
from .rec_lcnetv3 import PPLCNetV3
from .rec_hgnet import PPHGNet_small
from .rec_vit_parseq import ViTParseQ
support_dict = [
'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB',
'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet',
'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32', 'ResNetRFL',
'DenseNet', 'ShallowCNN', 'PPLCNetV3', 'PPHGNet_small'
'DenseNet', 'ShallowCNN', 'PPLCNetV3', 'PPHGNet_small', 'ViTParseQ', 'ViT'
]
elif model_type == 'e2e':
from .e2e_resnet_vd_pg import ResNet

View File

@ -0,0 +1,258 @@
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import ParamAttr
from paddle.nn.initializer import KaimingNormal
import numpy as np
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
trunc_normal_ = TruncatedNormal(std=.02)
normal_ = Normal
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
def drop_path(x, drop_prob=0., training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
self.dim = dim
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
qkv = paddle.reshape(self.qkv(x), (0, -1, 3, self.num_heads, self.dim //
self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2))))
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-6,
prenorm=True):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
else:
self.norm1 = norm_layer(dim)
self.mixer = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
else:
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp_ratio = mlp_ratio
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.prenorm = prenorm
def forward(self, x):
if self.prenorm:
x = self.norm1(x + self.drop_path(self.mixer(x)))
x = self.norm2(x + self.drop_path(self.mlp(x)))
else:
x = x + self.drop_path(self.mixer(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class ViT(nn.Layer):
def __init__(
self,
img_size=[32, 128],
patch_size=[4,4],
in_channels=3,
embed_dim=384,
depth=12,
num_heads=6,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.1,
norm_layer='nn.LayerNorm',
epsilon=1e-6,
act='nn.GELU',
prenorm=False,
**kwargs):
super().__init__()
self.embed_dim = embed_dim
self.out_channels = embed_dim
self.prenorm = prenorm
self.patch_embed = nn.Conv2D(in_channels, embed_dim, patch_size, patch_size, padding=(0, 0))
self.pos_embed = self.create_parameter(
shape=[1, 257, embed_dim], default_initializer=zeros_)
self.add_parameter("pos_embed", self.pos_embed)
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks1 = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=eval(act),
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm) for i in range(depth)
])
if not prenorm:
self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
self.avg_pool = nn.AdaptiveAvgPool2D([1, 25])
self.last_conv = nn.Conv2D(
in_channels=embed_dim,
out_channels=self.out_channels,
kernel_size=1,
stride=1,
padding=0,
bias_attr=False)
self.hardswish = nn.Hardswish()
self.dropout = nn.Dropout(p=0.1, mode="downscale_in_infer")
trunc_normal_(self.pos_embed)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
x = self.patch_embed(x).flatten(2).transpose((0, 2, 1))
x = x + self.pos_embed[:, 1:, :] #[:, :paddle.shape(x)[1], :]
x = self.pos_drop(x)
for blk in self.blocks1:
x = blk(x)
if not self.prenorm:
x = self.norm(x)
x = self.avg_pool(x.transpose([0, 2, 1]).reshape(
[0, self.embed_dim, -1, 25]))
x = self.last_conv(x)
x = self.hardswish(x)
x = self.dropout(x)
return x

View File

@ -0,0 +1,304 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.5/ppcls/arch/backbone/model_zoo/vision_transformer.py
"""
from collections.abc import Callable
import numpy as np
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
trunc_normal_ = TruncatedNormal(std=.02)
normal_ = Normal
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
def to_2tuple(x):
return tuple([x] * 2)
def drop_path(x, drop_prob=0., training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
# B= paddle.shape(x)[0]
N, C = x.shape[1:]
qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm1 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm2 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
if isinstance(img_size, int):
img_size = to_2tuple(img_size)
if isinstance(patch_size, int):
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
B, C, H, W = x.shape
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x).flatten(2).transpose((0, 2, 1))
return x
class VisionTransformer(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=224,
patch_size=16,
in_channels=3,
class_num=1000,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
epsilon=1e-5,
**kwargs):
super().__init__()
self.class_num = class_num
self.num_features = self.embed_dim = embed_dim
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_channels,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches
self.pos_embed = self.create_parameter(shape=(1, num_patches, embed_dim), default_initializer=zeros_)
self.add_parameter("pos_embed", self.pos_embed)
self.cls_token = self.create_parameter(
shape=(1, 1, embed_dim), default_initializer=zeros_)
self.add_parameter("cls_token", self.cls_token)
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
epsilon=epsilon) for i in range(depth)
])
self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
# Classifier head
self.head = nn.Linear(embed_dim,
class_num) if class_num > 0 else Identity()
trunc_normal_(self.pos_embed)
self.out_channels = embed_dim
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward_features(self, x):
B = paddle.shape(x)[0]
x = self.patch_embed(x)
x = x + self.pos_embed
x = self.pos_drop(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
return x
def forward(self, x):
x = self.forward_features(x)
x = self.head(x)
return x
class ViTParseQ(VisionTransformer):
def __init__(self, img_size=[224, 224], patch_size=[16, 16], in_channels=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=True, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0):
super().__init__(img_size, patch_size, in_channels, embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, class_num=0)
def forward(self, x):
return self.forward_features(x)

View File

@ -54,18 +54,11 @@ class NLPBaseModel(nn.Layer):
if checkpoints is not None: # load the trained model
self.model = model_class.from_pretrained(checkpoints)
else: # load the pretrained-model
pretrained_model_name = pretrained_model_dict[base_model_class][
mode]
if pretrained is True:
base_model = base_model_class.from_pretrained(
pretrained_model_name)
else:
base_model = base_model_class.from_pretrained(pretrained)
pretrained_model_name = pretrained_model_dict[base_model_class][mode]
if type == "ser":
self.model = model_class(
base_model, num_classes=kwargs["num_classes"], dropout=None)
self.model = model_class.from_pretrained(pretrained_model_name, num_classes=kwargs["num_classes"], dropout=0)
else:
self.model = model_class(base_model, dropout=None)
self.model = model_class.from_pretrained(pretrained_model_name, dropout=0)
self.out_channels = 1
self.use_visual_backbone = True

View File

@ -40,6 +40,8 @@ def build_head(config):
from .rec_rfl_head import RFLHead
from .rec_can_head import CANHead
from .rec_satrn_head import SATRNHead
from .rec_parseq_head import ParseQHead
from .rec_cppd_head import CPPDHead
# cls head
from .cls_head import ClsHead
@ -56,7 +58,8 @@ def build_head(config):
'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead',
'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead',
'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head', 'RFLHead',
'DRRGHead', 'CANHead', 'SATRNHead', 'PFHeadLocal'
'DRRGHead', 'CANHead', 'SATRNHead', 'PFHeadLocal', 'ParseQHead',
'CPPDHead'
]
if config['name'] == 'DRRGHead':

View File

@ -182,11 +182,13 @@ class ABINetHead(nn.Layer):
dropout=0.1,
max_length=25,
use_lang=False,
iter_size=1):
iter_size=1,
image_size=(32, 128)):
super().__init__()
self.max_length = max_length + 1
h, w = image_size[0] // 4, image_size[1] // 4
self.pos_encoder = PositionalEncoding(
dropout=0.1, dim=d_model, max_len=8 * 32)
dropout=0.1, dim=d_model, max_len=h * w)
self.encoder = nn.LayerList([
TransformerBlock(
d_model=d_model,
@ -199,7 +201,7 @@ class ABINetHead(nn.Layer):
])
self.decoder = PositionAttention(
max_length=max_length + 1, # additional stop token
mode='nearest', )
mode='nearest', h=h, w=w)
self.out_channels = out_channels
self.cls = nn.Linear(d_model, self.out_channels)
self.use_lang = use_lang

View File

@ -0,0 +1,333 @@
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
from collections import Callable
except:
from collections.abc import Callable
import numpy as np
import paddle
from paddle import nn
from paddle.nn import functional as F
from ppocr.modeling.heads.rec_nrtr_head import Embeddings
from ppocr.modeling.backbones.rec_svtrnet import DropPath, Identity, trunc_normal_, zeros_, ones_, Mlp
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, q, kv):
N, C = kv.shape[1:]
QN = q.shape[1]
q = self.q(q).reshape(
[-1, QN, self.num_heads, C // self.num_heads]).transpose(
[0, 2, 1, 3])
k, v = self.kv(kv).reshape(
[-1, N, 2, self.num_heads, C // self.num_heads]).transpose(
(2, 0, 3, 1, 4))
attn = q.matmul(k.transpose((0, 1, 3, 2))) * self.scale
attn = F.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, QN, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class EdgeDecoderLayer(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=[0., 0.],
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-6):
super().__init__()
self.head_dim = dim // num_heads
self.scale = qk_scale or self.head_dim**-0.5
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path1 = DropPath(drop_path[0]) if drop_path[
0] > 0. else Identity()
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
self.p = nn.Linear(dim, dim)
self.cv = nn.Linear(dim, dim)
self.pv = nn.Linear(dim, dim)
self.dim = dim
self.num_heads = num_heads
self.p_proj = nn.Linear(dim, dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp_ratio = mlp_ratio
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, p, cv, pv):
pN = p.shape[1]
vN = cv.shape[1]
p_shortcut = p
p1 = self.p(p).reshape(
[-1, pN, self.num_heads, self.dim // self.num_heads]).transpose(
[0, 2, 1, 3])
cv1 = self.cv(cv).reshape(
[-1, vN, self.num_heads, self.dim // self.num_heads]).transpose(
[0, 2, 1, 3])
pv1 = self.pv(pv).reshape(
[-1, vN, self.num_heads, self.dim // self.num_heads]).transpose(
[0, 2, 1, 3])
edge = F.softmax(p1.matmul(pv1.transpose((0, 1, 3, 2))), -1) # B h N N
p_c = (edge @cv1).transpose((0, 2, 1, 3)).reshape((-1, pN, self.dim))
x1 = self.norm1(p_shortcut + self.drop_path1(self.p_proj(p_c)))
x = self.norm2(x1 + self.drop_path1(self.mlp(x1)))
return x
class DecoderLayer(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-6):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
self.normkv = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm1 = norm_layer(dim)
self.normkv = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.LayerNorm class")
self.mixer = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm2 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp_ratio = mlp_ratio
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, q, kv):
x1 = self.norm1(q + self.drop_path(self.mixer(q, kv)))
x = self.norm2(x1 + self.drop_path(self.mlp(x1)))
return x
class CPPDHead(nn.Layer):
def __init__(self,
in_channels,
dim,
out_channels,
num_layer=2,
drop_path_rate=0.1,
max_len=25,
vis_seq=50,
ch=False,
**kwargs):
super(CPPDHead, self).__init__()
self.out_channels = out_channels # none + 26 + 10
self.dim = dim
self.ch = ch
self.max_len = max_len + 1 # max_len + eos
self.char_node_embed = Embeddings(
d_model=dim, vocab=self.out_channels, scale_embedding=True)
self.pos_node_embed = Embeddings(
d_model=dim, vocab=self.max_len, scale_embedding=True)
dpr = np.linspace(0, drop_path_rate, num_layer + 1)
self.char_node_decoder = nn.LayerList([
DecoderLayer(
dim=dim,
num_heads=dim // 32,
mlp_ratio=4.0,
qkv_bias=True,
drop_path=dpr[i]) for i in range(num_layer)
])
self.pos_node_decoder = nn.LayerList([
DecoderLayer(
dim=dim,
num_heads=dim // 32,
mlp_ratio=4.0,
qkv_bias=True,
drop_path=dpr[i]) for i in range(num_layer)
])
self.edge_decoder = EdgeDecoderLayer(
dim=dim,
num_heads=dim // 32,
mlp_ratio=4.0,
qkv_bias=True,
drop_path=dpr[num_layer:num_layer + 1])
self.char_pos_embed = self.create_parameter(
shape=[1, self.max_len, dim], default_initializer=zeros_)
self.add_parameter("char_pos_embed", self.char_pos_embed)
self.vis_pos_embed = self.create_parameter(
shape=[1, vis_seq, dim], default_initializer=zeros_)
self.add_parameter("vis_pos_embed", self.vis_pos_embed)
self.char_node_fc1 = nn.Linear(dim, max_len)
self.pos_node_fc1 = nn.Linear(dim, self.max_len)
self.edge_fc = nn.Linear(dim, self.out_channels)
trunc_normal_(self.char_pos_embed)
trunc_normal_(self.vis_pos_embed)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x, targets=None, epoch=0):
if self.training:
return self.forward_train(x, targets, epoch)
else:
return self.forward_test(x)
def forward_test(self, x):
visual_feats = x + self.vis_pos_embed
bs = visual_feats.shape[0]
pos_node_embed = self.pos_node_embed(paddle.arange(
self.max_len)).unsqueeze(0) + self.char_pos_embed
pos_node_embed = paddle.tile(pos_node_embed, [bs, 1, 1])
char_vis_node_query = visual_feats
pos_vis_node_query = paddle.concat([pos_node_embed, visual_feats], 1)
for char_decoder_layer, pos_decoder_layer in zip(self.char_node_decoder,
self.pos_node_decoder):
char_vis_node_query = char_decoder_layer(char_vis_node_query,
char_vis_node_query)
pos_vis_node_query = pos_decoder_layer(
pos_vis_node_query, pos_vis_node_query[:, self.max_len:, :])
pos_node_query = pos_vis_node_query[:, :self.max_len, :]
char_vis_feats = char_vis_node_query
pos_node_feats = self.edge_decoder(pos_node_query, char_vis_feats,
char_vis_feats) # B, 26, dim
edge_feats = self.edge_fc(pos_node_feats) # B, 26, 37
edge_logits = F.softmax(edge_feats, -1)
return edge_logits
def forward_train(self, x, targets=None, epoch=0):
visual_feats = x + self.vis_pos_embed
bs = visual_feats.shape[0]
if self.ch:
char_node_embed = self.char_node_embed(targets[-2])
else:
char_node_embed = self.char_node_embed(
paddle.arange(self.out_channels)).unsqueeze(0)
char_node_embed = paddle.tile(char_node_embed, [bs, 1, 1])
counting_char_num = paddle.shape(char_node_embed)[1]
pos_node_embed = self.pos_node_embed(paddle.arange(
self.max_len)).unsqueeze(0) + self.char_pos_embed
pos_node_embed = paddle.tile(pos_node_embed, [bs, 1, 1])
node_feats = []
char_vis_node_query = paddle.concat([char_node_embed, visual_feats], 1)
pos_vis_node_query = paddle.concat([pos_node_embed, visual_feats], 1)
for char_decoder_layer, pos_decoder_layer in zip(self.char_node_decoder,
self.pos_node_decoder):
char_vis_node_query = char_decoder_layer(
char_vis_node_query,
char_vis_node_query[:, counting_char_num:, :])
pos_vis_node_query = pos_decoder_layer(
pos_vis_node_query, pos_vis_node_query[:, self.max_len:, :])
char_node_query = char_vis_node_query[:, :counting_char_num, :]
pos_node_query = pos_vis_node_query[:, :self.max_len, :]
char_vis_feats = char_vis_node_query[:, counting_char_num:, :]
char_node_feats1 = self.char_node_fc1(char_node_query)
pos_node_feats1 = self.pos_node_fc1(pos_node_query)
diag_mask = paddle.eye(pos_node_feats1.shape[1]).unsqueeze(0).tile(
[pos_node_feats1.shape[0], 1, 1])
pos_node_feats1 = (pos_node_feats1 * diag_mask).sum(-1)
node_feats.append(char_node_feats1)
node_feats.append(pos_node_feats1)
pos_node_feats = self.edge_decoder(pos_node_query, char_vis_feats,
char_vis_feats) # B, 26, dim
edge_feats = self.edge_fc(pos_node_feats) # B, 26, 37
return node_feats, edge_feats

View File

@ -22,7 +22,7 @@ from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from ppocr.modeling.necks.rnn import Im2Seq, EncoderWithRNN, EncoderWithFC, SequenceEncoder, EncoderWithSVTR
from ppocr.modeling.necks.rnn import Im2Seq, EncoderWithRNN, EncoderWithFC, SequenceEncoder, EncoderWithSVTR, trunc_normal_, zeros_
from .rec_ctc_head import CTCHead
from .rec_sar_head import SARHead
from .rec_nrtr_head import Transformer
@ -41,12 +41,28 @@ class FCTranspose(nn.Layer):
else:
return self.fc(x.transpose([0, 2, 1]))
class AddPos(nn.Layer):
def __init__(self, dim, w):
super().__init__()
self.dec_pos_embed = self.create_parameter(
shape=[1, w, dim], default_initializer=zeros_)
self.add_parameter("dec_pos_embed", self.dec_pos_embed)
trunc_normal_(self.dec_pos_embed)
def forward(self,x):
x = x + self.dec_pos_embed[:, :paddle.shape(x)[1], :]
return x
class MultiHead(nn.Layer):
def __init__(self, in_channels, out_channels_list, **kwargs):
super().__init__()
self.head_list = kwargs.pop('head_list')
self.use_pool = kwargs.get('use_pool', False)
self.use_pos = kwargs.get('use_pos', False)
self.in_channels = in_channels
if self.use_pool:
self.pool = nn.AvgPool2D(kernel_size=[3, 2], stride=[3, 2], padding=0)
self.gtc_head = 'sar'
assert len(self.head_list) >= 2
for idx, head_name in enumerate(self.head_list):
@ -61,8 +77,13 @@ class MultiHead(nn.Layer):
max_text_length = gtc_args.get('max_text_length', 25)
nrtr_dim = gtc_args.get('nrtr_dim', 256)
num_decoder_layers = gtc_args.get('num_decoder_layers', 4)
self.before_gtc = nn.Sequential(
if self.use_pos:
self.before_gtc = nn.Sequential(
nn.Flatten(2), FCTranspose(in_channels, nrtr_dim), AddPos(nrtr_dim, 80))
else:
self.before_gtc = nn.Sequential(
nn.Flatten(2), FCTranspose(in_channels, nrtr_dim))
self.gtc_head = Transformer(
d_model=nrtr_dim,
nhead=nrtr_dim // 32,
@ -88,7 +109,8 @@ class MultiHead(nn.Layer):
'{} is not supported in MultiHead yet'.format(name))
def forward(self, x, targets=None):
if self.use_pool:
x = self.pool(x.reshape([0, 3, -1, self.in_channels]).transpose([0, 3, 1, 2]))
ctc_encoder = self.ctc_encoder(x)
ctc_out = self.ctc_head(ctc_encoder, targets)
head_out = dict()

View File

@ -16,11 +16,9 @@ import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle.nn import LayerList
# from paddle.nn.initializer import XavierNormal as xavier_uniform_
from paddle.nn import Dropout, Linear, LayerNorm
from paddle.nn import Dropout, LayerNorm
import numpy as np
from ppocr.modeling.backbones.rec_svtrnet import Mlp, zeros_, ones_
from ppocr.modeling.backbones.rec_svtrnet import Mlp, zeros_
from paddle.nn.initializer import XavierNormal as xavier_normal_

View File

@ -0,0 +1,342 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Code was based on https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py
# reference: https://arxiv.org/abs/2207.06966
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn, ParamAttr
from paddle.nn import functional as F
import numpy as np
from .self_attention import WrapEncoderForFeature
from .self_attention import WrapEncoder
from collections import OrderedDict
from typing import Optional
import copy
from itertools import permutations
class DecoderLayer(paddle.nn.Layer):
"""A Transformer decoder layer supporting two-stream attention (XLNet)
This implements a pre-LN decoder, as opposed to the post-LN default in PyTorch."""
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='gelu', layer_norm_eps=1e-05):
super().__init__()
self.self_attn = paddle.nn.MultiHeadAttention(d_model, nhead, dropout=dropout, need_weights=True) # paddle.nn.MultiHeadAttention默认为batch_first模式
self.cross_attn = paddle.nn.MultiHeadAttention(d_model, nhead, dropout=dropout, need_weights=True)
self.linear1 = paddle.nn.Linear(in_features=d_model, out_features=dim_feedforward)
self.dropout = paddle.nn.Dropout(p=dropout)
self.linear2 = paddle.nn.Linear(in_features=dim_feedforward, out_features=d_model)
self.norm1 = paddle.nn.LayerNorm(normalized_shape=d_model, epsilon=layer_norm_eps)
self.norm2 = paddle.nn.LayerNorm(normalized_shape=d_model, epsilon=layer_norm_eps)
self.norm_q = paddle.nn.LayerNorm(normalized_shape=d_model, epsilon=layer_norm_eps)
self.norm_c = paddle.nn.LayerNorm(normalized_shape=d_model, epsilon=layer_norm_eps)
self.dropout1 = paddle.nn.Dropout(p=dropout)
self.dropout2 = paddle.nn.Dropout(p=dropout)
self.dropout3 = paddle.nn.Dropout(p=dropout)
if activation == 'gelu':
self.activation = paddle.nn.GELU()
def __setstate__(self, state):
if 'activation' not in state:
state['activation'] = paddle.nn.functional.gelu
super().__setstate__(state)
def forward_stream(self, tgt, tgt_norm, tgt_kv, memory, tgt_mask, tgt_key_padding_mask):
"""Forward pass for a single stream (i.e. content or query)
tgt_norm is just a LayerNorm'd tgt. Added as a separate parameter for efficiency.
Both tgt_kv and memory are expected to be LayerNorm'd too.
memory is LayerNorm'd by ViT.
"""
if tgt_key_padding_mask is not None:
tgt_mask1 = (tgt_mask!=float('-inf'))[None,None,:,:] & (tgt_key_padding_mask[:,None,None,:]==False)
tgt2, sa_weights = self.self_attn(tgt_norm, tgt_kv, tgt_kv, attn_mask=tgt_mask1)
else:
tgt2, sa_weights = self.self_attn(tgt_norm, tgt_kv, tgt_kv, attn_mask=tgt_mask)
tgt = tgt + self.dropout1(tgt2)
tgt2, ca_weights = self.cross_attn(self.norm1(tgt), memory, memory)
tgt = tgt + self.dropout2(tgt2)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(self.norm2(tgt)))))
tgt = tgt + self.dropout3(tgt2)
return tgt, sa_weights, ca_weights
def forward(self, query, content, memory, query_mask=None, content_mask=None, content_key_padding_mask=None, update_content=True):
query_norm = self.norm_q(query)
content_norm = self.norm_c(content)
query = self.forward_stream(query, query_norm, content_norm, memory, query_mask, content_key_padding_mask)[0]
if update_content:
content = self.forward_stream(content, content_norm, content_norm, memory, content_mask, content_key_padding_mask)[0]
return query, content
def get_clones(module, N):
return paddle.nn.LayerList([copy.deepcopy(module) for i in range(N)])
class Decoder(paddle.nn.Layer):
__constants__ = ['norm']
def __init__(self, decoder_layer, num_layers, norm):
super().__init__()
self.layers = get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, query, content, memory, query_mask: Optional[paddle.Tensor]=None, content_mask: Optional[paddle.Tensor]=None, content_key_padding_mask: Optional[paddle.Tensor]=None):
for i, mod in enumerate(self.layers):
last = i == len(self.layers) - 1
query, content = mod(query, content, memory, query_mask, content_mask, content_key_padding_mask, update_content=not last)
query = self.norm(query)
return query
class TokenEmbedding(paddle.nn.Layer):
def __init__(self, charset_size: int, embed_dim: int):
super().__init__()
self.embedding = paddle.nn.Embedding(num_embeddings=charset_size, embedding_dim=embed_dim)
self.embed_dim = embed_dim
def forward(self, tokens: paddle.Tensor):
return math.sqrt(self.embed_dim) * self.embedding(tokens.astype(paddle.int64))
def trunc_normal_init(param, **kwargs):
initializer = nn.initializer.TruncatedNormal(**kwargs)
initializer(param, param.block)
def constant_init(param, **kwargs):
initializer = nn.initializer.Constant(**kwargs)
initializer(param, param.block)
def kaiming_normal_init(param, **kwargs):
initializer = nn.initializer.KaimingNormal(**kwargs)
initializer(param, param.block)
class ParseQHead(nn.Layer):
def __init__(self, out_channels, max_text_length, embed_dim, dec_num_heads, dec_mlp_ratio, dec_depth, perm_num, perm_forward, perm_mirrored, decode_ar, refine_iters, dropout, **kwargs):
super().__init__()
self.bos_id = out_channels - 2
self.eos_id = 0
self.pad_id = out_channels - 1
self.max_label_length = max_text_length
self.decode_ar = decode_ar
self.refine_iters = refine_iters
decoder_layer = DecoderLayer(embed_dim, dec_num_heads, embed_dim * dec_mlp_ratio, dropout)
self.decoder = Decoder(decoder_layer, num_layers=dec_depth, norm=paddle.nn.LayerNorm(normalized_shape=embed_dim))
self.rng = np.random.default_rng()
self.max_gen_perms = perm_num // 2 if perm_mirrored else perm_num
self.perm_forward = perm_forward
self.perm_mirrored = perm_mirrored
self.head = paddle.nn.Linear(in_features=embed_dim, out_features=out_channels - 2)
self.text_embed = TokenEmbedding(out_channels, embed_dim)
self.pos_queries = paddle.create_parameter(shape=paddle.empty(shape=[1, max_text_length + 1, embed_dim]).shape, dtype=paddle.empty(shape=[1, max_text_length + 1, embed_dim]).numpy().dtype, default_initializer=paddle.nn.initializer.Assign(paddle.empty(shape=[1, max_text_length + 1, embed_dim])))
self.pos_queries.stop_gradient = not True
self.dropout = paddle.nn.Dropout(p=dropout)
self._device = self.parameters()[0].place
trunc_normal_init(self.pos_queries, std=0.02)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, paddle.nn.Linear):
trunc_normal_init(m.weight, std=0.02)
if m.bias is not None:
constant_init(m.bias, value=0.0)
elif isinstance(m, paddle.nn.Embedding):
trunc_normal_init(m.weight, std=0.02)
if m._padding_idx is not None:
m.weight.data[m._padding_idx].zero_()
elif isinstance(m, paddle.nn.Conv2D):
kaiming_normal_init(m.weight, fan_in=None, nonlinearity='relu')
if m.bias is not None:
constant_init(m.bias, value=0.0)
elif isinstance(m, (paddle.nn.LayerNorm, paddle.nn.BatchNorm2D, paddle.nn.GroupNorm)):
constant_init(m.weight, value=1.0)
constant_init(m.bias, value=0.0)
def no_weight_decay(self):
param_names = {'text_embed.embedding.weight', 'pos_queries'}
enc_param_names = {('encoder.' + n) for n in self.encoder.
no_weight_decay()}
return param_names.union(enc_param_names)
def encode(self, img):
return self.encoder(img)
def decode(self, tgt, memory, tgt_mask=None, tgt_padding_mask=None, tgt_query=None, tgt_query_mask=None):
N, L = tgt.shape
null_ctx = self.text_embed(tgt[:, :1])
if L != 1:
tgt_emb = self.pos_queries[:, :L - 1] + self.text_embed(tgt[:, 1:])
tgt_emb = self.dropout(paddle.concat(x=[null_ctx, tgt_emb], axis=1))
else:
tgt_emb = self.dropout(null_ctx)
if tgt_query is None:
tgt_query = self.pos_queries[:, :L].expand(shape=[N, -1, -1])
tgt_query = self.dropout(tgt_query)
return self.decoder(tgt_query, tgt_emb, memory, tgt_query_mask, tgt_mask, tgt_padding_mask)
def forward_test(self, memory, max_length=None):
testing = max_length is None
max_length = self.max_label_length if max_length is None else min(max_length, self.max_label_length)
bs = memory.shape[0]
num_steps = max_length + 1
pos_queries = self.pos_queries[:, :num_steps].expand(shape=[bs, -1, -1])
tgt_mask = query_mask = paddle.triu(x=paddle.full(shape=(num_steps, num_steps), fill_value=float('-inf')), diagonal=1)
if self.decode_ar:
tgt_in = paddle.full(shape=(bs, num_steps), fill_value=self.pad_id).astype('int64')
tgt_in[:, (0)] = self.bos_id
logits = []
for i in range(paddle.to_tensor(num_steps)):
j = i + 1
tgt_out = self.decode(tgt_in[:, :j], memory, tgt_mask[:j, :j], tgt_query=pos_queries[:, i:j], tgt_query_mask=query_mask[i:j, :j])
p_i = self.head(tgt_out)
logits.append(p_i)
if j < num_steps:
tgt_in[:, (j)] = p_i.squeeze().argmax(axis=-1)
if testing and (tgt_in == self.eos_id).astype('bool').any(axis=-1).astype('bool').all():
break
logits = paddle.concat(x=logits, axis=1)
else:
tgt_in = paddle.full(shape=(bs, 1), fill_value=self.bos_id).astype('int64')
tgt_out = self.decode(tgt_in, memory, tgt_query=pos_queries)
logits = self.head(tgt_out)
if self.refine_iters:
temp = paddle.triu(x=paddle.ones(shape=[num_steps,num_steps], dtype='bool'), diagonal=2)
posi = np.where(temp.cpu().numpy()==True)
query_mask[posi] = 0
bos = paddle.full(shape=(bs, 1), fill_value=self.bos_id).astype('int64')
for i in range(self.refine_iters):
tgt_in = paddle.concat(x=[bos, logits[:, :-1].argmax(axis=-1)], axis=1)
tgt_padding_mask = (tgt_in == self.eos_id).astype(dtype='int32')
tgt_padding_mask = tgt_padding_mask.cpu()
tgt_padding_mask = tgt_padding_mask.cumsum(axis=-1) > 0
tgt_padding_mask = tgt_padding_mask.cuda().astype(dtype='float32')==1.0
tgt_out = self.decode(tgt_in, memory, tgt_mask, tgt_padding_mask, tgt_query=pos_queries, tgt_query_mask=query_mask[:, :tgt_in.shape[1]])
logits = self.head(tgt_out)
final_output = {"predict":logits}
return final_output
def gen_tgt_perms(self, tgt):
"""Generate shared permutations for the whole batch.
This works because the same attention mask can be used for the shorter sequences
because of the padding mask.
"""
max_num_chars = tgt.shape[1] - 2
if max_num_chars == 1:
return paddle.arange(end=3).unsqueeze(axis=0)
perms = [paddle.arange(end=max_num_chars)] if self.perm_forward else []
max_perms = math.factorial(max_num_chars)
if self.perm_mirrored:
max_perms //= 2
num_gen_perms = min(self.max_gen_perms, max_perms)
if max_num_chars < 5:
if max_num_chars == 4 and self.perm_mirrored:
selector = [0, 3, 4, 6, 9, 10, 12, 16, 17, 18, 19, 21]
else:
selector = list(range(max_perms))
perm_pool = paddle.to_tensor(data=list(permutations(range(max_num_chars), max_num_chars)), place=self._device)[selector]
if self.perm_forward:
perm_pool = perm_pool[1:]
perms = paddle.stack(x=perms)
if len(perm_pool):
i = self.rng.choice(len(perm_pool), size=num_gen_perms -
len(perms), replace=False)
perms = paddle.concat(x=[perms, perm_pool[i]])
else:
perms.extend([paddle.randperm(n=max_num_chars) for _ in range(num_gen_perms - len(perms))])
perms = paddle.stack(x=perms)
if self.perm_mirrored:
comp = perms.flip(axis=-1)
x = paddle.stack(x=[perms, comp])
perm_2 = list(range(x.ndim))
perm_2[0] = 1
perm_2[1] = 0
perms = x.transpose(perm=perm_2).reshape((-1, max_num_chars))
bos_idx = paddle.zeros(shape=(len(perms), 1), dtype=perms.dtype)
eos_idx = paddle.full(shape=(len(perms), 1), fill_value=
max_num_chars + 1, dtype=perms.dtype)
perms = paddle.concat(x=[bos_idx, perms + 1, eos_idx], axis=1)
if len(perms) > 1:
perms[(1), 1:] = max_num_chars + 1 - paddle.arange(end=max_num_chars + 1)
return perms
def generate_attn_masks(self, perm):
"""Generate attention masks given a sequence permutation (includes pos. for bos and eos tokens)
:param perm: the permutation sequence. i = 0 is always the BOS
:return: lookahead attention masks
"""
sz = perm.shape[0]
mask = paddle.zeros(shape=(sz, sz))
for i in range(sz):
query_idx = perm[i].cpu().numpy().tolist()
masked_keys = perm[i + 1:].cpu().numpy().tolist()
if len(masked_keys) == 0:
break
mask[query_idx, masked_keys] = float('-inf')
content_mask = mask[:-1, :-1].clone()
mask[paddle.eye(num_rows=sz).astype('bool')] = float('-inf')
query_mask = mask[1:, :-1]
return content_mask, query_mask
def forward_train(self, memory, tgt):
tgt_perms = self.gen_tgt_perms(tgt)
tgt_in = tgt[:, :-1]
tgt_padding_mask = (tgt_in == self.pad_id) | (tgt_in == self.eos_id)
logits_list = []
final_out = {}
for i, perm in enumerate(tgt_perms):
tgt_mask, query_mask = self.generate_attn_masks(perm)
out = self.decode(tgt_in, memory, tgt_mask, tgt_padding_mask, tgt_query_mask=query_mask)
logits = self.head(out)
if i == 0:
final_out['predict'] = logits
logits = logits.flatten(stop_axis=1)
logits_list.append(logits)
final_out['logits_list'] = logits_list
final_out['pad_id'] = self.pad_id
final_out['eos_id'] = self.eos_id
return final_out
def forward(self, feat, targets=None):
# feat : B, N, C
# targets : labels, labels_len
if self.training:
label = targets[0] # label
label_len = targets[1]
max_step = paddle.max(label_len).cpu().numpy()[0] + 2
crop_label = label[:, :max_step]
final_out = self.forward_train(feat, crop_label)
else:
final_out = self.forward_test(feat)
return final_out

View File

@ -28,7 +28,8 @@ from .fce_postprocess import FCEPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \
DistillationCTCLabelDecode, NRTRLabelDecode, SARLabelDecode, \
SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode, \
SPINLabelDecode, VLLabelDecode, RFLLabelDecode, SATRNLabelDecode
SPINLabelDecode, VLLabelDecode, RFLLabelDecode, SATRNLabelDecode, ParseQLabelDecode, \
CPPDLabelDecode
from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess
from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess
@ -53,7 +54,7 @@ def build_post_process(config, global_config=None):
'DistillationSerPostProcess', 'DistillationRePostProcess',
'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess',
'RFLLabelDecode', 'DRRGPostprocess', 'CANLabelDecode',
'SATRNLabelDecode'
'SATRNLabelDecode', 'ParseQLabelDecode', 'CPPDLabelDecode'
]
if config['name'] == 'PSEPostProcess':

View File

@ -67,7 +67,70 @@ class BaseRecLabelDecode(object):
def add_special_char(self, dict_character):
return dict_character
def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
def get_word_info(self, text, selection):
"""
Group the decoded characters and record the corresponding decoded positions.
Args:
text: the decoded text
selection: the bool array that identifies which columns of features are decoded as non-separated characters
Returns:
word_list: list of the grouped words
word_col_list: list of decoding positions corresponding to each character in the grouped word
state_list: list of marker to identify the type of grouping words, including two types of grouping words:
- 'cn': continous chinese characters (e.g., 你好啊)
- 'en&num': continous english characters (e.g., hello), number (e.g., 123, 1.123), or mixed of them connected by '-' (e.g., VGG-16)
The remaining characters in text are treated as separators between groups (e.g., space, '(', ')', etc.).
"""
state = None
word_content = []
word_col_content = []
word_list = []
word_col_list = []
state_list = []
valid_col = np.where(selection==True)[0]
for c_i, char in enumerate(text):
if '\u4e00' <= char <= '\u9fff':
c_state = 'cn'
elif bool(re.search('[a-zA-Z0-9]', char)):
c_state = 'en&num'
else:
c_state = 'splitter'
if char == '.' and state == 'en&num' and c_i + 1 < len(text) and bool(re.search('[0-9]', text[c_i+1])): # grouping floting number
c_state = 'en&num'
if char == '-' and state == "en&num": # grouping word with '-', such as 'state-of-the-art'
c_state = 'en&num'
if state == None:
state = c_state
if state != c_state:
if len(word_content) != 0:
word_list.append(word_content)
word_col_list.append(word_col_content)
state_list.append(state)
word_content = []
word_col_content = []
state = c_state
if state != "splitter":
word_content.append(char)
word_col_content.append(valid_col[c_i])
if len(word_content) != 0:
word_list.append(word_content)
word_col_list.append(word_col_content)
state_list.append(state)
return word_list, word_col_list, state_list
def decode(self,
text_index,
text_prob=None,
is_remove_duplicate=False,
return_word_box=False):
""" convert text-index into text-label. """
result_list = []
ignored_tokens = self.get_ignored_tokens()
@ -96,7 +159,15 @@ class BaseRecLabelDecode(object):
if self.reverse: # for arabic rec
text = self.pred_reverse(text)
result_list.append((text, np.mean(conf_list).tolist()))
if return_word_box:
word_list, word_col_list, state_list = self.get_word_info(
text, selection)
result_list.append((text, np.mean(conf_list).tolist(), [
len(text_index[batch_idx]), word_list, word_col_list,
state_list
]))
else:
result_list.append((text, np.mean(conf_list).tolist()))
return result_list
def get_ignored_tokens(self):
@ -111,14 +182,28 @@ class CTCLabelDecode(BaseRecLabelDecode):
super(CTCLabelDecode, self).__init__(character_dict_path,
use_space_char)
def __call__(self, preds, label=None, *args, **kwargs):
def __call__(self,
preds,
label=None,
return_word_box=False,
*args,
**kwargs):
if isinstance(preds, tuple) or isinstance(preds, list):
preds = preds[-1]
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
text = self.decode(
preds_idx,
preds_prob,
is_remove_duplicate=True,
return_word_box=return_word_box)
if return_word_box:
for rec_idx, rec in enumerate(text):
wh_ratio = kwargs['wh_ratio_list'][rec_idx]
max_wh_ratio = kwargs['max_wh_ratio']
rec[2][0] = rec[2][0] * (wh_ratio / max_wh_ratio)
if label is None:
return text
label = self.decode(label)
@ -492,6 +577,100 @@ class SRNLabelDecode(BaseRecLabelDecode):
return idx
class ParseQLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
BOS = '[B]'
EOS = '[E]'
PAD = '[P]'
def __init__(self, character_dict_path=None, use_space_char=False,
**kwargs):
super(ParseQLabelDecode, self).__init__(character_dict_path,
use_space_char)
self.max_text_length = kwargs.get('max_text_length', 25)
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, dict):
pred = preds['predict']
else:
pred = preds
char_num = len(
self.character_str
) + 1 # We don't predict <bos> nor <pad>, with only addition <eos>
if isinstance(pred, paddle.Tensor):
pred = pred.numpy()
B, L = pred.shape[:2]
pred = np.reshape(pred, [-1, char_num])
preds_idx = np.argmax(pred, axis=1)
preds_prob = np.max(pred, axis=1)
preds_idx = np.reshape(preds_idx, [B, L])
preds_prob = np.reshape(preds_prob, [B, L])
if label is None:
text = self.decode(preds_idx, preds_prob, raw=False)
return text
text = self.decode(preds_idx, preds_prob, raw=False)
label = self.decode(label, None, False)
return text, label
def decode(self, text_index, text_prob=None, raw=False):
""" convert text-index into text-label. """
result_list = []
ignored_tokens = self.get_ignored_tokens()
batch_size = len(text_index)
for batch_idx in range(batch_size):
char_list = []
conf_list = []
index = text_index[batch_idx, :]
prob = None
if text_prob is not None:
prob = text_prob[batch_idx, :]
if not raw:
index, prob = self._filter(index, prob)
for idx in range(len(index)):
if index[idx] in ignored_tokens:
continue
char_list.append(self.character[int(index[idx])])
if text_prob is not None:
conf_list.append(prob[idx])
else:
conf_list.append(1)
text = ''.join(char_list)
result_list.append((text, np.mean(conf_list).tolist()))
return result_list
def add_special_char(self, dict_character):
dict_character = [self.EOS] + dict_character + [self.BOS, self.PAD]
return dict_character
def _filter(self, ids, probs=None):
ids = ids.tolist()
try:
eos_idx = ids.index(self.dict[self.EOS])
except ValueError:
eos_idx = len(ids) # Nothing to truncate.
# Truncate after EOS
ids = ids[:eos_idx]
if probs is not None:
probs = probs[:eos_idx +
1] # but include prob. for EOS (if it exists)
return ids, probs
def get_ignored_tokens(self):
return [self.dict[self.BOS], self.dict[self.EOS], self.dict[self.PAD]]
class SARLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
@ -1009,3 +1188,34 @@ class CANLabelDecode(BaseRecLabelDecode):
return text
label = self.decode(label)
return text, label
class CPPDLabelDecode(NRTRLabelDecode):
""" Convert between text-label and text-index """
def __init__(self, character_dict_path=None, use_space_char=False,
**kwargs):
super(CPPDLabelDecode, self).__init__(character_dict_path,
use_space_char)
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, tuple):
if isinstance(preds[-1], dict):
preds = preds[-1]['align'][-1].numpy()
else:
preds = preds[-1].numpy()
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
else:
preds = preds
preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
if label is None:
return text
label = self.decode(label)
return text, label
def add_special_char(self, dict_character):
dict_character = ['</s>'] + dict_character
return dict_character

View File

@ -0,0 +1,94 @@
0
1
2
3
4
5
6
7
8
9
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~

5
ppocr/utils/utility.py 100755 → 100644
View File

@ -14,7 +14,6 @@
import logging
import os
import imghdr
import cv2
import random
import numpy as np
@ -57,7 +56,7 @@ def _check_image_file(path):
return any([path.lower().endswith(e) for e in img_end])
def get_image_file_list(img_file):
def get_image_file_list(img_file, infer_list=None):
imgs_lists = []
if img_file is None or not os.path.exists(img_file):
raise Exception("not found any img file in {}".format(img_file))
@ -69,6 +68,7 @@ def get_image_file_list(img_file):
file_path = os.path.join(img_file, single_file)
if os.path.isfile(file_path) and _check_image_file(file_path):
imgs_lists.append(file_path)
if len(imgs_lists) == 0:
raise Exception("not found any img file in {}".format(img_file))
imgs_lists = sorted(imgs_lists)
@ -108,7 +108,6 @@ def check_and_read(img_path):
return imgvalue, True, False
elif os.path.basename(img_path)[-3:].lower() == 'pdf':
from paddle.utils import try_import
fitz = try_import("fitz")
from PIL import Image
imgs = []

View File

@ -14,6 +14,7 @@
import cv2
import os
import numpy as np
import PIL
from PIL import Image, ImageDraw, ImageFont
@ -62,8 +63,13 @@ def draw_box_txt(bbox, text, draw, font, font_size, color):
draw.rectangle(bbox, fill=color)
# draw ocr results
left, top, right, bottom = font.getbbox(text)
tw, th = right - left, bottom - top
if int(PIL.__version__.split('.')[0]) < 10:
tw = font.getsize(text)[0]
th = font.getsize(text)[1]
else:
left, top, right, bottom = font.getbbox(text)
tw, th = right - left, bottom - top
start_y = max(0, bbox[0][1] - th)
draw.rectangle(
[(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1, start_y + th)],

View File

@ -50,6 +50,10 @@ PP-StructureV2 supports the independent use or flexible collocation of each modu
The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use.
<img src="docs/table/ppstructure.GIF" width="100%"/>
### 3.1.1 版面识别返回单字坐标
The following figure shows the result of layout analysis on single word please refer to the [doc](./return_word_pos.md)。
![show_0_mdf_v2](https://github.com/PaddlePaddle/PaddleOCR/assets/43341135/799450d4-d2c5-4b61-b490-e160dc0f515c)
### 3.2 Layout recovery
The following figure shows the effect of layout recovery based on the results of layout analysis and table recognition in the previous section.

View File

@ -52,12 +52,16 @@ PP-StructureV2支持各个模块独立使用或灵活搭配可以单独
下图展示了版面分析+表格识别的整体流程图片先有版面分析划分为图像、文本、标题和表格四种区域然后对图像、文本和标题三种区域进行OCR的检测识别对表格进行表格识别其中图像还会被存储下来以便使用。
<img src="./docs/table/ppstructure.GIF" width="100%"/>
### 3.1.1 版面识别返回单字坐标
下图展示了基于上一节版面分析对文字进行定位的效果, 可参考[文档](./return_word_pos.md)。
![show_0_mdf_v2](https://github.com/PaddlePaddle/PaddleOCR/assets/43341135/799450d4-d2c5-4b61-b490-e160dc0f515c)
<a name="32"></a>
### 3.2 版面恢复
下图展示了基于上一节版面分析和表格识别的结果进行版面恢复的效果。
<img src="./docs/recovery/recovery.jpg" width="100%"/>
<a name="33"></a>
### 3.3 关键信息抽取

View File

@ -34,7 +34,7 @@ from ppocr.utils.visual import draw_ser_results, draw_re_results
from tools.infer.predict_system import TextSystem
from ppstructure.layout.predict_layout import LayoutPredictor
from ppstructure.table.predict_table import TableSystem, to_excel
from ppstructure.utility import parse_args, draw_structure_result
from ppstructure.utility import parse_args, draw_structure_result, cal_ocr_word_box
logger = get_logger()
@ -79,6 +79,8 @@ class StructureSystem(object):
from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor
self.kie_predictor = SerRePredictor(args)
self.return_word_box = args.return_word_box
def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
time_dict = {
'image_orientation': 0,
@ -156,17 +158,27 @@ class StructureSystem(object):
]
res = []
for box, rec_res in zip(filter_boxes, filter_rec_res):
rec_str, rec_conf = rec_res
rec_str, rec_conf = rec_res[0], rec_res[1]
for token in style_token:
if token in rec_str:
rec_str = rec_str.replace(token, '')
if not self.recovery:
box += [x1, y1]
res.append({
'text': rec_str,
'confidence': float(rec_conf),
'text_region': box.tolist()
})
if self.return_word_box:
word_box_content_list, word_box_list = cal_ocr_word_box(rec_str, box, rec_res[2])
res.append({
'text': rec_str,
'confidence': float(rec_conf),
'text_region': box.tolist(),
'text_word': word_box_content_list,
'text_word_region': word_box_list
})
else:
res.append({
'text': rec_str,
'confidence': float(rec_conf),
'text_region': box.tolist()
})
res_list.append({
'type': region['label'].lower(),
'bbox': [x1, y1, x2, y2],

View File

@ -152,7 +152,7 @@ cd PaddleOCR/ppstructure
# download model
mkdir inference && cd inference
# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar
# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar
# Download the ultra-lightweight English table inch model and unzip it

View File

@ -0,0 +1,85 @@
# 返回识别位置
根据横排的文档,识别模型不仅返回识别的内容,还返回每个文字的位置。
## 英文文档恢复:
### 先下载推理模型:
```bash
cd PaddleOCR/ppstructure
## download model
mkdir inference && cd inference
## Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar
## Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar
## Download the ultra-lightweight English table inch model and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar
tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar
## Download the layout model of publaynet dataset and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar
tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar
cd ..
```
### 然后在/ppstructure/目录下使用下面的指令推理:
```bash
python predict_system.py \
--image_dir=./docs/table/1.png \
--det_model_dir=inference/en_PP-OCRv3_det_infer \
--rec_model_dir=inference/en_PP-OCRv3_rec_infer \
--rec_char_dict_path=../ppocr/utils/en_dict.txt \
--table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \
--layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \
--vis_font_path=../doc/fonts/simfang.ttf \
--recovery=True \
--output=../output/ \
--return_word_box=True
```
### 在../output/structure/1/show_0.jpg下查看推理结果的可视化如下图所示
![show_0_mdf_v2](https://github.com/PaddlePaddle/PaddleOCR/assets/43341135/799450d4-d2c5-4b61-b490-e160dc0f515c)
## 针对中文文档恢复
### 先下载推理模型
```bash
cd PaddleOCR/ppstructure
## download model
cd inference
## Download the detection model of the ultra-lightweight Chinesse PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
## Download the recognition model of the ultra-lightweight Chinese PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
## Download the ultra-lightweight Chinese table inch model and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar
tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar
## Download the layout model of CDLA dataset and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar
tar xf picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar
cd ..
```
### 上传下面的测试图片 "2.png" 至目录 ./docs/table/ 中
![2](https://github.com/PaddlePaddle/PaddleOCR/assets/43341135/d0858341-a889-483c-8373-5ecaa57f3b20)
### 然后在/ppstructure/目录下使用下面的指令推理
```bash
python predict_system.py \
--image_dir=./docs/table/2.png \
--det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
--layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_cdla_infer \
--layout_dict_path=../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt \
--vis_font_path=../doc/fonts/chinese_cht.ttf \
--recovery=True \
--output=../output/ \
--return_word_box=True
```
### 在../output/structure/2/show_0.jpg下查看推理结果的可视化如下图所示
![show_1_mdf_v2](https://github.com/PaddlePaddle/PaddleOCR/assets/43341135/3c200538-f2e6-4d79-847a-4c4587efa9f0)

View File

@ -13,10 +13,11 @@
# limitations under the License.
import random
import ast
import PIL
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from tools.infer.utility import draw_ocr_box_txt, str2bool, str2int_tuple, init_args as infer_args
import math
def init_args():
parser = infer_args()
@ -147,8 +148,13 @@ def draw_structure_result(image, result, font_path):
[(box_layout[0], box_layout[1]), (box_layout[2], box_layout[3])],
outline=box_color,
width=3)
left, top, right, bottom = font.getbbox(region['type'])
text_w, text_h = right - left, bottom - top
if int(PIL.__version__.split('.')[0]) < 10:
text_w, text_h = font.getsize(region['type'])
else:
left, top, right, bottom = font.getbbox(region['type'])
text_w, text_h = right - left, bottom - top
draw_layout.rectangle(
[(box_layout[0], box_layout[1]),
(box_layout[0] + text_w, box_layout[1] + text_h)],
@ -167,6 +173,70 @@ def draw_structure_result(image, result, font_path):
txts.append(text_result['text'])
scores.append(text_result['confidence'])
if 'text_word_region' in text_result:
for word_region in text_result['text_word_region']:
char_box = word_region
box_height = int(
math.sqrt((char_box[0][0] - char_box[3][0])**2 + (
char_box[0][1] - char_box[3][1])**2))
box_width = int(
math.sqrt((char_box[0][0] - char_box[1][0])**2 + (
char_box[0][1] - char_box[1][1])**2))
if box_height == 0 or box_width == 0:
continue
boxes.append(word_region)
txts.append("")
scores.append(1.0)
im_show = draw_ocr_box_txt(
img_layout, boxes, txts, scores, font_path=font_path, drop_score=0)
return im_show
def cal_ocr_word_box(rec_str, box, rec_word_info):
''' Calculate the detection frame for each word based on the results of recognition and detection of ocr'''
col_num, word_list, word_col_list, state_list = rec_word_info
box = box.tolist()
bbox_x_start = box[0][0]
bbox_x_end = box[1][0]
bbox_y_start = box[0][1]
bbox_y_end = box[2][1]
cell_width = (bbox_x_end - bbox_x_start) / col_num
word_box_list = []
word_box_content_list = []
cn_width_list = []
cn_col_list = []
for word, word_col, state in zip(word_list, word_col_list, state_list):
if state == 'cn':
if len(word_col) != 1:
char_seq_length = (word_col[-1] - word_col[0] + 1) * cell_width
char_width = char_seq_length / (len(word_col) - 1)
cn_width_list.append(char_width)
cn_col_list += word_col
word_box_content_list += word
else:
cell_x_start = bbox_x_start + int(word_col[0] * cell_width)
cell_x_end = bbox_x_start + int((word_col[-1] + 1) * cell_width)
cell = ((cell_x_start, bbox_y_start), (cell_x_end, bbox_y_start),
(cell_x_end, bbox_y_end), (cell_x_start, bbox_y_end))
word_box_list.append(cell)
word_box_content_list.append("".join(word))
if len(cn_col_list) != 0:
if len(cn_width_list) != 0:
avg_char_width = np.mean(cn_width_list)
else:
g avg_char_width = (bbox_x_end - bbox_x_start) / len(rec_str)
for center_idx in cn_col_list:
center_x = (center_idx + 0.5) * cell_width
cell_x_start = max(int(center_x - avg_char_width / 2),
0) + bbox_x_start
cell_x_end = min(
int(center_x + avg_char_width / 2), bbox_x_end -
bbox_x_start) + bbox_x_start
cell = ((cell_x_start, bbox_y_start), (cell_x_end, bbox_y_start),
(cell_x_end, bbox_y_end), (cell_x_start, bbox_y_end))
word_box_list.append(cell)
return word_box_content_list, word_box_list

View File

@ -68,7 +68,7 @@ def export_single_model(model,
shape=[None, 3, 48, -1], dtype="float32"),
]
model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] == "SVTR":
elif arch_config["algorithm"] in ["SVTR", "CPPD"]:
other_shape = [
paddle.static.InputSpec(
shape=[None] + input_shape, dtype="float32"),
@ -93,11 +93,12 @@ def export_single_model(model,
]
model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] == "ABINet":
if not input_shape:
input_shape = [3, 32, 128]
other_shape = [
paddle.static.InputSpec(
shape=[None, 3, 32, 128], dtype="float32"),
shape=[None] + input_shape, dtype="float32"),
]
# print([None, 3, 32, 128])
model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] in ["NRTR", "SPIN", 'RFL']:
other_shape = [
@ -105,7 +106,7 @@ def export_single_model(model,
shape=[None, 1, 32, 100], dtype="float32"),
]
model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] == 'SATRN':
elif arch_config["algorithm"] in ['SATRN']:
other_shape = [
paddle.static.InputSpec(
shape=[None, 3, 32, 100], dtype="float32"),
@ -266,10 +267,13 @@ def main():
arch_config = config["Architecture"]
if arch_config["algorithm"] == "SVTR" and arch_config["Head"][
if arch_config["algorithm"] in ["SVTR", "CPPD"] and arch_config["Head"][
"name"] != 'MultiHead':
input_shape = config["Eval"]["dataset"]["transforms"][-2][
'SVTRRecResizeImg']['image_shape']
elif arch_config["algorithm"].lower() == "ABINet".lower():
rec_rs = [c for c in config["Eval"]["dataset"]["transforms"] if 'ABINetRecResizeImg' in c]
input_shape = rec_rs[0]['ABINetRecResizeImg']['image_shape'] if rec_rs else None
else:
input_shape = None

View File

@ -73,7 +73,8 @@ class TextRecognizer(object):
postprocess_params = {
'name': 'VLLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
"use_space_char": args.use_space_char,
"max_text_length": args.max_text_length
}
elif self.rec_algorithm == 'ViTSTR':
postprocess_params = {
@ -113,6 +114,13 @@ class TextRecognizer(object):
"use_space_char": args.use_space_char,
"rm_symbol": True
}
elif self.rec_algorithm in ["CPPD", "CPPDPadding"]:
postprocess_params = {
'name': 'CPPDLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char,
"rm_symbol": True
}
elif self.rec_algorithm == "PREN":
postprocess_params = {'name': 'PRENLabelDecode'}
elif self.rec_algorithm == "CAN":
@ -122,7 +130,14 @@ class TextRecognizer(object):
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == "ParseQ":
postprocess_params = {
'name': 'ParseQLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
self.postprocess_op = build_post_process(postprocess_params)
self.postprocess_params = postprocess_params
self.predictor, self.input_tensor, self.output_tensors, self.config = \
utility.create_predictor(args, 'rec', logger)
self.benchmark = args.benchmark
@ -146,6 +161,7 @@ class TextRecognizer(object):
],
warmup=0,
logger=logger)
self.return_word_box = args.return_word_box
def resize_norm_img(self, img, max_wh_ratio):
imgC, imgH, imgW = self.rec_image_shape
@ -156,7 +172,7 @@ class TextRecognizer(object):
if self.rec_algorithm == 'ViTSTR':
img = image_pil.resize([imgW, imgH], Image.BICUBIC)
else:
img = image_pil.resize([imgW, imgH], Image.LANCZOS)
img = image_pil.resize([imgW, imgH], Image.Resampling.LANCZOS)
img = np.array(img)
norm_img = np.expand_dims(img, -1)
norm_img = norm_img.transpose((2, 0, 1))
@ -348,6 +364,38 @@ class TextRecognizer(object):
resized_image /= 0.5
return resized_image
def resize_norm_img_cppd_padding(self,
img,
image_shape,
padding=True,
interpolation=cv2.INTER_LINEAR):
imgC, imgH, imgW = image_shape
h = img.shape[0]
w = img.shape[1]
if not padding:
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=interpolation)
resized_w = imgW
else:
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def resize_norm_img_abinet(self, img, image_shape):
imgC, imgH, imgW = image_shape
@ -415,11 +463,12 @@ class TextRecognizer(object):
valid_ratios = []
imgC, imgH, imgW = self.rec_image_shape[:3]
max_wh_ratio = imgW / imgH
# max_wh_ratio = 0
wh_ratio_list = []
for ino in range(beg_img_no, end_img_no):
h, w = img_list[indices[ino]].shape[0:2]
wh_ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, wh_ratio)
wh_ratio_list.append(wh_ratio)
for ino in range(beg_img_no, end_img_no):
if self.rec_algorithm == "SAR":
norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
@ -436,11 +485,16 @@ class TextRecognizer(object):
gsrm_slf_attn_bias1_list.append(norm_img[3])
gsrm_slf_attn_bias2_list.append(norm_img[4])
norm_img_batch.append(norm_img[0])
elif self.rec_algorithm in ["SVTR", "SATRN"]:
elif self.rec_algorithm in ["SVTR", "SATRN", "ParseQ", "CPPD"]:
norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
self.rec_image_shape)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
elif self.rec_algorithm in ["CPPDPadding"]:
norm_img = self.resize_norm_img_cppd_padding(
img_list[indices[ino]], self.rec_image_shape)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
elif self.rec_algorithm in ["VisionLAN", "PREN"]:
norm_img = self.resize_norm_img_vl(img_list[indices[ino]],
self.rec_image_shape)
@ -625,7 +679,14 @@ class TextRecognizer(object):
else:
preds = outputs[0]
self.predictor.try_shrink_memory()
rec_result = self.postprocess_op(preds)
if self.postprocess_params['name'] == 'CTCLabelDecode':
rec_result = self.postprocess_op(
preds,
return_word_box=self.return_word_box,
wh_ratio_list=wh_ratio_list,
max_wh_ratio=max_wh_ratio)
else:
rec_result = self.postprocess_op(preds)
for rno in range(len(rec_result)):
rec_res[indices[beg_img_no + rno]] = rec_result[rno]
if self.benchmark:

View File

@ -111,7 +111,7 @@ class TextSystem(object):
rec_res)
filter_boxes, filter_rec_res = [], []
for box, rec_result in zip(dt_boxes, rec_res):
text, score = rec_result
text, score = rec_result[0], rec_result[1]
if score >= self.drop_score:
filter_boxes.append(box)
filter_rec_res.append(rec_result)

View File

@ -19,6 +19,7 @@ import platform
import cv2
import numpy as np
import paddle
import PIL
from PIL import Image, ImageDraw, ImageFont
import math
from paddle import inference
@ -30,15 +31,18 @@ from ppocr.utils.logging import get_logger
def str2bool(v):
return v.lower() in ("true", "yes", "t", "y", "1")
def str2int_tuple(v):
return tuple([int(i.strip()) for i in v.split(",")])
def init_args():
parser = argparse.ArgumentParser()
# params for prediction engine
parser.add_argument("--use_gpu", type=str2bool, default=True)
parser.add_argument("--use_xpu", type=str2bool, default=False)
parser.add_argument("--use_npu", type=str2bool, default=False)
parser.add_argument("--use_mlu", type=str2bool, default=False)
parser.add_argument("--ir_optim", type=str2bool, default=True)
parser.add_argument("--use_tensorrt", type=str2bool, default=False)
parser.add_argument("--min_subgraph_size", type=int, default=15)
@ -147,6 +151,10 @@ def init_args():
parser.add_argument("--show_log", type=str2bool, default=True)
parser.add_argument("--use_onnx", type=str2bool, default=False)
# extended function
parser.add_argument("--return_word_box", type=str2bool, default=False, help='Whether return the bbox of each word (split by space) or chinese character. Only used in ppstructure for layout recovery')
return parser
@ -184,7 +192,10 @@ def create_predictor(args, mode, logger):
if not os.path.exists(model_file_path):
raise ValueError("not find model file path {}".format(
model_file_path))
sess = ort.InferenceSession(model_file_path)
if args.use_gpu:
sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
else:
sess = ort.InferenceSession(model_file_path)
return sess, sess.get_inputs()[0], None, None
else:
@ -249,6 +260,8 @@ def create_predictor(args, mode, logger):
elif args.use_npu:
config.enable_custom_device("npu")
elif args.use_mlu:
config.enable_custom_device("mlu")
elif args.use_xpu:
config.enable_xpu(10 * 1024 * 1024)
else:
@ -473,7 +486,11 @@ def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"):
def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"):
font_size = int(sz[1] * 0.99)
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
length = font.getlength(txt)
if int(PIL.__version__.split('.')[0]) < 10:
length = font.getsize(txt)[0]
else:
length = font.getlength(txt)
if length > sz[0]:
font_size = int(font_size * sz[0] / length)
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")

View File

@ -118,9 +118,11 @@ def main():
os.makedirs(os.path.dirname(save_res_path))
model.eval()
infer_imgs = config['Global']['infer_img']
infer_list = config['Global'].get('infer_list', None)
with open(save_res_path, "w") as fout:
for file in get_image_file_list(config['Global']['infer_img']):
for file in get_image_file_list(infer_imgs, infer_list=infer_list):
logger.info("infer_img: {}".format(file))
with open(file, 'rb') as f:
img = f.read()

View File

@ -185,6 +185,7 @@ def train(config,
eval_class,
pre_best_model_dict,
logger,
step_pre_epoch,
log_writer=None,
scaler=None,
amp_level='O2',
@ -198,6 +199,7 @@ def train(config,
epoch_num = config['Global']['epoch_num']
print_batch_step = config['Global']['print_batch_step']
eval_batch_step = config['Global']['eval_batch_step']
eval_batch_epoch = config['Global'].get('eval_batch_epoch', None)
profiler_options = config['profiler_options']
global_step = 0
@ -205,8 +207,9 @@ def train(config,
global_step = pre_best_model_dict['global_step']
start_eval_step = 0
if type(eval_batch_step) == list and len(eval_batch_step) >= 2:
start_eval_step = eval_batch_step[0]
eval_batch_step = eval_batch_step[1]
start_eval_step = eval_batch_step[0] if not eval_batch_epoch else 0
eval_batch_step = eval_batch_step[
1] if not eval_batch_epoch else step_pre_epoch * eval_batch_epoch
if len(valid_dataloader) == 0:
logger.info(
'No Images in eval dataset, evaluation during training ' \
@ -231,7 +234,7 @@ def train(config,
use_srn = config['Architecture']['algorithm'] == "SRN"
extra_input_models = [
"SRN", "NRTR", "SAR", "SEED", "SVTR", "SVTR_LCNet", "SPIN", "VisionLAN",
"RobustScanner", "RFL", 'DRRG', 'SATRN', 'SVTR_HGNet'
"RobustScanner", "RFL", 'DRRG', 'SATRN', 'SVTR_HGNet', "ParseQ", "CPPD"
]
extra_input = False
if config['Architecture']['algorithm'] == 'Distillation':
@ -664,7 +667,7 @@ def preprocess(is_train=False):
'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE',
'SVTR', 'SVTR_LCNet', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN',
'VisionLAN', 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG',
'CAN', 'Telescope', 'SATRN', 'SVTR_HGNet'
'CAN', 'Telescope', 'SATRN', 'SVTR_HGNet', 'ParseQ', 'CPPD'
]
if use_xpu:

View File

@ -41,7 +41,7 @@ import tools.program as program
dist.get_world_size()
def main(config, device, logger, vdl_writer):
def main(config, device, logger, vdl_writer, seed):
# init dist environment
if config['Global']['distributed']:
dist.init_parallel_env()
@ -50,7 +50,7 @@ def main(config, device, logger, vdl_writer):
# build dataloader
set_signal_handlers()
train_dataloader = build_dataloader(config, 'Train', device, logger)
train_dataloader = build_dataloader(config, 'Train', device, logger, seed)
if len(train_dataloader) == 0:
logger.error(
"No Images in train dataset, please ensure\n" +
@ -61,9 +61,11 @@ def main(config, device, logger, vdl_writer):
return
if config['Eval']:
valid_dataloader = build_dataloader(config, 'Eval', device, logger)
valid_dataloader = build_dataloader(config, 'Eval', device, logger,
seed)
else:
valid_dataloader = None
step_pre_epoch = len(train_dataloader)
# build post process
post_process_class = build_post_process(config['PostProcess'],
@ -93,8 +95,8 @@ def main(config, device, logger, vdl_writer):
'DistillationSARLoss'][
'ignore_index'] = char_num + 1
out_channels_list['SARLabelDecode'] = char_num + 2
elif list(config['Loss']['loss_config_list'][-1].keys())[
0] == 'DistillationNRTRLoss':
elif any('DistillationNRTRLoss' in d
for d in config['Loss']['loss_config_list']):
out_channels_list['NRTRLabelDecode'] = char_num + 3
config['Architecture']['Models'][key]['Head'][
@ -197,9 +199,9 @@ def main(config, device, logger, vdl_writer):
# start train
program.train(config, train_dataloader, valid_dataloader, device, model,
loss_class, optimizer, lr_scheduler, post_process_class,
eval_class, pre_best_model_dict, logger, vdl_writer, scaler,
amp_level, amp_custom_black_list, amp_custom_white_list,
amp_dtype)
eval_class, pre_best_model_dict, logger, step_pre_epoch,
vdl_writer, scaler, amp_level, amp_custom_black_list,
amp_custom_white_list, amp_dtype)
def test_reader(config, device, logger):
@ -224,5 +226,5 @@ if __name__ == '__main__':
config, device, logger, vdl_writer = program.preprocess(is_train=True)
seed = config['Global']['seed'] if 'seed' in config['Global'] else 1024
set_seed(seed)
main(config, device, logger, vdl_writer)
main(config, device, logger, vdl_writer, seed)
# test_reader(config, device, logger)